zfs_vnops.c revision 330065
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28/* Portions Copyright 2007 Jeremy Teo */ 29/* Portions Copyright 2010 Robert Milkowski */ 30 31#include <sys/types.h> 32#include <sys/param.h> 33#include <sys/time.h> 34#include <sys/systm.h> 35#include <sys/sysmacros.h> 36#include <sys/resource.h> 37#include <sys/vfs.h> 38#include <sys/vm.h> 39#include <sys/vnode.h> 40#include <sys/file.h> 41#include <sys/stat.h> 42#include <sys/kmem.h> 43#include <sys/taskq.h> 44#include <sys/uio.h> 45#include <sys/atomic.h> 46#include <sys/namei.h> 47#include <sys/mman.h> 48#include <sys/cmn_err.h> 49#include <sys/errno.h> 50#include <sys/unistd.h> 51#include <sys/zfs_dir.h> 52#include <sys/zfs_ioctl.h> 53#include <sys/fs/zfs.h> 54#include <sys/dmu.h> 55#include <sys/dmu_objset.h> 56#include <sys/spa.h> 57#include <sys/txg.h> 58#include <sys/dbuf.h> 59#include <sys/zap.h> 60#include <sys/sa.h> 61#include <sys/dirent.h> 62#include <sys/policy.h> 63#include <sys/sunddi.h> 64#include <sys/filio.h> 65#include <sys/sid.h> 66#include <sys/zfs_ctldir.h> 67#include <sys/zfs_fuid.h> 68#include <sys/zfs_sa.h> 69#include <sys/zfs_rlock.h> 70#include <sys/extdirent.h> 71#include <sys/kidmap.h> 72#include <sys/bio.h> 73#include <sys/buf.h> 74#include <sys/sched.h> 75#include <sys/acl.h> 76#include <vm/vm_param.h> 77 78/* 79 * Programming rules. 80 * 81 * Each vnode op performs some logical unit of work. To do this, the ZPL must 82 * properly lock its in-core state, create a DMU transaction, do the work, 83 * record this work in the intent log (ZIL), commit the DMU transaction, 84 * and wait for the intent log to commit if it is a synchronous operation. 85 * Moreover, the vnode ops must work in both normal and log replay context. 86 * The ordering of events is important to avoid deadlocks and references 87 * to freed memory. The example below illustrates the following Big Rules: 88 * 89 * (1) A check must be made in each zfs thread for a mounted file system. 90 * This is done avoiding races using ZFS_ENTER(zfsvfs). 91 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 92 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 93 * can return EIO from the calling function. 94 * 95 * (2) VN_RELE() should always be the last thing except for zil_commit() 96 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 97 * First, if it's the last reference, the vnode/znode 98 * can be freed, so the zp may point to freed memory. Second, the last 99 * reference will call zfs_zinactive(), which may induce a lot of work -- 100 * pushing cached pages (which acquires range locks) and syncing out 101 * cached atime changes. Third, zfs_zinactive() may require a new tx, 102 * which could deadlock the system if you were already holding one. 103 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 104 * 105 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 106 * as they can span dmu_tx_assign() calls. 107 * 108 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 109 * dmu_tx_assign(). This is critical because we don't want to block 110 * while holding locks. 111 * 112 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 113 * reduces lock contention and CPU usage when we must wait (note that if 114 * throughput is constrained by the storage, nearly every transaction 115 * must wait). 116 * 117 * Note, in particular, that if a lock is sometimes acquired before 118 * the tx assigns, and sometimes after (e.g. z_lock), then failing 119 * to use a non-blocking assign can deadlock the system. The scenario: 120 * 121 * Thread A has grabbed a lock before calling dmu_tx_assign(). 122 * Thread B is in an already-assigned tx, and blocks for this lock. 123 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 124 * forever, because the previous txg can't quiesce until B's tx commits. 125 * 126 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 127 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 128 * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, 129 * to indicate that this operation has already called dmu_tx_wait(). 130 * This will ensure that we don't retry forever, waiting a short bit 131 * each time. 132 * 133 * (5) If the operation succeeded, generate the intent log entry for it 134 * before dropping locks. This ensures that the ordering of events 135 * in the intent log matches the order in which they actually occurred. 136 * During ZIL replay the zfs_log_* functions will update the sequence 137 * number to indicate the zil transaction has replayed. 138 * 139 * (6) At the end of each vnode op, the DMU tx must always commit, 140 * regardless of whether there were any errors. 141 * 142 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 143 * to ensure that synchronous semantics are provided when necessary. 144 * 145 * In general, this is how things should be ordered in each vnode op: 146 * 147 * ZFS_ENTER(zfsvfs); // exit if unmounted 148 * top: 149 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) 150 * rw_enter(...); // grab any other locks you need 151 * tx = dmu_tx_create(...); // get DMU tx 152 * dmu_tx_hold_*(); // hold each object you might modify 153 * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 154 * if (error) { 155 * rw_exit(...); // drop locks 156 * zfs_dirent_unlock(dl); // unlock directory entry 157 * VN_RELE(...); // release held vnodes 158 * if (error == ERESTART) { 159 * waited = B_TRUE; 160 * dmu_tx_wait(tx); 161 * dmu_tx_abort(tx); 162 * goto top; 163 * } 164 * dmu_tx_abort(tx); // abort DMU tx 165 * ZFS_EXIT(zfsvfs); // finished in zfs 166 * return (error); // really out of space 167 * } 168 * error = do_real_work(); // do whatever this VOP does 169 * if (error == 0) 170 * zfs_log_*(...); // on success, make ZIL entry 171 * dmu_tx_commit(tx); // commit DMU tx -- error or not 172 * rw_exit(...); // drop locks 173 * zfs_dirent_unlock(dl); // unlock directory entry 174 * VN_RELE(...); // release held vnodes 175 * zil_commit(zilog, foid); // synchronous when necessary 176 * ZFS_EXIT(zfsvfs); // finished in zfs 177 * return (error); // done, report error 178 */ 179 180/* ARGSUSED */ 181static int 182zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 183{ 184 znode_t *zp = VTOZ(*vpp); 185 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 186 187 ZFS_ENTER(zfsvfs); 188 ZFS_VERIFY_ZP(zp); 189 190 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 191 ((flag & FAPPEND) == 0)) { 192 ZFS_EXIT(zfsvfs); 193 return (SET_ERROR(EPERM)); 194 } 195 196 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 197 ZTOV(zp)->v_type == VREG && 198 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 199 if (fs_vscan(*vpp, cr, 0) != 0) { 200 ZFS_EXIT(zfsvfs); 201 return (SET_ERROR(EACCES)); 202 } 203 } 204 205 /* Keep a count of the synchronous opens in the znode */ 206 if (flag & (FSYNC | FDSYNC)) 207 atomic_inc_32(&zp->z_sync_cnt); 208 209 ZFS_EXIT(zfsvfs); 210 return (0); 211} 212 213/* ARGSUSED */ 214static int 215zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 216 caller_context_t *ct) 217{ 218 znode_t *zp = VTOZ(vp); 219 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 220 221 /* 222 * Clean up any locks held by this process on the vp. 223 */ 224 cleanlocks(vp, ddi_get_pid(), 0); 225 cleanshares(vp, ddi_get_pid()); 226 227 ZFS_ENTER(zfsvfs); 228 ZFS_VERIFY_ZP(zp); 229 230 /* Decrement the synchronous opens in the znode */ 231 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 232 atomic_dec_32(&zp->z_sync_cnt); 233 234 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 235 ZTOV(zp)->v_type == VREG && 236 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 237 VERIFY(fs_vscan(vp, cr, 1) == 0); 238 239 ZFS_EXIT(zfsvfs); 240 return (0); 241} 242 243/* 244 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 245 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 246 */ 247static int 248zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 249{ 250 znode_t *zp = VTOZ(vp); 251 uint64_t noff = (uint64_t)*off; /* new offset */ 252 uint64_t file_sz; 253 int error; 254 boolean_t hole; 255 256 file_sz = zp->z_size; 257 if (noff >= file_sz) { 258 return (SET_ERROR(ENXIO)); 259 } 260 261 if (cmd == _FIO_SEEK_HOLE) 262 hole = B_TRUE; 263 else 264 hole = B_FALSE; 265 266 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 267 268 if (error == ESRCH) 269 return (SET_ERROR(ENXIO)); 270 271 /* 272 * We could find a hole that begins after the logical end-of-file, 273 * because dmu_offset_next() only works on whole blocks. If the 274 * EOF falls mid-block, then indicate that the "virtual hole" 275 * at the end of the file begins at the logical EOF, rather than 276 * at the end of the last block. 277 */ 278 if (noff > file_sz) { 279 ASSERT(hole); 280 noff = file_sz; 281 } 282 283 if (noff < *off) 284 return (error); 285 *off = noff; 286 return (error); 287} 288 289/* ARGSUSED */ 290static int 291zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 292 int *rvalp, caller_context_t *ct) 293{ 294 offset_t off; 295 offset_t ndata; 296 dmu_object_info_t doi; 297 int error; 298 zfsvfs_t *zfsvfs; 299 znode_t *zp; 300 301 switch (com) { 302 case _FIOFFS: 303 { 304 return (0); 305 306 /* 307 * The following two ioctls are used by bfu. Faking out, 308 * necessary to avoid bfu errors. 309 */ 310 } 311 case _FIOGDIO: 312 case _FIOSDIO: 313 { 314 return (0); 315 } 316 317 case _FIO_SEEK_DATA: 318 case _FIO_SEEK_HOLE: 319 { 320#ifdef illumos 321 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 322 return (SET_ERROR(EFAULT)); 323#else 324 off = *(offset_t *)data; 325#endif 326 zp = VTOZ(vp); 327 zfsvfs = zp->z_zfsvfs; 328 ZFS_ENTER(zfsvfs); 329 ZFS_VERIFY_ZP(zp); 330 331 /* offset parameter is in/out */ 332 error = zfs_holey(vp, com, &off); 333 ZFS_EXIT(zfsvfs); 334 if (error) 335 return (error); 336#ifdef illumos 337 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 338 return (SET_ERROR(EFAULT)); 339#else 340 *(offset_t *)data = off; 341#endif 342 return (0); 343 } 344#ifdef illumos 345 case _FIO_COUNT_FILLED: 346 { 347 /* 348 * _FIO_COUNT_FILLED adds a new ioctl command which 349 * exposes the number of filled blocks in a 350 * ZFS object. 351 */ 352 zp = VTOZ(vp); 353 zfsvfs = zp->z_zfsvfs; 354 ZFS_ENTER(zfsvfs); 355 ZFS_VERIFY_ZP(zp); 356 357 /* 358 * Wait for all dirty blocks for this object 359 * to get synced out to disk, and the DMU info 360 * updated. 361 */ 362 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); 363 if (error) { 364 ZFS_EXIT(zfsvfs); 365 return (error); 366 } 367 368 /* 369 * Retrieve fill count from DMU object. 370 */ 371 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); 372 if (error) { 373 ZFS_EXIT(zfsvfs); 374 return (error); 375 } 376 377 ndata = doi.doi_fill_count; 378 379 ZFS_EXIT(zfsvfs); 380 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) 381 return (SET_ERROR(EFAULT)); 382 return (0); 383 } 384#endif 385 } 386 return (SET_ERROR(ENOTTY)); 387} 388 389static vm_page_t 390page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) 391{ 392 vm_object_t obj; 393 vm_page_t pp; 394 int64_t end; 395 396 /* 397 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE 398 * aligned boundaries, if the range is not aligned. As a result a 399 * DEV_BSIZE subrange with partially dirty data may get marked as clean. 400 * It may happen that all DEV_BSIZE subranges are marked clean and thus 401 * the whole page would be considred clean despite have some dirty data. 402 * For this reason we should shrink the range to DEV_BSIZE aligned 403 * boundaries before calling vm_page_clear_dirty. 404 */ 405 end = rounddown2(off + nbytes, DEV_BSIZE); 406 off = roundup2(off, DEV_BSIZE); 407 nbytes = end - off; 408 409 obj = vp->v_object; 410 zfs_vmobject_assert_wlocked(obj); 411 412 for (;;) { 413 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 414 pp->valid) { 415 if (vm_page_xbusied(pp)) { 416 /* 417 * Reference the page before unlocking and 418 * sleeping so that the page daemon is less 419 * likely to reclaim it. 420 */ 421 vm_page_reference(pp); 422 vm_page_lock(pp); 423 zfs_vmobject_wunlock(obj); 424 vm_page_busy_sleep(pp, "zfsmwb", true); 425 zfs_vmobject_wlock(obj); 426 continue; 427 } 428 vm_page_sbusy(pp); 429 } else if (pp == NULL) { 430 pp = vm_page_alloc(obj, OFF_TO_IDX(start), 431 VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED | 432 VM_ALLOC_SBUSY); 433 } else { 434 ASSERT(pp != NULL && !pp->valid); 435 pp = NULL; 436 } 437 438 if (pp != NULL) { 439 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 440 vm_object_pip_add(obj, 1); 441 pmap_remove_write(pp); 442 if (nbytes != 0) 443 vm_page_clear_dirty(pp, off, nbytes); 444 } 445 break; 446 } 447 return (pp); 448} 449 450static void 451page_unbusy(vm_page_t pp) 452{ 453 454 vm_page_sunbusy(pp); 455 vm_object_pip_subtract(pp->object, 1); 456} 457 458static vm_page_t 459page_hold(vnode_t *vp, int64_t start) 460{ 461 vm_object_t obj; 462 vm_page_t pp; 463 464 obj = vp->v_object; 465 zfs_vmobject_assert_wlocked(obj); 466 467 for (;;) { 468 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 469 pp->valid) { 470 if (vm_page_xbusied(pp)) { 471 /* 472 * Reference the page before unlocking and 473 * sleeping so that the page daemon is less 474 * likely to reclaim it. 475 */ 476 vm_page_reference(pp); 477 vm_page_lock(pp); 478 zfs_vmobject_wunlock(obj); 479 vm_page_busy_sleep(pp, "zfsmwb", true); 480 zfs_vmobject_wlock(obj); 481 continue; 482 } 483 484 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 485 vm_page_lock(pp); 486 vm_page_hold(pp); 487 vm_page_unlock(pp); 488 489 } else 490 pp = NULL; 491 break; 492 } 493 return (pp); 494} 495 496static void 497page_unhold(vm_page_t pp) 498{ 499 500 vm_page_lock(pp); 501 vm_page_unhold(pp); 502 vm_page_unlock(pp); 503} 504 505/* 506 * When a file is memory mapped, we must keep the IO data synchronized 507 * between the DMU cache and the memory mapped pages. What this means: 508 * 509 * On Write: If we find a memory mapped page, we write to *both* 510 * the page and the dmu buffer. 511 */ 512static void 513update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, 514 int segflg, dmu_tx_t *tx) 515{ 516 vm_object_t obj; 517 struct sf_buf *sf; 518 caddr_t va; 519 int off; 520 521 ASSERT(segflg != UIO_NOCOPY); 522 ASSERT(vp->v_mount != NULL); 523 obj = vp->v_object; 524 ASSERT(obj != NULL); 525 526 off = start & PAGEOFFSET; 527 zfs_vmobject_wlock(obj); 528 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 529 vm_page_t pp; 530 int nbytes = imin(PAGESIZE - off, len); 531 532 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { 533 zfs_vmobject_wunlock(obj); 534 535 va = zfs_map_page(pp, &sf); 536 (void) dmu_read(os, oid, start+off, nbytes, 537 va+off, DMU_READ_PREFETCH);; 538 zfs_unmap_page(sf); 539 540 zfs_vmobject_wlock(obj); 541 page_unbusy(pp); 542 } 543 len -= nbytes; 544 off = 0; 545 } 546 vm_object_pip_wakeupn(obj, 0); 547 zfs_vmobject_wunlock(obj); 548} 549 550/* 551 * Read with UIO_NOCOPY flag means that sendfile(2) requests 552 * ZFS to populate a range of page cache pages with data. 553 * 554 * NOTE: this function could be optimized to pre-allocate 555 * all pages in advance, drain exclusive busy on all of them, 556 * map them into contiguous KVA region and populate them 557 * in one single dmu_read() call. 558 */ 559static int 560mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) 561{ 562 znode_t *zp = VTOZ(vp); 563 objset_t *os = zp->z_zfsvfs->z_os; 564 struct sf_buf *sf; 565 vm_object_t obj; 566 vm_page_t pp; 567 int64_t start; 568 caddr_t va; 569 int len = nbytes; 570 int off; 571 int error = 0; 572 573 ASSERT(uio->uio_segflg == UIO_NOCOPY); 574 ASSERT(vp->v_mount != NULL); 575 obj = vp->v_object; 576 ASSERT(obj != NULL); 577 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); 578 579 zfs_vmobject_wlock(obj); 580 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { 581 int bytes = MIN(PAGESIZE, len); 582 583 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | 584 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); 585 if (pp->valid == 0) { 586 zfs_vmobject_wunlock(obj); 587 va = zfs_map_page(pp, &sf); 588 error = dmu_read(os, zp->z_id, start, bytes, va, 589 DMU_READ_PREFETCH); 590 if (bytes != PAGESIZE && error == 0) 591 bzero(va + bytes, PAGESIZE - bytes); 592 zfs_unmap_page(sf); 593 zfs_vmobject_wlock(obj); 594 vm_page_sunbusy(pp); 595 vm_page_lock(pp); 596 if (error) { 597 if (pp->wire_count == 0 && pp->valid == 0 && 598 !vm_page_busied(pp)) 599 vm_page_free(pp); 600 } else { 601 pp->valid = VM_PAGE_BITS_ALL; 602 vm_page_activate(pp); 603 } 604 vm_page_unlock(pp); 605 } else { 606 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 607 vm_page_sunbusy(pp); 608 } 609 if (error) 610 break; 611 uio->uio_resid -= bytes; 612 uio->uio_offset += bytes; 613 len -= bytes; 614 } 615 zfs_vmobject_wunlock(obj); 616 return (error); 617} 618 619/* 620 * When a file is memory mapped, we must keep the IO data synchronized 621 * between the DMU cache and the memory mapped pages. What this means: 622 * 623 * On Read: We "read" preferentially from memory mapped pages, 624 * else we default from the dmu buffer. 625 * 626 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 627 * the file is memory mapped. 628 */ 629static int 630mappedread(vnode_t *vp, int nbytes, uio_t *uio) 631{ 632 znode_t *zp = VTOZ(vp); 633 vm_object_t obj; 634 int64_t start; 635 caddr_t va; 636 int len = nbytes; 637 int off; 638 int error = 0; 639 640 ASSERT(vp->v_mount != NULL); 641 obj = vp->v_object; 642 ASSERT(obj != NULL); 643 644 start = uio->uio_loffset; 645 off = start & PAGEOFFSET; 646 zfs_vmobject_wlock(obj); 647 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 648 vm_page_t pp; 649 uint64_t bytes = MIN(PAGESIZE - off, len); 650 651 if (pp = page_hold(vp, start)) { 652 struct sf_buf *sf; 653 caddr_t va; 654 655 zfs_vmobject_wunlock(obj); 656 va = zfs_map_page(pp, &sf); 657#ifdef illumos 658 error = uiomove(va + off, bytes, UIO_READ, uio); 659#else 660 error = vn_io_fault_uiomove(va + off, bytes, uio); 661#endif 662 zfs_unmap_page(sf); 663 zfs_vmobject_wlock(obj); 664 page_unhold(pp); 665 } else { 666 zfs_vmobject_wunlock(obj); 667 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 668 uio, bytes); 669 zfs_vmobject_wlock(obj); 670 } 671 len -= bytes; 672 off = 0; 673 if (error) 674 break; 675 } 676 zfs_vmobject_wunlock(obj); 677 return (error); 678} 679 680offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 681 682/* 683 * Read bytes from specified file into supplied buffer. 684 * 685 * IN: vp - vnode of file to be read from. 686 * uio - structure supplying read location, range info, 687 * and return buffer. 688 * ioflag - SYNC flags; used to provide FRSYNC semantics. 689 * cr - credentials of caller. 690 * ct - caller context 691 * 692 * OUT: uio - updated offset and range, buffer filled. 693 * 694 * RETURN: 0 on success, error code on failure. 695 * 696 * Side Effects: 697 * vp - atime updated if byte count > 0 698 */ 699/* ARGSUSED */ 700static int 701zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 702{ 703 znode_t *zp = VTOZ(vp); 704 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 705 ssize_t n, nbytes; 706 int error = 0; 707 rl_t *rl; 708 xuio_t *xuio = NULL; 709 710 ZFS_ENTER(zfsvfs); 711 ZFS_VERIFY_ZP(zp); 712 713 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 714 ZFS_EXIT(zfsvfs); 715 return (SET_ERROR(EACCES)); 716 } 717 718 /* 719 * Validate file offset 720 */ 721 if (uio->uio_loffset < (offset_t)0) { 722 ZFS_EXIT(zfsvfs); 723 return (SET_ERROR(EINVAL)); 724 } 725 726 /* 727 * Fasttrack empty reads 728 */ 729 if (uio->uio_resid == 0) { 730 ZFS_EXIT(zfsvfs); 731 return (0); 732 } 733 734 /* 735 * Check for mandatory locks 736 */ 737 if (MANDMODE(zp->z_mode)) { 738 if (error = chklock(vp, FREAD, 739 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 740 ZFS_EXIT(zfsvfs); 741 return (error); 742 } 743 } 744 745 /* 746 * If we're in FRSYNC mode, sync out this znode before reading it. 747 */ 748 if (zfsvfs->z_log && 749 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 750 zil_commit(zfsvfs->z_log, zp->z_id); 751 752 /* 753 * Lock the range against changes. 754 */ 755 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 756 757 /* 758 * If we are reading past end-of-file we can skip 759 * to the end; but we might still need to set atime. 760 */ 761 if (uio->uio_loffset >= zp->z_size) { 762 error = 0; 763 goto out; 764 } 765 766 ASSERT(uio->uio_loffset < zp->z_size); 767 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 768 769#ifdef illumos 770 if ((uio->uio_extflg == UIO_XUIO) && 771 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 772 int nblk; 773 int blksz = zp->z_blksz; 774 uint64_t offset = uio->uio_loffset; 775 776 xuio = (xuio_t *)uio; 777 if ((ISP2(blksz))) { 778 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 779 blksz)) / blksz; 780 } else { 781 ASSERT(offset + n <= blksz); 782 nblk = 1; 783 } 784 (void) dmu_xuio_init(xuio, nblk); 785 786 if (vn_has_cached_data(vp)) { 787 /* 788 * For simplicity, we always allocate a full buffer 789 * even if we only expect to read a portion of a block. 790 */ 791 while (--nblk >= 0) { 792 (void) dmu_xuio_add(xuio, 793 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 794 blksz), 0, blksz); 795 } 796 } 797 } 798#endif /* illumos */ 799 800 while (n > 0) { 801 nbytes = MIN(n, zfs_read_chunk_size - 802 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 803 804#ifdef __FreeBSD__ 805 if (uio->uio_segflg == UIO_NOCOPY) 806 error = mappedread_sf(vp, nbytes, uio); 807 else 808#endif /* __FreeBSD__ */ 809 if (vn_has_cached_data(vp)) { 810 error = mappedread(vp, nbytes, uio); 811 } else { 812 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 813 uio, nbytes); 814 } 815 if (error) { 816 /* convert checksum errors into IO errors */ 817 if (error == ECKSUM) 818 error = SET_ERROR(EIO); 819 break; 820 } 821 822 n -= nbytes; 823 } 824out: 825 zfs_range_unlock(rl); 826 827 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 828 ZFS_EXIT(zfsvfs); 829 return (error); 830} 831 832/* 833 * Write the bytes to a file. 834 * 835 * IN: vp - vnode of file to be written to. 836 * uio - structure supplying write location, range info, 837 * and data buffer. 838 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 839 * set if in append mode. 840 * cr - credentials of caller. 841 * ct - caller context (NFS/CIFS fem monitor only) 842 * 843 * OUT: uio - updated offset and range. 844 * 845 * RETURN: 0 on success, error code on failure. 846 * 847 * Timestamps: 848 * vp - ctime|mtime updated if byte count > 0 849 */ 850 851/* ARGSUSED */ 852static int 853zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 854{ 855 znode_t *zp = VTOZ(vp); 856 rlim64_t limit = MAXOFFSET_T; 857 ssize_t start_resid = uio->uio_resid; 858 ssize_t tx_bytes; 859 uint64_t end_size; 860 dmu_tx_t *tx; 861 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 862 zilog_t *zilog; 863 offset_t woff; 864 ssize_t n, nbytes; 865 rl_t *rl; 866 int max_blksz = zfsvfs->z_max_blksz; 867 int error = 0; 868 arc_buf_t *abuf; 869 iovec_t *aiov = NULL; 870 xuio_t *xuio = NULL; 871 int i_iov = 0; 872 int iovcnt = uio->uio_iovcnt; 873 iovec_t *iovp = uio->uio_iov; 874 int write_eof; 875 int count = 0; 876 sa_bulk_attr_t bulk[4]; 877 uint64_t mtime[2], ctime[2]; 878 879 /* 880 * Fasttrack empty write 881 */ 882 n = start_resid; 883 if (n == 0) 884 return (0); 885 886 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 887 limit = MAXOFFSET_T; 888 889 ZFS_ENTER(zfsvfs); 890 ZFS_VERIFY_ZP(zp); 891 892 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 893 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 894 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 895 &zp->z_size, 8); 896 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 897 &zp->z_pflags, 8); 898 899 /* 900 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 901 * callers might not be able to detect properly that we are read-only, 902 * so check it explicitly here. 903 */ 904 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 905 ZFS_EXIT(zfsvfs); 906 return (SET_ERROR(EROFS)); 907 } 908 909 /* 910 * If immutable or not appending then return EPERM 911 */ 912 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 913 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 914 (uio->uio_loffset < zp->z_size))) { 915 ZFS_EXIT(zfsvfs); 916 return (SET_ERROR(EPERM)); 917 } 918 919 zilog = zfsvfs->z_log; 920 921 /* 922 * Validate file offset 923 */ 924 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 925 if (woff < 0) { 926 ZFS_EXIT(zfsvfs); 927 return (SET_ERROR(EINVAL)); 928 } 929 930 /* 931 * Check for mandatory locks before calling zfs_range_lock() 932 * in order to prevent a deadlock with locks set via fcntl(). 933 */ 934 if (MANDMODE((mode_t)zp->z_mode) && 935 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 936 ZFS_EXIT(zfsvfs); 937 return (error); 938 } 939 940#ifdef illumos 941 /* 942 * Pre-fault the pages to ensure slow (eg NFS) pages 943 * don't hold up txg. 944 * Skip this if uio contains loaned arc_buf. 945 */ 946 if ((uio->uio_extflg == UIO_XUIO) && 947 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 948 xuio = (xuio_t *)uio; 949 else 950 uio_prefaultpages(MIN(n, max_blksz), uio); 951#endif 952 953 /* 954 * If in append mode, set the io offset pointer to eof. 955 */ 956 if (ioflag & FAPPEND) { 957 /* 958 * Obtain an appending range lock to guarantee file append 959 * semantics. We reset the write offset once we have the lock. 960 */ 961 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 962 woff = rl->r_off; 963 if (rl->r_len == UINT64_MAX) { 964 /* 965 * We overlocked the file because this write will cause 966 * the file block size to increase. 967 * Note that zp_size cannot change with this lock held. 968 */ 969 woff = zp->z_size; 970 } 971 uio->uio_loffset = woff; 972 } else { 973 /* 974 * Note that if the file block size will change as a result of 975 * this write, then this range lock will lock the entire file 976 * so that we can re-write the block safely. 977 */ 978 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 979 } 980 981 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { 982 zfs_range_unlock(rl); 983 ZFS_EXIT(zfsvfs); 984 return (EFBIG); 985 } 986 987 if (woff >= limit) { 988 zfs_range_unlock(rl); 989 ZFS_EXIT(zfsvfs); 990 return (SET_ERROR(EFBIG)); 991 } 992 993 if ((woff + n) > limit || woff > (limit - n)) 994 n = limit - woff; 995 996 /* Will this write extend the file length? */ 997 write_eof = (woff + n > zp->z_size); 998 999 end_size = MAX(zp->z_size, woff + n); 1000 1001 /* 1002 * Write the file in reasonable size chunks. Each chunk is written 1003 * in a separate transaction; this keeps the intent log records small 1004 * and allows us to do more fine-grained space accounting. 1005 */ 1006 while (n > 0) { 1007 abuf = NULL; 1008 woff = uio->uio_loffset; 1009 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 1010 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 1011 if (abuf != NULL) 1012 dmu_return_arcbuf(abuf); 1013 error = SET_ERROR(EDQUOT); 1014 break; 1015 } 1016 1017 if (xuio && abuf == NULL) { 1018 ASSERT(i_iov < iovcnt); 1019 aiov = &iovp[i_iov]; 1020 abuf = dmu_xuio_arcbuf(xuio, i_iov); 1021 dmu_xuio_clear(xuio, i_iov); 1022 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 1023 iovec_t *, aiov, arc_buf_t *, abuf); 1024 ASSERT((aiov->iov_base == abuf->b_data) || 1025 ((char *)aiov->iov_base - (char *)abuf->b_data + 1026 aiov->iov_len == arc_buf_size(abuf))); 1027 i_iov++; 1028 } else if (abuf == NULL && n >= max_blksz && 1029 woff >= zp->z_size && 1030 P2PHASE(woff, max_blksz) == 0 && 1031 zp->z_blksz == max_blksz) { 1032 /* 1033 * This write covers a full block. "Borrow" a buffer 1034 * from the dmu so that we can fill it before we enter 1035 * a transaction. This avoids the possibility of 1036 * holding up the transaction if the data copy hangs 1037 * up on a pagefault (e.g., from an NFS server mapping). 1038 */ 1039 size_t cbytes; 1040 1041 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 1042 max_blksz); 1043 ASSERT(abuf != NULL); 1044 ASSERT(arc_buf_size(abuf) == max_blksz); 1045 if (error = uiocopy(abuf->b_data, max_blksz, 1046 UIO_WRITE, uio, &cbytes)) { 1047 dmu_return_arcbuf(abuf); 1048 break; 1049 } 1050 ASSERT(cbytes == max_blksz); 1051 } 1052 1053 /* 1054 * Start a transaction. 1055 */ 1056 tx = dmu_tx_create(zfsvfs->z_os); 1057 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1058 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 1059 zfs_sa_upgrade_txholds(tx, zp); 1060 error = dmu_tx_assign(tx, TXG_WAIT); 1061 if (error) { 1062 dmu_tx_abort(tx); 1063 if (abuf != NULL) 1064 dmu_return_arcbuf(abuf); 1065 break; 1066 } 1067 1068 /* 1069 * If zfs_range_lock() over-locked we grow the blocksize 1070 * and then reduce the lock range. This will only happen 1071 * on the first iteration since zfs_range_reduce() will 1072 * shrink down r_len to the appropriate size. 1073 */ 1074 if (rl->r_len == UINT64_MAX) { 1075 uint64_t new_blksz; 1076 1077 if (zp->z_blksz > max_blksz) { 1078 /* 1079 * File's blocksize is already larger than the 1080 * "recordsize" property. Only let it grow to 1081 * the next power of 2. 1082 */ 1083 ASSERT(!ISP2(zp->z_blksz)); 1084 new_blksz = MIN(end_size, 1085 1 << highbit64(zp->z_blksz)); 1086 } else { 1087 new_blksz = MIN(end_size, max_blksz); 1088 } 1089 zfs_grow_blocksize(zp, new_blksz, tx); 1090 zfs_range_reduce(rl, woff, n); 1091 } 1092 1093 /* 1094 * XXX - should we really limit each write to z_max_blksz? 1095 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 1096 */ 1097 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 1098 1099 if (woff + nbytes > zp->z_size) 1100 vnode_pager_setsize(vp, woff + nbytes); 1101 1102 if (abuf == NULL) { 1103 tx_bytes = uio->uio_resid; 1104 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 1105 uio, nbytes, tx); 1106 tx_bytes -= uio->uio_resid; 1107 } else { 1108 tx_bytes = nbytes; 1109 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 1110 /* 1111 * If this is not a full block write, but we are 1112 * extending the file past EOF and this data starts 1113 * block-aligned, use assign_arcbuf(). Otherwise, 1114 * write via dmu_write(). 1115 */ 1116 if (tx_bytes < max_blksz && (!write_eof || 1117 aiov->iov_base != abuf->b_data)) { 1118 ASSERT(xuio); 1119 dmu_write(zfsvfs->z_os, zp->z_id, woff, 1120 aiov->iov_len, aiov->iov_base, tx); 1121 dmu_return_arcbuf(abuf); 1122 xuio_stat_wbuf_copied(); 1123 } else { 1124 ASSERT(xuio || tx_bytes == max_blksz); 1125 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 1126 woff, abuf, tx); 1127 } 1128 ASSERT(tx_bytes <= uio->uio_resid); 1129 uioskip(uio, tx_bytes); 1130 } 1131 if (tx_bytes && vn_has_cached_data(vp)) { 1132 update_pages(vp, woff, tx_bytes, zfsvfs->z_os, 1133 zp->z_id, uio->uio_segflg, tx); 1134 } 1135 1136 /* 1137 * If we made no progress, we're done. If we made even 1138 * partial progress, update the znode and ZIL accordingly. 1139 */ 1140 if (tx_bytes == 0) { 1141 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 1142 (void *)&zp->z_size, sizeof (uint64_t), tx); 1143 dmu_tx_commit(tx); 1144 ASSERT(error != 0); 1145 break; 1146 } 1147 1148 /* 1149 * Clear Set-UID/Set-GID bits on successful write if not 1150 * privileged and at least one of the excute bits is set. 1151 * 1152 * It would be nice to to this after all writes have 1153 * been done, but that would still expose the ISUID/ISGID 1154 * to another app after the partial write is committed. 1155 * 1156 * Note: we don't call zfs_fuid_map_id() here because 1157 * user 0 is not an ephemeral uid. 1158 */ 1159 mutex_enter(&zp->z_acl_lock); 1160 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 1161 (S_IXUSR >> 6))) != 0 && 1162 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 1163 secpolicy_vnode_setid_retain(vp, cr, 1164 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 1165 uint64_t newmode; 1166 zp->z_mode &= ~(S_ISUID | S_ISGID); 1167 newmode = zp->z_mode; 1168 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 1169 (void *)&newmode, sizeof (uint64_t), tx); 1170 } 1171 mutex_exit(&zp->z_acl_lock); 1172 1173 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 1174 B_TRUE); 1175 1176 /* 1177 * Update the file size (zp_size) if it has changed; 1178 * account for possible concurrent updates. 1179 */ 1180 while ((end_size = zp->z_size) < uio->uio_loffset) { 1181 (void) atomic_cas_64(&zp->z_size, end_size, 1182 uio->uio_loffset); 1183#ifdef illumos 1184 ASSERT(error == 0); 1185#else 1186 ASSERT(error == 0 || error == EFAULT); 1187#endif 1188 } 1189 /* 1190 * If we are replaying and eof is non zero then force 1191 * the file size to the specified eof. Note, there's no 1192 * concurrency during replay. 1193 */ 1194 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1195 zp->z_size = zfsvfs->z_replay_eof; 1196 1197 if (error == 0) 1198 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1199 else 1200 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1201 1202 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 1203 dmu_tx_commit(tx); 1204 1205 if (error != 0) 1206 break; 1207 ASSERT(tx_bytes == nbytes); 1208 n -= nbytes; 1209 1210#ifdef illumos 1211 if (!xuio && n > 0) 1212 uio_prefaultpages(MIN(n, max_blksz), uio); 1213#endif 1214 } 1215 1216 zfs_range_unlock(rl); 1217 1218 /* 1219 * If we're in replay mode, or we made no progress, return error. 1220 * Otherwise, it's at least a partial write, so it's successful. 1221 */ 1222 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 1223 ZFS_EXIT(zfsvfs); 1224 return (error); 1225 } 1226 1227#ifdef __FreeBSD__ 1228 /* 1229 * EFAULT means that at least one page of the source buffer was not 1230 * available. VFS will re-try remaining I/O upon this error. 1231 */ 1232 if (error == EFAULT) { 1233 ZFS_EXIT(zfsvfs); 1234 return (error); 1235 } 1236#endif 1237 1238 if (ioflag & (FSYNC | FDSYNC) || 1239 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1240 zil_commit(zilog, zp->z_id); 1241 1242 ZFS_EXIT(zfsvfs); 1243 return (0); 1244} 1245 1246void 1247zfs_get_done(zgd_t *zgd, int error) 1248{ 1249 znode_t *zp = zgd->zgd_private; 1250 objset_t *os = zp->z_zfsvfs->z_os; 1251 1252 if (zgd->zgd_db) 1253 dmu_buf_rele(zgd->zgd_db, zgd); 1254 1255 zfs_range_unlock(zgd->zgd_rl); 1256 1257 /* 1258 * Release the vnode asynchronously as we currently have the 1259 * txg stopped from syncing. 1260 */ 1261 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1262 1263 if (error == 0 && zgd->zgd_bp) 1264 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 1265 1266 kmem_free(zgd, sizeof (zgd_t)); 1267} 1268 1269#ifdef DEBUG 1270static int zil_fault_io = 0; 1271#endif 1272 1273/* 1274 * Get data to generate a TX_WRITE intent log record. 1275 */ 1276int 1277zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 1278{ 1279 zfsvfs_t *zfsvfs = arg; 1280 objset_t *os = zfsvfs->z_os; 1281 znode_t *zp; 1282 uint64_t object = lr->lr_foid; 1283 uint64_t offset = lr->lr_offset; 1284 uint64_t size = lr->lr_length; 1285 blkptr_t *bp = &lr->lr_blkptr; 1286 dmu_buf_t *db; 1287 zgd_t *zgd; 1288 int error = 0; 1289 1290 ASSERT(zio != NULL); 1291 ASSERT(size != 0); 1292 1293 /* 1294 * Nothing to do if the file has been removed 1295 */ 1296 if (zfs_zget(zfsvfs, object, &zp) != 0) 1297 return (SET_ERROR(ENOENT)); 1298 if (zp->z_unlinked) { 1299 /* 1300 * Release the vnode asynchronously as we currently have the 1301 * txg stopped from syncing. 1302 */ 1303 VN_RELE_ASYNC(ZTOV(zp), 1304 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1305 return (SET_ERROR(ENOENT)); 1306 } 1307 1308 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1309 zgd->zgd_zilog = zfsvfs->z_log; 1310 zgd->zgd_private = zp; 1311 1312 /* 1313 * Write records come in two flavors: immediate and indirect. 1314 * For small writes it's cheaper to store the data with the 1315 * log record (immediate); for large writes it's cheaper to 1316 * sync the data and get a pointer to it (indirect) so that 1317 * we don't have to write the data twice. 1318 */ 1319 if (buf != NULL) { /* immediate write */ 1320 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1321 /* test for truncation needs to be done while range locked */ 1322 if (offset >= zp->z_size) { 1323 error = SET_ERROR(ENOENT); 1324 } else { 1325 error = dmu_read(os, object, offset, size, buf, 1326 DMU_READ_NO_PREFETCH); 1327 } 1328 ASSERT(error == 0 || error == ENOENT); 1329 } else { /* indirect write */ 1330 /* 1331 * Have to lock the whole block to ensure when it's 1332 * written out and its checksum is being calculated 1333 * that no one can change the data. We need to re-check 1334 * blocksize after we get the lock in case it's changed! 1335 */ 1336 for (;;) { 1337 uint64_t blkoff; 1338 size = zp->z_blksz; 1339 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1340 offset -= blkoff; 1341 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1342 RL_READER); 1343 if (zp->z_blksz == size) 1344 break; 1345 offset += blkoff; 1346 zfs_range_unlock(zgd->zgd_rl); 1347 } 1348 /* test for truncation needs to be done while range locked */ 1349 if (lr->lr_offset >= zp->z_size) 1350 error = SET_ERROR(ENOENT); 1351#ifdef DEBUG 1352 if (zil_fault_io) { 1353 error = SET_ERROR(EIO); 1354 zil_fault_io = 0; 1355 } 1356#endif 1357 if (error == 0) 1358 error = dmu_buf_hold(os, object, offset, zgd, &db, 1359 DMU_READ_NO_PREFETCH); 1360 1361 if (error == 0) { 1362 blkptr_t *obp = dmu_buf_get_blkptr(db); 1363 if (obp) { 1364 ASSERT(BP_IS_HOLE(bp)); 1365 *bp = *obp; 1366 } 1367 1368 zgd->zgd_db = db; 1369 zgd->zgd_bp = bp; 1370 1371 ASSERT(db->db_offset == offset); 1372 ASSERT(db->db_size == size); 1373 1374 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1375 zfs_get_done, zgd); 1376 ASSERT(error || lr->lr_length <= zp->z_blksz); 1377 1378 /* 1379 * On success, we need to wait for the write I/O 1380 * initiated by dmu_sync() to complete before we can 1381 * release this dbuf. We will finish everything up 1382 * in the zfs_get_done() callback. 1383 */ 1384 if (error == 0) 1385 return (0); 1386 1387 if (error == EALREADY) { 1388 lr->lr_common.lrc_txtype = TX_WRITE2; 1389 error = 0; 1390 } 1391 } 1392 } 1393 1394 zfs_get_done(zgd, error); 1395 1396 return (error); 1397} 1398 1399/*ARGSUSED*/ 1400static int 1401zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1402 caller_context_t *ct) 1403{ 1404 znode_t *zp = VTOZ(vp); 1405 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1406 int error; 1407 1408 ZFS_ENTER(zfsvfs); 1409 ZFS_VERIFY_ZP(zp); 1410 1411 if (flag & V_ACE_MASK) 1412 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1413 else 1414 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1415 1416 ZFS_EXIT(zfsvfs); 1417 return (error); 1418} 1419 1420static int 1421zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) 1422{ 1423 int error; 1424 1425 *vpp = arg; 1426 error = vn_lock(*vpp, lkflags); 1427 if (error != 0) 1428 vrele(*vpp); 1429 return (error); 1430} 1431 1432static int 1433zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) 1434{ 1435 znode_t *zdp = VTOZ(dvp); 1436 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1437 int error; 1438 int ltype; 1439 1440 ASSERT_VOP_LOCKED(dvp, __func__); 1441#ifdef DIAGNOSTIC 1442 if ((zdp->z_pflags & ZFS_XATTR) == 0) 1443 VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); 1444#endif 1445 1446 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 1447 ASSERT3P(dvp, ==, vp); 1448 vref(dvp); 1449 ltype = lkflags & LK_TYPE_MASK; 1450 if (ltype != VOP_ISLOCKED(dvp)) { 1451 if (ltype == LK_EXCLUSIVE) 1452 vn_lock(dvp, LK_UPGRADE | LK_RETRY); 1453 else /* if (ltype == LK_SHARED) */ 1454 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); 1455 1456 /* 1457 * Relock for the "." case could leave us with 1458 * reclaimed vnode. 1459 */ 1460 if (dvp->v_iflag & VI_DOOMED) { 1461 vrele(dvp); 1462 return (SET_ERROR(ENOENT)); 1463 } 1464 } 1465 return (0); 1466 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 1467 /* 1468 * Note that in this case, dvp is the child vnode, and we 1469 * are looking up the parent vnode - exactly reverse from 1470 * normal operation. Unlocking dvp requires some rather 1471 * tricky unlock/relock dance to prevent mp from being freed; 1472 * use vn_vget_ino_gen() which takes care of all that. 1473 * 1474 * XXX Note that there is a time window when both vnodes are 1475 * unlocked. It is possible, although highly unlikely, that 1476 * during that window the parent-child relationship between 1477 * the vnodes may change, for example, get reversed. 1478 * In that case we would have a wrong lock order for the vnodes. 1479 * All other filesystems seem to ignore this problem, so we 1480 * do the same here. 1481 * A potential solution could be implemented as follows: 1482 * - using LK_NOWAIT when locking the second vnode and retrying 1483 * if necessary 1484 * - checking that the parent-child relationship still holds 1485 * after locking both vnodes and retrying if it doesn't 1486 */ 1487 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); 1488 return (error); 1489 } else { 1490 error = vn_lock(vp, lkflags); 1491 if (error != 0) 1492 vrele(vp); 1493 return (error); 1494 } 1495} 1496 1497/* 1498 * Lookup an entry in a directory, or an extended attribute directory. 1499 * If it exists, return a held vnode reference for it. 1500 * 1501 * IN: dvp - vnode of directory to search. 1502 * nm - name of entry to lookup. 1503 * pnp - full pathname to lookup [UNUSED]. 1504 * flags - LOOKUP_XATTR set if looking for an attribute. 1505 * rdir - root directory vnode [UNUSED]. 1506 * cr - credentials of caller. 1507 * ct - caller context 1508 * 1509 * OUT: vpp - vnode of located entry, NULL if not found. 1510 * 1511 * RETURN: 0 on success, error code on failure. 1512 * 1513 * Timestamps: 1514 * NA 1515 */ 1516/* ARGSUSED */ 1517static int 1518zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1519 int nameiop, cred_t *cr, kthread_t *td, int flags) 1520{ 1521 znode_t *zdp = VTOZ(dvp); 1522 znode_t *zp; 1523 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1524 int error = 0; 1525 1526 /* fast path (should be redundant with vfs namecache) */ 1527 if (!(flags & LOOKUP_XATTR)) { 1528 if (dvp->v_type != VDIR) { 1529 return (SET_ERROR(ENOTDIR)); 1530 } else if (zdp->z_sa_hdl == NULL) { 1531 return (SET_ERROR(EIO)); 1532 } 1533 } 1534 1535 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1536 1537 ZFS_ENTER(zfsvfs); 1538 ZFS_VERIFY_ZP(zdp); 1539 1540 *vpp = NULL; 1541 1542 if (flags & LOOKUP_XATTR) { 1543#ifdef TODO 1544 /* 1545 * If the xattr property is off, refuse the lookup request. 1546 */ 1547 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1548 ZFS_EXIT(zfsvfs); 1549 return (SET_ERROR(EINVAL)); 1550 } 1551#endif 1552 1553 /* 1554 * We don't allow recursive attributes.. 1555 * Maybe someday we will. 1556 */ 1557 if (zdp->z_pflags & ZFS_XATTR) { 1558 ZFS_EXIT(zfsvfs); 1559 return (SET_ERROR(EINVAL)); 1560 } 1561 1562 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1563 ZFS_EXIT(zfsvfs); 1564 return (error); 1565 } 1566 1567 /* 1568 * Do we have permission to get into attribute directory? 1569 */ 1570 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1571 B_FALSE, cr)) { 1572 vrele(*vpp); 1573 *vpp = NULL; 1574 } 1575 1576 ZFS_EXIT(zfsvfs); 1577 return (error); 1578 } 1579 1580 /* 1581 * Check accessibility of directory. 1582 */ 1583 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1584 ZFS_EXIT(zfsvfs); 1585 return (error); 1586 } 1587 1588 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1589 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1590 ZFS_EXIT(zfsvfs); 1591 return (SET_ERROR(EILSEQ)); 1592 } 1593 1594 1595 /* 1596 * First handle the special cases. 1597 */ 1598 if ((cnp->cn_flags & ISDOTDOT) != 0) { 1599 /* 1600 * If we are a snapshot mounted under .zfs, return 1601 * the vp for the snapshot directory. 1602 */ 1603 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { 1604 struct componentname cn; 1605 vnode_t *zfsctl_vp; 1606 int ltype; 1607 1608 ZFS_EXIT(zfsvfs); 1609 ltype = VOP_ISLOCKED(dvp); 1610 VOP_UNLOCK(dvp, 0); 1611 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, 1612 &zfsctl_vp); 1613 if (error == 0) { 1614 cn.cn_nameptr = "snapshot"; 1615 cn.cn_namelen = strlen(cn.cn_nameptr); 1616 cn.cn_nameiop = cnp->cn_nameiop; 1617 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; 1618 cn.cn_lkflags = cnp->cn_lkflags; 1619 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); 1620 vput(zfsctl_vp); 1621 } 1622 vn_lock(dvp, ltype | LK_RETRY); 1623 return (error); 1624 } 1625 } 1626 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { 1627 ZFS_EXIT(zfsvfs); 1628 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) 1629 return (SET_ERROR(ENOTSUP)); 1630 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); 1631 return (error); 1632 } 1633 1634 /* 1635 * The loop is retry the lookup if the parent-child relationship 1636 * changes during the dot-dot locking complexities. 1637 */ 1638 for (;;) { 1639 uint64_t parent; 1640 1641 error = zfs_dirlook(zdp, nm, &zp); 1642 if (error == 0) 1643 *vpp = ZTOV(zp); 1644 1645 ZFS_EXIT(zfsvfs); 1646 if (error != 0) 1647 break; 1648 1649 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); 1650 if (error != 0) { 1651 /* 1652 * If we've got a locking error, then the vnode 1653 * got reclaimed because of a force unmount. 1654 * We never enter doomed vnodes into the name cache. 1655 */ 1656 *vpp = NULL; 1657 return (error); 1658 } 1659 1660 if ((cnp->cn_flags & ISDOTDOT) == 0) 1661 break; 1662 1663 ZFS_ENTER(zfsvfs); 1664 if (zdp->z_sa_hdl == NULL) { 1665 error = SET_ERROR(EIO); 1666 } else { 1667 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1668 &parent, sizeof (parent)); 1669 } 1670 if (error != 0) { 1671 ZFS_EXIT(zfsvfs); 1672 vput(ZTOV(zp)); 1673 break; 1674 } 1675 if (zp->z_id == parent) { 1676 ZFS_EXIT(zfsvfs); 1677 break; 1678 } 1679 vput(ZTOV(zp)); 1680 } 1681 1682out: 1683 if (error != 0) 1684 *vpp = NULL; 1685 1686 /* Translate errors and add SAVENAME when needed. */ 1687 if (cnp->cn_flags & ISLASTCN) { 1688 switch (nameiop) { 1689 case CREATE: 1690 case RENAME: 1691 if (error == ENOENT) { 1692 error = EJUSTRETURN; 1693 cnp->cn_flags |= SAVENAME; 1694 break; 1695 } 1696 /* FALLTHROUGH */ 1697 case DELETE: 1698 if (error == 0) 1699 cnp->cn_flags |= SAVENAME; 1700 break; 1701 } 1702 } 1703 1704 /* Insert name into cache (as non-existent) if appropriate. */ 1705 if (zfsvfs->z_use_namecache && 1706 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) 1707 cache_enter(dvp, NULL, cnp); 1708 1709 /* Insert name into cache if appropriate. */ 1710 if (zfsvfs->z_use_namecache && 1711 error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1712 if (!(cnp->cn_flags & ISLASTCN) || 1713 (nameiop != DELETE && nameiop != RENAME)) { 1714 cache_enter(dvp, *vpp, cnp); 1715 } 1716 } 1717 1718 return (error); 1719} 1720 1721/* 1722 * Attempt to create a new entry in a directory. If the entry 1723 * already exists, truncate the file if permissible, else return 1724 * an error. Return the vp of the created or trunc'd file. 1725 * 1726 * IN: dvp - vnode of directory to put new file entry in. 1727 * name - name of new file entry. 1728 * vap - attributes of new file. 1729 * excl - flag indicating exclusive or non-exclusive mode. 1730 * mode - mode to open file with. 1731 * cr - credentials of caller. 1732 * flag - large file flag [UNUSED]. 1733 * ct - caller context 1734 * vsecp - ACL to be set 1735 * 1736 * OUT: vpp - vnode of created or trunc'd entry. 1737 * 1738 * RETURN: 0 on success, error code on failure. 1739 * 1740 * Timestamps: 1741 * dvp - ctime|mtime updated if new entry created 1742 * vp - ctime|mtime always, atime if new 1743 */ 1744 1745/* ARGSUSED */ 1746static int 1747zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1748 vnode_t **vpp, cred_t *cr, kthread_t *td) 1749{ 1750 znode_t *zp, *dzp = VTOZ(dvp); 1751 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1752 zilog_t *zilog; 1753 objset_t *os; 1754 dmu_tx_t *tx; 1755 int error; 1756 ksid_t *ksid; 1757 uid_t uid; 1758 gid_t gid = crgetgid(cr); 1759 zfs_acl_ids_t acl_ids; 1760 boolean_t fuid_dirtied; 1761 void *vsecp = NULL; 1762 int flag = 0; 1763 uint64_t txtype; 1764 1765 /* 1766 * If we have an ephemeral id, ACL, or XVATTR then 1767 * make sure file system is at proper version 1768 */ 1769 1770 ksid = crgetsid(cr, KSID_OWNER); 1771 if (ksid) 1772 uid = ksid_getid(ksid); 1773 else 1774 uid = crgetuid(cr); 1775 1776 if (zfsvfs->z_use_fuids == B_FALSE && 1777 (vsecp || (vap->va_mask & AT_XVATTR) || 1778 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1779 return (SET_ERROR(EINVAL)); 1780 1781 ZFS_ENTER(zfsvfs); 1782 ZFS_VERIFY_ZP(dzp); 1783 os = zfsvfs->z_os; 1784 zilog = zfsvfs->z_log; 1785 1786 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1787 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1788 ZFS_EXIT(zfsvfs); 1789 return (SET_ERROR(EILSEQ)); 1790 } 1791 1792 if (vap->va_mask & AT_XVATTR) { 1793 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1794 crgetuid(cr), cr, vap->va_type)) != 0) { 1795 ZFS_EXIT(zfsvfs); 1796 return (error); 1797 } 1798 } 1799 1800 *vpp = NULL; 1801 1802 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1803 vap->va_mode &= ~S_ISVTX; 1804 1805 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 1806 if (error) { 1807 ZFS_EXIT(zfsvfs); 1808 return (error); 1809 } 1810 ASSERT3P(zp, ==, NULL); 1811 1812 /* 1813 * Create a new file object and update the directory 1814 * to reference it. 1815 */ 1816 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1817 goto out; 1818 } 1819 1820 /* 1821 * We only support the creation of regular files in 1822 * extended attribute directories. 1823 */ 1824 1825 if ((dzp->z_pflags & ZFS_XATTR) && 1826 (vap->va_type != VREG)) { 1827 error = SET_ERROR(EINVAL); 1828 goto out; 1829 } 1830 1831 if ((error = zfs_acl_ids_create(dzp, 0, vap, 1832 cr, vsecp, &acl_ids)) != 0) 1833 goto out; 1834 1835 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1836 zfs_acl_ids_free(&acl_ids); 1837 error = SET_ERROR(EDQUOT); 1838 goto out; 1839 } 1840 1841 getnewvnode_reserve(1); 1842 1843 tx = dmu_tx_create(os); 1844 1845 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1846 ZFS_SA_BASE_ATTR_SIZE); 1847 1848 fuid_dirtied = zfsvfs->z_fuid_dirty; 1849 if (fuid_dirtied) 1850 zfs_fuid_txhold(zfsvfs, tx); 1851 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1852 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1853 if (!zfsvfs->z_use_sa && 1854 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1855 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1856 0, acl_ids.z_aclp->z_acl_bytes); 1857 } 1858 error = dmu_tx_assign(tx, TXG_WAIT); 1859 if (error) { 1860 zfs_acl_ids_free(&acl_ids); 1861 dmu_tx_abort(tx); 1862 getnewvnode_drop_reserve(); 1863 ZFS_EXIT(zfsvfs); 1864 return (error); 1865 } 1866 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1867 1868 if (fuid_dirtied) 1869 zfs_fuid_sync(zfsvfs, tx); 1870 1871 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 1872 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1873 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1874 vsecp, acl_ids.z_fuidp, vap); 1875 zfs_acl_ids_free(&acl_ids); 1876 dmu_tx_commit(tx); 1877 1878 getnewvnode_drop_reserve(); 1879 1880out: 1881 if (error == 0) { 1882 *vpp = ZTOV(zp); 1883 } 1884 1885 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1886 zil_commit(zilog, 0); 1887 1888 ZFS_EXIT(zfsvfs); 1889 return (error); 1890} 1891 1892/* 1893 * Remove an entry from a directory. 1894 * 1895 * IN: dvp - vnode of directory to remove entry from. 1896 * name - name of entry to remove. 1897 * cr - credentials of caller. 1898 * ct - caller context 1899 * flags - case flags 1900 * 1901 * RETURN: 0 on success, error code on failure. 1902 * 1903 * Timestamps: 1904 * dvp - ctime|mtime 1905 * vp - ctime (if nlink > 0) 1906 */ 1907 1908/*ARGSUSED*/ 1909static int 1910zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 1911{ 1912 znode_t *dzp = VTOZ(dvp); 1913 znode_t *zp = VTOZ(vp); 1914 znode_t *xzp; 1915 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1916 zilog_t *zilog; 1917 uint64_t acl_obj, xattr_obj; 1918 uint64_t obj = 0; 1919 dmu_tx_t *tx; 1920 boolean_t unlinked, toobig = FALSE; 1921 uint64_t txtype; 1922 int error; 1923 1924 ZFS_ENTER(zfsvfs); 1925 ZFS_VERIFY_ZP(dzp); 1926 ZFS_VERIFY_ZP(zp); 1927 zilog = zfsvfs->z_log; 1928 zp = VTOZ(vp); 1929 1930 xattr_obj = 0; 1931 xzp = NULL; 1932 1933 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1934 goto out; 1935 } 1936 1937 /* 1938 * Need to use rmdir for removing directories. 1939 */ 1940 if (vp->v_type == VDIR) { 1941 error = SET_ERROR(EPERM); 1942 goto out; 1943 } 1944 1945 vnevent_remove(vp, dvp, name, ct); 1946 1947 obj = zp->z_id; 1948 1949 /* are there any extended attributes? */ 1950 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1951 &xattr_obj, sizeof (xattr_obj)); 1952 if (error == 0 && xattr_obj) { 1953 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1954 ASSERT0(error); 1955 } 1956 1957 /* 1958 * We may delete the znode now, or we may put it in the unlinked set; 1959 * it depends on whether we're the last link, and on whether there are 1960 * other holds on the vnode. So we dmu_tx_hold() the right things to 1961 * allow for either case. 1962 */ 1963 tx = dmu_tx_create(zfsvfs->z_os); 1964 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1965 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1966 zfs_sa_upgrade_txholds(tx, zp); 1967 zfs_sa_upgrade_txholds(tx, dzp); 1968 1969 if (xzp) { 1970 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1971 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1972 } 1973 1974 /* charge as an update -- would be nice not to charge at all */ 1975 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1976 1977 /* 1978 * Mark this transaction as typically resulting in a net free of space 1979 */ 1980 dmu_tx_mark_netfree(tx); 1981 1982 error = dmu_tx_assign(tx, TXG_WAIT); 1983 if (error) { 1984 dmu_tx_abort(tx); 1985 ZFS_EXIT(zfsvfs); 1986 return (error); 1987 } 1988 1989 /* 1990 * Remove the directory entry. 1991 */ 1992 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); 1993 1994 if (error) { 1995 dmu_tx_commit(tx); 1996 goto out; 1997 } 1998 1999 if (unlinked) { 2000 zfs_unlinked_add(zp, tx); 2001 vp->v_vflag |= VV_NOSYNC; 2002 } 2003 2004 txtype = TX_REMOVE; 2005 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 2006 2007 dmu_tx_commit(tx); 2008out: 2009 2010 if (xzp) 2011 vrele(ZTOV(xzp)); 2012 2013 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2014 zil_commit(zilog, 0); 2015 2016 ZFS_EXIT(zfsvfs); 2017 return (error); 2018} 2019 2020/* 2021 * Create a new directory and insert it into dvp using the name 2022 * provided. Return a pointer to the inserted directory. 2023 * 2024 * IN: dvp - vnode of directory to add subdir to. 2025 * dirname - name of new directory. 2026 * vap - attributes of new directory. 2027 * cr - credentials of caller. 2028 * ct - caller context 2029 * flags - case flags 2030 * vsecp - ACL to be set 2031 * 2032 * OUT: vpp - vnode of created directory. 2033 * 2034 * RETURN: 0 on success, error code on failure. 2035 * 2036 * Timestamps: 2037 * dvp - ctime|mtime updated 2038 * vp - ctime|mtime|atime updated 2039 */ 2040/*ARGSUSED*/ 2041static int 2042zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) 2043{ 2044 znode_t *zp, *dzp = VTOZ(dvp); 2045 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2046 zilog_t *zilog; 2047 uint64_t txtype; 2048 dmu_tx_t *tx; 2049 int error; 2050 ksid_t *ksid; 2051 uid_t uid; 2052 gid_t gid = crgetgid(cr); 2053 zfs_acl_ids_t acl_ids; 2054 boolean_t fuid_dirtied; 2055 2056 ASSERT(vap->va_type == VDIR); 2057 2058 /* 2059 * If we have an ephemeral id, ACL, or XVATTR then 2060 * make sure file system is at proper version 2061 */ 2062 2063 ksid = crgetsid(cr, KSID_OWNER); 2064 if (ksid) 2065 uid = ksid_getid(ksid); 2066 else 2067 uid = crgetuid(cr); 2068 if (zfsvfs->z_use_fuids == B_FALSE && 2069 ((vap->va_mask & AT_XVATTR) || 2070 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 2071 return (SET_ERROR(EINVAL)); 2072 2073 ZFS_ENTER(zfsvfs); 2074 ZFS_VERIFY_ZP(dzp); 2075 zilog = zfsvfs->z_log; 2076 2077 if (dzp->z_pflags & ZFS_XATTR) { 2078 ZFS_EXIT(zfsvfs); 2079 return (SET_ERROR(EINVAL)); 2080 } 2081 2082 if (zfsvfs->z_utf8 && u8_validate(dirname, 2083 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2084 ZFS_EXIT(zfsvfs); 2085 return (SET_ERROR(EILSEQ)); 2086 } 2087 2088 if (vap->va_mask & AT_XVATTR) { 2089 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 2090 crgetuid(cr), cr, vap->va_type)) != 0) { 2091 ZFS_EXIT(zfsvfs); 2092 return (error); 2093 } 2094 } 2095 2096 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 2097 NULL, &acl_ids)) != 0) { 2098 ZFS_EXIT(zfsvfs); 2099 return (error); 2100 } 2101 2102 /* 2103 * First make sure the new directory doesn't exist. 2104 * 2105 * Existence is checked first to make sure we don't return 2106 * EACCES instead of EEXIST which can cause some applications 2107 * to fail. 2108 */ 2109 *vpp = NULL; 2110 2111 if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { 2112 zfs_acl_ids_free(&acl_ids); 2113 ZFS_EXIT(zfsvfs); 2114 return (error); 2115 } 2116 ASSERT3P(zp, ==, NULL); 2117 2118 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 2119 zfs_acl_ids_free(&acl_ids); 2120 ZFS_EXIT(zfsvfs); 2121 return (error); 2122 } 2123 2124 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 2125 zfs_acl_ids_free(&acl_ids); 2126 ZFS_EXIT(zfsvfs); 2127 return (SET_ERROR(EDQUOT)); 2128 } 2129 2130 /* 2131 * Add a new entry to the directory. 2132 */ 2133 getnewvnode_reserve(1); 2134 tx = dmu_tx_create(zfsvfs->z_os); 2135 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 2136 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2137 fuid_dirtied = zfsvfs->z_fuid_dirty; 2138 if (fuid_dirtied) 2139 zfs_fuid_txhold(zfsvfs, tx); 2140 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2141 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2142 acl_ids.z_aclp->z_acl_bytes); 2143 } 2144 2145 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2146 ZFS_SA_BASE_ATTR_SIZE); 2147 2148 error = dmu_tx_assign(tx, TXG_WAIT); 2149 if (error) { 2150 zfs_acl_ids_free(&acl_ids); 2151 dmu_tx_abort(tx); 2152 getnewvnode_drop_reserve(); 2153 ZFS_EXIT(zfsvfs); 2154 return (error); 2155 } 2156 2157 /* 2158 * Create new node. 2159 */ 2160 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2161 2162 if (fuid_dirtied) 2163 zfs_fuid_sync(zfsvfs, tx); 2164 2165 /* 2166 * Now put new name in parent dir. 2167 */ 2168 (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); 2169 2170 *vpp = ZTOV(zp); 2171 2172 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); 2173 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, 2174 acl_ids.z_fuidp, vap); 2175 2176 zfs_acl_ids_free(&acl_ids); 2177 2178 dmu_tx_commit(tx); 2179 2180 getnewvnode_drop_reserve(); 2181 2182 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2183 zil_commit(zilog, 0); 2184 2185 ZFS_EXIT(zfsvfs); 2186 return (0); 2187} 2188 2189/* 2190 * Remove a directory subdir entry. If the current working 2191 * directory is the same as the subdir to be removed, the 2192 * remove will fail. 2193 * 2194 * IN: dvp - vnode of directory to remove from. 2195 * name - name of directory to be removed. 2196 * cwd - vnode of current working directory. 2197 * cr - credentials of caller. 2198 * ct - caller context 2199 * flags - case flags 2200 * 2201 * RETURN: 0 on success, error code on failure. 2202 * 2203 * Timestamps: 2204 * dvp - ctime|mtime updated 2205 */ 2206/*ARGSUSED*/ 2207static int 2208zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 2209{ 2210 znode_t *dzp = VTOZ(dvp); 2211 znode_t *zp = VTOZ(vp); 2212 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2213 zilog_t *zilog; 2214 dmu_tx_t *tx; 2215 int error; 2216 2217 ZFS_ENTER(zfsvfs); 2218 ZFS_VERIFY_ZP(dzp); 2219 ZFS_VERIFY_ZP(zp); 2220 zilog = zfsvfs->z_log; 2221 2222 2223 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2224 goto out; 2225 } 2226 2227 if (vp->v_type != VDIR) { 2228 error = SET_ERROR(ENOTDIR); 2229 goto out; 2230 } 2231 2232 vnevent_rmdir(vp, dvp, name, ct); 2233 2234 tx = dmu_tx_create(zfsvfs->z_os); 2235 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2236 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2237 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2238 zfs_sa_upgrade_txholds(tx, zp); 2239 zfs_sa_upgrade_txholds(tx, dzp); 2240 dmu_tx_mark_netfree(tx); 2241 error = dmu_tx_assign(tx, TXG_WAIT); 2242 if (error) { 2243 dmu_tx_abort(tx); 2244 ZFS_EXIT(zfsvfs); 2245 return (error); 2246 } 2247 2248 cache_purge(dvp); 2249 2250 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); 2251 2252 if (error == 0) { 2253 uint64_t txtype = TX_RMDIR; 2254 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2255 } 2256 2257 dmu_tx_commit(tx); 2258 2259 cache_purge(vp); 2260out: 2261 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2262 zil_commit(zilog, 0); 2263 2264 ZFS_EXIT(zfsvfs); 2265 return (error); 2266} 2267 2268/* 2269 * Read as many directory entries as will fit into the provided 2270 * buffer from the given directory cursor position (specified in 2271 * the uio structure). 2272 * 2273 * IN: vp - vnode of directory to read. 2274 * uio - structure supplying read location, range info, 2275 * and return buffer. 2276 * cr - credentials of caller. 2277 * ct - caller context 2278 * flags - case flags 2279 * 2280 * OUT: uio - updated offset and range, buffer filled. 2281 * eofp - set to true if end-of-file detected. 2282 * 2283 * RETURN: 0 on success, error code on failure. 2284 * 2285 * Timestamps: 2286 * vp - atime updated 2287 * 2288 * Note that the low 4 bits of the cookie returned by zap is always zero. 2289 * This allows us to use the low range for "special" directory entries: 2290 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2291 * we use the offset 2 for the '.zfs' directory. 2292 */ 2293/* ARGSUSED */ 2294static int 2295zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2296{ 2297 znode_t *zp = VTOZ(vp); 2298 iovec_t *iovp; 2299 edirent_t *eodp; 2300 dirent64_t *odp; 2301 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2302 objset_t *os; 2303 caddr_t outbuf; 2304 size_t bufsize; 2305 zap_cursor_t zc; 2306 zap_attribute_t zap; 2307 uint_t bytes_wanted; 2308 uint64_t offset; /* must be unsigned; checks for < 1 */ 2309 uint64_t parent; 2310 int local_eof; 2311 int outcount; 2312 int error; 2313 uint8_t prefetch; 2314 boolean_t check_sysattrs; 2315 uint8_t type; 2316 int ncooks; 2317 u_long *cooks = NULL; 2318 int flags = 0; 2319 2320 ZFS_ENTER(zfsvfs); 2321 ZFS_VERIFY_ZP(zp); 2322 2323 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2324 &parent, sizeof (parent))) != 0) { 2325 ZFS_EXIT(zfsvfs); 2326 return (error); 2327 } 2328 2329 /* 2330 * If we are not given an eof variable, 2331 * use a local one. 2332 */ 2333 if (eofp == NULL) 2334 eofp = &local_eof; 2335 2336 /* 2337 * Check for valid iov_len. 2338 */ 2339 if (uio->uio_iov->iov_len <= 0) { 2340 ZFS_EXIT(zfsvfs); 2341 return (SET_ERROR(EINVAL)); 2342 } 2343 2344 /* 2345 * Quit if directory has been removed (posix) 2346 */ 2347 if ((*eofp = zp->z_unlinked) != 0) { 2348 ZFS_EXIT(zfsvfs); 2349 return (0); 2350 } 2351 2352 error = 0; 2353 os = zfsvfs->z_os; 2354 offset = uio->uio_loffset; 2355 prefetch = zp->z_zn_prefetch; 2356 2357 /* 2358 * Initialize the iterator cursor. 2359 */ 2360 if (offset <= 3) { 2361 /* 2362 * Start iteration from the beginning of the directory. 2363 */ 2364 zap_cursor_init(&zc, os, zp->z_id); 2365 } else { 2366 /* 2367 * The offset is a serialized cursor. 2368 */ 2369 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2370 } 2371 2372 /* 2373 * Get space to change directory entries into fs independent format. 2374 */ 2375 iovp = uio->uio_iov; 2376 bytes_wanted = iovp->iov_len; 2377 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2378 bufsize = bytes_wanted; 2379 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2380 odp = (struct dirent64 *)outbuf; 2381 } else { 2382 bufsize = bytes_wanted; 2383 outbuf = NULL; 2384 odp = (struct dirent64 *)iovp->iov_base; 2385 } 2386 eodp = (struct edirent *)odp; 2387 2388 if (ncookies != NULL) { 2389 /* 2390 * Minimum entry size is dirent size and 1 byte for a file name. 2391 */ 2392 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2393 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2394 *cookies = cooks; 2395 *ncookies = ncooks; 2396 } 2397 /* 2398 * If this VFS supports the system attribute view interface; and 2399 * we're looking at an extended attribute directory; and we care 2400 * about normalization conflicts on this vfs; then we must check 2401 * for normalization conflicts with the sysattr name space. 2402 */ 2403#ifdef TODO 2404 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2405 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2406 (flags & V_RDDIR_ENTFLAGS); 2407#else 2408 check_sysattrs = 0; 2409#endif 2410 2411 /* 2412 * Transform to file-system independent format 2413 */ 2414 outcount = 0; 2415 while (outcount < bytes_wanted) { 2416 ino64_t objnum; 2417 ushort_t reclen; 2418 off64_t *next = NULL; 2419 2420 /* 2421 * Special case `.', `..', and `.zfs'. 2422 */ 2423 if (offset == 0) { 2424 (void) strcpy(zap.za_name, "."); 2425 zap.za_normalization_conflict = 0; 2426 objnum = zp->z_id; 2427 type = DT_DIR; 2428 } else if (offset == 1) { 2429 (void) strcpy(zap.za_name, ".."); 2430 zap.za_normalization_conflict = 0; 2431 objnum = parent; 2432 type = DT_DIR; 2433 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2434 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2435 zap.za_normalization_conflict = 0; 2436 objnum = ZFSCTL_INO_ROOT; 2437 type = DT_DIR; 2438 } else { 2439 /* 2440 * Grab next entry. 2441 */ 2442 if (error = zap_cursor_retrieve(&zc, &zap)) { 2443 if ((*eofp = (error == ENOENT)) != 0) 2444 break; 2445 else 2446 goto update; 2447 } 2448 2449 if (zap.za_integer_length != 8 || 2450 zap.za_num_integers != 1) { 2451 cmn_err(CE_WARN, "zap_readdir: bad directory " 2452 "entry, obj = %lld, offset = %lld\n", 2453 (u_longlong_t)zp->z_id, 2454 (u_longlong_t)offset); 2455 error = SET_ERROR(ENXIO); 2456 goto update; 2457 } 2458 2459 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2460 /* 2461 * MacOS X can extract the object type here such as: 2462 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2463 */ 2464 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2465 2466 if (check_sysattrs && !zap.za_normalization_conflict) { 2467#ifdef TODO 2468 zap.za_normalization_conflict = 2469 xattr_sysattr_casechk(zap.za_name); 2470#else 2471 panic("%s:%u: TODO", __func__, __LINE__); 2472#endif 2473 } 2474 } 2475 2476 if (flags & V_RDDIR_ACCFILTER) { 2477 /* 2478 * If we have no access at all, don't include 2479 * this entry in the returned information 2480 */ 2481 znode_t *ezp; 2482 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2483 goto skip_entry; 2484 if (!zfs_has_access(ezp, cr)) { 2485 vrele(ZTOV(ezp)); 2486 goto skip_entry; 2487 } 2488 vrele(ZTOV(ezp)); 2489 } 2490 2491 if (flags & V_RDDIR_ENTFLAGS) 2492 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2493 else 2494 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2495 2496 /* 2497 * Will this entry fit in the buffer? 2498 */ 2499 if (outcount + reclen > bufsize) { 2500 /* 2501 * Did we manage to fit anything in the buffer? 2502 */ 2503 if (!outcount) { 2504 error = SET_ERROR(EINVAL); 2505 goto update; 2506 } 2507 break; 2508 } 2509 if (flags & V_RDDIR_ENTFLAGS) { 2510 /* 2511 * Add extended flag entry: 2512 */ 2513 eodp->ed_ino = objnum; 2514 eodp->ed_reclen = reclen; 2515 /* NOTE: ed_off is the offset for the *next* entry */ 2516 next = &(eodp->ed_off); 2517 eodp->ed_eflags = zap.za_normalization_conflict ? 2518 ED_CASE_CONFLICT : 0; 2519 (void) strncpy(eodp->ed_name, zap.za_name, 2520 EDIRENT_NAMELEN(reclen)); 2521 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2522 } else { 2523 /* 2524 * Add normal entry: 2525 */ 2526 odp->d_ino = objnum; 2527 odp->d_reclen = reclen; 2528 odp->d_namlen = strlen(zap.za_name); 2529 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2530 odp->d_type = type; 2531 odp = (dirent64_t *)((intptr_t)odp + reclen); 2532 } 2533 outcount += reclen; 2534 2535 ASSERT(outcount <= bufsize); 2536 2537 /* Prefetch znode */ 2538 if (prefetch) 2539 dmu_prefetch(os, objnum, 0, 0, 0, 2540 ZIO_PRIORITY_SYNC_READ); 2541 2542 skip_entry: 2543 /* 2544 * Move to the next entry, fill in the previous offset. 2545 */ 2546 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2547 zap_cursor_advance(&zc); 2548 offset = zap_cursor_serialize(&zc); 2549 } else { 2550 offset += 1; 2551 } 2552 2553 if (cooks != NULL) { 2554 *cooks++ = offset; 2555 ncooks--; 2556 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2557 } 2558 } 2559 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2560 2561 /* Subtract unused cookies */ 2562 if (ncookies != NULL) 2563 *ncookies -= ncooks; 2564 2565 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2566 iovp->iov_base += outcount; 2567 iovp->iov_len -= outcount; 2568 uio->uio_resid -= outcount; 2569 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2570 /* 2571 * Reset the pointer. 2572 */ 2573 offset = uio->uio_loffset; 2574 } 2575 2576update: 2577 zap_cursor_fini(&zc); 2578 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2579 kmem_free(outbuf, bufsize); 2580 2581 if (error == ENOENT) 2582 error = 0; 2583 2584 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2585 2586 uio->uio_loffset = offset; 2587 ZFS_EXIT(zfsvfs); 2588 if (error != 0 && cookies != NULL) { 2589 free(*cookies, M_TEMP); 2590 *cookies = NULL; 2591 *ncookies = 0; 2592 } 2593 return (error); 2594} 2595 2596ulong_t zfs_fsync_sync_cnt = 4; 2597 2598static int 2599zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2600{ 2601 znode_t *zp = VTOZ(vp); 2602 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2603 2604 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2605 2606 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2607 ZFS_ENTER(zfsvfs); 2608 ZFS_VERIFY_ZP(zp); 2609 zil_commit(zfsvfs->z_log, zp->z_id); 2610 ZFS_EXIT(zfsvfs); 2611 } 2612 return (0); 2613} 2614 2615 2616/* 2617 * Get the requested file attributes and place them in the provided 2618 * vattr structure. 2619 * 2620 * IN: vp - vnode of file. 2621 * vap - va_mask identifies requested attributes. 2622 * If AT_XVATTR set, then optional attrs are requested 2623 * flags - ATTR_NOACLCHECK (CIFS server context) 2624 * cr - credentials of caller. 2625 * ct - caller context 2626 * 2627 * OUT: vap - attribute values. 2628 * 2629 * RETURN: 0 (always succeeds). 2630 */ 2631/* ARGSUSED */ 2632static int 2633zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2634 caller_context_t *ct) 2635{ 2636 znode_t *zp = VTOZ(vp); 2637 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2638 int error = 0; 2639 uint32_t blksize; 2640 u_longlong_t nblocks; 2641 uint64_t links; 2642 uint64_t mtime[2], ctime[2], crtime[2], rdev; 2643 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2644 xoptattr_t *xoap = NULL; 2645 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2646 sa_bulk_attr_t bulk[4]; 2647 int count = 0; 2648 2649 ZFS_ENTER(zfsvfs); 2650 ZFS_VERIFY_ZP(zp); 2651 2652 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2653 2654 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2655 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2656 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); 2657 if (vp->v_type == VBLK || vp->v_type == VCHR) 2658 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, 2659 &rdev, 8); 2660 2661 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2662 ZFS_EXIT(zfsvfs); 2663 return (error); 2664 } 2665 2666 /* 2667 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2668 * Also, if we are the owner don't bother, since owner should 2669 * always be allowed to read basic attributes of file. 2670 */ 2671 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2672 (vap->va_uid != crgetuid(cr))) { 2673 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2674 skipaclchk, cr)) { 2675 ZFS_EXIT(zfsvfs); 2676 return (error); 2677 } 2678 } 2679 2680 /* 2681 * Return all attributes. It's cheaper to provide the answer 2682 * than to determine whether we were asked the question. 2683 */ 2684 2685 vap->va_type = IFTOVT(zp->z_mode); 2686 vap->va_mode = zp->z_mode & ~S_IFMT; 2687#ifdef illumos 2688 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2689#else 2690 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2691#endif 2692 vap->va_nodeid = zp->z_id; 2693 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2694 links = zp->z_links + 1; 2695 else 2696 links = zp->z_links; 2697 vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ 2698 vap->va_size = zp->z_size; 2699#ifdef illumos 2700 vap->va_rdev = vp->v_rdev; 2701#else 2702 if (vp->v_type == VBLK || vp->v_type == VCHR) 2703 vap->va_rdev = zfs_cmpldev(rdev); 2704#endif 2705 vap->va_seq = zp->z_seq; 2706 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2707 vap->va_filerev = zp->z_seq; 2708 2709 /* 2710 * Add in any requested optional attributes and the create time. 2711 * Also set the corresponding bits in the returned attribute bitmap. 2712 */ 2713 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2714 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2715 xoap->xoa_archive = 2716 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2717 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2718 } 2719 2720 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2721 xoap->xoa_readonly = 2722 ((zp->z_pflags & ZFS_READONLY) != 0); 2723 XVA_SET_RTN(xvap, XAT_READONLY); 2724 } 2725 2726 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2727 xoap->xoa_system = 2728 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2729 XVA_SET_RTN(xvap, XAT_SYSTEM); 2730 } 2731 2732 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2733 xoap->xoa_hidden = 2734 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2735 XVA_SET_RTN(xvap, XAT_HIDDEN); 2736 } 2737 2738 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2739 xoap->xoa_nounlink = 2740 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2741 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2742 } 2743 2744 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2745 xoap->xoa_immutable = 2746 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2747 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2748 } 2749 2750 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2751 xoap->xoa_appendonly = 2752 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2753 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2754 } 2755 2756 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2757 xoap->xoa_nodump = 2758 ((zp->z_pflags & ZFS_NODUMP) != 0); 2759 XVA_SET_RTN(xvap, XAT_NODUMP); 2760 } 2761 2762 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2763 xoap->xoa_opaque = 2764 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2765 XVA_SET_RTN(xvap, XAT_OPAQUE); 2766 } 2767 2768 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2769 xoap->xoa_av_quarantined = 2770 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2771 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2772 } 2773 2774 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2775 xoap->xoa_av_modified = 2776 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2777 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2778 } 2779 2780 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2781 vp->v_type == VREG) { 2782 zfs_sa_get_scanstamp(zp, xvap); 2783 } 2784 2785 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2786 uint64_t times[2]; 2787 2788 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), 2789 times, sizeof (times)); 2790 ZFS_TIME_DECODE(&xoap->xoa_createtime, times); 2791 XVA_SET_RTN(xvap, XAT_CREATETIME); 2792 } 2793 2794 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2795 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2796 XVA_SET_RTN(xvap, XAT_REPARSE); 2797 } 2798 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2799 xoap->xoa_generation = zp->z_gen; 2800 XVA_SET_RTN(xvap, XAT_GEN); 2801 } 2802 2803 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2804 xoap->xoa_offline = 2805 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2806 XVA_SET_RTN(xvap, XAT_OFFLINE); 2807 } 2808 2809 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2810 xoap->xoa_sparse = 2811 ((zp->z_pflags & ZFS_SPARSE) != 0); 2812 XVA_SET_RTN(xvap, XAT_SPARSE); 2813 } 2814 } 2815 2816 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2817 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2818 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2819 ZFS_TIME_DECODE(&vap->va_birthtime, crtime); 2820 2821 2822 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 2823 vap->va_blksize = blksize; 2824 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2825 2826 if (zp->z_blksz == 0) { 2827 /* 2828 * Block size hasn't been set; suggest maximal I/O transfers. 2829 */ 2830 vap->va_blksize = zfsvfs->z_max_blksz; 2831 } 2832 2833 ZFS_EXIT(zfsvfs); 2834 return (0); 2835} 2836 2837/* 2838 * Set the file attributes to the values contained in the 2839 * vattr structure. 2840 * 2841 * IN: vp - vnode of file to be modified. 2842 * vap - new attribute values. 2843 * If AT_XVATTR set, then optional attrs are being set 2844 * flags - ATTR_UTIME set if non-default time values provided. 2845 * - ATTR_NOACLCHECK (CIFS context only). 2846 * cr - credentials of caller. 2847 * ct - caller context 2848 * 2849 * RETURN: 0 on success, error code on failure. 2850 * 2851 * Timestamps: 2852 * vp - ctime updated, mtime updated if size changed. 2853 */ 2854/* ARGSUSED */ 2855static int 2856zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2857 caller_context_t *ct) 2858{ 2859 znode_t *zp = VTOZ(vp); 2860 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2861 zilog_t *zilog; 2862 dmu_tx_t *tx; 2863 vattr_t oldva; 2864 xvattr_t tmpxvattr; 2865 uint_t mask = vap->va_mask; 2866 uint_t saved_mask = 0; 2867 uint64_t saved_mode; 2868 int trim_mask = 0; 2869 uint64_t new_mode; 2870 uint64_t new_uid, new_gid; 2871 uint64_t xattr_obj; 2872 uint64_t mtime[2], ctime[2]; 2873 znode_t *attrzp; 2874 int need_policy = FALSE; 2875 int err, err2; 2876 zfs_fuid_info_t *fuidp = NULL; 2877 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2878 xoptattr_t *xoap; 2879 zfs_acl_t *aclp; 2880 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2881 boolean_t fuid_dirtied = B_FALSE; 2882 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 2883 int count = 0, xattr_count = 0; 2884 2885 if (mask == 0) 2886 return (0); 2887 2888 if (mask & AT_NOSET) 2889 return (SET_ERROR(EINVAL)); 2890 2891 ZFS_ENTER(zfsvfs); 2892 ZFS_VERIFY_ZP(zp); 2893 2894 zilog = zfsvfs->z_log; 2895 2896 /* 2897 * Make sure that if we have ephemeral uid/gid or xvattr specified 2898 * that file system is at proper version level 2899 */ 2900 2901 if (zfsvfs->z_use_fuids == B_FALSE && 2902 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2903 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2904 (mask & AT_XVATTR))) { 2905 ZFS_EXIT(zfsvfs); 2906 return (SET_ERROR(EINVAL)); 2907 } 2908 2909 if (mask & AT_SIZE && vp->v_type == VDIR) { 2910 ZFS_EXIT(zfsvfs); 2911 return (SET_ERROR(EISDIR)); 2912 } 2913 2914 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2915 ZFS_EXIT(zfsvfs); 2916 return (SET_ERROR(EINVAL)); 2917 } 2918 2919 /* 2920 * If this is an xvattr_t, then get a pointer to the structure of 2921 * optional attributes. If this is NULL, then we have a vattr_t. 2922 */ 2923 xoap = xva_getxoptattr(xvap); 2924 2925 xva_init(&tmpxvattr); 2926 2927 /* 2928 * Immutable files can only alter immutable bit and atime 2929 */ 2930 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2931 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2932 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2933 ZFS_EXIT(zfsvfs); 2934 return (SET_ERROR(EPERM)); 2935 } 2936 2937 if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 2938 ZFS_EXIT(zfsvfs); 2939 return (SET_ERROR(EPERM)); 2940 } 2941 2942 /* 2943 * Verify timestamps doesn't overflow 32 bits. 2944 * ZFS can handle large timestamps, but 32bit syscalls can't 2945 * handle times greater than 2039. This check should be removed 2946 * once large timestamps are fully supported. 2947 */ 2948 if (mask & (AT_ATIME | AT_MTIME)) { 2949 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2950 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2951 ZFS_EXIT(zfsvfs); 2952 return (SET_ERROR(EOVERFLOW)); 2953 } 2954 } 2955 2956 attrzp = NULL; 2957 aclp = NULL; 2958 2959 /* Can this be moved to before the top label? */ 2960 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2961 ZFS_EXIT(zfsvfs); 2962 return (SET_ERROR(EROFS)); 2963 } 2964 2965 /* 2966 * First validate permissions 2967 */ 2968 2969 if (mask & AT_SIZE) { 2970 /* 2971 * XXX - Note, we are not providing any open 2972 * mode flags here (like FNDELAY), so we may 2973 * block if there are locks present... this 2974 * should be addressed in openat(). 2975 */ 2976 /* XXX - would it be OK to generate a log record here? */ 2977 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2978 if (err) { 2979 ZFS_EXIT(zfsvfs); 2980 return (err); 2981 } 2982 } 2983 2984 if (mask & (AT_ATIME|AT_MTIME) || 2985 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2986 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2987 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2988 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2989 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2990 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2991 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2992 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2993 skipaclchk, cr); 2994 } 2995 2996 if (mask & (AT_UID|AT_GID)) { 2997 int idmask = (mask & (AT_UID|AT_GID)); 2998 int take_owner; 2999 int take_group; 3000 3001 /* 3002 * NOTE: even if a new mode is being set, 3003 * we may clear S_ISUID/S_ISGID bits. 3004 */ 3005 3006 if (!(mask & AT_MODE)) 3007 vap->va_mode = zp->z_mode; 3008 3009 /* 3010 * Take ownership or chgrp to group we are a member of 3011 */ 3012 3013 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 3014 take_group = (mask & AT_GID) && 3015 zfs_groupmember(zfsvfs, vap->va_gid, cr); 3016 3017 /* 3018 * If both AT_UID and AT_GID are set then take_owner and 3019 * take_group must both be set in order to allow taking 3020 * ownership. 3021 * 3022 * Otherwise, send the check through secpolicy_vnode_setattr() 3023 * 3024 */ 3025 3026 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 3027 ((idmask == AT_UID) && take_owner) || 3028 ((idmask == AT_GID) && take_group)) { 3029 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 3030 skipaclchk, cr) == 0) { 3031 /* 3032 * Remove setuid/setgid for non-privileged users 3033 */ 3034 secpolicy_setid_clear(vap, vp, cr); 3035 trim_mask = (mask & (AT_UID|AT_GID)); 3036 } else { 3037 need_policy = TRUE; 3038 } 3039 } else { 3040 need_policy = TRUE; 3041 } 3042 } 3043 3044 oldva.va_mode = zp->z_mode; 3045 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 3046 if (mask & AT_XVATTR) { 3047 /* 3048 * Update xvattr mask to include only those attributes 3049 * that are actually changing. 3050 * 3051 * the bits will be restored prior to actually setting 3052 * the attributes so the caller thinks they were set. 3053 */ 3054 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3055 if (xoap->xoa_appendonly != 3056 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 3057 need_policy = TRUE; 3058 } else { 3059 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 3060 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 3061 } 3062 } 3063 3064 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3065 if (xoap->xoa_nounlink != 3066 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 3067 need_policy = TRUE; 3068 } else { 3069 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 3070 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 3071 } 3072 } 3073 3074 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3075 if (xoap->xoa_immutable != 3076 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 3077 need_policy = TRUE; 3078 } else { 3079 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 3080 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 3081 } 3082 } 3083 3084 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3085 if (xoap->xoa_nodump != 3086 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 3087 need_policy = TRUE; 3088 } else { 3089 XVA_CLR_REQ(xvap, XAT_NODUMP); 3090 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 3091 } 3092 } 3093 3094 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3095 if (xoap->xoa_av_modified != 3096 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 3097 need_policy = TRUE; 3098 } else { 3099 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 3100 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 3101 } 3102 } 3103 3104 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3105 if ((vp->v_type != VREG && 3106 xoap->xoa_av_quarantined) || 3107 xoap->xoa_av_quarantined != 3108 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 3109 need_policy = TRUE; 3110 } else { 3111 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 3112 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 3113 } 3114 } 3115 3116 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3117 ZFS_EXIT(zfsvfs); 3118 return (SET_ERROR(EPERM)); 3119 } 3120 3121 if (need_policy == FALSE && 3122 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 3123 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 3124 need_policy = TRUE; 3125 } 3126 } 3127 3128 if (mask & AT_MODE) { 3129 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 3130 err = secpolicy_setid_setsticky_clear(vp, vap, 3131 &oldva, cr); 3132 if (err) { 3133 ZFS_EXIT(zfsvfs); 3134 return (err); 3135 } 3136 trim_mask |= AT_MODE; 3137 } else { 3138 need_policy = TRUE; 3139 } 3140 } 3141 3142 if (need_policy) { 3143 /* 3144 * If trim_mask is set then take ownership 3145 * has been granted or write_acl is present and user 3146 * has the ability to modify mode. In that case remove 3147 * UID|GID and or MODE from mask so that 3148 * secpolicy_vnode_setattr() doesn't revoke it. 3149 */ 3150 3151 if (trim_mask) { 3152 saved_mask = vap->va_mask; 3153 vap->va_mask &= ~trim_mask; 3154 if (trim_mask & AT_MODE) { 3155 /* 3156 * Save the mode, as secpolicy_vnode_setattr() 3157 * will overwrite it with ova.va_mode. 3158 */ 3159 saved_mode = vap->va_mode; 3160 } 3161 } 3162 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 3163 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 3164 if (err) { 3165 ZFS_EXIT(zfsvfs); 3166 return (err); 3167 } 3168 3169 if (trim_mask) { 3170 vap->va_mask |= saved_mask; 3171 if (trim_mask & AT_MODE) { 3172 /* 3173 * Recover the mode after 3174 * secpolicy_vnode_setattr(). 3175 */ 3176 vap->va_mode = saved_mode; 3177 } 3178 } 3179 } 3180 3181 /* 3182 * secpolicy_vnode_setattr, or take ownership may have 3183 * changed va_mask 3184 */ 3185 mask = vap->va_mask; 3186 3187 if ((mask & (AT_UID | AT_GID))) { 3188 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3189 &xattr_obj, sizeof (xattr_obj)); 3190 3191 if (err == 0 && xattr_obj) { 3192 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3193 if (err == 0) { 3194 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); 3195 if (err != 0) 3196 vrele(ZTOV(attrzp)); 3197 } 3198 if (err) 3199 goto out2; 3200 } 3201 if (mask & AT_UID) { 3202 new_uid = zfs_fuid_create(zfsvfs, 3203 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3204 if (new_uid != zp->z_uid && 3205 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 3206 if (attrzp) 3207 vput(ZTOV(attrzp)); 3208 err = SET_ERROR(EDQUOT); 3209 goto out2; 3210 } 3211 } 3212 3213 if (mask & AT_GID) { 3214 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3215 cr, ZFS_GROUP, &fuidp); 3216 if (new_gid != zp->z_gid && 3217 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 3218 if (attrzp) 3219 vput(ZTOV(attrzp)); 3220 err = SET_ERROR(EDQUOT); 3221 goto out2; 3222 } 3223 } 3224 } 3225 tx = dmu_tx_create(zfsvfs->z_os); 3226 3227 if (mask & AT_MODE) { 3228 uint64_t pmode = zp->z_mode; 3229 uint64_t acl_obj; 3230 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3231 3232 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3233 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3234 err = SET_ERROR(EPERM); 3235 goto out; 3236 } 3237 3238 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3239 goto out; 3240 3241 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3242 /* 3243 * Are we upgrading ACL from old V0 format 3244 * to V1 format? 3245 */ 3246 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3247 zfs_znode_acl_version(zp) == 3248 ZFS_ACL_VERSION_INITIAL) { 3249 dmu_tx_hold_free(tx, acl_obj, 0, 3250 DMU_OBJECT_END); 3251 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3252 0, aclp->z_acl_bytes); 3253 } else { 3254 dmu_tx_hold_write(tx, acl_obj, 0, 3255 aclp->z_acl_bytes); 3256 } 3257 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3258 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3259 0, aclp->z_acl_bytes); 3260 } 3261 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3262 } else { 3263 if ((mask & AT_XVATTR) && 3264 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3265 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3266 else 3267 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3268 } 3269 3270 if (attrzp) { 3271 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3272 } 3273 3274 fuid_dirtied = zfsvfs->z_fuid_dirty; 3275 if (fuid_dirtied) 3276 zfs_fuid_txhold(zfsvfs, tx); 3277 3278 zfs_sa_upgrade_txholds(tx, zp); 3279 3280 err = dmu_tx_assign(tx, TXG_WAIT); 3281 if (err) 3282 goto out; 3283 3284 count = 0; 3285 /* 3286 * Set each attribute requested. 3287 * We group settings according to the locks they need to acquire. 3288 * 3289 * Note: you cannot set ctime directly, although it will be 3290 * updated as a side-effect of calling this function. 3291 */ 3292 3293 if (mask & (AT_UID|AT_GID|AT_MODE)) 3294 mutex_enter(&zp->z_acl_lock); 3295 3296 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3297 &zp->z_pflags, sizeof (zp->z_pflags)); 3298 3299 if (attrzp) { 3300 if (mask & (AT_UID|AT_GID|AT_MODE)) 3301 mutex_enter(&attrzp->z_acl_lock); 3302 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3303 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3304 sizeof (attrzp->z_pflags)); 3305 } 3306 3307 if (mask & (AT_UID|AT_GID)) { 3308 3309 if (mask & AT_UID) { 3310 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3311 &new_uid, sizeof (new_uid)); 3312 zp->z_uid = new_uid; 3313 if (attrzp) { 3314 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3315 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3316 sizeof (new_uid)); 3317 attrzp->z_uid = new_uid; 3318 } 3319 } 3320 3321 if (mask & AT_GID) { 3322 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3323 NULL, &new_gid, sizeof (new_gid)); 3324 zp->z_gid = new_gid; 3325 if (attrzp) { 3326 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3327 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3328 sizeof (new_gid)); 3329 attrzp->z_gid = new_gid; 3330 } 3331 } 3332 if (!(mask & AT_MODE)) { 3333 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3334 NULL, &new_mode, sizeof (new_mode)); 3335 new_mode = zp->z_mode; 3336 } 3337 err = zfs_acl_chown_setattr(zp); 3338 ASSERT(err == 0); 3339 if (attrzp) { 3340 err = zfs_acl_chown_setattr(attrzp); 3341 ASSERT(err == 0); 3342 } 3343 } 3344 3345 if (mask & AT_MODE) { 3346 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3347 &new_mode, sizeof (new_mode)); 3348 zp->z_mode = new_mode; 3349 ASSERT3U((uintptr_t)aclp, !=, 0); 3350 err = zfs_aclset_common(zp, aclp, cr, tx); 3351 ASSERT0(err); 3352 if (zp->z_acl_cached) 3353 zfs_acl_free(zp->z_acl_cached); 3354 zp->z_acl_cached = aclp; 3355 aclp = NULL; 3356 } 3357 3358 3359 if (mask & AT_ATIME) { 3360 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3361 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3362 &zp->z_atime, sizeof (zp->z_atime)); 3363 } 3364 3365 if (mask & AT_MTIME) { 3366 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3367 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3368 mtime, sizeof (mtime)); 3369 } 3370 3371 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3372 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3373 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3374 NULL, mtime, sizeof (mtime)); 3375 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3376 &ctime, sizeof (ctime)); 3377 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3378 B_TRUE); 3379 } else if (mask != 0) { 3380 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3381 &ctime, sizeof (ctime)); 3382 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3383 B_TRUE); 3384 if (attrzp) { 3385 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3386 SA_ZPL_CTIME(zfsvfs), NULL, 3387 &ctime, sizeof (ctime)); 3388 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3389 mtime, ctime, B_TRUE); 3390 } 3391 } 3392 /* 3393 * Do this after setting timestamps to prevent timestamp 3394 * update from toggling bit 3395 */ 3396 3397 if (xoap && (mask & AT_XVATTR)) { 3398 3399 /* 3400 * restore trimmed off masks 3401 * so that return masks can be set for caller. 3402 */ 3403 3404 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3405 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3406 } 3407 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3408 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3409 } 3410 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3411 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3412 } 3413 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3414 XVA_SET_REQ(xvap, XAT_NODUMP); 3415 } 3416 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3417 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3418 } 3419 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3420 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3421 } 3422 3423 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3424 ASSERT(vp->v_type == VREG); 3425 3426 zfs_xvattr_set(zp, xvap, tx); 3427 } 3428 3429 if (fuid_dirtied) 3430 zfs_fuid_sync(zfsvfs, tx); 3431 3432 if (mask != 0) 3433 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3434 3435 if (mask & (AT_UID|AT_GID|AT_MODE)) 3436 mutex_exit(&zp->z_acl_lock); 3437 3438 if (attrzp) { 3439 if (mask & (AT_UID|AT_GID|AT_MODE)) 3440 mutex_exit(&attrzp->z_acl_lock); 3441 } 3442out: 3443 if (err == 0 && attrzp) { 3444 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3445 xattr_count, tx); 3446 ASSERT(err2 == 0); 3447 } 3448 3449 if (attrzp) 3450 vput(ZTOV(attrzp)); 3451 3452 if (aclp) 3453 zfs_acl_free(aclp); 3454 3455 if (fuidp) { 3456 zfs_fuid_info_free(fuidp); 3457 fuidp = NULL; 3458 } 3459 3460 if (err) { 3461 dmu_tx_abort(tx); 3462 } else { 3463 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3464 dmu_tx_commit(tx); 3465 } 3466 3467out2: 3468 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3469 zil_commit(zilog, 0); 3470 3471 ZFS_EXIT(zfsvfs); 3472 return (err); 3473} 3474 3475/* 3476 * We acquire all but fdvp locks using non-blocking acquisitions. If we 3477 * fail to acquire any lock in the path we will drop all held locks, 3478 * acquire the new lock in a blocking fashion, and then release it and 3479 * restart the rename. This acquire/release step ensures that we do not 3480 * spin on a lock waiting for release. On error release all vnode locks 3481 * and decrement references the way tmpfs_rename() would do. 3482 */ 3483static int 3484zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, 3485 struct vnode *tdvp, struct vnode **tvpp, 3486 const struct componentname *scnp, const struct componentname *tcnp) 3487{ 3488 zfsvfs_t *zfsvfs; 3489 struct vnode *nvp, *svp, *tvp; 3490 znode_t *sdzp, *tdzp, *szp, *tzp; 3491 const char *snm = scnp->cn_nameptr; 3492 const char *tnm = tcnp->cn_nameptr; 3493 int error; 3494 3495 VOP_UNLOCK(tdvp, 0); 3496 if (*tvpp != NULL && *tvpp != tdvp) 3497 VOP_UNLOCK(*tvpp, 0); 3498 3499relock: 3500 error = vn_lock(sdvp, LK_EXCLUSIVE); 3501 if (error) 3502 goto out; 3503 sdzp = VTOZ(sdvp); 3504 3505 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); 3506 if (error != 0) { 3507 VOP_UNLOCK(sdvp, 0); 3508 if (error != EBUSY) 3509 goto out; 3510 error = vn_lock(tdvp, LK_EXCLUSIVE); 3511 if (error) 3512 goto out; 3513 VOP_UNLOCK(tdvp, 0); 3514 goto relock; 3515 } 3516 tdzp = VTOZ(tdvp); 3517 3518 /* 3519 * Before using sdzp and tdzp we must ensure that they are live. 3520 * As a porting legacy from illumos we have two things to worry 3521 * about. One is typical for FreeBSD and it is that the vnode is 3522 * not reclaimed (doomed). The other is that the znode is live. 3523 * The current code can invalidate the znode without acquiring the 3524 * corresponding vnode lock if the object represented by the znode 3525 * and vnode is no longer valid after a rollback or receive operation. 3526 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock 3527 * that protects the znodes from the invalidation. 3528 */ 3529 zfsvfs = sdzp->z_zfsvfs; 3530 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); 3531 ZFS_ENTER(zfsvfs); 3532 3533 /* 3534 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3535 * bypassing the cleanup code in the case of an error. 3536 */ 3537 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3538 ZFS_EXIT(zfsvfs); 3539 VOP_UNLOCK(sdvp, 0); 3540 VOP_UNLOCK(tdvp, 0); 3541 error = SET_ERROR(EIO); 3542 goto out; 3543 } 3544 3545 /* 3546 * Re-resolve svp to be certain it still exists and fetch the 3547 * correct vnode. 3548 */ 3549 error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); 3550 if (error != 0) { 3551 /* Source entry invalid or not there. */ 3552 ZFS_EXIT(zfsvfs); 3553 VOP_UNLOCK(sdvp, 0); 3554 VOP_UNLOCK(tdvp, 0); 3555 if ((scnp->cn_flags & ISDOTDOT) != 0 || 3556 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) 3557 error = SET_ERROR(EINVAL); 3558 goto out; 3559 } 3560 svp = ZTOV(szp); 3561 3562 /* 3563 * Re-resolve tvp, if it disappeared we just carry on. 3564 */ 3565 error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); 3566 if (error != 0) { 3567 ZFS_EXIT(zfsvfs); 3568 VOP_UNLOCK(sdvp, 0); 3569 VOP_UNLOCK(tdvp, 0); 3570 vrele(svp); 3571 if ((tcnp->cn_flags & ISDOTDOT) != 0) 3572 error = SET_ERROR(EINVAL); 3573 goto out; 3574 } 3575 if (tzp != NULL) 3576 tvp = ZTOV(tzp); 3577 else 3578 tvp = NULL; 3579 3580 /* 3581 * At present the vnode locks must be acquired before z_teardown_lock, 3582 * although it would be more logical to use the opposite order. 3583 */ 3584 ZFS_EXIT(zfsvfs); 3585 3586 /* 3587 * Now try acquire locks on svp and tvp. 3588 */ 3589 nvp = svp; 3590 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3591 if (error != 0) { 3592 VOP_UNLOCK(sdvp, 0); 3593 VOP_UNLOCK(tdvp, 0); 3594 if (tvp != NULL) 3595 vrele(tvp); 3596 if (error != EBUSY) { 3597 vrele(nvp); 3598 goto out; 3599 } 3600 error = vn_lock(nvp, LK_EXCLUSIVE); 3601 if (error != 0) { 3602 vrele(nvp); 3603 goto out; 3604 } 3605 VOP_UNLOCK(nvp, 0); 3606 /* 3607 * Concurrent rename race. 3608 * XXX ? 3609 */ 3610 if (nvp == tdvp) { 3611 vrele(nvp); 3612 error = SET_ERROR(EINVAL); 3613 goto out; 3614 } 3615 vrele(*svpp); 3616 *svpp = nvp; 3617 goto relock; 3618 } 3619 vrele(*svpp); 3620 *svpp = nvp; 3621 3622 if (*tvpp != NULL) 3623 vrele(*tvpp); 3624 *tvpp = NULL; 3625 if (tvp != NULL) { 3626 nvp = tvp; 3627 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3628 if (error != 0) { 3629 VOP_UNLOCK(sdvp, 0); 3630 VOP_UNLOCK(tdvp, 0); 3631 VOP_UNLOCK(*svpp, 0); 3632 if (error != EBUSY) { 3633 vrele(nvp); 3634 goto out; 3635 } 3636 error = vn_lock(nvp, LK_EXCLUSIVE); 3637 if (error != 0) { 3638 vrele(nvp); 3639 goto out; 3640 } 3641 vput(nvp); 3642 goto relock; 3643 } 3644 *tvpp = nvp; 3645 } 3646 3647 return (0); 3648 3649out: 3650 return (error); 3651} 3652 3653/* 3654 * Note that we must use VRELE_ASYNC in this function as it walks 3655 * up the directory tree and vrele may need to acquire an exclusive 3656 * lock if a last reference to a vnode is dropped. 3657 */ 3658static int 3659zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) 3660{ 3661 zfsvfs_t *zfsvfs; 3662 znode_t *zp, *zp1; 3663 uint64_t parent; 3664 int error; 3665 3666 zfsvfs = tdzp->z_zfsvfs; 3667 if (tdzp == szp) 3668 return (SET_ERROR(EINVAL)); 3669 if (tdzp == sdzp) 3670 return (0); 3671 if (tdzp->z_id == zfsvfs->z_root) 3672 return (0); 3673 zp = tdzp; 3674 for (;;) { 3675 ASSERT(!zp->z_unlinked); 3676 if ((error = sa_lookup(zp->z_sa_hdl, 3677 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 3678 break; 3679 3680 if (parent == szp->z_id) { 3681 error = SET_ERROR(EINVAL); 3682 break; 3683 } 3684 if (parent == zfsvfs->z_root) 3685 break; 3686 if (parent == sdzp->z_id) 3687 break; 3688 3689 error = zfs_zget(zfsvfs, parent, &zp1); 3690 if (error != 0) 3691 break; 3692 3693 if (zp != tdzp) 3694 VN_RELE_ASYNC(ZTOV(zp), 3695 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3696 zp = zp1; 3697 } 3698 3699 if (error == ENOTDIR) 3700 panic("checkpath: .. not a directory\n"); 3701 if (zp != tdzp) 3702 VN_RELE_ASYNC(ZTOV(zp), 3703 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3704 return (error); 3705} 3706 3707/* 3708 * Move an entry from the provided source directory to the target 3709 * directory. Change the entry name as indicated. 3710 * 3711 * IN: sdvp - Source directory containing the "old entry". 3712 * snm - Old entry name. 3713 * tdvp - Target directory to contain the "new entry". 3714 * tnm - New entry name. 3715 * cr - credentials of caller. 3716 * ct - caller context 3717 * flags - case flags 3718 * 3719 * RETURN: 0 on success, error code on failure. 3720 * 3721 * Timestamps: 3722 * sdvp,tdvp - ctime|mtime updated 3723 */ 3724/*ARGSUSED*/ 3725static int 3726zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, 3727 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, 3728 cred_t *cr) 3729{ 3730 zfsvfs_t *zfsvfs; 3731 znode_t *sdzp, *tdzp, *szp, *tzp; 3732 zilog_t *zilog = NULL; 3733 dmu_tx_t *tx; 3734 char *snm = scnp->cn_nameptr; 3735 char *tnm = tcnp->cn_nameptr; 3736 int error = 0; 3737 3738 /* Reject renames across filesystems. */ 3739 if ((*svpp)->v_mount != tdvp->v_mount || 3740 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { 3741 error = SET_ERROR(EXDEV); 3742 goto out; 3743 } 3744 3745 if (zfsctl_is_node(tdvp)) { 3746 error = SET_ERROR(EXDEV); 3747 goto out; 3748 } 3749 3750 /* 3751 * Lock all four vnodes to ensure safety and semantics of renaming. 3752 */ 3753 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); 3754 if (error != 0) { 3755 /* no vnodes are locked in the case of error here */ 3756 return (error); 3757 } 3758 3759 tdzp = VTOZ(tdvp); 3760 sdzp = VTOZ(sdvp); 3761 zfsvfs = tdzp->z_zfsvfs; 3762 zilog = zfsvfs->z_log; 3763 3764 /* 3765 * After we re-enter ZFS_ENTER() we will have to revalidate all 3766 * znodes involved. 3767 */ 3768 ZFS_ENTER(zfsvfs); 3769 3770 if (zfsvfs->z_utf8 && u8_validate(tnm, 3771 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3772 error = SET_ERROR(EILSEQ); 3773 goto unlockout; 3774 } 3775 3776 /* If source and target are the same file, there is nothing to do. */ 3777 if ((*svpp) == (*tvpp)) { 3778 error = 0; 3779 goto unlockout; 3780 } 3781 3782 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || 3783 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && 3784 (*tvpp)->v_mountedhere != NULL)) { 3785 error = SET_ERROR(EXDEV); 3786 goto unlockout; 3787 } 3788 3789 /* 3790 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3791 * bypassing the cleanup code in the case of an error. 3792 */ 3793 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3794 error = SET_ERROR(EIO); 3795 goto unlockout; 3796 } 3797 3798 szp = VTOZ(*svpp); 3799 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); 3800 if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { 3801 error = SET_ERROR(EIO); 3802 goto unlockout; 3803 } 3804 3805 /* 3806 * This is to prevent the creation of links into attribute space 3807 * by renaming a linked file into/outof an attribute directory. 3808 * See the comment in zfs_link() for why this is considered bad. 3809 */ 3810 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3811 error = SET_ERROR(EINVAL); 3812 goto unlockout; 3813 } 3814 3815 /* 3816 * Must have write access at the source to remove the old entry 3817 * and write access at the target to create the new entry. 3818 * Note that if target and source are the same, this can be 3819 * done in a single check. 3820 */ 3821 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3822 goto unlockout; 3823 3824 if ((*svpp)->v_type == VDIR) { 3825 /* 3826 * Avoid ".", "..", and aliases of "." for obvious reasons. 3827 */ 3828 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || 3829 sdzp == szp || 3830 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { 3831 error = EINVAL; 3832 goto unlockout; 3833 } 3834 3835 /* 3836 * Check to make sure rename is valid. 3837 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3838 */ 3839 if (error = zfs_rename_check(szp, sdzp, tdzp)) 3840 goto unlockout; 3841 } 3842 3843 /* 3844 * Does target exist? 3845 */ 3846 if (tzp) { 3847 /* 3848 * Source and target must be the same type. 3849 */ 3850 if ((*svpp)->v_type == VDIR) { 3851 if ((*tvpp)->v_type != VDIR) { 3852 error = SET_ERROR(ENOTDIR); 3853 goto unlockout; 3854 } else { 3855 cache_purge(tdvp); 3856 if (sdvp != tdvp) 3857 cache_purge(sdvp); 3858 } 3859 } else { 3860 if ((*tvpp)->v_type == VDIR) { 3861 error = SET_ERROR(EISDIR); 3862 goto unlockout; 3863 } 3864 } 3865 } 3866 3867 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); 3868 if (tzp) 3869 vnevent_rename_dest(*tvpp, tdvp, tnm, ct); 3870 3871 /* 3872 * notify the target directory if it is not the same 3873 * as source directory. 3874 */ 3875 if (tdvp != sdvp) { 3876 vnevent_rename_dest_dir(tdvp, ct); 3877 } 3878 3879 tx = dmu_tx_create(zfsvfs->z_os); 3880 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3881 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3882 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3883 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3884 if (sdzp != tdzp) { 3885 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3886 zfs_sa_upgrade_txholds(tx, tdzp); 3887 } 3888 if (tzp) { 3889 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3890 zfs_sa_upgrade_txholds(tx, tzp); 3891 } 3892 3893 zfs_sa_upgrade_txholds(tx, szp); 3894 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3895 error = dmu_tx_assign(tx, TXG_WAIT); 3896 if (error) { 3897 dmu_tx_abort(tx); 3898 goto unlockout; 3899 } 3900 3901 3902 if (tzp) /* Attempt to remove the existing target */ 3903 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); 3904 3905 if (error == 0) { 3906 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); 3907 if (error == 0) { 3908 szp->z_pflags |= ZFS_AV_MODIFIED; 3909 3910 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3911 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3912 ASSERT0(error); 3913 3914 error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, 3915 NULL); 3916 if (error == 0) { 3917 zfs_log_rename(zilog, tx, TX_RENAME, sdzp, 3918 snm, tdzp, tnm, szp); 3919 3920 /* 3921 * Update path information for the target vnode 3922 */ 3923 vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); 3924 } else { 3925 /* 3926 * At this point, we have successfully created 3927 * the target name, but have failed to remove 3928 * the source name. Since the create was done 3929 * with the ZRENAMING flag, there are 3930 * complications; for one, the link count is 3931 * wrong. The easiest way to deal with this 3932 * is to remove the newly created target, and 3933 * return the original error. This must 3934 * succeed; fortunately, it is very unlikely to 3935 * fail, since we just created it. 3936 */ 3937 VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, 3938 ZRENAMING, NULL), ==, 0); 3939 } 3940 } 3941 if (error == 0) { 3942 cache_purge(*svpp); 3943 if (*tvpp != NULL) 3944 cache_purge(*tvpp); 3945 cache_purge_negative(tdvp); 3946 } 3947 } 3948 3949 dmu_tx_commit(tx); 3950 3951unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ 3952 ZFS_EXIT(zfsvfs); 3953 VOP_UNLOCK(*svpp, 0); 3954 VOP_UNLOCK(sdvp, 0); 3955 3956out: /* original two vnodes are locked */ 3957 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3958 zil_commit(zilog, 0); 3959 3960 if (*tvpp != NULL) 3961 VOP_UNLOCK(*tvpp, 0); 3962 if (tdvp != *tvpp) 3963 VOP_UNLOCK(tdvp, 0); 3964 return (error); 3965} 3966 3967/* 3968 * Insert the indicated symbolic reference entry into the directory. 3969 * 3970 * IN: dvp - Directory to contain new symbolic link. 3971 * link - Name for new symlink entry. 3972 * vap - Attributes of new entry. 3973 * cr - credentials of caller. 3974 * ct - caller context 3975 * flags - case flags 3976 * 3977 * RETURN: 0 on success, error code on failure. 3978 * 3979 * Timestamps: 3980 * dvp - ctime|mtime updated 3981 */ 3982/*ARGSUSED*/ 3983static int 3984zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3985 cred_t *cr, kthread_t *td) 3986{ 3987 znode_t *zp, *dzp = VTOZ(dvp); 3988 dmu_tx_t *tx; 3989 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3990 zilog_t *zilog; 3991 uint64_t len = strlen(link); 3992 int error; 3993 zfs_acl_ids_t acl_ids; 3994 boolean_t fuid_dirtied; 3995 uint64_t txtype = TX_SYMLINK; 3996 int flags = 0; 3997 3998 ASSERT(vap->va_type == VLNK); 3999 4000 ZFS_ENTER(zfsvfs); 4001 ZFS_VERIFY_ZP(dzp); 4002 zilog = zfsvfs->z_log; 4003 4004 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 4005 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4006 ZFS_EXIT(zfsvfs); 4007 return (SET_ERROR(EILSEQ)); 4008 } 4009 4010 if (len > MAXPATHLEN) { 4011 ZFS_EXIT(zfsvfs); 4012 return (SET_ERROR(ENAMETOOLONG)); 4013 } 4014 4015 if ((error = zfs_acl_ids_create(dzp, 0, 4016 vap, cr, NULL, &acl_ids)) != 0) { 4017 ZFS_EXIT(zfsvfs); 4018 return (error); 4019 } 4020 4021 /* 4022 * Attempt to lock directory; fail if entry already exists. 4023 */ 4024 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 4025 if (error) { 4026 zfs_acl_ids_free(&acl_ids); 4027 ZFS_EXIT(zfsvfs); 4028 return (error); 4029 } 4030 4031 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4032 zfs_acl_ids_free(&acl_ids); 4033 ZFS_EXIT(zfsvfs); 4034 return (error); 4035 } 4036 4037 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 4038 zfs_acl_ids_free(&acl_ids); 4039 ZFS_EXIT(zfsvfs); 4040 return (SET_ERROR(EDQUOT)); 4041 } 4042 4043 getnewvnode_reserve(1); 4044 tx = dmu_tx_create(zfsvfs->z_os); 4045 fuid_dirtied = zfsvfs->z_fuid_dirty; 4046 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 4047 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4048 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 4049 ZFS_SA_BASE_ATTR_SIZE + len); 4050 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 4051 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 4052 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 4053 acl_ids.z_aclp->z_acl_bytes); 4054 } 4055 if (fuid_dirtied) 4056 zfs_fuid_txhold(zfsvfs, tx); 4057 error = dmu_tx_assign(tx, TXG_WAIT); 4058 if (error) { 4059 zfs_acl_ids_free(&acl_ids); 4060 dmu_tx_abort(tx); 4061 getnewvnode_drop_reserve(); 4062 ZFS_EXIT(zfsvfs); 4063 return (error); 4064 } 4065 4066 /* 4067 * Create a new object for the symlink. 4068 * for version 4 ZPL datsets the symlink will be an SA attribute 4069 */ 4070 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 4071 4072 if (fuid_dirtied) 4073 zfs_fuid_sync(zfsvfs, tx); 4074 4075 if (zp->z_is_sa) 4076 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 4077 link, len, tx); 4078 else 4079 zfs_sa_symlink(zp, link, len, tx); 4080 4081 zp->z_size = len; 4082 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 4083 &zp->z_size, sizeof (zp->z_size), tx); 4084 /* 4085 * Insert the new object into the directory. 4086 */ 4087 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 4088 4089 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 4090 *vpp = ZTOV(zp); 4091 4092 zfs_acl_ids_free(&acl_ids); 4093 4094 dmu_tx_commit(tx); 4095 4096 getnewvnode_drop_reserve(); 4097 4098 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4099 zil_commit(zilog, 0); 4100 4101 ZFS_EXIT(zfsvfs); 4102 return (error); 4103} 4104 4105/* 4106 * Return, in the buffer contained in the provided uio structure, 4107 * the symbolic path referred to by vp. 4108 * 4109 * IN: vp - vnode of symbolic link. 4110 * uio - structure to contain the link path. 4111 * cr - credentials of caller. 4112 * ct - caller context 4113 * 4114 * OUT: uio - structure containing the link path. 4115 * 4116 * RETURN: 0 on success, error code on failure. 4117 * 4118 * Timestamps: 4119 * vp - atime updated 4120 */ 4121/* ARGSUSED */ 4122static int 4123zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 4124{ 4125 znode_t *zp = VTOZ(vp); 4126 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4127 int error; 4128 4129 ZFS_ENTER(zfsvfs); 4130 ZFS_VERIFY_ZP(zp); 4131 4132 if (zp->z_is_sa) 4133 error = sa_lookup_uio(zp->z_sa_hdl, 4134 SA_ZPL_SYMLINK(zfsvfs), uio); 4135 else 4136 error = zfs_sa_readlink(zp, uio); 4137 4138 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4139 4140 ZFS_EXIT(zfsvfs); 4141 return (error); 4142} 4143 4144/* 4145 * Insert a new entry into directory tdvp referencing svp. 4146 * 4147 * IN: tdvp - Directory to contain new entry. 4148 * svp - vnode of new entry. 4149 * name - name of new entry. 4150 * cr - credentials of caller. 4151 * ct - caller context 4152 * 4153 * RETURN: 0 on success, error code on failure. 4154 * 4155 * Timestamps: 4156 * tdvp - ctime|mtime updated 4157 * svp - ctime updated 4158 */ 4159/* ARGSUSED */ 4160static int 4161zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 4162 caller_context_t *ct, int flags) 4163{ 4164 znode_t *dzp = VTOZ(tdvp); 4165 znode_t *tzp, *szp; 4166 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4167 zilog_t *zilog; 4168 dmu_tx_t *tx; 4169 int error; 4170 uint64_t parent; 4171 uid_t owner; 4172 4173 ASSERT(tdvp->v_type == VDIR); 4174 4175 ZFS_ENTER(zfsvfs); 4176 ZFS_VERIFY_ZP(dzp); 4177 zilog = zfsvfs->z_log; 4178 4179 /* 4180 * POSIX dictates that we return EPERM here. 4181 * Better choices include ENOTSUP or EISDIR. 4182 */ 4183 if (svp->v_type == VDIR) { 4184 ZFS_EXIT(zfsvfs); 4185 return (SET_ERROR(EPERM)); 4186 } 4187 4188 szp = VTOZ(svp); 4189 ZFS_VERIFY_ZP(szp); 4190 4191 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { 4192 ZFS_EXIT(zfsvfs); 4193 return (SET_ERROR(EPERM)); 4194 } 4195 4196 /* Prevent links to .zfs/shares files */ 4197 4198 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4199 &parent, sizeof (uint64_t))) != 0) { 4200 ZFS_EXIT(zfsvfs); 4201 return (error); 4202 } 4203 if (parent == zfsvfs->z_shares_dir) { 4204 ZFS_EXIT(zfsvfs); 4205 return (SET_ERROR(EPERM)); 4206 } 4207 4208 if (zfsvfs->z_utf8 && u8_validate(name, 4209 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4210 ZFS_EXIT(zfsvfs); 4211 return (SET_ERROR(EILSEQ)); 4212 } 4213 4214 /* 4215 * We do not support links between attributes and non-attributes 4216 * because of the potential security risk of creating links 4217 * into "normal" file space in order to circumvent restrictions 4218 * imposed in attribute space. 4219 */ 4220 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4221 ZFS_EXIT(zfsvfs); 4222 return (SET_ERROR(EINVAL)); 4223 } 4224 4225 4226 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4227 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { 4228 ZFS_EXIT(zfsvfs); 4229 return (SET_ERROR(EPERM)); 4230 } 4231 4232 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4233 ZFS_EXIT(zfsvfs); 4234 return (error); 4235 } 4236 4237 /* 4238 * Attempt to lock directory; fail if entry already exists. 4239 */ 4240 error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); 4241 if (error) { 4242 ZFS_EXIT(zfsvfs); 4243 return (error); 4244 } 4245 4246 tx = dmu_tx_create(zfsvfs->z_os); 4247 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4248 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4249 zfs_sa_upgrade_txholds(tx, szp); 4250 zfs_sa_upgrade_txholds(tx, dzp); 4251 error = dmu_tx_assign(tx, TXG_WAIT); 4252 if (error) { 4253 dmu_tx_abort(tx); 4254 ZFS_EXIT(zfsvfs); 4255 return (error); 4256 } 4257 4258 error = zfs_link_create(dzp, name, szp, tx, 0); 4259 4260 if (error == 0) { 4261 uint64_t txtype = TX_LINK; 4262 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4263 } 4264 4265 dmu_tx_commit(tx); 4266 4267 if (error == 0) { 4268 vnevent_link(svp, ct); 4269 } 4270 4271 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4272 zil_commit(zilog, 0); 4273 4274 ZFS_EXIT(zfsvfs); 4275 return (error); 4276} 4277 4278 4279/*ARGSUSED*/ 4280void 4281zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4282{ 4283 znode_t *zp = VTOZ(vp); 4284 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4285 int error; 4286 4287 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4288 if (zp->z_sa_hdl == NULL) { 4289 /* 4290 * The fs has been unmounted, or we did a 4291 * suspend/resume and this file no longer exists. 4292 */ 4293 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4294 vrecycle(vp); 4295 return; 4296 } 4297 4298 if (zp->z_unlinked) { 4299 /* 4300 * Fast path to recycle a vnode of a removed file. 4301 */ 4302 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4303 vrecycle(vp); 4304 return; 4305 } 4306 4307 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4308 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4309 4310 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4311 zfs_sa_upgrade_txholds(tx, zp); 4312 error = dmu_tx_assign(tx, TXG_WAIT); 4313 if (error) { 4314 dmu_tx_abort(tx); 4315 } else { 4316 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4317 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4318 zp->z_atime_dirty = 0; 4319 dmu_tx_commit(tx); 4320 } 4321 } 4322 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4323} 4324 4325 4326CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 4327CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 4328 4329/*ARGSUSED*/ 4330static int 4331zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4332{ 4333 znode_t *zp = VTOZ(vp); 4334 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4335 uint32_t gen; 4336 uint64_t gen64; 4337 uint64_t object = zp->z_id; 4338 zfid_short_t *zfid; 4339 int size, i, error; 4340 4341 ZFS_ENTER(zfsvfs); 4342 ZFS_VERIFY_ZP(zp); 4343 4344 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4345 &gen64, sizeof (uint64_t))) != 0) { 4346 ZFS_EXIT(zfsvfs); 4347 return (error); 4348 } 4349 4350 gen = (uint32_t)gen64; 4351 4352 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4353 4354#ifdef illumos 4355 if (fidp->fid_len < size) { 4356 fidp->fid_len = size; 4357 ZFS_EXIT(zfsvfs); 4358 return (SET_ERROR(ENOSPC)); 4359 } 4360#else 4361 fidp->fid_len = size; 4362#endif 4363 4364 zfid = (zfid_short_t *)fidp; 4365 4366 zfid->zf_len = size; 4367 4368 for (i = 0; i < sizeof (zfid->zf_object); i++) 4369 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4370 4371 /* Must have a non-zero generation number to distinguish from .zfs */ 4372 if (gen == 0) 4373 gen = 1; 4374 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4375 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4376 4377 if (size == LONG_FID_LEN) { 4378 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4379 zfid_long_t *zlfid; 4380 4381 zlfid = (zfid_long_t *)fidp; 4382 4383 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4384 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4385 4386 /* XXX - this should be the generation number for the objset */ 4387 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4388 zlfid->zf_setgen[i] = 0; 4389 } 4390 4391 ZFS_EXIT(zfsvfs); 4392 return (0); 4393} 4394 4395static int 4396zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4397 caller_context_t *ct) 4398{ 4399 znode_t *zp, *xzp; 4400 zfsvfs_t *zfsvfs; 4401 int error; 4402 4403 switch (cmd) { 4404 case _PC_LINK_MAX: 4405 *valp = INT_MAX; 4406 return (0); 4407 4408 case _PC_FILESIZEBITS: 4409 *valp = 64; 4410 return (0); 4411#ifdef illumos 4412 case _PC_XATTR_EXISTS: 4413 zp = VTOZ(vp); 4414 zfsvfs = zp->z_zfsvfs; 4415 ZFS_ENTER(zfsvfs); 4416 ZFS_VERIFY_ZP(zp); 4417 *valp = 0; 4418 error = zfs_dirent_lookup(zp, "", &xzp, 4419 ZXATTR | ZEXISTS | ZSHARED); 4420 if (error == 0) { 4421 if (!zfs_dirempty(xzp)) 4422 *valp = 1; 4423 vrele(ZTOV(xzp)); 4424 } else if (error == ENOENT) { 4425 /* 4426 * If there aren't extended attributes, it's the 4427 * same as having zero of them. 4428 */ 4429 error = 0; 4430 } 4431 ZFS_EXIT(zfsvfs); 4432 return (error); 4433 4434 case _PC_SATTR_ENABLED: 4435 case _PC_SATTR_EXISTS: 4436 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4437 (vp->v_type == VREG || vp->v_type == VDIR); 4438 return (0); 4439 4440 case _PC_ACCESS_FILTERING: 4441 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 4442 vp->v_type == VDIR; 4443 return (0); 4444 4445 case _PC_ACL_ENABLED: 4446 *valp = _ACL_ACE_ENABLED; 4447 return (0); 4448#endif /* illumos */ 4449 case _PC_MIN_HOLE_SIZE: 4450 *valp = (int)SPA_MINBLOCKSIZE; 4451 return (0); 4452#ifdef illumos 4453 case _PC_TIMESTAMP_RESOLUTION: 4454 /* nanosecond timestamp resolution */ 4455 *valp = 1L; 4456 return (0); 4457#endif 4458 case _PC_ACL_EXTENDED: 4459 *valp = 0; 4460 return (0); 4461 4462 case _PC_ACL_NFS4: 4463 *valp = 1; 4464 return (0); 4465 4466 case _PC_ACL_PATH_MAX: 4467 *valp = ACL_MAX_ENTRIES; 4468 return (0); 4469 4470 default: 4471 return (EOPNOTSUPP); 4472 } 4473} 4474 4475/*ARGSUSED*/ 4476static int 4477zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4478 caller_context_t *ct) 4479{ 4480 znode_t *zp = VTOZ(vp); 4481 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4482 int error; 4483 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4484 4485 ZFS_ENTER(zfsvfs); 4486 ZFS_VERIFY_ZP(zp); 4487 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4488 ZFS_EXIT(zfsvfs); 4489 4490 return (error); 4491} 4492 4493/*ARGSUSED*/ 4494int 4495zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4496 caller_context_t *ct) 4497{ 4498 znode_t *zp = VTOZ(vp); 4499 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4500 int error; 4501 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4502 zilog_t *zilog = zfsvfs->z_log; 4503 4504 ZFS_ENTER(zfsvfs); 4505 ZFS_VERIFY_ZP(zp); 4506 4507 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4508 4509 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4510 zil_commit(zilog, 0); 4511 4512 ZFS_EXIT(zfsvfs); 4513 return (error); 4514} 4515 4516static int 4517zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage) 4518{ 4519 znode_t *zp = VTOZ(vp); 4520 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4521 objset_t *os = zp->z_zfsvfs->z_os; 4522 vm_page_t mfirst, mlast, mreq; 4523 vm_object_t object; 4524 caddr_t va; 4525 struct sf_buf *sf; 4526 off_t startoff, endoff; 4527 int i, error; 4528 vm_pindex_t reqstart, reqend; 4529 int pcount, lsize, reqsize, size; 4530 4531 ZFS_ENTER(zfsvfs); 4532 ZFS_VERIFY_ZP(zp); 4533 4534 pcount = OFF_TO_IDX(round_page(count)); 4535 mreq = m[reqpage]; 4536 object = mreq->object; 4537 error = 0; 4538 4539 KASSERT(vp->v_object == object, ("mismatching object")); 4540 4541 if (pcount > 1 && zp->z_blksz > PAGESIZE) { 4542 startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz); 4543 reqstart = OFF_TO_IDX(round_page(startoff)); 4544 if (reqstart < m[0]->pindex) 4545 reqstart = 0; 4546 else 4547 reqstart = reqstart - m[0]->pindex; 4548 endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE, 4549 zp->z_blksz); 4550 reqend = OFF_TO_IDX(trunc_page(endoff)) - 1; 4551 if (reqend > m[pcount - 1]->pindex) 4552 reqend = m[pcount - 1]->pindex; 4553 reqsize = reqend - m[reqstart]->pindex + 1; 4554 KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize, 4555 ("reqpage beyond [reqstart, reqstart + reqsize[ bounds")); 4556 } else { 4557 reqstart = reqpage; 4558 reqsize = 1; 4559 } 4560 mfirst = m[reqstart]; 4561 mlast = m[reqstart + reqsize - 1]; 4562 4563 zfs_vmobject_wlock(object); 4564 4565 for (i = 0; i < reqstart; i++) { 4566 vm_page_lock(m[i]); 4567 vm_page_free(m[i]); 4568 vm_page_unlock(m[i]); 4569 } 4570 for (i = reqstart + reqsize; i < pcount; i++) { 4571 vm_page_lock(m[i]); 4572 vm_page_free(m[i]); 4573 vm_page_unlock(m[i]); 4574 } 4575 4576 if (mreq->valid && reqsize == 1) { 4577 if (mreq->valid != VM_PAGE_BITS_ALL) 4578 vm_page_zero_invalid(mreq, TRUE); 4579 zfs_vmobject_wunlock(object); 4580 ZFS_EXIT(zfsvfs); 4581 return (zfs_vm_pagerret_ok); 4582 } 4583 4584 PCPU_INC(cnt.v_vnodein); 4585 PCPU_ADD(cnt.v_vnodepgsin, reqsize); 4586 4587 if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) { 4588 for (i = reqstart; i < reqstart + reqsize; i++) { 4589 if (i != reqpage) { 4590 vm_page_lock(m[i]); 4591 vm_page_free(m[i]); 4592 vm_page_unlock(m[i]); 4593 } 4594 } 4595 zfs_vmobject_wunlock(object); 4596 ZFS_EXIT(zfsvfs); 4597 return (zfs_vm_pagerret_bad); 4598 } 4599 4600 lsize = PAGE_SIZE; 4601 if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size) 4602 lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex); 4603 4604 zfs_vmobject_wunlock(object); 4605 4606 for (i = reqstart; i < reqstart + reqsize; i++) { 4607 size = PAGE_SIZE; 4608 if (i == (reqstart + reqsize - 1)) 4609 size = lsize; 4610 va = zfs_map_page(m[i], &sf); 4611 error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex), 4612 size, va, DMU_READ_PREFETCH); 4613 if (size != PAGE_SIZE) 4614 bzero(va + size, PAGE_SIZE - size); 4615 zfs_unmap_page(sf); 4616 if (error != 0) 4617 break; 4618 } 4619 4620 zfs_vmobject_wlock(object); 4621 4622 for (i = reqstart; i < reqstart + reqsize; i++) { 4623 if (!error) 4624 m[i]->valid = VM_PAGE_BITS_ALL; 4625 KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i])); 4626 if (i != reqpage) 4627 vm_page_readahead_finish(m[i]); 4628 } 4629 4630 zfs_vmobject_wunlock(object); 4631 4632 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4633 ZFS_EXIT(zfsvfs); 4634 return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok); 4635} 4636 4637static int 4638zfs_freebsd_getpages(ap) 4639 struct vop_getpages_args /* { 4640 struct vnode *a_vp; 4641 vm_page_t *a_m; 4642 int a_count; 4643 int a_reqpage; 4644 vm_ooffset_t a_offset; 4645 } */ *ap; 4646{ 4647 4648 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage)); 4649} 4650 4651static int 4652zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, 4653 int *rtvals) 4654{ 4655 znode_t *zp = VTOZ(vp); 4656 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4657 rl_t *rl; 4658 dmu_tx_t *tx; 4659 struct sf_buf *sf; 4660 vm_object_t object; 4661 vm_page_t m; 4662 caddr_t va; 4663 size_t tocopy; 4664 size_t lo_len; 4665 vm_ooffset_t lo_off; 4666 vm_ooffset_t off; 4667 uint_t blksz; 4668 int ncount; 4669 int pcount; 4670 int err; 4671 int i; 4672 4673 ZFS_ENTER(zfsvfs); 4674 ZFS_VERIFY_ZP(zp); 4675 4676 object = vp->v_object; 4677 pcount = btoc(len); 4678 ncount = pcount; 4679 4680 KASSERT(ma[0]->object == object, ("mismatching object")); 4681 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); 4682 4683 for (i = 0; i < pcount; i++) 4684 rtvals[i] = zfs_vm_pagerret_error; 4685 4686 off = IDX_TO_OFF(ma[0]->pindex); 4687 blksz = zp->z_blksz; 4688 lo_off = rounddown(off, blksz); 4689 lo_len = roundup(len + (off - lo_off), blksz); 4690 rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); 4691 4692 zfs_vmobject_wlock(object); 4693 if (len + off > object->un_pager.vnp.vnp_size) { 4694 if (object->un_pager.vnp.vnp_size > off) { 4695 int pgoff; 4696 4697 len = object->un_pager.vnp.vnp_size - off; 4698 ncount = btoc(len); 4699 if ((pgoff = (int)len & PAGE_MASK) != 0) { 4700 /* 4701 * If the object is locked and the following 4702 * conditions hold, then the page's dirty 4703 * field cannot be concurrently changed by a 4704 * pmap operation. 4705 */ 4706 m = ma[ncount - 1]; 4707 vm_page_assert_sbusied(m); 4708 KASSERT(!pmap_page_is_write_mapped(m), 4709 ("zfs_putpages: page %p is not read-only", m)); 4710 vm_page_clear_dirty(m, pgoff, PAGE_SIZE - 4711 pgoff); 4712 } 4713 } else { 4714 len = 0; 4715 ncount = 0; 4716 } 4717 if (ncount < pcount) { 4718 for (i = ncount; i < pcount; i++) { 4719 rtvals[i] = zfs_vm_pagerret_bad; 4720 } 4721 } 4722 } 4723 zfs_vmobject_wunlock(object); 4724 4725 if (ncount == 0) 4726 goto out; 4727 4728 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 4729 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 4730 goto out; 4731 } 4732 4733 tx = dmu_tx_create(zfsvfs->z_os); 4734 dmu_tx_hold_write(tx, zp->z_id, off, len); 4735 4736 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4737 zfs_sa_upgrade_txholds(tx, zp); 4738 err = dmu_tx_assign(tx, TXG_WAIT); 4739 if (err != 0) { 4740 dmu_tx_abort(tx); 4741 goto out; 4742 } 4743 4744 if (zp->z_blksz < PAGE_SIZE) { 4745 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { 4746 tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; 4747 va = zfs_map_page(ma[i], &sf); 4748 dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); 4749 zfs_unmap_page(sf); 4750 } 4751 } else { 4752 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); 4753 } 4754 4755 if (err == 0) { 4756 uint64_t mtime[2], ctime[2]; 4757 sa_bulk_attr_t bulk[3]; 4758 int count = 0; 4759 4760 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4761 &mtime, 16); 4762 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4763 &ctime, 16); 4764 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4765 &zp->z_pflags, 8); 4766 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4767 B_TRUE); 4768 (void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 4769 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4770 4771 zfs_vmobject_wlock(object); 4772 for (i = 0; i < ncount; i++) { 4773 rtvals[i] = zfs_vm_pagerret_ok; 4774 vm_page_undirty(ma[i]); 4775 } 4776 zfs_vmobject_wunlock(object); 4777 PCPU_INC(cnt.v_vnodeout); 4778 PCPU_ADD(cnt.v_vnodepgsout, ncount); 4779 } 4780 dmu_tx_commit(tx); 4781 4782out: 4783 zfs_range_unlock(rl); 4784 if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || 4785 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4786 zil_commit(zfsvfs->z_log, zp->z_id); 4787 ZFS_EXIT(zfsvfs); 4788 return (rtvals[0]); 4789} 4790 4791int 4792zfs_freebsd_putpages(ap) 4793 struct vop_putpages_args /* { 4794 struct vnode *a_vp; 4795 vm_page_t *a_m; 4796 int a_count; 4797 int a_sync; 4798 int *a_rtvals; 4799 vm_ooffset_t a_offset; 4800 } */ *ap; 4801{ 4802 4803 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, 4804 ap->a_rtvals)); 4805} 4806 4807static int 4808zfs_freebsd_bmap(ap) 4809 struct vop_bmap_args /* { 4810 struct vnode *a_vp; 4811 daddr_t a_bn; 4812 struct bufobj **a_bop; 4813 daddr_t *a_bnp; 4814 int *a_runp; 4815 int *a_runb; 4816 } */ *ap; 4817{ 4818 4819 if (ap->a_bop != NULL) 4820 *ap->a_bop = &ap->a_vp->v_bufobj; 4821 if (ap->a_bnp != NULL) 4822 *ap->a_bnp = ap->a_bn; 4823 if (ap->a_runp != NULL) 4824 *ap->a_runp = 0; 4825 if (ap->a_runb != NULL) 4826 *ap->a_runb = 0; 4827 4828 return (0); 4829} 4830 4831static int 4832zfs_freebsd_open(ap) 4833 struct vop_open_args /* { 4834 struct vnode *a_vp; 4835 int a_mode; 4836 struct ucred *a_cred; 4837 struct thread *a_td; 4838 } */ *ap; 4839{ 4840 vnode_t *vp = ap->a_vp; 4841 znode_t *zp = VTOZ(vp); 4842 int error; 4843 4844 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 4845 if (error == 0) 4846 vnode_create_vobject(vp, zp->z_size, ap->a_td); 4847 return (error); 4848} 4849 4850static int 4851zfs_freebsd_close(ap) 4852 struct vop_close_args /* { 4853 struct vnode *a_vp; 4854 int a_fflag; 4855 struct ucred *a_cred; 4856 struct thread *a_td; 4857 } */ *ap; 4858{ 4859 4860 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); 4861} 4862 4863static int 4864zfs_freebsd_ioctl(ap) 4865 struct vop_ioctl_args /* { 4866 struct vnode *a_vp; 4867 u_long a_command; 4868 caddr_t a_data; 4869 int a_fflag; 4870 struct ucred *cred; 4871 struct thread *td; 4872 } */ *ap; 4873{ 4874 4875 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 4876 ap->a_fflag, ap->a_cred, NULL, NULL)); 4877} 4878 4879static int 4880ioflags(int ioflags) 4881{ 4882 int flags = 0; 4883 4884 if (ioflags & IO_APPEND) 4885 flags |= FAPPEND; 4886 if (ioflags & IO_NDELAY) 4887 flags |= FNONBLOCK; 4888 if (ioflags & IO_SYNC) 4889 flags |= (FSYNC | FDSYNC | FRSYNC); 4890 4891 return (flags); 4892} 4893 4894static int 4895zfs_freebsd_read(ap) 4896 struct vop_read_args /* { 4897 struct vnode *a_vp; 4898 struct uio *a_uio; 4899 int a_ioflag; 4900 struct ucred *a_cred; 4901 } */ *ap; 4902{ 4903 4904 return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4905 ap->a_cred, NULL)); 4906} 4907 4908static int 4909zfs_freebsd_write(ap) 4910 struct vop_write_args /* { 4911 struct vnode *a_vp; 4912 struct uio *a_uio; 4913 int a_ioflag; 4914 struct ucred *a_cred; 4915 } */ *ap; 4916{ 4917 4918 return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4919 ap->a_cred, NULL)); 4920} 4921 4922static int 4923zfs_freebsd_access(ap) 4924 struct vop_access_args /* { 4925 struct vnode *a_vp; 4926 accmode_t a_accmode; 4927 struct ucred *a_cred; 4928 struct thread *a_td; 4929 } */ *ap; 4930{ 4931 vnode_t *vp = ap->a_vp; 4932 znode_t *zp = VTOZ(vp); 4933 accmode_t accmode; 4934 int error = 0; 4935 4936 /* 4937 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, 4938 */ 4939 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); 4940 if (accmode != 0) 4941 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); 4942 4943 /* 4944 * VADMIN has to be handled by vaccess(). 4945 */ 4946 if (error == 0) { 4947 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); 4948 if (accmode != 0) { 4949 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, 4950 zp->z_gid, accmode, ap->a_cred, NULL); 4951 } 4952 } 4953 4954 /* 4955 * For VEXEC, ensure that at least one execute bit is set for 4956 * non-directories. 4957 */ 4958 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && 4959 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { 4960 error = EACCES; 4961 } 4962 4963 return (error); 4964} 4965 4966static int 4967zfs_freebsd_lookup(ap) 4968 struct vop_lookup_args /* { 4969 struct vnode *a_dvp; 4970 struct vnode **a_vpp; 4971 struct componentname *a_cnp; 4972 } */ *ap; 4973{ 4974 struct componentname *cnp = ap->a_cnp; 4975 char nm[NAME_MAX + 1]; 4976 4977 ASSERT(cnp->cn_namelen < sizeof(nm)); 4978 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 4979 4980 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 4981 cnp->cn_cred, cnp->cn_thread, 0)); 4982} 4983 4984static int 4985zfs_cache_lookup(ap) 4986 struct vop_lookup_args /* { 4987 struct vnode *a_dvp; 4988 struct vnode **a_vpp; 4989 struct componentname *a_cnp; 4990 } */ *ap; 4991{ 4992 zfsvfs_t *zfsvfs; 4993 4994 zfsvfs = ap->a_dvp->v_mount->mnt_data; 4995 if (zfsvfs->z_use_namecache) 4996 return (vfs_cache_lookup(ap)); 4997 else 4998 return (zfs_freebsd_lookup(ap)); 4999} 5000 5001static int 5002zfs_freebsd_create(ap) 5003 struct vop_create_args /* { 5004 struct vnode *a_dvp; 5005 struct vnode **a_vpp; 5006 struct componentname *a_cnp; 5007 struct vattr *a_vap; 5008 } */ *ap; 5009{ 5010 zfsvfs_t *zfsvfs; 5011 struct componentname *cnp = ap->a_cnp; 5012 vattr_t *vap = ap->a_vap; 5013 int error, mode; 5014 5015 ASSERT(cnp->cn_flags & SAVENAME); 5016 5017 vattr_init_mask(vap); 5018 mode = vap->va_mode & ALLPERMS; 5019 zfsvfs = ap->a_dvp->v_mount->mnt_data; 5020 5021 error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 5022 ap->a_vpp, cnp->cn_cred, cnp->cn_thread); 5023 if (zfsvfs->z_use_namecache && 5024 error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) 5025 cache_enter(ap->a_dvp, *ap->a_vpp, cnp); 5026 return (error); 5027} 5028 5029static int 5030zfs_freebsd_remove(ap) 5031 struct vop_remove_args /* { 5032 struct vnode *a_dvp; 5033 struct vnode *a_vp; 5034 struct componentname *a_cnp; 5035 } */ *ap; 5036{ 5037 5038 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5039 5040 return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, 5041 ap->a_cnp->cn_cred)); 5042} 5043 5044static int 5045zfs_freebsd_mkdir(ap) 5046 struct vop_mkdir_args /* { 5047 struct vnode *a_dvp; 5048 struct vnode **a_vpp; 5049 struct componentname *a_cnp; 5050 struct vattr *a_vap; 5051 } */ *ap; 5052{ 5053 vattr_t *vap = ap->a_vap; 5054 5055 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5056 5057 vattr_init_mask(vap); 5058 5059 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 5060 ap->a_cnp->cn_cred)); 5061} 5062 5063static int 5064zfs_freebsd_rmdir(ap) 5065 struct vop_rmdir_args /* { 5066 struct vnode *a_dvp; 5067 struct vnode *a_vp; 5068 struct componentname *a_cnp; 5069 } */ *ap; 5070{ 5071 struct componentname *cnp = ap->a_cnp; 5072 5073 ASSERT(cnp->cn_flags & SAVENAME); 5074 5075 return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); 5076} 5077 5078static int 5079zfs_freebsd_readdir(ap) 5080 struct vop_readdir_args /* { 5081 struct vnode *a_vp; 5082 struct uio *a_uio; 5083 struct ucred *a_cred; 5084 int *a_eofflag; 5085 int *a_ncookies; 5086 u_long **a_cookies; 5087 } */ *ap; 5088{ 5089 5090 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 5091 ap->a_ncookies, ap->a_cookies)); 5092} 5093 5094static int 5095zfs_freebsd_fsync(ap) 5096 struct vop_fsync_args /* { 5097 struct vnode *a_vp; 5098 int a_waitfor; 5099 struct thread *a_td; 5100 } */ *ap; 5101{ 5102 5103 vop_stdfsync(ap); 5104 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 5105} 5106 5107static int 5108zfs_freebsd_getattr(ap) 5109 struct vop_getattr_args /* { 5110 struct vnode *a_vp; 5111 struct vattr *a_vap; 5112 struct ucred *a_cred; 5113 } */ *ap; 5114{ 5115 vattr_t *vap = ap->a_vap; 5116 xvattr_t xvap; 5117 u_long fflags = 0; 5118 int error; 5119 5120 xva_init(&xvap); 5121 xvap.xva_vattr = *vap; 5122 xvap.xva_vattr.va_mask |= AT_XVATTR; 5123 5124 /* Convert chflags into ZFS-type flags. */ 5125 /* XXX: what about SF_SETTABLE?. */ 5126 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 5127 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 5128 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 5129 XVA_SET_REQ(&xvap, XAT_NODUMP); 5130 XVA_SET_REQ(&xvap, XAT_READONLY); 5131 XVA_SET_REQ(&xvap, XAT_ARCHIVE); 5132 XVA_SET_REQ(&xvap, XAT_SYSTEM); 5133 XVA_SET_REQ(&xvap, XAT_HIDDEN); 5134 XVA_SET_REQ(&xvap, XAT_REPARSE); 5135 XVA_SET_REQ(&xvap, XAT_OFFLINE); 5136 XVA_SET_REQ(&xvap, XAT_SPARSE); 5137 5138 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 5139 if (error != 0) 5140 return (error); 5141 5142 /* Convert ZFS xattr into chflags. */ 5143#define FLAG_CHECK(fflag, xflag, xfield) do { \ 5144 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 5145 fflags |= (fflag); \ 5146} while (0) 5147 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 5148 xvap.xva_xoptattrs.xoa_immutable); 5149 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 5150 xvap.xva_xoptattrs.xoa_appendonly); 5151 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 5152 xvap.xva_xoptattrs.xoa_nounlink); 5153 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, 5154 xvap.xva_xoptattrs.xoa_archive); 5155 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 5156 xvap.xva_xoptattrs.xoa_nodump); 5157 FLAG_CHECK(UF_READONLY, XAT_READONLY, 5158 xvap.xva_xoptattrs.xoa_readonly); 5159 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, 5160 xvap.xva_xoptattrs.xoa_system); 5161 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, 5162 xvap.xva_xoptattrs.xoa_hidden); 5163 FLAG_CHECK(UF_REPARSE, XAT_REPARSE, 5164 xvap.xva_xoptattrs.xoa_reparse); 5165 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, 5166 xvap.xva_xoptattrs.xoa_offline); 5167 FLAG_CHECK(UF_SPARSE, XAT_SPARSE, 5168 xvap.xva_xoptattrs.xoa_sparse); 5169 5170#undef FLAG_CHECK 5171 *vap = xvap.xva_vattr; 5172 vap->va_flags = fflags; 5173 return (0); 5174} 5175 5176static int 5177zfs_freebsd_setattr(ap) 5178 struct vop_setattr_args /* { 5179 struct vnode *a_vp; 5180 struct vattr *a_vap; 5181 struct ucred *a_cred; 5182 } */ *ap; 5183{ 5184 vnode_t *vp = ap->a_vp; 5185 vattr_t *vap = ap->a_vap; 5186 cred_t *cred = ap->a_cred; 5187 xvattr_t xvap; 5188 u_long fflags; 5189 uint64_t zflags; 5190 5191 vattr_init_mask(vap); 5192 vap->va_mask &= ~AT_NOSET; 5193 5194 xva_init(&xvap); 5195 xvap.xva_vattr = *vap; 5196 5197 zflags = VTOZ(vp)->z_pflags; 5198 5199 if (vap->va_flags != VNOVAL) { 5200 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; 5201 int error; 5202 5203 if (zfsvfs->z_use_fuids == B_FALSE) 5204 return (EOPNOTSUPP); 5205 5206 fflags = vap->va_flags; 5207 /* 5208 * XXX KDM 5209 * We need to figure out whether it makes sense to allow 5210 * UF_REPARSE through, since we don't really have other 5211 * facilities to handle reparse points and zfs_setattr() 5212 * doesn't currently allow setting that attribute anyway. 5213 */ 5214 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| 5215 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| 5216 UF_OFFLINE|UF_SPARSE)) != 0) 5217 return (EOPNOTSUPP); 5218 /* 5219 * Unprivileged processes are not permitted to unset system 5220 * flags, or modify flags if any system flags are set. 5221 * Privileged non-jail processes may not modify system flags 5222 * if securelevel > 0 and any existing system flags are set. 5223 * Privileged jail processes behave like privileged non-jail 5224 * processes if the security.jail.chflags_allowed sysctl is 5225 * is non-zero; otherwise, they behave like unprivileged 5226 * processes. 5227 */ 5228 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || 5229 priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 5230 if (zflags & 5231 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5232 error = securelevel_gt(cred, 0); 5233 if (error != 0) 5234 return (error); 5235 } 5236 } else { 5237 /* 5238 * Callers may only modify the file flags on objects they 5239 * have VADMIN rights for. 5240 */ 5241 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 5242 return (error); 5243 if (zflags & 5244 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5245 return (EPERM); 5246 } 5247 if (fflags & 5248 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 5249 return (EPERM); 5250 } 5251 } 5252 5253#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 5254 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 5255 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 5256 XVA_SET_REQ(&xvap, (xflag)); \ 5257 (xfield) = ((fflags & (fflag)) != 0); \ 5258 } \ 5259} while (0) 5260 /* Convert chflags into ZFS-type flags. */ 5261 /* XXX: what about SF_SETTABLE?. */ 5262 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 5263 xvap.xva_xoptattrs.xoa_immutable); 5264 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 5265 xvap.xva_xoptattrs.xoa_appendonly); 5266 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 5267 xvap.xva_xoptattrs.xoa_nounlink); 5268 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, 5269 xvap.xva_xoptattrs.xoa_archive); 5270 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 5271 xvap.xva_xoptattrs.xoa_nodump); 5272 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, 5273 xvap.xva_xoptattrs.xoa_readonly); 5274 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, 5275 xvap.xva_xoptattrs.xoa_system); 5276 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, 5277 xvap.xva_xoptattrs.xoa_hidden); 5278 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, 5279 xvap.xva_xoptattrs.xoa_hidden); 5280 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, 5281 xvap.xva_xoptattrs.xoa_offline); 5282 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, 5283 xvap.xva_xoptattrs.xoa_sparse); 5284#undef FLAG_CHANGE 5285 } 5286 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 5287} 5288 5289static int 5290zfs_freebsd_rename(ap) 5291 struct vop_rename_args /* { 5292 struct vnode *a_fdvp; 5293 struct vnode *a_fvp; 5294 struct componentname *a_fcnp; 5295 struct vnode *a_tdvp; 5296 struct vnode *a_tvp; 5297 struct componentname *a_tcnp; 5298 } */ *ap; 5299{ 5300 vnode_t *fdvp = ap->a_fdvp; 5301 vnode_t *fvp = ap->a_fvp; 5302 vnode_t *tdvp = ap->a_tdvp; 5303 vnode_t *tvp = ap->a_tvp; 5304 int error; 5305 5306 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 5307 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 5308 5309 error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, 5310 ap->a_tcnp, ap->a_fcnp->cn_cred); 5311 5312 vrele(fdvp); 5313 vrele(fvp); 5314 vrele(tdvp); 5315 if (tvp != NULL) 5316 vrele(tvp); 5317 5318 return (error); 5319} 5320 5321static int 5322zfs_freebsd_symlink(ap) 5323 struct vop_symlink_args /* { 5324 struct vnode *a_dvp; 5325 struct vnode **a_vpp; 5326 struct componentname *a_cnp; 5327 struct vattr *a_vap; 5328 char *a_target; 5329 } */ *ap; 5330{ 5331 struct componentname *cnp = ap->a_cnp; 5332 vattr_t *vap = ap->a_vap; 5333 5334 ASSERT(cnp->cn_flags & SAVENAME); 5335 5336 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 5337 vattr_init_mask(vap); 5338 5339 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 5340 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 5341} 5342 5343static int 5344zfs_freebsd_readlink(ap) 5345 struct vop_readlink_args /* { 5346 struct vnode *a_vp; 5347 struct uio *a_uio; 5348 struct ucred *a_cred; 5349 } */ *ap; 5350{ 5351 5352 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 5353} 5354 5355static int 5356zfs_freebsd_link(ap) 5357 struct vop_link_args /* { 5358 struct vnode *a_tdvp; 5359 struct vnode *a_vp; 5360 struct componentname *a_cnp; 5361 } */ *ap; 5362{ 5363 struct componentname *cnp = ap->a_cnp; 5364 vnode_t *vp = ap->a_vp; 5365 vnode_t *tdvp = ap->a_tdvp; 5366 5367 if (tdvp->v_mount != vp->v_mount) 5368 return (EXDEV); 5369 5370 ASSERT(cnp->cn_flags & SAVENAME); 5371 5372 return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 5373} 5374 5375static int 5376zfs_freebsd_inactive(ap) 5377 struct vop_inactive_args /* { 5378 struct vnode *a_vp; 5379 struct thread *a_td; 5380 } */ *ap; 5381{ 5382 vnode_t *vp = ap->a_vp; 5383 5384 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 5385 return (0); 5386} 5387 5388static int 5389zfs_freebsd_reclaim(ap) 5390 struct vop_reclaim_args /* { 5391 struct vnode *a_vp; 5392 struct thread *a_td; 5393 } */ *ap; 5394{ 5395 vnode_t *vp = ap->a_vp; 5396 znode_t *zp = VTOZ(vp); 5397 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5398 5399 ASSERT(zp != NULL); 5400 5401 /* Destroy the vm object and flush associated pages. */ 5402 vnode_destroy_vobject(vp); 5403 5404 /* 5405 * z_teardown_inactive_lock protects from a race with 5406 * zfs_znode_dmu_fini in zfsvfs_teardown during 5407 * force unmount. 5408 */ 5409 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 5410 if (zp->z_sa_hdl == NULL) 5411 zfs_znode_free(zp); 5412 else 5413 zfs_zinactive(zp); 5414 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5415 5416 vp->v_data = NULL; 5417 return (0); 5418} 5419 5420static int 5421zfs_freebsd_fid(ap) 5422 struct vop_fid_args /* { 5423 struct vnode *a_vp; 5424 struct fid *a_fid; 5425 } */ *ap; 5426{ 5427 5428 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 5429} 5430 5431static int 5432zfs_freebsd_pathconf(ap) 5433 struct vop_pathconf_args /* { 5434 struct vnode *a_vp; 5435 int a_name; 5436 register_t *a_retval; 5437 } */ *ap; 5438{ 5439 ulong_t val; 5440 int error; 5441 5442 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 5443 if (error == 0) 5444 *ap->a_retval = val; 5445 else if (error == EOPNOTSUPP) 5446 error = vop_stdpathconf(ap); 5447 return (error); 5448} 5449 5450static int 5451zfs_freebsd_fifo_pathconf(ap) 5452 struct vop_pathconf_args /* { 5453 struct vnode *a_vp; 5454 int a_name; 5455 register_t *a_retval; 5456 } */ *ap; 5457{ 5458 5459 switch (ap->a_name) { 5460 case _PC_ACL_EXTENDED: 5461 case _PC_ACL_NFS4: 5462 case _PC_ACL_PATH_MAX: 5463 case _PC_MAC_PRESENT: 5464 return (zfs_freebsd_pathconf(ap)); 5465 default: 5466 return (fifo_specops.vop_pathconf(ap)); 5467 } 5468} 5469 5470/* 5471 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 5472 * extended attribute name: 5473 * 5474 * NAMESPACE PREFIX 5475 * system freebsd:system: 5476 * user (none, can be used to access ZFS fsattr(5) attributes 5477 * created on Solaris) 5478 */ 5479static int 5480zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 5481 size_t size) 5482{ 5483 const char *namespace, *prefix, *suffix; 5484 5485 /* We don't allow '/' character in attribute name. */ 5486 if (strchr(name, '/') != NULL) 5487 return (EINVAL); 5488 /* We don't allow attribute names that start with "freebsd:" string. */ 5489 if (strncmp(name, "freebsd:", 8) == 0) 5490 return (EINVAL); 5491 5492 bzero(attrname, size); 5493 5494 switch (attrnamespace) { 5495 case EXTATTR_NAMESPACE_USER: 5496#if 0 5497 prefix = "freebsd:"; 5498 namespace = EXTATTR_NAMESPACE_USER_STRING; 5499 suffix = ":"; 5500#else 5501 /* 5502 * This is the default namespace by which we can access all 5503 * attributes created on Solaris. 5504 */ 5505 prefix = namespace = suffix = ""; 5506#endif 5507 break; 5508 case EXTATTR_NAMESPACE_SYSTEM: 5509 prefix = "freebsd:"; 5510 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 5511 suffix = ":"; 5512 break; 5513 case EXTATTR_NAMESPACE_EMPTY: 5514 default: 5515 return (EINVAL); 5516 } 5517 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 5518 name) >= size) { 5519 return (ENAMETOOLONG); 5520 } 5521 return (0); 5522} 5523 5524/* 5525 * Vnode operating to retrieve a named extended attribute. 5526 */ 5527static int 5528zfs_getextattr(struct vop_getextattr_args *ap) 5529/* 5530vop_getextattr { 5531 IN struct vnode *a_vp; 5532 IN int a_attrnamespace; 5533 IN const char *a_name; 5534 INOUT struct uio *a_uio; 5535 OUT size_t *a_size; 5536 IN struct ucred *a_cred; 5537 IN struct thread *a_td; 5538}; 5539*/ 5540{ 5541 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5542 struct thread *td = ap->a_td; 5543 struct nameidata nd; 5544 char attrname[255]; 5545 struct vattr va; 5546 vnode_t *xvp = NULL, *vp; 5547 int error, flags; 5548 5549 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5550 ap->a_cred, ap->a_td, VREAD); 5551 if (error != 0) 5552 return (error); 5553 5554 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5555 sizeof(attrname)); 5556 if (error != 0) 5557 return (error); 5558 5559 ZFS_ENTER(zfsvfs); 5560 5561 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5562 LOOKUP_XATTR); 5563 if (error != 0) { 5564 ZFS_EXIT(zfsvfs); 5565 return (error); 5566 } 5567 5568 flags = FREAD; 5569 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5570 xvp, td); 5571 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); 5572 vp = nd.ni_vp; 5573 NDFREE(&nd, NDF_ONLY_PNBUF); 5574 if (error != 0) { 5575 ZFS_EXIT(zfsvfs); 5576 if (error == ENOENT) 5577 error = ENOATTR; 5578 return (error); 5579 } 5580 5581 if (ap->a_size != NULL) { 5582 error = VOP_GETATTR(vp, &va, ap->a_cred); 5583 if (error == 0) 5584 *ap->a_size = (size_t)va.va_size; 5585 } else if (ap->a_uio != NULL) 5586 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5587 5588 VOP_UNLOCK(vp, 0); 5589 vn_close(vp, flags, ap->a_cred, td); 5590 ZFS_EXIT(zfsvfs); 5591 5592 return (error); 5593} 5594 5595/* 5596 * Vnode operation to remove a named attribute. 5597 */ 5598int 5599zfs_deleteextattr(struct vop_deleteextattr_args *ap) 5600/* 5601vop_deleteextattr { 5602 IN struct vnode *a_vp; 5603 IN int a_attrnamespace; 5604 IN const char *a_name; 5605 IN struct ucred *a_cred; 5606 IN struct thread *a_td; 5607}; 5608*/ 5609{ 5610 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5611 struct thread *td = ap->a_td; 5612 struct nameidata nd; 5613 char attrname[255]; 5614 struct vattr va; 5615 vnode_t *xvp = NULL, *vp; 5616 int error, flags; 5617 5618 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5619 ap->a_cred, ap->a_td, VWRITE); 5620 if (error != 0) 5621 return (error); 5622 5623 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5624 sizeof(attrname)); 5625 if (error != 0) 5626 return (error); 5627 5628 ZFS_ENTER(zfsvfs); 5629 5630 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5631 LOOKUP_XATTR); 5632 if (error != 0) { 5633 ZFS_EXIT(zfsvfs); 5634 return (error); 5635 } 5636 5637 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, 5638 UIO_SYSSPACE, attrname, xvp, td); 5639 error = namei(&nd); 5640 vp = nd.ni_vp; 5641 if (error != 0) { 5642 ZFS_EXIT(zfsvfs); 5643 NDFREE(&nd, NDF_ONLY_PNBUF); 5644 if (error == ENOENT) 5645 error = ENOATTR; 5646 return (error); 5647 } 5648 5649 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 5650 NDFREE(&nd, NDF_ONLY_PNBUF); 5651 5652 vput(nd.ni_dvp); 5653 if (vp == nd.ni_dvp) 5654 vrele(vp); 5655 else 5656 vput(vp); 5657 ZFS_EXIT(zfsvfs); 5658 5659 return (error); 5660} 5661 5662/* 5663 * Vnode operation to set a named attribute. 5664 */ 5665static int 5666zfs_setextattr(struct vop_setextattr_args *ap) 5667/* 5668vop_setextattr { 5669 IN struct vnode *a_vp; 5670 IN int a_attrnamespace; 5671 IN const char *a_name; 5672 INOUT struct uio *a_uio; 5673 IN struct ucred *a_cred; 5674 IN struct thread *a_td; 5675}; 5676*/ 5677{ 5678 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5679 struct thread *td = ap->a_td; 5680 struct nameidata nd; 5681 char attrname[255]; 5682 struct vattr va; 5683 vnode_t *xvp = NULL, *vp; 5684 int error, flags; 5685 5686 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5687 ap->a_cred, ap->a_td, VWRITE); 5688 if (error != 0) 5689 return (error); 5690 5691 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5692 sizeof(attrname)); 5693 if (error != 0) 5694 return (error); 5695 5696 ZFS_ENTER(zfsvfs); 5697 5698 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5699 LOOKUP_XATTR | CREATE_XATTR_DIR); 5700 if (error != 0) { 5701 ZFS_EXIT(zfsvfs); 5702 return (error); 5703 } 5704 5705 flags = FFLAGS(O_WRONLY | O_CREAT); 5706 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5707 xvp, td); 5708 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); 5709 vp = nd.ni_vp; 5710 NDFREE(&nd, NDF_ONLY_PNBUF); 5711 if (error != 0) { 5712 ZFS_EXIT(zfsvfs); 5713 return (error); 5714 } 5715 5716 VATTR_NULL(&va); 5717 va.va_size = 0; 5718 error = VOP_SETATTR(vp, &va, ap->a_cred); 5719 if (error == 0) 5720 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5721 5722 VOP_UNLOCK(vp, 0); 5723 vn_close(vp, flags, ap->a_cred, td); 5724 ZFS_EXIT(zfsvfs); 5725 5726 return (error); 5727} 5728 5729/* 5730 * Vnode operation to retrieve extended attributes on a vnode. 5731 */ 5732static int 5733zfs_listextattr(struct vop_listextattr_args *ap) 5734/* 5735vop_listextattr { 5736 IN struct vnode *a_vp; 5737 IN int a_attrnamespace; 5738 INOUT struct uio *a_uio; 5739 OUT size_t *a_size; 5740 IN struct ucred *a_cred; 5741 IN struct thread *a_td; 5742}; 5743*/ 5744{ 5745 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5746 struct thread *td = ap->a_td; 5747 struct nameidata nd; 5748 char attrprefix[16]; 5749 u_char dirbuf[sizeof(struct dirent)]; 5750 struct dirent *dp; 5751 struct iovec aiov; 5752 struct uio auio, *uio = ap->a_uio; 5753 size_t *sizep = ap->a_size; 5754 size_t plen; 5755 vnode_t *xvp = NULL, *vp; 5756 int done, error, eof, pos; 5757 5758 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5759 ap->a_cred, ap->a_td, VREAD); 5760 if (error != 0) 5761 return (error); 5762 5763 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 5764 sizeof(attrprefix)); 5765 if (error != 0) 5766 return (error); 5767 plen = strlen(attrprefix); 5768 5769 ZFS_ENTER(zfsvfs); 5770 5771 if (sizep != NULL) 5772 *sizep = 0; 5773 5774 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5775 LOOKUP_XATTR); 5776 if (error != 0) { 5777 ZFS_EXIT(zfsvfs); 5778 /* 5779 * ENOATTR means that the EA directory does not yet exist, 5780 * i.e. there are no extended attributes there. 5781 */ 5782 if (error == ENOATTR) 5783 error = 0; 5784 return (error); 5785 } 5786 5787 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, 5788 UIO_SYSSPACE, ".", xvp, td); 5789 error = namei(&nd); 5790 vp = nd.ni_vp; 5791 NDFREE(&nd, NDF_ONLY_PNBUF); 5792 if (error != 0) { 5793 ZFS_EXIT(zfsvfs); 5794 return (error); 5795 } 5796 5797 auio.uio_iov = &aiov; 5798 auio.uio_iovcnt = 1; 5799 auio.uio_segflg = UIO_SYSSPACE; 5800 auio.uio_td = td; 5801 auio.uio_rw = UIO_READ; 5802 auio.uio_offset = 0; 5803 5804 do { 5805 u_char nlen; 5806 5807 aiov.iov_base = (void *)dirbuf; 5808 aiov.iov_len = sizeof(dirbuf); 5809 auio.uio_resid = sizeof(dirbuf); 5810 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 5811 done = sizeof(dirbuf) - auio.uio_resid; 5812 if (error != 0) 5813 break; 5814 for (pos = 0; pos < done;) { 5815 dp = (struct dirent *)(dirbuf + pos); 5816 pos += dp->d_reclen; 5817 /* 5818 * XXX: Temporarily we also accept DT_UNKNOWN, as this 5819 * is what we get when attribute was created on Solaris. 5820 */ 5821 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 5822 continue; 5823 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 5824 continue; 5825 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 5826 continue; 5827 nlen = dp->d_namlen - plen; 5828 if (sizep != NULL) 5829 *sizep += 1 + nlen; 5830 else if (uio != NULL) { 5831 /* 5832 * Format of extattr name entry is one byte for 5833 * length and the rest for name. 5834 */ 5835 error = uiomove(&nlen, 1, uio->uio_rw, uio); 5836 if (error == 0) { 5837 error = uiomove(dp->d_name + plen, nlen, 5838 uio->uio_rw, uio); 5839 } 5840 if (error != 0) 5841 break; 5842 } 5843 } 5844 } while (!eof && error == 0); 5845 5846 vput(vp); 5847 ZFS_EXIT(zfsvfs); 5848 5849 return (error); 5850} 5851 5852int 5853zfs_freebsd_getacl(ap) 5854 struct vop_getacl_args /* { 5855 struct vnode *vp; 5856 acl_type_t type; 5857 struct acl *aclp; 5858 struct ucred *cred; 5859 struct thread *td; 5860 } */ *ap; 5861{ 5862 int error; 5863 vsecattr_t vsecattr; 5864 5865 if (ap->a_type != ACL_TYPE_NFS4) 5866 return (EINVAL); 5867 5868 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 5869 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 5870 return (error); 5871 5872 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 5873 if (vsecattr.vsa_aclentp != NULL) 5874 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 5875 5876 return (error); 5877} 5878 5879int 5880zfs_freebsd_setacl(ap) 5881 struct vop_setacl_args /* { 5882 struct vnode *vp; 5883 acl_type_t type; 5884 struct acl *aclp; 5885 struct ucred *cred; 5886 struct thread *td; 5887 } */ *ap; 5888{ 5889 int error; 5890 vsecattr_t vsecattr; 5891 int aclbsize; /* size of acl list in bytes */ 5892 aclent_t *aaclp; 5893 5894 if (ap->a_type != ACL_TYPE_NFS4) 5895 return (EINVAL); 5896 5897 if (ap->a_aclp == NULL) 5898 return (EINVAL); 5899 5900 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 5901 return (EINVAL); 5902 5903 /* 5904 * With NFSv4 ACLs, chmod(2) may need to add additional entries, 5905 * splitting every entry into two and appending "canonical six" 5906 * entries at the end. Don't allow for setting an ACL that would 5907 * cause chmod(2) to run out of ACL entries. 5908 */ 5909 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 5910 return (ENOSPC); 5911 5912 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); 5913 if (error != 0) 5914 return (error); 5915 5916 vsecattr.vsa_mask = VSA_ACE; 5917 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 5918 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 5919 aaclp = vsecattr.vsa_aclentp; 5920 vsecattr.vsa_aclentsz = aclbsize; 5921 5922 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 5923 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 5924 kmem_free(aaclp, aclbsize); 5925 5926 return (error); 5927} 5928 5929int 5930zfs_freebsd_aclcheck(ap) 5931 struct vop_aclcheck_args /* { 5932 struct vnode *vp; 5933 acl_type_t type; 5934 struct acl *aclp; 5935 struct ucred *cred; 5936 struct thread *td; 5937 } */ *ap; 5938{ 5939 5940 return (EOPNOTSUPP); 5941} 5942 5943static int 5944zfs_vptocnp(struct vop_vptocnp_args *ap) 5945{ 5946 vnode_t *covered_vp; 5947 vnode_t *vp = ap->a_vp;; 5948 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; 5949 znode_t *zp = VTOZ(vp); 5950 int ltype; 5951 int error; 5952 5953 ZFS_ENTER(zfsvfs); 5954 ZFS_VERIFY_ZP(zp); 5955 5956 /* 5957 * If we are a snapshot mounted under .zfs, run the operation 5958 * on the covered vnode. 5959 */ 5960 if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { 5961 char name[MAXNAMLEN + 1]; 5962 znode_t *dzp; 5963 size_t len; 5964 5965 error = zfs_znode_parent_and_name(zp, &dzp, name); 5966 if (error == 0) { 5967 len = strlen(name); 5968 if (*ap->a_buflen < len) 5969 error = SET_ERROR(ENOMEM); 5970 } 5971 if (error == 0) { 5972 *ap->a_buflen -= len; 5973 bcopy(name, ap->a_buf + *ap->a_buflen, len); 5974 *ap->a_vpp = ZTOV(dzp); 5975 } 5976 ZFS_EXIT(zfsvfs); 5977 return (error); 5978 } 5979 ZFS_EXIT(zfsvfs); 5980 5981 covered_vp = vp->v_mount->mnt_vnodecovered; 5982 vhold(covered_vp); 5983 ltype = VOP_ISLOCKED(vp); 5984 VOP_UNLOCK(vp, 0); 5985 error = vget(covered_vp, LK_SHARED, curthread); 5986 vdrop(covered_vp); 5987 if (error == 0) { 5988 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, 5989 ap->a_buf, ap->a_buflen); 5990 vput(covered_vp); 5991 } 5992 vn_lock(vp, ltype | LK_RETRY); 5993 if ((vp->v_iflag & VI_DOOMED) != 0) 5994 error = SET_ERROR(ENOENT); 5995 return (error); 5996} 5997 5998#ifdef DIAGNOSTIC 5999static int 6000zfs_lock(ap) 6001 struct vop_lock1_args /* { 6002 struct vnode *a_vp; 6003 int a_flags; 6004 char *file; 6005 int line; 6006 } */ *ap; 6007{ 6008 vnode_t *vp; 6009 znode_t *zp; 6010 int err; 6011 6012 err = vop_stdlock(ap); 6013 if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { 6014 vp = ap->a_vp; 6015 zp = vp->v_data; 6016 if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && 6017 zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) 6018 VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); 6019 } 6020 return (err); 6021} 6022#endif 6023 6024struct vop_vector zfs_vnodeops; 6025struct vop_vector zfs_fifoops; 6026struct vop_vector zfs_shareops; 6027 6028struct vop_vector zfs_vnodeops = { 6029 .vop_default = &default_vnodeops, 6030 .vop_inactive = zfs_freebsd_inactive, 6031 .vop_reclaim = zfs_freebsd_reclaim, 6032 .vop_access = zfs_freebsd_access, 6033 .vop_lookup = zfs_cache_lookup, 6034 .vop_cachedlookup = zfs_freebsd_lookup, 6035 .vop_getattr = zfs_freebsd_getattr, 6036 .vop_setattr = zfs_freebsd_setattr, 6037 .vop_create = zfs_freebsd_create, 6038 .vop_mknod = zfs_freebsd_create, 6039 .vop_mkdir = zfs_freebsd_mkdir, 6040 .vop_readdir = zfs_freebsd_readdir, 6041 .vop_fsync = zfs_freebsd_fsync, 6042 .vop_open = zfs_freebsd_open, 6043 .vop_close = zfs_freebsd_close, 6044 .vop_rmdir = zfs_freebsd_rmdir, 6045 .vop_ioctl = zfs_freebsd_ioctl, 6046 .vop_link = zfs_freebsd_link, 6047 .vop_symlink = zfs_freebsd_symlink, 6048 .vop_readlink = zfs_freebsd_readlink, 6049 .vop_read = zfs_freebsd_read, 6050 .vop_write = zfs_freebsd_write, 6051 .vop_remove = zfs_freebsd_remove, 6052 .vop_rename = zfs_freebsd_rename, 6053 .vop_pathconf = zfs_freebsd_pathconf, 6054 .vop_bmap = zfs_freebsd_bmap, 6055 .vop_fid = zfs_freebsd_fid, 6056 .vop_getextattr = zfs_getextattr, 6057 .vop_deleteextattr = zfs_deleteextattr, 6058 .vop_setextattr = zfs_setextattr, 6059 .vop_listextattr = zfs_listextattr, 6060 .vop_getacl = zfs_freebsd_getacl, 6061 .vop_setacl = zfs_freebsd_setacl, 6062 .vop_aclcheck = zfs_freebsd_aclcheck, 6063 .vop_getpages = zfs_freebsd_getpages, 6064 .vop_putpages = zfs_freebsd_putpages, 6065 .vop_vptocnp = zfs_vptocnp, 6066#ifdef DIAGNOSTIC 6067 .vop_lock1 = zfs_lock, 6068#endif 6069}; 6070 6071struct vop_vector zfs_fifoops = { 6072 .vop_default = &fifo_specops, 6073 .vop_fsync = zfs_freebsd_fsync, 6074 .vop_access = zfs_freebsd_access, 6075 .vop_getattr = zfs_freebsd_getattr, 6076 .vop_inactive = zfs_freebsd_inactive, 6077 .vop_read = VOP_PANIC, 6078 .vop_reclaim = zfs_freebsd_reclaim, 6079 .vop_setattr = zfs_freebsd_setattr, 6080 .vop_write = VOP_PANIC, 6081 .vop_pathconf = zfs_freebsd_fifo_pathconf, 6082 .vop_fid = zfs_freebsd_fid, 6083 .vop_getacl = zfs_freebsd_getacl, 6084 .vop_setacl = zfs_freebsd_setacl, 6085 .vop_aclcheck = zfs_freebsd_aclcheck, 6086}; 6087 6088/* 6089 * special share hidden files vnode operations template 6090 */ 6091struct vop_vector zfs_shareops = { 6092 .vop_default = &default_vnodeops, 6093 .vop_access = zfs_freebsd_access, 6094 .vop_inactive = zfs_freebsd_inactive, 6095 .vop_reclaim = zfs_freebsd_reclaim, 6096 .vop_fid = zfs_freebsd_fid, 6097 .vop_pathconf = zfs_freebsd_pathconf, 6098}; 6099