zfs_vnops.c revision 331017
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright 2017 Nexenta Systems, Inc. 27 */ 28 29/* Portions Copyright 2007 Jeremy Teo */ 30/* Portions Copyright 2010 Robert Milkowski */ 31 32#include <sys/types.h> 33#include <sys/param.h> 34#include <sys/time.h> 35#include <sys/systm.h> 36#include <sys/sysmacros.h> 37#include <sys/resource.h> 38#include <sys/vfs.h> 39#include <sys/vm.h> 40#include <sys/vnode.h> 41#include <sys/file.h> 42#include <sys/stat.h> 43#include <sys/kmem.h> 44#include <sys/taskq.h> 45#include <sys/uio.h> 46#include <sys/atomic.h> 47#include <sys/namei.h> 48#include <sys/mman.h> 49#include <sys/cmn_err.h> 50#include <sys/errno.h> 51#include <sys/unistd.h> 52#include <sys/zfs_dir.h> 53#include <sys/zfs_ioctl.h> 54#include <sys/fs/zfs.h> 55#include <sys/dmu.h> 56#include <sys/dmu_objset.h> 57#include <sys/spa.h> 58#include <sys/txg.h> 59#include <sys/dbuf.h> 60#include <sys/zap.h> 61#include <sys/sa.h> 62#include <sys/dirent.h> 63#include <sys/policy.h> 64#include <sys/sunddi.h> 65#include <sys/filio.h> 66#include <sys/sid.h> 67#include <sys/zfs_ctldir.h> 68#include <sys/zfs_fuid.h> 69#include <sys/zfs_sa.h> 70#include <sys/zfs_rlock.h> 71#include <sys/extdirent.h> 72#include <sys/kidmap.h> 73#include <sys/bio.h> 74#include <sys/buf.h> 75#include <sys/sched.h> 76#include <sys/acl.h> 77#include <sys/vmmeter.h> 78#include <vm/vm_param.h> 79#include <sys/zil.h> 80 81/* 82 * Programming rules. 83 * 84 * Each vnode op performs some logical unit of work. To do this, the ZPL must 85 * properly lock its in-core state, create a DMU transaction, do the work, 86 * record this work in the intent log (ZIL), commit the DMU transaction, 87 * and wait for the intent log to commit if it is a synchronous operation. 88 * Moreover, the vnode ops must work in both normal and log replay context. 89 * The ordering of events is important to avoid deadlocks and references 90 * to freed memory. The example below illustrates the following Big Rules: 91 * 92 * (1) A check must be made in each zfs thread for a mounted file system. 93 * This is done avoiding races using ZFS_ENTER(zfsvfs). 94 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 95 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 96 * can return EIO from the calling function. 97 * 98 * (2) VN_RELE() should always be the last thing except for zil_commit() 99 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 100 * First, if it's the last reference, the vnode/znode 101 * can be freed, so the zp may point to freed memory. Second, the last 102 * reference will call zfs_zinactive(), which may induce a lot of work -- 103 * pushing cached pages (which acquires range locks) and syncing out 104 * cached atime changes. Third, zfs_zinactive() may require a new tx, 105 * which could deadlock the system if you were already holding one. 106 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 107 * 108 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 109 * as they can span dmu_tx_assign() calls. 110 * 111 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 112 * dmu_tx_assign(). This is critical because we don't want to block 113 * while holding locks. 114 * 115 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 116 * reduces lock contention and CPU usage when we must wait (note that if 117 * throughput is constrained by the storage, nearly every transaction 118 * must wait). 119 * 120 * Note, in particular, that if a lock is sometimes acquired before 121 * the tx assigns, and sometimes after (e.g. z_lock), then failing 122 * to use a non-blocking assign can deadlock the system. The scenario: 123 * 124 * Thread A has grabbed a lock before calling dmu_tx_assign(). 125 * Thread B is in an already-assigned tx, and blocks for this lock. 126 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 127 * forever, because the previous txg can't quiesce until B's tx commits. 128 * 129 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 130 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 131 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 132 * to indicate that this operation has already called dmu_tx_wait(). 133 * This will ensure that we don't retry forever, waiting a short bit 134 * each time. 135 * 136 * (5) If the operation succeeded, generate the intent log entry for it 137 * before dropping locks. This ensures that the ordering of events 138 * in the intent log matches the order in which they actually occurred. 139 * During ZIL replay the zfs_log_* functions will update the sequence 140 * number to indicate the zil transaction has replayed. 141 * 142 * (6) At the end of each vnode op, the DMU tx must always commit, 143 * regardless of whether there were any errors. 144 * 145 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 146 * to ensure that synchronous semantics are provided when necessary. 147 * 148 * In general, this is how things should be ordered in each vnode op: 149 * 150 * ZFS_ENTER(zfsvfs); // exit if unmounted 151 * top: 152 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) 153 * rw_enter(...); // grab any other locks you need 154 * tx = dmu_tx_create(...); // get DMU tx 155 * dmu_tx_hold_*(); // hold each object you might modify 156 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 157 * if (error) { 158 * rw_exit(...); // drop locks 159 * zfs_dirent_unlock(dl); // unlock directory entry 160 * VN_RELE(...); // release held vnodes 161 * if (error == ERESTART) { 162 * waited = B_TRUE; 163 * dmu_tx_wait(tx); 164 * dmu_tx_abort(tx); 165 * goto top; 166 * } 167 * dmu_tx_abort(tx); // abort DMU tx 168 * ZFS_EXIT(zfsvfs); // finished in zfs 169 * return (error); // really out of space 170 * } 171 * error = do_real_work(); // do whatever this VOP does 172 * if (error == 0) 173 * zfs_log_*(...); // on success, make ZIL entry 174 * dmu_tx_commit(tx); // commit DMU tx -- error or not 175 * rw_exit(...); // drop locks 176 * zfs_dirent_unlock(dl); // unlock directory entry 177 * VN_RELE(...); // release held vnodes 178 * zil_commit(zilog, foid); // synchronous when necessary 179 * ZFS_EXIT(zfsvfs); // finished in zfs 180 * return (error); // done, report error 181 */ 182 183/* ARGSUSED */ 184static int 185zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 186{ 187 znode_t *zp = VTOZ(*vpp); 188 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 189 190 ZFS_ENTER(zfsvfs); 191 ZFS_VERIFY_ZP(zp); 192 193 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 194 ((flag & FAPPEND) == 0)) { 195 ZFS_EXIT(zfsvfs); 196 return (SET_ERROR(EPERM)); 197 } 198 199 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 200 ZTOV(zp)->v_type == VREG && 201 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 202 if (fs_vscan(*vpp, cr, 0) != 0) { 203 ZFS_EXIT(zfsvfs); 204 return (SET_ERROR(EACCES)); 205 } 206 } 207 208 /* Keep a count of the synchronous opens in the znode */ 209 if (flag & (FSYNC | FDSYNC)) 210 atomic_inc_32(&zp->z_sync_cnt); 211 212 ZFS_EXIT(zfsvfs); 213 return (0); 214} 215 216/* ARGSUSED */ 217static int 218zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 219 caller_context_t *ct) 220{ 221 znode_t *zp = VTOZ(vp); 222 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 223 224 /* 225 * Clean up any locks held by this process on the vp. 226 */ 227 cleanlocks(vp, ddi_get_pid(), 0); 228 cleanshares(vp, ddi_get_pid()); 229 230 ZFS_ENTER(zfsvfs); 231 ZFS_VERIFY_ZP(zp); 232 233 /* Decrement the synchronous opens in the znode */ 234 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 235 atomic_dec_32(&zp->z_sync_cnt); 236 237 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 238 ZTOV(zp)->v_type == VREG && 239 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 240 VERIFY(fs_vscan(vp, cr, 1) == 0); 241 242 ZFS_EXIT(zfsvfs); 243 return (0); 244} 245 246/* 247 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 248 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 249 */ 250static int 251zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 252{ 253 znode_t *zp = VTOZ(vp); 254 uint64_t noff = (uint64_t)*off; /* new offset */ 255 uint64_t file_sz; 256 int error; 257 boolean_t hole; 258 259 file_sz = zp->z_size; 260 if (noff >= file_sz) { 261 return (SET_ERROR(ENXIO)); 262 } 263 264 if (cmd == _FIO_SEEK_HOLE) 265 hole = B_TRUE; 266 else 267 hole = B_FALSE; 268 269 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 270 271 if (error == ESRCH) 272 return (SET_ERROR(ENXIO)); 273 274 /* 275 * We could find a hole that begins after the logical end-of-file, 276 * because dmu_offset_next() only works on whole blocks. If the 277 * EOF falls mid-block, then indicate that the "virtual hole" 278 * at the end of the file begins at the logical EOF, rather than 279 * at the end of the last block. 280 */ 281 if (noff > file_sz) { 282 ASSERT(hole); 283 noff = file_sz; 284 } 285 286 if (noff < *off) 287 return (error); 288 *off = noff; 289 return (error); 290} 291 292/* ARGSUSED */ 293static int 294zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 295 int *rvalp, caller_context_t *ct) 296{ 297 offset_t off; 298 offset_t ndata; 299 dmu_object_info_t doi; 300 int error; 301 zfsvfs_t *zfsvfs; 302 znode_t *zp; 303 304 switch (com) { 305 case _FIOFFS: 306 { 307 return (0); 308 309 /* 310 * The following two ioctls are used by bfu. Faking out, 311 * necessary to avoid bfu errors. 312 */ 313 } 314 case _FIOGDIO: 315 case _FIOSDIO: 316 { 317 return (0); 318 } 319 320 case _FIO_SEEK_DATA: 321 case _FIO_SEEK_HOLE: 322 { 323#ifdef illumos 324 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 325 return (SET_ERROR(EFAULT)); 326#else 327 off = *(offset_t *)data; 328#endif 329 zp = VTOZ(vp); 330 zfsvfs = zp->z_zfsvfs; 331 ZFS_ENTER(zfsvfs); 332 ZFS_VERIFY_ZP(zp); 333 334 /* offset parameter is in/out */ 335 error = zfs_holey(vp, com, &off); 336 ZFS_EXIT(zfsvfs); 337 if (error) 338 return (error); 339#ifdef illumos 340 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 341 return (SET_ERROR(EFAULT)); 342#else 343 *(offset_t *)data = off; 344#endif 345 return (0); 346 } 347#ifdef illumos 348 case _FIO_COUNT_FILLED: 349 { 350 /* 351 * _FIO_COUNT_FILLED adds a new ioctl command which 352 * exposes the number of filled blocks in a 353 * ZFS object. 354 */ 355 zp = VTOZ(vp); 356 zfsvfs = zp->z_zfsvfs; 357 ZFS_ENTER(zfsvfs); 358 ZFS_VERIFY_ZP(zp); 359 360 /* 361 * Wait for all dirty blocks for this object 362 * to get synced out to disk, and the DMU info 363 * updated. 364 */ 365 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); 366 if (error) { 367 ZFS_EXIT(zfsvfs); 368 return (error); 369 } 370 371 /* 372 * Retrieve fill count from DMU object. 373 */ 374 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); 375 if (error) { 376 ZFS_EXIT(zfsvfs); 377 return (error); 378 } 379 380 ndata = doi.doi_fill_count; 381 382 ZFS_EXIT(zfsvfs); 383 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) 384 return (SET_ERROR(EFAULT)); 385 return (0); 386 } 387#endif 388 } 389 return (SET_ERROR(ENOTTY)); 390} 391 392static vm_page_t 393page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) 394{ 395 vm_object_t obj; 396 vm_page_t pp; 397 int64_t end; 398 399 /* 400 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE 401 * aligned boundaries, if the range is not aligned. As a result a 402 * DEV_BSIZE subrange with partially dirty data may get marked as clean. 403 * It may happen that all DEV_BSIZE subranges are marked clean and thus 404 * the whole page would be considred clean despite have some dirty data. 405 * For this reason we should shrink the range to DEV_BSIZE aligned 406 * boundaries before calling vm_page_clear_dirty. 407 */ 408 end = rounddown2(off + nbytes, DEV_BSIZE); 409 off = roundup2(off, DEV_BSIZE); 410 nbytes = end - off; 411 412 obj = vp->v_object; 413 zfs_vmobject_assert_wlocked(obj); 414 415 for (;;) { 416 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 417 pp->valid) { 418 if (vm_page_xbusied(pp)) { 419 /* 420 * Reference the page before unlocking and 421 * sleeping so that the page daemon is less 422 * likely to reclaim it. 423 */ 424 vm_page_reference(pp); 425 vm_page_lock(pp); 426 zfs_vmobject_wunlock(obj); 427 vm_page_busy_sleep(pp, "zfsmwb", true); 428 zfs_vmobject_wlock(obj); 429 continue; 430 } 431 vm_page_sbusy(pp); 432 } else if (pp != NULL) { 433 ASSERT(!pp->valid); 434 pp = NULL; 435 } 436 437 if (pp != NULL) { 438 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 439 vm_object_pip_add(obj, 1); 440 pmap_remove_write(pp); 441 if (nbytes != 0) 442 vm_page_clear_dirty(pp, off, nbytes); 443 } 444 break; 445 } 446 return (pp); 447} 448 449static void 450page_unbusy(vm_page_t pp) 451{ 452 453 vm_page_sunbusy(pp); 454 vm_object_pip_subtract(pp->object, 1); 455} 456 457static vm_page_t 458page_hold(vnode_t *vp, int64_t start) 459{ 460 vm_object_t obj; 461 vm_page_t pp; 462 463 obj = vp->v_object; 464 zfs_vmobject_assert_wlocked(obj); 465 466 for (;;) { 467 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 468 pp->valid) { 469 if (vm_page_xbusied(pp)) { 470 /* 471 * Reference the page before unlocking and 472 * sleeping so that the page daemon is less 473 * likely to reclaim it. 474 */ 475 vm_page_reference(pp); 476 vm_page_lock(pp); 477 zfs_vmobject_wunlock(obj); 478 vm_page_busy_sleep(pp, "zfsmwb", true); 479 zfs_vmobject_wlock(obj); 480 continue; 481 } 482 483 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 484 vm_page_lock(pp); 485 vm_page_hold(pp); 486 vm_page_unlock(pp); 487 488 } else 489 pp = NULL; 490 break; 491 } 492 return (pp); 493} 494 495static void 496page_unhold(vm_page_t pp) 497{ 498 499 vm_page_lock(pp); 500 vm_page_unhold(pp); 501 vm_page_unlock(pp); 502} 503 504/* 505 * When a file is memory mapped, we must keep the IO data synchronized 506 * between the DMU cache and the memory mapped pages. What this means: 507 * 508 * On Write: If we find a memory mapped page, we write to *both* 509 * the page and the dmu buffer. 510 */ 511static void 512update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, 513 int segflg, dmu_tx_t *tx) 514{ 515 vm_object_t obj; 516 struct sf_buf *sf; 517 caddr_t va; 518 int off; 519 520 ASSERT(segflg != UIO_NOCOPY); 521 ASSERT(vp->v_mount != NULL); 522 obj = vp->v_object; 523 ASSERT(obj != NULL); 524 525 off = start & PAGEOFFSET; 526 zfs_vmobject_wlock(obj); 527 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 528 vm_page_t pp; 529 int nbytes = imin(PAGESIZE - off, len); 530 531 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { 532 zfs_vmobject_wunlock(obj); 533 534 va = zfs_map_page(pp, &sf); 535 (void) dmu_read(os, oid, start+off, nbytes, 536 va+off, DMU_READ_PREFETCH);; 537 zfs_unmap_page(sf); 538 539 zfs_vmobject_wlock(obj); 540 page_unbusy(pp); 541 } 542 len -= nbytes; 543 off = 0; 544 } 545 vm_object_pip_wakeupn(obj, 0); 546 zfs_vmobject_wunlock(obj); 547} 548 549/* 550 * Read with UIO_NOCOPY flag means that sendfile(2) requests 551 * ZFS to populate a range of page cache pages with data. 552 * 553 * NOTE: this function could be optimized to pre-allocate 554 * all pages in advance, drain exclusive busy on all of them, 555 * map them into contiguous KVA region and populate them 556 * in one single dmu_read() call. 557 */ 558static int 559mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) 560{ 561 znode_t *zp = VTOZ(vp); 562 objset_t *os = zp->z_zfsvfs->z_os; 563 struct sf_buf *sf; 564 vm_object_t obj; 565 vm_page_t pp; 566 int64_t start; 567 caddr_t va; 568 int len = nbytes; 569 int off; 570 int error = 0; 571 572 ASSERT(uio->uio_segflg == UIO_NOCOPY); 573 ASSERT(vp->v_mount != NULL); 574 obj = vp->v_object; 575 ASSERT(obj != NULL); 576 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); 577 578 zfs_vmobject_wlock(obj); 579 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { 580 int bytes = MIN(PAGESIZE, len); 581 582 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | 583 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); 584 if (pp->valid == 0) { 585 zfs_vmobject_wunlock(obj); 586 va = zfs_map_page(pp, &sf); 587 error = dmu_read(os, zp->z_id, start, bytes, va, 588 DMU_READ_PREFETCH); 589 if (bytes != PAGESIZE && error == 0) 590 bzero(va + bytes, PAGESIZE - bytes); 591 zfs_unmap_page(sf); 592 zfs_vmobject_wlock(obj); 593 vm_page_sunbusy(pp); 594 vm_page_lock(pp); 595 if (error) { 596 if (pp->wire_count == 0 && pp->valid == 0 && 597 !vm_page_busied(pp)) 598 vm_page_free(pp); 599 } else { 600 pp->valid = VM_PAGE_BITS_ALL; 601 vm_page_activate(pp); 602 } 603 vm_page_unlock(pp); 604 } else { 605 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 606 vm_page_sunbusy(pp); 607 } 608 if (error) 609 break; 610 uio->uio_resid -= bytes; 611 uio->uio_offset += bytes; 612 len -= bytes; 613 } 614 zfs_vmobject_wunlock(obj); 615 return (error); 616} 617 618/* 619 * When a file is memory mapped, we must keep the IO data synchronized 620 * between the DMU cache and the memory mapped pages. What this means: 621 * 622 * On Read: We "read" preferentially from memory mapped pages, 623 * else we default from the dmu buffer. 624 * 625 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 626 * the file is memory mapped. 627 */ 628static int 629mappedread(vnode_t *vp, int nbytes, uio_t *uio) 630{ 631 znode_t *zp = VTOZ(vp); 632 vm_object_t obj; 633 int64_t start; 634 caddr_t va; 635 int len = nbytes; 636 int off; 637 int error = 0; 638 639 ASSERT(vp->v_mount != NULL); 640 obj = vp->v_object; 641 ASSERT(obj != NULL); 642 643 start = uio->uio_loffset; 644 off = start & PAGEOFFSET; 645 zfs_vmobject_wlock(obj); 646 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 647 vm_page_t pp; 648 uint64_t bytes = MIN(PAGESIZE - off, len); 649 650 if (pp = page_hold(vp, start)) { 651 struct sf_buf *sf; 652 caddr_t va; 653 654 zfs_vmobject_wunlock(obj); 655 va = zfs_map_page(pp, &sf); 656#ifdef illumos 657 error = uiomove(va + off, bytes, UIO_READ, uio); 658#else 659 error = vn_io_fault_uiomove(va + off, bytes, uio); 660#endif 661 zfs_unmap_page(sf); 662 zfs_vmobject_wlock(obj); 663 page_unhold(pp); 664 } else { 665 zfs_vmobject_wunlock(obj); 666 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 667 uio, bytes); 668 zfs_vmobject_wlock(obj); 669 } 670 len -= bytes; 671 off = 0; 672 if (error) 673 break; 674 } 675 zfs_vmobject_wunlock(obj); 676 return (error); 677} 678 679offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 680 681/* 682 * Read bytes from specified file into supplied buffer. 683 * 684 * IN: vp - vnode of file to be read from. 685 * uio - structure supplying read location, range info, 686 * and return buffer. 687 * ioflag - SYNC flags; used to provide FRSYNC semantics. 688 * cr - credentials of caller. 689 * ct - caller context 690 * 691 * OUT: uio - updated offset and range, buffer filled. 692 * 693 * RETURN: 0 on success, error code on failure. 694 * 695 * Side Effects: 696 * vp - atime updated if byte count > 0 697 */ 698/* ARGSUSED */ 699static int 700zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 701{ 702 znode_t *zp = VTOZ(vp); 703 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 704 ssize_t n, nbytes; 705 int error = 0; 706 rl_t *rl; 707 xuio_t *xuio = NULL; 708 709 ZFS_ENTER(zfsvfs); 710 ZFS_VERIFY_ZP(zp); 711 712 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 713 ZFS_EXIT(zfsvfs); 714 return (SET_ERROR(EACCES)); 715 } 716 717 /* 718 * Validate file offset 719 */ 720 if (uio->uio_loffset < (offset_t)0) { 721 ZFS_EXIT(zfsvfs); 722 return (SET_ERROR(EINVAL)); 723 } 724 725 /* 726 * Fasttrack empty reads 727 */ 728 if (uio->uio_resid == 0) { 729 ZFS_EXIT(zfsvfs); 730 return (0); 731 } 732 733 /* 734 * Check for mandatory locks 735 */ 736 if (MANDMODE(zp->z_mode)) { 737 if (error = chklock(vp, FREAD, 738 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 739 ZFS_EXIT(zfsvfs); 740 return (error); 741 } 742 } 743 744 /* 745 * If we're in FRSYNC mode, sync out this znode before reading it. 746 */ 747 if (zfsvfs->z_log && 748 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 749 zil_commit(zfsvfs->z_log, zp->z_id); 750 751 /* 752 * Lock the range against changes. 753 */ 754 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 755 756 /* 757 * If we are reading past end-of-file we can skip 758 * to the end; but we might still need to set atime. 759 */ 760 if (uio->uio_loffset >= zp->z_size) { 761 error = 0; 762 goto out; 763 } 764 765 ASSERT(uio->uio_loffset < zp->z_size); 766 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 767 768#ifdef illumos 769 if ((uio->uio_extflg == UIO_XUIO) && 770 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 771 int nblk; 772 int blksz = zp->z_blksz; 773 uint64_t offset = uio->uio_loffset; 774 775 xuio = (xuio_t *)uio; 776 if ((ISP2(blksz))) { 777 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 778 blksz)) / blksz; 779 } else { 780 ASSERT(offset + n <= blksz); 781 nblk = 1; 782 } 783 (void) dmu_xuio_init(xuio, nblk); 784 785 if (vn_has_cached_data(vp)) { 786 /* 787 * For simplicity, we always allocate a full buffer 788 * even if we only expect to read a portion of a block. 789 */ 790 while (--nblk >= 0) { 791 (void) dmu_xuio_add(xuio, 792 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 793 blksz), 0, blksz); 794 } 795 } 796 } 797#endif /* illumos */ 798 799 while (n > 0) { 800 nbytes = MIN(n, zfs_read_chunk_size - 801 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 802 803#ifdef __FreeBSD__ 804 if (uio->uio_segflg == UIO_NOCOPY) 805 error = mappedread_sf(vp, nbytes, uio); 806 else 807#endif /* __FreeBSD__ */ 808 if (vn_has_cached_data(vp)) { 809 error = mappedread(vp, nbytes, uio); 810 } else { 811 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 812 uio, nbytes); 813 } 814 if (error) { 815 /* convert checksum errors into IO errors */ 816 if (error == ECKSUM) 817 error = SET_ERROR(EIO); 818 break; 819 } 820 821 n -= nbytes; 822 } 823out: 824 zfs_range_unlock(rl); 825 826 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 827 ZFS_EXIT(zfsvfs); 828 return (error); 829} 830 831/* 832 * Write the bytes to a file. 833 * 834 * IN: vp - vnode of file to be written to. 835 * uio - structure supplying write location, range info, 836 * and data buffer. 837 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 838 * set if in append mode. 839 * cr - credentials of caller. 840 * ct - caller context (NFS/CIFS fem monitor only) 841 * 842 * OUT: uio - updated offset and range. 843 * 844 * RETURN: 0 on success, error code on failure. 845 * 846 * Timestamps: 847 * vp - ctime|mtime updated if byte count > 0 848 */ 849 850/* ARGSUSED */ 851static int 852zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 853{ 854 znode_t *zp = VTOZ(vp); 855 rlim64_t limit = MAXOFFSET_T; 856 ssize_t start_resid = uio->uio_resid; 857 ssize_t tx_bytes; 858 uint64_t end_size; 859 dmu_tx_t *tx; 860 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 861 zilog_t *zilog; 862 offset_t woff; 863 ssize_t n, nbytes; 864 rl_t *rl; 865 int max_blksz = zfsvfs->z_max_blksz; 866 int error = 0; 867 arc_buf_t *abuf; 868 iovec_t *aiov = NULL; 869 xuio_t *xuio = NULL; 870 int i_iov = 0; 871 int iovcnt = uio->uio_iovcnt; 872 iovec_t *iovp = uio->uio_iov; 873 int write_eof; 874 int count = 0; 875 sa_bulk_attr_t bulk[4]; 876 uint64_t mtime[2], ctime[2]; 877 878 /* 879 * Fasttrack empty write 880 */ 881 n = start_resid; 882 if (n == 0) 883 return (0); 884 885 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 886 limit = MAXOFFSET_T; 887 888 ZFS_ENTER(zfsvfs); 889 ZFS_VERIFY_ZP(zp); 890 891 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 892 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 893 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 894 &zp->z_size, 8); 895 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 896 &zp->z_pflags, 8); 897 898 /* 899 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 900 * callers might not be able to detect properly that we are read-only, 901 * so check it explicitly here. 902 */ 903 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 904 ZFS_EXIT(zfsvfs); 905 return (SET_ERROR(EROFS)); 906 } 907 908 /* 909 * If immutable or not appending then return EPERM. 910 * Intentionally allow ZFS_READONLY through here. 911 * See zfs_zaccess_common() 912 */ 913 if ((zp->z_pflags & ZFS_IMMUTABLE) || 914 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 915 (uio->uio_loffset < zp->z_size))) { 916 ZFS_EXIT(zfsvfs); 917 return (SET_ERROR(EPERM)); 918 } 919 920 zilog = zfsvfs->z_log; 921 922 /* 923 * Validate file offset 924 */ 925 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 926 if (woff < 0) { 927 ZFS_EXIT(zfsvfs); 928 return (SET_ERROR(EINVAL)); 929 } 930 931 /* 932 * Check for mandatory locks before calling zfs_range_lock() 933 * in order to prevent a deadlock with locks set via fcntl(). 934 */ 935 if (MANDMODE((mode_t)zp->z_mode) && 936 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 937 ZFS_EXIT(zfsvfs); 938 return (error); 939 } 940 941#ifdef illumos 942 /* 943 * Pre-fault the pages to ensure slow (eg NFS) pages 944 * don't hold up txg. 945 * Skip this if uio contains loaned arc_buf. 946 */ 947 if ((uio->uio_extflg == UIO_XUIO) && 948 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 949 xuio = (xuio_t *)uio; 950 else 951 uio_prefaultpages(MIN(n, max_blksz), uio); 952#endif 953 954 /* 955 * If in append mode, set the io offset pointer to eof. 956 */ 957 if (ioflag & FAPPEND) { 958 /* 959 * Obtain an appending range lock to guarantee file append 960 * semantics. We reset the write offset once we have the lock. 961 */ 962 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 963 woff = rl->r_off; 964 if (rl->r_len == UINT64_MAX) { 965 /* 966 * We overlocked the file because this write will cause 967 * the file block size to increase. 968 * Note that zp_size cannot change with this lock held. 969 */ 970 woff = zp->z_size; 971 } 972 uio->uio_loffset = woff; 973 } else { 974 /* 975 * Note that if the file block size will change as a result of 976 * this write, then this range lock will lock the entire file 977 * so that we can re-write the block safely. 978 */ 979 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 980 } 981 982 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { 983 zfs_range_unlock(rl); 984 ZFS_EXIT(zfsvfs); 985 return (EFBIG); 986 } 987 988 if (woff >= limit) { 989 zfs_range_unlock(rl); 990 ZFS_EXIT(zfsvfs); 991 return (SET_ERROR(EFBIG)); 992 } 993 994 if ((woff + n) > limit || woff > (limit - n)) 995 n = limit - woff; 996 997 /* Will this write extend the file length? */ 998 write_eof = (woff + n > zp->z_size); 999 1000 end_size = MAX(zp->z_size, woff + n); 1001 1002 /* 1003 * Write the file in reasonable size chunks. Each chunk is written 1004 * in a separate transaction; this keeps the intent log records small 1005 * and allows us to do more fine-grained space accounting. 1006 */ 1007 while (n > 0) { 1008 abuf = NULL; 1009 woff = uio->uio_loffset; 1010 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 1011 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 1012 if (abuf != NULL) 1013 dmu_return_arcbuf(abuf); 1014 error = SET_ERROR(EDQUOT); 1015 break; 1016 } 1017 1018 if (xuio && abuf == NULL) { 1019 ASSERT(i_iov < iovcnt); 1020 aiov = &iovp[i_iov]; 1021 abuf = dmu_xuio_arcbuf(xuio, i_iov); 1022 dmu_xuio_clear(xuio, i_iov); 1023 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 1024 iovec_t *, aiov, arc_buf_t *, abuf); 1025 ASSERT((aiov->iov_base == abuf->b_data) || 1026 ((char *)aiov->iov_base - (char *)abuf->b_data + 1027 aiov->iov_len == arc_buf_size(abuf))); 1028 i_iov++; 1029 } else if (abuf == NULL && n >= max_blksz && 1030 woff >= zp->z_size && 1031 P2PHASE(woff, max_blksz) == 0 && 1032 zp->z_blksz == max_blksz) { 1033 /* 1034 * This write covers a full block. "Borrow" a buffer 1035 * from the dmu so that we can fill it before we enter 1036 * a transaction. This avoids the possibility of 1037 * holding up the transaction if the data copy hangs 1038 * up on a pagefault (e.g., from an NFS server mapping). 1039 */ 1040 size_t cbytes; 1041 1042 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 1043 max_blksz); 1044 ASSERT(abuf != NULL); 1045 ASSERT(arc_buf_size(abuf) == max_blksz); 1046 if (error = uiocopy(abuf->b_data, max_blksz, 1047 UIO_WRITE, uio, &cbytes)) { 1048 dmu_return_arcbuf(abuf); 1049 break; 1050 } 1051 ASSERT(cbytes == max_blksz); 1052 } 1053 1054 /* 1055 * Start a transaction. 1056 */ 1057 tx = dmu_tx_create(zfsvfs->z_os); 1058 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1059 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 1060 zfs_sa_upgrade_txholds(tx, zp); 1061 error = dmu_tx_assign(tx, TXG_WAIT); 1062 if (error) { 1063 dmu_tx_abort(tx); 1064 if (abuf != NULL) 1065 dmu_return_arcbuf(abuf); 1066 break; 1067 } 1068 1069 /* 1070 * If zfs_range_lock() over-locked we grow the blocksize 1071 * and then reduce the lock range. This will only happen 1072 * on the first iteration since zfs_range_reduce() will 1073 * shrink down r_len to the appropriate size. 1074 */ 1075 if (rl->r_len == UINT64_MAX) { 1076 uint64_t new_blksz; 1077 1078 if (zp->z_blksz > max_blksz) { 1079 /* 1080 * File's blocksize is already larger than the 1081 * "recordsize" property. Only let it grow to 1082 * the next power of 2. 1083 */ 1084 ASSERT(!ISP2(zp->z_blksz)); 1085 new_blksz = MIN(end_size, 1086 1 << highbit64(zp->z_blksz)); 1087 } else { 1088 new_blksz = MIN(end_size, max_blksz); 1089 } 1090 zfs_grow_blocksize(zp, new_blksz, tx); 1091 zfs_range_reduce(rl, woff, n); 1092 } 1093 1094 /* 1095 * XXX - should we really limit each write to z_max_blksz? 1096 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 1097 */ 1098 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 1099 1100 if (woff + nbytes > zp->z_size) 1101 vnode_pager_setsize(vp, woff + nbytes); 1102 1103 if (abuf == NULL) { 1104 tx_bytes = uio->uio_resid; 1105 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 1106 uio, nbytes, tx); 1107 tx_bytes -= uio->uio_resid; 1108 } else { 1109 tx_bytes = nbytes; 1110 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 1111 /* 1112 * If this is not a full block write, but we are 1113 * extending the file past EOF and this data starts 1114 * block-aligned, use assign_arcbuf(). Otherwise, 1115 * write via dmu_write(). 1116 */ 1117 if (tx_bytes < max_blksz && (!write_eof || 1118 aiov->iov_base != abuf->b_data)) { 1119 ASSERT(xuio); 1120 dmu_write(zfsvfs->z_os, zp->z_id, woff, 1121 aiov->iov_len, aiov->iov_base, tx); 1122 dmu_return_arcbuf(abuf); 1123 xuio_stat_wbuf_copied(); 1124 } else { 1125 ASSERT(xuio || tx_bytes == max_blksz); 1126 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 1127 woff, abuf, tx); 1128 } 1129 ASSERT(tx_bytes <= uio->uio_resid); 1130 uioskip(uio, tx_bytes); 1131 } 1132 if (tx_bytes && vn_has_cached_data(vp)) { 1133 update_pages(vp, woff, tx_bytes, zfsvfs->z_os, 1134 zp->z_id, uio->uio_segflg, tx); 1135 } 1136 1137 /* 1138 * If we made no progress, we're done. If we made even 1139 * partial progress, update the znode and ZIL accordingly. 1140 */ 1141 if (tx_bytes == 0) { 1142 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 1143 (void *)&zp->z_size, sizeof (uint64_t), tx); 1144 dmu_tx_commit(tx); 1145 ASSERT(error != 0); 1146 break; 1147 } 1148 1149 /* 1150 * Clear Set-UID/Set-GID bits on successful write if not 1151 * privileged and at least one of the excute bits is set. 1152 * 1153 * It would be nice to to this after all writes have 1154 * been done, but that would still expose the ISUID/ISGID 1155 * to another app after the partial write is committed. 1156 * 1157 * Note: we don't call zfs_fuid_map_id() here because 1158 * user 0 is not an ephemeral uid. 1159 */ 1160 mutex_enter(&zp->z_acl_lock); 1161 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 1162 (S_IXUSR >> 6))) != 0 && 1163 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 1164 secpolicy_vnode_setid_retain(vp, cr, 1165 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 1166 uint64_t newmode; 1167 zp->z_mode &= ~(S_ISUID | S_ISGID); 1168 newmode = zp->z_mode; 1169 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 1170 (void *)&newmode, sizeof (uint64_t), tx); 1171 } 1172 mutex_exit(&zp->z_acl_lock); 1173 1174 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 1175 B_TRUE); 1176 1177 /* 1178 * Update the file size (zp_size) if it has changed; 1179 * account for possible concurrent updates. 1180 */ 1181 while ((end_size = zp->z_size) < uio->uio_loffset) { 1182 (void) atomic_cas_64(&zp->z_size, end_size, 1183 uio->uio_loffset); 1184#ifdef illumos 1185 ASSERT(error == 0); 1186#else 1187 ASSERT(error == 0 || error == EFAULT); 1188#endif 1189 } 1190 /* 1191 * If we are replaying and eof is non zero then force 1192 * the file size to the specified eof. Note, there's no 1193 * concurrency during replay. 1194 */ 1195 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1196 zp->z_size = zfsvfs->z_replay_eof; 1197 1198 if (error == 0) 1199 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1200 else 1201 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1202 1203 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 1204 dmu_tx_commit(tx); 1205 1206 if (error != 0) 1207 break; 1208 ASSERT(tx_bytes == nbytes); 1209 n -= nbytes; 1210 1211#ifdef illumos 1212 if (!xuio && n > 0) 1213 uio_prefaultpages(MIN(n, max_blksz), uio); 1214#endif 1215 } 1216 1217 zfs_range_unlock(rl); 1218 1219 /* 1220 * If we're in replay mode, or we made no progress, return error. 1221 * Otherwise, it's at least a partial write, so it's successful. 1222 */ 1223 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 1224 ZFS_EXIT(zfsvfs); 1225 return (error); 1226 } 1227 1228#ifdef __FreeBSD__ 1229 /* 1230 * EFAULT means that at least one page of the source buffer was not 1231 * available. VFS will re-try remaining I/O upon this error. 1232 */ 1233 if (error == EFAULT) { 1234 ZFS_EXIT(zfsvfs); 1235 return (error); 1236 } 1237#endif 1238 1239 if (ioflag & (FSYNC | FDSYNC) || 1240 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1241 zil_commit(zilog, zp->z_id); 1242 1243 ZFS_EXIT(zfsvfs); 1244 return (0); 1245} 1246 1247void 1248zfs_get_done(zgd_t *zgd, int error) 1249{ 1250 znode_t *zp = zgd->zgd_private; 1251 objset_t *os = zp->z_zfsvfs->z_os; 1252 1253 if (zgd->zgd_db) 1254 dmu_buf_rele(zgd->zgd_db, zgd); 1255 1256 zfs_range_unlock(zgd->zgd_rl); 1257 1258 /* 1259 * Release the vnode asynchronously as we currently have the 1260 * txg stopped from syncing. 1261 */ 1262 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1263 1264 if (error == 0 && zgd->zgd_bp) 1265 zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); 1266 1267 kmem_free(zgd, sizeof (zgd_t)); 1268} 1269 1270#ifdef DEBUG 1271static int zil_fault_io = 0; 1272#endif 1273 1274/* 1275 * Get data to generate a TX_WRITE intent log record. 1276 */ 1277int 1278zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) 1279{ 1280 zfsvfs_t *zfsvfs = arg; 1281 objset_t *os = zfsvfs->z_os; 1282 znode_t *zp; 1283 uint64_t object = lr->lr_foid; 1284 uint64_t offset = lr->lr_offset; 1285 uint64_t size = lr->lr_length; 1286 dmu_buf_t *db; 1287 zgd_t *zgd; 1288 int error = 0; 1289 1290 ASSERT3P(lwb, !=, NULL); 1291 ASSERT3P(zio, !=, NULL); 1292 ASSERT3U(size, !=, 0); 1293 1294 /* 1295 * Nothing to do if the file has been removed 1296 */ 1297 if (zfs_zget(zfsvfs, object, &zp) != 0) 1298 return (SET_ERROR(ENOENT)); 1299 if (zp->z_unlinked) { 1300 /* 1301 * Release the vnode asynchronously as we currently have the 1302 * txg stopped from syncing. 1303 */ 1304 VN_RELE_ASYNC(ZTOV(zp), 1305 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1306 return (SET_ERROR(ENOENT)); 1307 } 1308 1309 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1310 zgd->zgd_lwb = lwb; 1311 zgd->zgd_private = zp; 1312 1313 /* 1314 * Write records come in two flavors: immediate and indirect. 1315 * For small writes it's cheaper to store the data with the 1316 * log record (immediate); for large writes it's cheaper to 1317 * sync the data and get a pointer to it (indirect) so that 1318 * we don't have to write the data twice. 1319 */ 1320 if (buf != NULL) { /* immediate write */ 1321 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1322 /* test for truncation needs to be done while range locked */ 1323 if (offset >= zp->z_size) { 1324 error = SET_ERROR(ENOENT); 1325 } else { 1326 error = dmu_read(os, object, offset, size, buf, 1327 DMU_READ_NO_PREFETCH); 1328 } 1329 ASSERT(error == 0 || error == ENOENT); 1330 } else { /* indirect write */ 1331 /* 1332 * Have to lock the whole block to ensure when it's 1333 * written out and its checksum is being calculated 1334 * that no one can change the data. We need to re-check 1335 * blocksize after we get the lock in case it's changed! 1336 */ 1337 for (;;) { 1338 uint64_t blkoff; 1339 size = zp->z_blksz; 1340 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1341 offset -= blkoff; 1342 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1343 RL_READER); 1344 if (zp->z_blksz == size) 1345 break; 1346 offset += blkoff; 1347 zfs_range_unlock(zgd->zgd_rl); 1348 } 1349 /* test for truncation needs to be done while range locked */ 1350 if (lr->lr_offset >= zp->z_size) 1351 error = SET_ERROR(ENOENT); 1352#ifdef DEBUG 1353 if (zil_fault_io) { 1354 error = SET_ERROR(EIO); 1355 zil_fault_io = 0; 1356 } 1357#endif 1358 if (error == 0) 1359 error = dmu_buf_hold(os, object, offset, zgd, &db, 1360 DMU_READ_NO_PREFETCH); 1361 1362 if (error == 0) { 1363 blkptr_t *bp = &lr->lr_blkptr; 1364 1365 zgd->zgd_db = db; 1366 zgd->zgd_bp = bp; 1367 1368 ASSERT(db->db_offset == offset); 1369 ASSERT(db->db_size == size); 1370 1371 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1372 zfs_get_done, zgd); 1373 ASSERT(error || lr->lr_length <= size); 1374 1375 /* 1376 * On success, we need to wait for the write I/O 1377 * initiated by dmu_sync() to complete before we can 1378 * release this dbuf. We will finish everything up 1379 * in the zfs_get_done() callback. 1380 */ 1381 if (error == 0) 1382 return (0); 1383 1384 if (error == EALREADY) { 1385 lr->lr_common.lrc_txtype = TX_WRITE2; 1386 error = 0; 1387 } 1388 } 1389 } 1390 1391 zfs_get_done(zgd, error); 1392 1393 return (error); 1394} 1395 1396/*ARGSUSED*/ 1397static int 1398zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1399 caller_context_t *ct) 1400{ 1401 znode_t *zp = VTOZ(vp); 1402 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1403 int error; 1404 1405 ZFS_ENTER(zfsvfs); 1406 ZFS_VERIFY_ZP(zp); 1407 1408 if (flag & V_ACE_MASK) 1409 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1410 else 1411 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1412 1413 ZFS_EXIT(zfsvfs); 1414 return (error); 1415} 1416 1417static int 1418zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) 1419{ 1420 int error; 1421 1422 *vpp = arg; 1423 error = vn_lock(*vpp, lkflags); 1424 if (error != 0) 1425 vrele(*vpp); 1426 return (error); 1427} 1428 1429static int 1430zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) 1431{ 1432 znode_t *zdp = VTOZ(dvp); 1433 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1434 int error; 1435 int ltype; 1436 1437 ASSERT_VOP_LOCKED(dvp, __func__); 1438#ifdef DIAGNOSTIC 1439 if ((zdp->z_pflags & ZFS_XATTR) == 0) 1440 VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); 1441#endif 1442 1443 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 1444 ASSERT3P(dvp, ==, vp); 1445 vref(dvp); 1446 ltype = lkflags & LK_TYPE_MASK; 1447 if (ltype != VOP_ISLOCKED(dvp)) { 1448 if (ltype == LK_EXCLUSIVE) 1449 vn_lock(dvp, LK_UPGRADE | LK_RETRY); 1450 else /* if (ltype == LK_SHARED) */ 1451 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); 1452 1453 /* 1454 * Relock for the "." case could leave us with 1455 * reclaimed vnode. 1456 */ 1457 if (dvp->v_iflag & VI_DOOMED) { 1458 vrele(dvp); 1459 return (SET_ERROR(ENOENT)); 1460 } 1461 } 1462 return (0); 1463 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 1464 /* 1465 * Note that in this case, dvp is the child vnode, and we 1466 * are looking up the parent vnode - exactly reverse from 1467 * normal operation. Unlocking dvp requires some rather 1468 * tricky unlock/relock dance to prevent mp from being freed; 1469 * use vn_vget_ino_gen() which takes care of all that. 1470 * 1471 * XXX Note that there is a time window when both vnodes are 1472 * unlocked. It is possible, although highly unlikely, that 1473 * during that window the parent-child relationship between 1474 * the vnodes may change, for example, get reversed. 1475 * In that case we would have a wrong lock order for the vnodes. 1476 * All other filesystems seem to ignore this problem, so we 1477 * do the same here. 1478 * A potential solution could be implemented as follows: 1479 * - using LK_NOWAIT when locking the second vnode and retrying 1480 * if necessary 1481 * - checking that the parent-child relationship still holds 1482 * after locking both vnodes and retrying if it doesn't 1483 */ 1484 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); 1485 return (error); 1486 } else { 1487 error = vn_lock(vp, lkflags); 1488 if (error != 0) 1489 vrele(vp); 1490 return (error); 1491 } 1492} 1493 1494/* 1495 * Lookup an entry in a directory, or an extended attribute directory. 1496 * If it exists, return a held vnode reference for it. 1497 * 1498 * IN: dvp - vnode of directory to search. 1499 * nm - name of entry to lookup. 1500 * pnp - full pathname to lookup [UNUSED]. 1501 * flags - LOOKUP_XATTR set if looking for an attribute. 1502 * rdir - root directory vnode [UNUSED]. 1503 * cr - credentials of caller. 1504 * ct - caller context 1505 * 1506 * OUT: vpp - vnode of located entry, NULL if not found. 1507 * 1508 * RETURN: 0 on success, error code on failure. 1509 * 1510 * Timestamps: 1511 * NA 1512 */ 1513/* ARGSUSED */ 1514static int 1515zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1516 int nameiop, cred_t *cr, kthread_t *td, int flags) 1517{ 1518 znode_t *zdp = VTOZ(dvp); 1519 znode_t *zp; 1520 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1521 int error = 0; 1522 1523 /* 1524 * Fast path lookup, however we must skip DNLC lookup 1525 * for case folding or normalizing lookups because the 1526 * DNLC code only stores the passed in name. This means 1527 * creating 'a' and removing 'A' on a case insensitive 1528 * file system would work, but DNLC still thinks 'a' 1529 * exists and won't let you create it again on the next 1530 * pass through fast path. 1531 */ 1532 if (!(flags & LOOKUP_XATTR)) { 1533 if (dvp->v_type != VDIR) { 1534 return (SET_ERROR(ENOTDIR)); 1535 } else if (zdp->z_sa_hdl == NULL) { 1536 return (SET_ERROR(EIO)); 1537 } 1538 } 1539 1540 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1541 1542 ZFS_ENTER(zfsvfs); 1543 ZFS_VERIFY_ZP(zdp); 1544 1545 *vpp = NULL; 1546 1547 if (flags & LOOKUP_XATTR) { 1548#ifdef TODO 1549 /* 1550 * If the xattr property is off, refuse the lookup request. 1551 */ 1552 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1553 ZFS_EXIT(zfsvfs); 1554 return (SET_ERROR(EINVAL)); 1555 } 1556#endif 1557 1558 /* 1559 * We don't allow recursive attributes.. 1560 * Maybe someday we will. 1561 */ 1562 if (zdp->z_pflags & ZFS_XATTR) { 1563 ZFS_EXIT(zfsvfs); 1564 return (SET_ERROR(EINVAL)); 1565 } 1566 1567 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1568 ZFS_EXIT(zfsvfs); 1569 return (error); 1570 } 1571 1572 /* 1573 * Do we have permission to get into attribute directory? 1574 */ 1575 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1576 B_FALSE, cr)) { 1577 vrele(*vpp); 1578 *vpp = NULL; 1579 } 1580 1581 ZFS_EXIT(zfsvfs); 1582 return (error); 1583 } 1584 1585 /* 1586 * Check accessibility of directory. 1587 */ 1588 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1589 ZFS_EXIT(zfsvfs); 1590 return (error); 1591 } 1592 1593 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1594 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1595 ZFS_EXIT(zfsvfs); 1596 return (SET_ERROR(EILSEQ)); 1597 } 1598 1599 1600 /* 1601 * First handle the special cases. 1602 */ 1603 if ((cnp->cn_flags & ISDOTDOT) != 0) { 1604 /* 1605 * If we are a snapshot mounted under .zfs, return 1606 * the vp for the snapshot directory. 1607 */ 1608 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { 1609 struct componentname cn; 1610 vnode_t *zfsctl_vp; 1611 int ltype; 1612 1613 ZFS_EXIT(zfsvfs); 1614 ltype = VOP_ISLOCKED(dvp); 1615 VOP_UNLOCK(dvp, 0); 1616 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, 1617 &zfsctl_vp); 1618 if (error == 0) { 1619 cn.cn_nameptr = "snapshot"; 1620 cn.cn_namelen = strlen(cn.cn_nameptr); 1621 cn.cn_nameiop = cnp->cn_nameiop; 1622 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; 1623 cn.cn_lkflags = cnp->cn_lkflags; 1624 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); 1625 vput(zfsctl_vp); 1626 } 1627 vn_lock(dvp, ltype | LK_RETRY); 1628 return (error); 1629 } 1630 } 1631 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { 1632 ZFS_EXIT(zfsvfs); 1633 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) 1634 return (SET_ERROR(ENOTSUP)); 1635 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); 1636 return (error); 1637 } 1638 1639 /* 1640 * The loop is retry the lookup if the parent-child relationship 1641 * changes during the dot-dot locking complexities. 1642 */ 1643 for (;;) { 1644 uint64_t parent; 1645 1646 error = zfs_dirlook(zdp, nm, &zp); 1647 if (error == 0) 1648 *vpp = ZTOV(zp); 1649 1650 ZFS_EXIT(zfsvfs); 1651 if (error != 0) 1652 break; 1653 1654 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); 1655 if (error != 0) { 1656 /* 1657 * If we've got a locking error, then the vnode 1658 * got reclaimed because of a force unmount. 1659 * We never enter doomed vnodes into the name cache. 1660 */ 1661 *vpp = NULL; 1662 return (error); 1663 } 1664 1665 if ((cnp->cn_flags & ISDOTDOT) == 0) 1666 break; 1667 1668 ZFS_ENTER(zfsvfs); 1669 if (zdp->z_sa_hdl == NULL) { 1670 error = SET_ERROR(EIO); 1671 } else { 1672 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1673 &parent, sizeof (parent)); 1674 } 1675 if (error != 0) { 1676 ZFS_EXIT(zfsvfs); 1677 vput(ZTOV(zp)); 1678 break; 1679 } 1680 if (zp->z_id == parent) { 1681 ZFS_EXIT(zfsvfs); 1682 break; 1683 } 1684 vput(ZTOV(zp)); 1685 } 1686 1687out: 1688 if (error != 0) 1689 *vpp = NULL; 1690 1691 /* Translate errors and add SAVENAME when needed. */ 1692 if (cnp->cn_flags & ISLASTCN) { 1693 switch (nameiop) { 1694 case CREATE: 1695 case RENAME: 1696 if (error == ENOENT) { 1697 error = EJUSTRETURN; 1698 cnp->cn_flags |= SAVENAME; 1699 break; 1700 } 1701 /* FALLTHROUGH */ 1702 case DELETE: 1703 if (error == 0) 1704 cnp->cn_flags |= SAVENAME; 1705 break; 1706 } 1707 } 1708 1709 /* Insert name into cache (as non-existent) if appropriate. */ 1710 if (zfsvfs->z_use_namecache && 1711 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) 1712 cache_enter(dvp, NULL, cnp); 1713 1714 /* Insert name into cache if appropriate. */ 1715 if (zfsvfs->z_use_namecache && 1716 error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1717 if (!(cnp->cn_flags & ISLASTCN) || 1718 (nameiop != DELETE && nameiop != RENAME)) { 1719 cache_enter(dvp, *vpp, cnp); 1720 } 1721 } 1722 1723 return (error); 1724} 1725 1726/* 1727 * Attempt to create a new entry in a directory. If the entry 1728 * already exists, truncate the file if permissible, else return 1729 * an error. Return the vp of the created or trunc'd file. 1730 * 1731 * IN: dvp - vnode of directory to put new file entry in. 1732 * name - name of new file entry. 1733 * vap - attributes of new file. 1734 * excl - flag indicating exclusive or non-exclusive mode. 1735 * mode - mode to open file with. 1736 * cr - credentials of caller. 1737 * flag - large file flag [UNUSED]. 1738 * ct - caller context 1739 * vsecp - ACL to be set 1740 * 1741 * OUT: vpp - vnode of created or trunc'd entry. 1742 * 1743 * RETURN: 0 on success, error code on failure. 1744 * 1745 * Timestamps: 1746 * dvp - ctime|mtime updated if new entry created 1747 * vp - ctime|mtime always, atime if new 1748 */ 1749 1750/* ARGSUSED */ 1751static int 1752zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1753 vnode_t **vpp, cred_t *cr, kthread_t *td) 1754{ 1755 znode_t *zp, *dzp = VTOZ(dvp); 1756 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1757 zilog_t *zilog; 1758 objset_t *os; 1759 dmu_tx_t *tx; 1760 int error; 1761 ksid_t *ksid; 1762 uid_t uid; 1763 gid_t gid = crgetgid(cr); 1764 zfs_acl_ids_t acl_ids; 1765 boolean_t fuid_dirtied; 1766 void *vsecp = NULL; 1767 int flag = 0; 1768 uint64_t txtype; 1769 1770 /* 1771 * If we have an ephemeral id, ACL, or XVATTR then 1772 * make sure file system is at proper version 1773 */ 1774 1775 ksid = crgetsid(cr, KSID_OWNER); 1776 if (ksid) 1777 uid = ksid_getid(ksid); 1778 else 1779 uid = crgetuid(cr); 1780 1781 if (zfsvfs->z_use_fuids == B_FALSE && 1782 (vsecp || (vap->va_mask & AT_XVATTR) || 1783 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1784 return (SET_ERROR(EINVAL)); 1785 1786 ZFS_ENTER(zfsvfs); 1787 ZFS_VERIFY_ZP(dzp); 1788 os = zfsvfs->z_os; 1789 zilog = zfsvfs->z_log; 1790 1791 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1792 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1793 ZFS_EXIT(zfsvfs); 1794 return (SET_ERROR(EILSEQ)); 1795 } 1796 1797 if (vap->va_mask & AT_XVATTR) { 1798 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1799 crgetuid(cr), cr, vap->va_type)) != 0) { 1800 ZFS_EXIT(zfsvfs); 1801 return (error); 1802 } 1803 } 1804 1805 *vpp = NULL; 1806 1807 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1808 vap->va_mode &= ~S_ISVTX; 1809 1810 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 1811 if (error) { 1812 ZFS_EXIT(zfsvfs); 1813 return (error); 1814 } 1815 ASSERT3P(zp, ==, NULL); 1816 1817 /* 1818 * Create a new file object and update the directory 1819 * to reference it. 1820 */ 1821 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1822 goto out; 1823 } 1824 1825 /* 1826 * We only support the creation of regular files in 1827 * extended attribute directories. 1828 */ 1829 1830 if ((dzp->z_pflags & ZFS_XATTR) && 1831 (vap->va_type != VREG)) { 1832 error = SET_ERROR(EINVAL); 1833 goto out; 1834 } 1835 1836 if ((error = zfs_acl_ids_create(dzp, 0, vap, 1837 cr, vsecp, &acl_ids)) != 0) 1838 goto out; 1839 1840 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1841 zfs_acl_ids_free(&acl_ids); 1842 error = SET_ERROR(EDQUOT); 1843 goto out; 1844 } 1845 1846 getnewvnode_reserve(1); 1847 1848 tx = dmu_tx_create(os); 1849 1850 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1851 ZFS_SA_BASE_ATTR_SIZE); 1852 1853 fuid_dirtied = zfsvfs->z_fuid_dirty; 1854 if (fuid_dirtied) 1855 zfs_fuid_txhold(zfsvfs, tx); 1856 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1857 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1858 if (!zfsvfs->z_use_sa && 1859 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1860 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1861 0, acl_ids.z_aclp->z_acl_bytes); 1862 } 1863 error = dmu_tx_assign(tx, TXG_WAIT); 1864 if (error) { 1865 zfs_acl_ids_free(&acl_ids); 1866 dmu_tx_abort(tx); 1867 getnewvnode_drop_reserve(); 1868 ZFS_EXIT(zfsvfs); 1869 return (error); 1870 } 1871 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1872 1873 if (fuid_dirtied) 1874 zfs_fuid_sync(zfsvfs, tx); 1875 1876 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 1877 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1878 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1879 vsecp, acl_ids.z_fuidp, vap); 1880 zfs_acl_ids_free(&acl_ids); 1881 dmu_tx_commit(tx); 1882 1883 getnewvnode_drop_reserve(); 1884 1885out: 1886 if (error == 0) { 1887 *vpp = ZTOV(zp); 1888 } 1889 1890 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1891 zil_commit(zilog, 0); 1892 1893 ZFS_EXIT(zfsvfs); 1894 return (error); 1895} 1896 1897/* 1898 * Remove an entry from a directory. 1899 * 1900 * IN: dvp - vnode of directory to remove entry from. 1901 * name - name of entry to remove. 1902 * cr - credentials of caller. 1903 * ct - caller context 1904 * flags - case flags 1905 * 1906 * RETURN: 0 on success, error code on failure. 1907 * 1908 * Timestamps: 1909 * dvp - ctime|mtime 1910 * vp - ctime (if nlink > 0) 1911 */ 1912 1913/*ARGSUSED*/ 1914static int 1915zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 1916{ 1917 znode_t *dzp = VTOZ(dvp); 1918 znode_t *zp = VTOZ(vp); 1919 znode_t *xzp; 1920 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1921 zilog_t *zilog; 1922 uint64_t acl_obj, xattr_obj; 1923 uint64_t obj = 0; 1924 dmu_tx_t *tx; 1925 boolean_t unlinked, toobig = FALSE; 1926 uint64_t txtype; 1927 int error; 1928 1929 ZFS_ENTER(zfsvfs); 1930 ZFS_VERIFY_ZP(dzp); 1931 ZFS_VERIFY_ZP(zp); 1932 zilog = zfsvfs->z_log; 1933 zp = VTOZ(vp); 1934 1935 xattr_obj = 0; 1936 xzp = NULL; 1937 1938 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1939 goto out; 1940 } 1941 1942 /* 1943 * Need to use rmdir for removing directories. 1944 */ 1945 if (vp->v_type == VDIR) { 1946 error = SET_ERROR(EPERM); 1947 goto out; 1948 } 1949 1950 vnevent_remove(vp, dvp, name, ct); 1951 1952 obj = zp->z_id; 1953 1954 /* are there any extended attributes? */ 1955 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1956 &xattr_obj, sizeof (xattr_obj)); 1957 if (error == 0 && xattr_obj) { 1958 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1959 ASSERT0(error); 1960 } 1961 1962 /* 1963 * We may delete the znode now, or we may put it in the unlinked set; 1964 * it depends on whether we're the last link, and on whether there are 1965 * other holds on the vnode. So we dmu_tx_hold() the right things to 1966 * allow for either case. 1967 */ 1968 tx = dmu_tx_create(zfsvfs->z_os); 1969 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1970 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1971 zfs_sa_upgrade_txholds(tx, zp); 1972 zfs_sa_upgrade_txholds(tx, dzp); 1973 1974 if (xzp) { 1975 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1976 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1977 } 1978 1979 /* charge as an update -- would be nice not to charge at all */ 1980 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1981 1982 /* 1983 * Mark this transaction as typically resulting in a net free of space 1984 */ 1985 dmu_tx_mark_netfree(tx); 1986 1987 error = dmu_tx_assign(tx, TXG_WAIT); 1988 if (error) { 1989 dmu_tx_abort(tx); 1990 ZFS_EXIT(zfsvfs); 1991 return (error); 1992 } 1993 1994 /* 1995 * Remove the directory entry. 1996 */ 1997 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); 1998 1999 if (error) { 2000 dmu_tx_commit(tx); 2001 goto out; 2002 } 2003 2004 if (unlinked) { 2005 zfs_unlinked_add(zp, tx); 2006 vp->v_vflag |= VV_NOSYNC; 2007 } 2008 2009 txtype = TX_REMOVE; 2010 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 2011 2012 dmu_tx_commit(tx); 2013out: 2014 2015 if (xzp) 2016 vrele(ZTOV(xzp)); 2017 2018 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2019 zil_commit(zilog, 0); 2020 2021 ZFS_EXIT(zfsvfs); 2022 return (error); 2023} 2024 2025/* 2026 * Create a new directory and insert it into dvp using the name 2027 * provided. Return a pointer to the inserted directory. 2028 * 2029 * IN: dvp - vnode of directory to add subdir to. 2030 * dirname - name of new directory. 2031 * vap - attributes of new directory. 2032 * cr - credentials of caller. 2033 * ct - caller context 2034 * flags - case flags 2035 * vsecp - ACL to be set 2036 * 2037 * OUT: vpp - vnode of created directory. 2038 * 2039 * RETURN: 0 on success, error code on failure. 2040 * 2041 * Timestamps: 2042 * dvp - ctime|mtime updated 2043 * vp - ctime|mtime|atime updated 2044 */ 2045/*ARGSUSED*/ 2046static int 2047zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) 2048{ 2049 znode_t *zp, *dzp = VTOZ(dvp); 2050 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2051 zilog_t *zilog; 2052 uint64_t txtype; 2053 dmu_tx_t *tx; 2054 int error; 2055 ksid_t *ksid; 2056 uid_t uid; 2057 gid_t gid = crgetgid(cr); 2058 zfs_acl_ids_t acl_ids; 2059 boolean_t fuid_dirtied; 2060 2061 ASSERT(vap->va_type == VDIR); 2062 2063 /* 2064 * If we have an ephemeral id, ACL, or XVATTR then 2065 * make sure file system is at proper version 2066 */ 2067 2068 ksid = crgetsid(cr, KSID_OWNER); 2069 if (ksid) 2070 uid = ksid_getid(ksid); 2071 else 2072 uid = crgetuid(cr); 2073 if (zfsvfs->z_use_fuids == B_FALSE && 2074 ((vap->va_mask & AT_XVATTR) || 2075 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 2076 return (SET_ERROR(EINVAL)); 2077 2078 ZFS_ENTER(zfsvfs); 2079 ZFS_VERIFY_ZP(dzp); 2080 zilog = zfsvfs->z_log; 2081 2082 if (dzp->z_pflags & ZFS_XATTR) { 2083 ZFS_EXIT(zfsvfs); 2084 return (SET_ERROR(EINVAL)); 2085 } 2086 2087 if (zfsvfs->z_utf8 && u8_validate(dirname, 2088 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2089 ZFS_EXIT(zfsvfs); 2090 return (SET_ERROR(EILSEQ)); 2091 } 2092 2093 if (vap->va_mask & AT_XVATTR) { 2094 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 2095 crgetuid(cr), cr, vap->va_type)) != 0) { 2096 ZFS_EXIT(zfsvfs); 2097 return (error); 2098 } 2099 } 2100 2101 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 2102 NULL, &acl_ids)) != 0) { 2103 ZFS_EXIT(zfsvfs); 2104 return (error); 2105 } 2106 2107 /* 2108 * First make sure the new directory doesn't exist. 2109 * 2110 * Existence is checked first to make sure we don't return 2111 * EACCES instead of EEXIST which can cause some applications 2112 * to fail. 2113 */ 2114 *vpp = NULL; 2115 2116 if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { 2117 zfs_acl_ids_free(&acl_ids); 2118 ZFS_EXIT(zfsvfs); 2119 return (error); 2120 } 2121 ASSERT3P(zp, ==, NULL); 2122 2123 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 2124 zfs_acl_ids_free(&acl_ids); 2125 ZFS_EXIT(zfsvfs); 2126 return (error); 2127 } 2128 2129 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 2130 zfs_acl_ids_free(&acl_ids); 2131 ZFS_EXIT(zfsvfs); 2132 return (SET_ERROR(EDQUOT)); 2133 } 2134 2135 /* 2136 * Add a new entry to the directory. 2137 */ 2138 getnewvnode_reserve(1); 2139 tx = dmu_tx_create(zfsvfs->z_os); 2140 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 2141 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2142 fuid_dirtied = zfsvfs->z_fuid_dirty; 2143 if (fuid_dirtied) 2144 zfs_fuid_txhold(zfsvfs, tx); 2145 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2146 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2147 acl_ids.z_aclp->z_acl_bytes); 2148 } 2149 2150 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2151 ZFS_SA_BASE_ATTR_SIZE); 2152 2153 error = dmu_tx_assign(tx, TXG_WAIT); 2154 if (error) { 2155 zfs_acl_ids_free(&acl_ids); 2156 dmu_tx_abort(tx); 2157 getnewvnode_drop_reserve(); 2158 ZFS_EXIT(zfsvfs); 2159 return (error); 2160 } 2161 2162 /* 2163 * Create new node. 2164 */ 2165 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2166 2167 if (fuid_dirtied) 2168 zfs_fuid_sync(zfsvfs, tx); 2169 2170 /* 2171 * Now put new name in parent dir. 2172 */ 2173 (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); 2174 2175 *vpp = ZTOV(zp); 2176 2177 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); 2178 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, 2179 acl_ids.z_fuidp, vap); 2180 2181 zfs_acl_ids_free(&acl_ids); 2182 2183 dmu_tx_commit(tx); 2184 2185 getnewvnode_drop_reserve(); 2186 2187 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2188 zil_commit(zilog, 0); 2189 2190 ZFS_EXIT(zfsvfs); 2191 return (0); 2192} 2193 2194/* 2195 * Remove a directory subdir entry. If the current working 2196 * directory is the same as the subdir to be removed, the 2197 * remove will fail. 2198 * 2199 * IN: dvp - vnode of directory to remove from. 2200 * name - name of directory to be removed. 2201 * cwd - vnode of current working directory. 2202 * cr - credentials of caller. 2203 * ct - caller context 2204 * flags - case flags 2205 * 2206 * RETURN: 0 on success, error code on failure. 2207 * 2208 * Timestamps: 2209 * dvp - ctime|mtime updated 2210 */ 2211/*ARGSUSED*/ 2212static int 2213zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 2214{ 2215 znode_t *dzp = VTOZ(dvp); 2216 znode_t *zp = VTOZ(vp); 2217 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2218 zilog_t *zilog; 2219 dmu_tx_t *tx; 2220 int error; 2221 2222 ZFS_ENTER(zfsvfs); 2223 ZFS_VERIFY_ZP(dzp); 2224 ZFS_VERIFY_ZP(zp); 2225 zilog = zfsvfs->z_log; 2226 2227 2228 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2229 goto out; 2230 } 2231 2232 if (vp->v_type != VDIR) { 2233 error = SET_ERROR(ENOTDIR); 2234 goto out; 2235 } 2236 2237 vnevent_rmdir(vp, dvp, name, ct); 2238 2239 tx = dmu_tx_create(zfsvfs->z_os); 2240 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2241 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2242 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2243 zfs_sa_upgrade_txholds(tx, zp); 2244 zfs_sa_upgrade_txholds(tx, dzp); 2245 dmu_tx_mark_netfree(tx); 2246 error = dmu_tx_assign(tx, TXG_WAIT); 2247 if (error) { 2248 dmu_tx_abort(tx); 2249 ZFS_EXIT(zfsvfs); 2250 return (error); 2251 } 2252 2253 cache_purge(dvp); 2254 2255 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); 2256 2257 if (error == 0) { 2258 uint64_t txtype = TX_RMDIR; 2259 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2260 } 2261 2262 dmu_tx_commit(tx); 2263 2264 cache_purge(vp); 2265out: 2266 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2267 zil_commit(zilog, 0); 2268 2269 ZFS_EXIT(zfsvfs); 2270 return (error); 2271} 2272 2273/* 2274 * Read as many directory entries as will fit into the provided 2275 * buffer from the given directory cursor position (specified in 2276 * the uio structure). 2277 * 2278 * IN: vp - vnode of directory to read. 2279 * uio - structure supplying read location, range info, 2280 * and return buffer. 2281 * cr - credentials of caller. 2282 * ct - caller context 2283 * flags - case flags 2284 * 2285 * OUT: uio - updated offset and range, buffer filled. 2286 * eofp - set to true if end-of-file detected. 2287 * 2288 * RETURN: 0 on success, error code on failure. 2289 * 2290 * Timestamps: 2291 * vp - atime updated 2292 * 2293 * Note that the low 4 bits of the cookie returned by zap is always zero. 2294 * This allows us to use the low range for "special" directory entries: 2295 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2296 * we use the offset 2 for the '.zfs' directory. 2297 */ 2298/* ARGSUSED */ 2299static int 2300zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2301{ 2302 znode_t *zp = VTOZ(vp); 2303 iovec_t *iovp; 2304 edirent_t *eodp; 2305 dirent64_t *odp; 2306 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2307 objset_t *os; 2308 caddr_t outbuf; 2309 size_t bufsize; 2310 zap_cursor_t zc; 2311 zap_attribute_t zap; 2312 uint_t bytes_wanted; 2313 uint64_t offset; /* must be unsigned; checks for < 1 */ 2314 uint64_t parent; 2315 int local_eof; 2316 int outcount; 2317 int error; 2318 uint8_t prefetch; 2319 boolean_t check_sysattrs; 2320 uint8_t type; 2321 int ncooks; 2322 u_long *cooks = NULL; 2323 int flags = 0; 2324 2325 ZFS_ENTER(zfsvfs); 2326 ZFS_VERIFY_ZP(zp); 2327 2328 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2329 &parent, sizeof (parent))) != 0) { 2330 ZFS_EXIT(zfsvfs); 2331 return (error); 2332 } 2333 2334 /* 2335 * If we are not given an eof variable, 2336 * use a local one. 2337 */ 2338 if (eofp == NULL) 2339 eofp = &local_eof; 2340 2341 /* 2342 * Check for valid iov_len. 2343 */ 2344 if (uio->uio_iov->iov_len <= 0) { 2345 ZFS_EXIT(zfsvfs); 2346 return (SET_ERROR(EINVAL)); 2347 } 2348 2349 /* 2350 * Quit if directory has been removed (posix) 2351 */ 2352 if ((*eofp = zp->z_unlinked) != 0) { 2353 ZFS_EXIT(zfsvfs); 2354 return (0); 2355 } 2356 2357 error = 0; 2358 os = zfsvfs->z_os; 2359 offset = uio->uio_loffset; 2360 prefetch = zp->z_zn_prefetch; 2361 2362 /* 2363 * Initialize the iterator cursor. 2364 */ 2365 if (offset <= 3) { 2366 /* 2367 * Start iteration from the beginning of the directory. 2368 */ 2369 zap_cursor_init(&zc, os, zp->z_id); 2370 } else { 2371 /* 2372 * The offset is a serialized cursor. 2373 */ 2374 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2375 } 2376 2377 /* 2378 * Get space to change directory entries into fs independent format. 2379 */ 2380 iovp = uio->uio_iov; 2381 bytes_wanted = iovp->iov_len; 2382 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2383 bufsize = bytes_wanted; 2384 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2385 odp = (struct dirent64 *)outbuf; 2386 } else { 2387 bufsize = bytes_wanted; 2388 outbuf = NULL; 2389 odp = (struct dirent64 *)iovp->iov_base; 2390 } 2391 eodp = (struct edirent *)odp; 2392 2393 if (ncookies != NULL) { 2394 /* 2395 * Minimum entry size is dirent size and 1 byte for a file name. 2396 */ 2397 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2398 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2399 *cookies = cooks; 2400 *ncookies = ncooks; 2401 } 2402 /* 2403 * If this VFS supports the system attribute view interface; and 2404 * we're looking at an extended attribute directory; and we care 2405 * about normalization conflicts on this vfs; then we must check 2406 * for normalization conflicts with the sysattr name space. 2407 */ 2408#ifdef TODO 2409 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2410 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2411 (flags & V_RDDIR_ENTFLAGS); 2412#else 2413 check_sysattrs = 0; 2414#endif 2415 2416 /* 2417 * Transform to file-system independent format 2418 */ 2419 outcount = 0; 2420 while (outcount < bytes_wanted) { 2421 ino64_t objnum; 2422 ushort_t reclen; 2423 off64_t *next = NULL; 2424 2425 /* 2426 * Special case `.', `..', and `.zfs'. 2427 */ 2428 if (offset == 0) { 2429 (void) strcpy(zap.za_name, "."); 2430 zap.za_normalization_conflict = 0; 2431 objnum = zp->z_id; 2432 type = DT_DIR; 2433 } else if (offset == 1) { 2434 (void) strcpy(zap.za_name, ".."); 2435 zap.za_normalization_conflict = 0; 2436 objnum = parent; 2437 type = DT_DIR; 2438 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2439 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2440 zap.za_normalization_conflict = 0; 2441 objnum = ZFSCTL_INO_ROOT; 2442 type = DT_DIR; 2443 } else { 2444 /* 2445 * Grab next entry. 2446 */ 2447 if (error = zap_cursor_retrieve(&zc, &zap)) { 2448 if ((*eofp = (error == ENOENT)) != 0) 2449 break; 2450 else 2451 goto update; 2452 } 2453 2454 if (zap.za_integer_length != 8 || 2455 zap.za_num_integers != 1) { 2456 cmn_err(CE_WARN, "zap_readdir: bad directory " 2457 "entry, obj = %lld, offset = %lld\n", 2458 (u_longlong_t)zp->z_id, 2459 (u_longlong_t)offset); 2460 error = SET_ERROR(ENXIO); 2461 goto update; 2462 } 2463 2464 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2465 /* 2466 * MacOS X can extract the object type here such as: 2467 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2468 */ 2469 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2470 2471 if (check_sysattrs && !zap.za_normalization_conflict) { 2472#ifdef TODO 2473 zap.za_normalization_conflict = 2474 xattr_sysattr_casechk(zap.za_name); 2475#else 2476 panic("%s:%u: TODO", __func__, __LINE__); 2477#endif 2478 } 2479 } 2480 2481 if (flags & V_RDDIR_ACCFILTER) { 2482 /* 2483 * If we have no access at all, don't include 2484 * this entry in the returned information 2485 */ 2486 znode_t *ezp; 2487 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2488 goto skip_entry; 2489 if (!zfs_has_access(ezp, cr)) { 2490 vrele(ZTOV(ezp)); 2491 goto skip_entry; 2492 } 2493 vrele(ZTOV(ezp)); 2494 } 2495 2496 if (flags & V_RDDIR_ENTFLAGS) 2497 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2498 else 2499 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2500 2501 /* 2502 * Will this entry fit in the buffer? 2503 */ 2504 if (outcount + reclen > bufsize) { 2505 /* 2506 * Did we manage to fit anything in the buffer? 2507 */ 2508 if (!outcount) { 2509 error = SET_ERROR(EINVAL); 2510 goto update; 2511 } 2512 break; 2513 } 2514 if (flags & V_RDDIR_ENTFLAGS) { 2515 /* 2516 * Add extended flag entry: 2517 */ 2518 eodp->ed_ino = objnum; 2519 eodp->ed_reclen = reclen; 2520 /* NOTE: ed_off is the offset for the *next* entry */ 2521 next = &(eodp->ed_off); 2522 eodp->ed_eflags = zap.za_normalization_conflict ? 2523 ED_CASE_CONFLICT : 0; 2524 (void) strncpy(eodp->ed_name, zap.za_name, 2525 EDIRENT_NAMELEN(reclen)); 2526 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2527 } else { 2528 /* 2529 * Add normal entry: 2530 */ 2531 odp->d_ino = objnum; 2532 odp->d_reclen = reclen; 2533 odp->d_namlen = strlen(zap.za_name); 2534 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2535 odp->d_type = type; 2536 odp = (dirent64_t *)((intptr_t)odp + reclen); 2537 } 2538 outcount += reclen; 2539 2540 ASSERT(outcount <= bufsize); 2541 2542 /* Prefetch znode */ 2543 if (prefetch) 2544 dmu_prefetch(os, objnum, 0, 0, 0, 2545 ZIO_PRIORITY_SYNC_READ); 2546 2547 skip_entry: 2548 /* 2549 * Move to the next entry, fill in the previous offset. 2550 */ 2551 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2552 zap_cursor_advance(&zc); 2553 offset = zap_cursor_serialize(&zc); 2554 } else { 2555 offset += 1; 2556 } 2557 2558 if (cooks != NULL) { 2559 *cooks++ = offset; 2560 ncooks--; 2561 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2562 } 2563 } 2564 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2565 2566 /* Subtract unused cookies */ 2567 if (ncookies != NULL) 2568 *ncookies -= ncooks; 2569 2570 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2571 iovp->iov_base += outcount; 2572 iovp->iov_len -= outcount; 2573 uio->uio_resid -= outcount; 2574 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2575 /* 2576 * Reset the pointer. 2577 */ 2578 offset = uio->uio_loffset; 2579 } 2580 2581update: 2582 zap_cursor_fini(&zc); 2583 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2584 kmem_free(outbuf, bufsize); 2585 2586 if (error == ENOENT) 2587 error = 0; 2588 2589 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2590 2591 uio->uio_loffset = offset; 2592 ZFS_EXIT(zfsvfs); 2593 if (error != 0 && cookies != NULL) { 2594 free(*cookies, M_TEMP); 2595 *cookies = NULL; 2596 *ncookies = 0; 2597 } 2598 return (error); 2599} 2600 2601ulong_t zfs_fsync_sync_cnt = 4; 2602 2603static int 2604zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2605{ 2606 znode_t *zp = VTOZ(vp); 2607 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2608 2609 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2610 2611 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2612 ZFS_ENTER(zfsvfs); 2613 ZFS_VERIFY_ZP(zp); 2614 zil_commit(zfsvfs->z_log, zp->z_id); 2615 ZFS_EXIT(zfsvfs); 2616 } 2617 return (0); 2618} 2619 2620 2621/* 2622 * Get the requested file attributes and place them in the provided 2623 * vattr structure. 2624 * 2625 * IN: vp - vnode of file. 2626 * vap - va_mask identifies requested attributes. 2627 * If AT_XVATTR set, then optional attrs are requested 2628 * flags - ATTR_NOACLCHECK (CIFS server context) 2629 * cr - credentials of caller. 2630 * ct - caller context 2631 * 2632 * OUT: vap - attribute values. 2633 * 2634 * RETURN: 0 (always succeeds). 2635 */ 2636/* ARGSUSED */ 2637static int 2638zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2639 caller_context_t *ct) 2640{ 2641 znode_t *zp = VTOZ(vp); 2642 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2643 int error = 0; 2644 uint32_t blksize; 2645 u_longlong_t nblocks; 2646 uint64_t links; 2647 uint64_t mtime[2], ctime[2], crtime[2], rdev; 2648 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2649 xoptattr_t *xoap = NULL; 2650 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2651 sa_bulk_attr_t bulk[4]; 2652 int count = 0; 2653 2654 ZFS_ENTER(zfsvfs); 2655 ZFS_VERIFY_ZP(zp); 2656 2657 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2658 2659 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2660 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2661 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); 2662 if (vp->v_type == VBLK || vp->v_type == VCHR) 2663 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, 2664 &rdev, 8); 2665 2666 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2667 ZFS_EXIT(zfsvfs); 2668 return (error); 2669 } 2670 2671 /* 2672 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2673 * Also, if we are the owner don't bother, since owner should 2674 * always be allowed to read basic attributes of file. 2675 */ 2676 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2677 (vap->va_uid != crgetuid(cr))) { 2678 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2679 skipaclchk, cr)) { 2680 ZFS_EXIT(zfsvfs); 2681 return (error); 2682 } 2683 } 2684 2685 /* 2686 * Return all attributes. It's cheaper to provide the answer 2687 * than to determine whether we were asked the question. 2688 */ 2689 2690 vap->va_type = IFTOVT(zp->z_mode); 2691 vap->va_mode = zp->z_mode & ~S_IFMT; 2692#ifdef illumos 2693 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2694#else 2695 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2696#endif 2697 vap->va_nodeid = zp->z_id; 2698 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2699 links = zp->z_links + 1; 2700 else 2701 links = zp->z_links; 2702 vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ 2703 vap->va_size = zp->z_size; 2704#ifdef illumos 2705 vap->va_rdev = vp->v_rdev; 2706#else 2707 if (vp->v_type == VBLK || vp->v_type == VCHR) 2708 vap->va_rdev = zfs_cmpldev(rdev); 2709#endif 2710 vap->va_seq = zp->z_seq; 2711 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2712 vap->va_filerev = zp->z_seq; 2713 2714 /* 2715 * Add in any requested optional attributes and the create time. 2716 * Also set the corresponding bits in the returned attribute bitmap. 2717 */ 2718 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2719 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2720 xoap->xoa_archive = 2721 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2722 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2723 } 2724 2725 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2726 xoap->xoa_readonly = 2727 ((zp->z_pflags & ZFS_READONLY) != 0); 2728 XVA_SET_RTN(xvap, XAT_READONLY); 2729 } 2730 2731 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2732 xoap->xoa_system = 2733 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2734 XVA_SET_RTN(xvap, XAT_SYSTEM); 2735 } 2736 2737 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2738 xoap->xoa_hidden = 2739 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2740 XVA_SET_RTN(xvap, XAT_HIDDEN); 2741 } 2742 2743 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2744 xoap->xoa_nounlink = 2745 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2746 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2747 } 2748 2749 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2750 xoap->xoa_immutable = 2751 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2752 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2753 } 2754 2755 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2756 xoap->xoa_appendonly = 2757 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2758 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2759 } 2760 2761 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2762 xoap->xoa_nodump = 2763 ((zp->z_pflags & ZFS_NODUMP) != 0); 2764 XVA_SET_RTN(xvap, XAT_NODUMP); 2765 } 2766 2767 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2768 xoap->xoa_opaque = 2769 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2770 XVA_SET_RTN(xvap, XAT_OPAQUE); 2771 } 2772 2773 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2774 xoap->xoa_av_quarantined = 2775 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2776 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2777 } 2778 2779 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2780 xoap->xoa_av_modified = 2781 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2782 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2783 } 2784 2785 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2786 vp->v_type == VREG) { 2787 zfs_sa_get_scanstamp(zp, xvap); 2788 } 2789 2790 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2791 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2792 XVA_SET_RTN(xvap, XAT_REPARSE); 2793 } 2794 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2795 xoap->xoa_generation = zp->z_gen; 2796 XVA_SET_RTN(xvap, XAT_GEN); 2797 } 2798 2799 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2800 xoap->xoa_offline = 2801 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2802 XVA_SET_RTN(xvap, XAT_OFFLINE); 2803 } 2804 2805 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2806 xoap->xoa_sparse = 2807 ((zp->z_pflags & ZFS_SPARSE) != 0); 2808 XVA_SET_RTN(xvap, XAT_SPARSE); 2809 } 2810 } 2811 2812 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2813 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2814 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2815 ZFS_TIME_DECODE(&vap->va_birthtime, crtime); 2816 2817 2818 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 2819 vap->va_blksize = blksize; 2820 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 2821 2822 if (zp->z_blksz == 0) { 2823 /* 2824 * Block size hasn't been set; suggest maximal I/O transfers. 2825 */ 2826 vap->va_blksize = zfsvfs->z_max_blksz; 2827 } 2828 2829 ZFS_EXIT(zfsvfs); 2830 return (0); 2831} 2832 2833/* 2834 * Set the file attributes to the values contained in the 2835 * vattr structure. 2836 * 2837 * IN: vp - vnode of file to be modified. 2838 * vap - new attribute values. 2839 * If AT_XVATTR set, then optional attrs are being set 2840 * flags - ATTR_UTIME set if non-default time values provided. 2841 * - ATTR_NOACLCHECK (CIFS context only). 2842 * cr - credentials of caller. 2843 * ct - caller context 2844 * 2845 * RETURN: 0 on success, error code on failure. 2846 * 2847 * Timestamps: 2848 * vp - ctime updated, mtime updated if size changed. 2849 */ 2850/* ARGSUSED */ 2851static int 2852zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2853 caller_context_t *ct) 2854{ 2855 znode_t *zp = VTOZ(vp); 2856 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2857 zilog_t *zilog; 2858 dmu_tx_t *tx; 2859 vattr_t oldva; 2860 xvattr_t tmpxvattr; 2861 uint_t mask = vap->va_mask; 2862 uint_t saved_mask = 0; 2863 uint64_t saved_mode; 2864 int trim_mask = 0; 2865 uint64_t new_mode; 2866 uint64_t new_uid, new_gid; 2867 uint64_t xattr_obj; 2868 uint64_t mtime[2], ctime[2]; 2869 znode_t *attrzp; 2870 int need_policy = FALSE; 2871 int err, err2; 2872 zfs_fuid_info_t *fuidp = NULL; 2873 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2874 xoptattr_t *xoap; 2875 zfs_acl_t *aclp; 2876 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2877 boolean_t fuid_dirtied = B_FALSE; 2878 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 2879 int count = 0, xattr_count = 0; 2880 2881 if (mask == 0) 2882 return (0); 2883 2884 if (mask & AT_NOSET) 2885 return (SET_ERROR(EINVAL)); 2886 2887 ZFS_ENTER(zfsvfs); 2888 ZFS_VERIFY_ZP(zp); 2889 2890 zilog = zfsvfs->z_log; 2891 2892 /* 2893 * Make sure that if we have ephemeral uid/gid or xvattr specified 2894 * that file system is at proper version level 2895 */ 2896 2897 if (zfsvfs->z_use_fuids == B_FALSE && 2898 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2899 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2900 (mask & AT_XVATTR))) { 2901 ZFS_EXIT(zfsvfs); 2902 return (SET_ERROR(EINVAL)); 2903 } 2904 2905 if (mask & AT_SIZE && vp->v_type == VDIR) { 2906 ZFS_EXIT(zfsvfs); 2907 return (SET_ERROR(EISDIR)); 2908 } 2909 2910 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2911 ZFS_EXIT(zfsvfs); 2912 return (SET_ERROR(EINVAL)); 2913 } 2914 2915 /* 2916 * If this is an xvattr_t, then get a pointer to the structure of 2917 * optional attributes. If this is NULL, then we have a vattr_t. 2918 */ 2919 xoap = xva_getxoptattr(xvap); 2920 2921 xva_init(&tmpxvattr); 2922 2923 /* 2924 * Immutable files can only alter immutable bit and atime 2925 */ 2926 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2927 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2928 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2929 ZFS_EXIT(zfsvfs); 2930 return (SET_ERROR(EPERM)); 2931 } 2932 2933 /* 2934 * Note: ZFS_READONLY is handled in zfs_zaccess_common. 2935 */ 2936 2937 /* 2938 * Verify timestamps doesn't overflow 32 bits. 2939 * ZFS can handle large timestamps, but 32bit syscalls can't 2940 * handle times greater than 2039. This check should be removed 2941 * once large timestamps are fully supported. 2942 */ 2943 if (mask & (AT_ATIME | AT_MTIME)) { 2944 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2945 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2946 ZFS_EXIT(zfsvfs); 2947 return (SET_ERROR(EOVERFLOW)); 2948 } 2949 } 2950 if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && 2951 TIMESPEC_OVERFLOW(&vap->va_birthtime)) { 2952 ZFS_EXIT(zfsvfs); 2953 return (SET_ERROR(EOVERFLOW)); 2954 } 2955 2956 attrzp = NULL; 2957 aclp = NULL; 2958 2959 /* Can this be moved to before the top label? */ 2960 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2961 ZFS_EXIT(zfsvfs); 2962 return (SET_ERROR(EROFS)); 2963 } 2964 2965 /* 2966 * First validate permissions 2967 */ 2968 2969 if (mask & AT_SIZE) { 2970 /* 2971 * XXX - Note, we are not providing any open 2972 * mode flags here (like FNDELAY), so we may 2973 * block if there are locks present... this 2974 * should be addressed in openat(). 2975 */ 2976 /* XXX - would it be OK to generate a log record here? */ 2977 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2978 if (err) { 2979 ZFS_EXIT(zfsvfs); 2980 return (err); 2981 } 2982 } 2983 2984 if (mask & (AT_ATIME|AT_MTIME) || 2985 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2986 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2987 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2988 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2989 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2990 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2991 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2992 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2993 skipaclchk, cr); 2994 } 2995 2996 if (mask & (AT_UID|AT_GID)) { 2997 int idmask = (mask & (AT_UID|AT_GID)); 2998 int take_owner; 2999 int take_group; 3000 3001 /* 3002 * NOTE: even if a new mode is being set, 3003 * we may clear S_ISUID/S_ISGID bits. 3004 */ 3005 3006 if (!(mask & AT_MODE)) 3007 vap->va_mode = zp->z_mode; 3008 3009 /* 3010 * Take ownership or chgrp to group we are a member of 3011 */ 3012 3013 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 3014 take_group = (mask & AT_GID) && 3015 zfs_groupmember(zfsvfs, vap->va_gid, cr); 3016 3017 /* 3018 * If both AT_UID and AT_GID are set then take_owner and 3019 * take_group must both be set in order to allow taking 3020 * ownership. 3021 * 3022 * Otherwise, send the check through secpolicy_vnode_setattr() 3023 * 3024 */ 3025 3026 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 3027 ((idmask == AT_UID) && take_owner) || 3028 ((idmask == AT_GID) && take_group)) { 3029 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 3030 skipaclchk, cr) == 0) { 3031 /* 3032 * Remove setuid/setgid for non-privileged users 3033 */ 3034 secpolicy_setid_clear(vap, vp, cr); 3035 trim_mask = (mask & (AT_UID|AT_GID)); 3036 } else { 3037 need_policy = TRUE; 3038 } 3039 } else { 3040 need_policy = TRUE; 3041 } 3042 } 3043 3044 oldva.va_mode = zp->z_mode; 3045 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 3046 if (mask & AT_XVATTR) { 3047 /* 3048 * Update xvattr mask to include only those attributes 3049 * that are actually changing. 3050 * 3051 * the bits will be restored prior to actually setting 3052 * the attributes so the caller thinks they were set. 3053 */ 3054 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3055 if (xoap->xoa_appendonly != 3056 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 3057 need_policy = TRUE; 3058 } else { 3059 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 3060 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 3061 } 3062 } 3063 3064 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3065 if (xoap->xoa_nounlink != 3066 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 3067 need_policy = TRUE; 3068 } else { 3069 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 3070 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 3071 } 3072 } 3073 3074 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3075 if (xoap->xoa_immutable != 3076 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 3077 need_policy = TRUE; 3078 } else { 3079 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 3080 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 3081 } 3082 } 3083 3084 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3085 if (xoap->xoa_nodump != 3086 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 3087 need_policy = TRUE; 3088 } else { 3089 XVA_CLR_REQ(xvap, XAT_NODUMP); 3090 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 3091 } 3092 } 3093 3094 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3095 if (xoap->xoa_av_modified != 3096 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 3097 need_policy = TRUE; 3098 } else { 3099 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 3100 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 3101 } 3102 } 3103 3104 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3105 if ((vp->v_type != VREG && 3106 xoap->xoa_av_quarantined) || 3107 xoap->xoa_av_quarantined != 3108 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 3109 need_policy = TRUE; 3110 } else { 3111 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 3112 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 3113 } 3114 } 3115 3116 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3117 ZFS_EXIT(zfsvfs); 3118 return (SET_ERROR(EPERM)); 3119 } 3120 3121 if (need_policy == FALSE && 3122 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 3123 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 3124 need_policy = TRUE; 3125 } 3126 } 3127 3128 if (mask & AT_MODE) { 3129 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 3130 err = secpolicy_setid_setsticky_clear(vp, vap, 3131 &oldva, cr); 3132 if (err) { 3133 ZFS_EXIT(zfsvfs); 3134 return (err); 3135 } 3136 trim_mask |= AT_MODE; 3137 } else { 3138 need_policy = TRUE; 3139 } 3140 } 3141 3142 if (need_policy) { 3143 /* 3144 * If trim_mask is set then take ownership 3145 * has been granted or write_acl is present and user 3146 * has the ability to modify mode. In that case remove 3147 * UID|GID and or MODE from mask so that 3148 * secpolicy_vnode_setattr() doesn't revoke it. 3149 */ 3150 3151 if (trim_mask) { 3152 saved_mask = vap->va_mask; 3153 vap->va_mask &= ~trim_mask; 3154 if (trim_mask & AT_MODE) { 3155 /* 3156 * Save the mode, as secpolicy_vnode_setattr() 3157 * will overwrite it with ova.va_mode. 3158 */ 3159 saved_mode = vap->va_mode; 3160 } 3161 } 3162 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 3163 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 3164 if (err) { 3165 ZFS_EXIT(zfsvfs); 3166 return (err); 3167 } 3168 3169 if (trim_mask) { 3170 vap->va_mask |= saved_mask; 3171 if (trim_mask & AT_MODE) { 3172 /* 3173 * Recover the mode after 3174 * secpolicy_vnode_setattr(). 3175 */ 3176 vap->va_mode = saved_mode; 3177 } 3178 } 3179 } 3180 3181 /* 3182 * secpolicy_vnode_setattr, or take ownership may have 3183 * changed va_mask 3184 */ 3185 mask = vap->va_mask; 3186 3187 if ((mask & (AT_UID | AT_GID))) { 3188 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3189 &xattr_obj, sizeof (xattr_obj)); 3190 3191 if (err == 0 && xattr_obj) { 3192 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3193 if (err == 0) { 3194 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); 3195 if (err != 0) 3196 vrele(ZTOV(attrzp)); 3197 } 3198 if (err) 3199 goto out2; 3200 } 3201 if (mask & AT_UID) { 3202 new_uid = zfs_fuid_create(zfsvfs, 3203 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3204 if (new_uid != zp->z_uid && 3205 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 3206 if (attrzp) 3207 vput(ZTOV(attrzp)); 3208 err = SET_ERROR(EDQUOT); 3209 goto out2; 3210 } 3211 } 3212 3213 if (mask & AT_GID) { 3214 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3215 cr, ZFS_GROUP, &fuidp); 3216 if (new_gid != zp->z_gid && 3217 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 3218 if (attrzp) 3219 vput(ZTOV(attrzp)); 3220 err = SET_ERROR(EDQUOT); 3221 goto out2; 3222 } 3223 } 3224 } 3225 tx = dmu_tx_create(zfsvfs->z_os); 3226 3227 if (mask & AT_MODE) { 3228 uint64_t pmode = zp->z_mode; 3229 uint64_t acl_obj; 3230 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3231 3232 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3233 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3234 err = SET_ERROR(EPERM); 3235 goto out; 3236 } 3237 3238 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3239 goto out; 3240 3241 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3242 /* 3243 * Are we upgrading ACL from old V0 format 3244 * to V1 format? 3245 */ 3246 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3247 zfs_znode_acl_version(zp) == 3248 ZFS_ACL_VERSION_INITIAL) { 3249 dmu_tx_hold_free(tx, acl_obj, 0, 3250 DMU_OBJECT_END); 3251 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3252 0, aclp->z_acl_bytes); 3253 } else { 3254 dmu_tx_hold_write(tx, acl_obj, 0, 3255 aclp->z_acl_bytes); 3256 } 3257 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3258 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3259 0, aclp->z_acl_bytes); 3260 } 3261 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3262 } else { 3263 if ((mask & AT_XVATTR) && 3264 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3265 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3266 else 3267 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3268 } 3269 3270 if (attrzp) { 3271 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3272 } 3273 3274 fuid_dirtied = zfsvfs->z_fuid_dirty; 3275 if (fuid_dirtied) 3276 zfs_fuid_txhold(zfsvfs, tx); 3277 3278 zfs_sa_upgrade_txholds(tx, zp); 3279 3280 err = dmu_tx_assign(tx, TXG_WAIT); 3281 if (err) 3282 goto out; 3283 3284 count = 0; 3285 /* 3286 * Set each attribute requested. 3287 * We group settings according to the locks they need to acquire. 3288 * 3289 * Note: you cannot set ctime directly, although it will be 3290 * updated as a side-effect of calling this function. 3291 */ 3292 3293 if (mask & (AT_UID|AT_GID|AT_MODE)) 3294 mutex_enter(&zp->z_acl_lock); 3295 3296 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3297 &zp->z_pflags, sizeof (zp->z_pflags)); 3298 3299 if (attrzp) { 3300 if (mask & (AT_UID|AT_GID|AT_MODE)) 3301 mutex_enter(&attrzp->z_acl_lock); 3302 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3303 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3304 sizeof (attrzp->z_pflags)); 3305 } 3306 3307 if (mask & (AT_UID|AT_GID)) { 3308 3309 if (mask & AT_UID) { 3310 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3311 &new_uid, sizeof (new_uid)); 3312 zp->z_uid = new_uid; 3313 if (attrzp) { 3314 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3315 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3316 sizeof (new_uid)); 3317 attrzp->z_uid = new_uid; 3318 } 3319 } 3320 3321 if (mask & AT_GID) { 3322 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3323 NULL, &new_gid, sizeof (new_gid)); 3324 zp->z_gid = new_gid; 3325 if (attrzp) { 3326 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3327 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3328 sizeof (new_gid)); 3329 attrzp->z_gid = new_gid; 3330 } 3331 } 3332 if (!(mask & AT_MODE)) { 3333 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3334 NULL, &new_mode, sizeof (new_mode)); 3335 new_mode = zp->z_mode; 3336 } 3337 err = zfs_acl_chown_setattr(zp); 3338 ASSERT(err == 0); 3339 if (attrzp) { 3340 err = zfs_acl_chown_setattr(attrzp); 3341 ASSERT(err == 0); 3342 } 3343 } 3344 3345 if (mask & AT_MODE) { 3346 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3347 &new_mode, sizeof (new_mode)); 3348 zp->z_mode = new_mode; 3349 ASSERT3U((uintptr_t)aclp, !=, 0); 3350 err = zfs_aclset_common(zp, aclp, cr, tx); 3351 ASSERT0(err); 3352 if (zp->z_acl_cached) 3353 zfs_acl_free(zp->z_acl_cached); 3354 zp->z_acl_cached = aclp; 3355 aclp = NULL; 3356 } 3357 3358 3359 if (mask & AT_ATIME) { 3360 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3361 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3362 &zp->z_atime, sizeof (zp->z_atime)); 3363 } 3364 3365 if (mask & AT_MTIME) { 3366 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3367 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3368 mtime, sizeof (mtime)); 3369 } 3370 3371 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3372 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3373 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3374 NULL, mtime, sizeof (mtime)); 3375 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3376 &ctime, sizeof (ctime)); 3377 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3378 B_TRUE); 3379 } else if (mask != 0) { 3380 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3381 &ctime, sizeof (ctime)); 3382 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3383 B_TRUE); 3384 if (attrzp) { 3385 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3386 SA_ZPL_CTIME(zfsvfs), NULL, 3387 &ctime, sizeof (ctime)); 3388 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3389 mtime, ctime, B_TRUE); 3390 } 3391 } 3392 /* 3393 * Do this after setting timestamps to prevent timestamp 3394 * update from toggling bit 3395 */ 3396 3397 if (xoap && (mask & AT_XVATTR)) { 3398 3399 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) 3400 xoap->xoa_createtime = vap->va_birthtime; 3401 /* 3402 * restore trimmed off masks 3403 * so that return masks can be set for caller. 3404 */ 3405 3406 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3407 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3408 } 3409 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3410 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3411 } 3412 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3413 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3414 } 3415 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3416 XVA_SET_REQ(xvap, XAT_NODUMP); 3417 } 3418 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3419 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3420 } 3421 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3422 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3423 } 3424 3425 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3426 ASSERT(vp->v_type == VREG); 3427 3428 zfs_xvattr_set(zp, xvap, tx); 3429 } 3430 3431 if (fuid_dirtied) 3432 zfs_fuid_sync(zfsvfs, tx); 3433 3434 if (mask != 0) 3435 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3436 3437 if (mask & (AT_UID|AT_GID|AT_MODE)) 3438 mutex_exit(&zp->z_acl_lock); 3439 3440 if (attrzp) { 3441 if (mask & (AT_UID|AT_GID|AT_MODE)) 3442 mutex_exit(&attrzp->z_acl_lock); 3443 } 3444out: 3445 if (err == 0 && attrzp) { 3446 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3447 xattr_count, tx); 3448 ASSERT(err2 == 0); 3449 } 3450 3451 if (attrzp) 3452 vput(ZTOV(attrzp)); 3453 3454 if (aclp) 3455 zfs_acl_free(aclp); 3456 3457 if (fuidp) { 3458 zfs_fuid_info_free(fuidp); 3459 fuidp = NULL; 3460 } 3461 3462 if (err) { 3463 dmu_tx_abort(tx); 3464 } else { 3465 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3466 dmu_tx_commit(tx); 3467 } 3468 3469out2: 3470 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3471 zil_commit(zilog, 0); 3472 3473 ZFS_EXIT(zfsvfs); 3474 return (err); 3475} 3476 3477/* 3478 * We acquire all but fdvp locks using non-blocking acquisitions. If we 3479 * fail to acquire any lock in the path we will drop all held locks, 3480 * acquire the new lock in a blocking fashion, and then release it and 3481 * restart the rename. This acquire/release step ensures that we do not 3482 * spin on a lock waiting for release. On error release all vnode locks 3483 * and decrement references the way tmpfs_rename() would do. 3484 */ 3485static int 3486zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, 3487 struct vnode *tdvp, struct vnode **tvpp, 3488 const struct componentname *scnp, const struct componentname *tcnp) 3489{ 3490 zfsvfs_t *zfsvfs; 3491 struct vnode *nvp, *svp, *tvp; 3492 znode_t *sdzp, *tdzp, *szp, *tzp; 3493 const char *snm = scnp->cn_nameptr; 3494 const char *tnm = tcnp->cn_nameptr; 3495 int error; 3496 3497 VOP_UNLOCK(tdvp, 0); 3498 if (*tvpp != NULL && *tvpp != tdvp) 3499 VOP_UNLOCK(*tvpp, 0); 3500 3501relock: 3502 error = vn_lock(sdvp, LK_EXCLUSIVE); 3503 if (error) 3504 goto out; 3505 sdzp = VTOZ(sdvp); 3506 3507 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); 3508 if (error != 0) { 3509 VOP_UNLOCK(sdvp, 0); 3510 if (error != EBUSY) 3511 goto out; 3512 error = vn_lock(tdvp, LK_EXCLUSIVE); 3513 if (error) 3514 goto out; 3515 VOP_UNLOCK(tdvp, 0); 3516 goto relock; 3517 } 3518 tdzp = VTOZ(tdvp); 3519 3520 /* 3521 * Before using sdzp and tdzp we must ensure that they are live. 3522 * As a porting legacy from illumos we have two things to worry 3523 * about. One is typical for FreeBSD and it is that the vnode is 3524 * not reclaimed (doomed). The other is that the znode is live. 3525 * The current code can invalidate the znode without acquiring the 3526 * corresponding vnode lock if the object represented by the znode 3527 * and vnode is no longer valid after a rollback or receive operation. 3528 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock 3529 * that protects the znodes from the invalidation. 3530 */ 3531 zfsvfs = sdzp->z_zfsvfs; 3532 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); 3533 ZFS_ENTER(zfsvfs); 3534 3535 /* 3536 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3537 * bypassing the cleanup code in the case of an error. 3538 */ 3539 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3540 ZFS_EXIT(zfsvfs); 3541 VOP_UNLOCK(sdvp, 0); 3542 VOP_UNLOCK(tdvp, 0); 3543 error = SET_ERROR(EIO); 3544 goto out; 3545 } 3546 3547 /* 3548 * Re-resolve svp to be certain it still exists and fetch the 3549 * correct vnode. 3550 */ 3551 error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); 3552 if (error != 0) { 3553 /* Source entry invalid or not there. */ 3554 ZFS_EXIT(zfsvfs); 3555 VOP_UNLOCK(sdvp, 0); 3556 VOP_UNLOCK(tdvp, 0); 3557 if ((scnp->cn_flags & ISDOTDOT) != 0 || 3558 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) 3559 error = SET_ERROR(EINVAL); 3560 goto out; 3561 } 3562 svp = ZTOV(szp); 3563 3564 /* 3565 * Re-resolve tvp, if it disappeared we just carry on. 3566 */ 3567 error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); 3568 if (error != 0) { 3569 ZFS_EXIT(zfsvfs); 3570 VOP_UNLOCK(sdvp, 0); 3571 VOP_UNLOCK(tdvp, 0); 3572 vrele(svp); 3573 if ((tcnp->cn_flags & ISDOTDOT) != 0) 3574 error = SET_ERROR(EINVAL); 3575 goto out; 3576 } 3577 if (tzp != NULL) 3578 tvp = ZTOV(tzp); 3579 else 3580 tvp = NULL; 3581 3582 /* 3583 * At present the vnode locks must be acquired before z_teardown_lock, 3584 * although it would be more logical to use the opposite order. 3585 */ 3586 ZFS_EXIT(zfsvfs); 3587 3588 /* 3589 * Now try acquire locks on svp and tvp. 3590 */ 3591 nvp = svp; 3592 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3593 if (error != 0) { 3594 VOP_UNLOCK(sdvp, 0); 3595 VOP_UNLOCK(tdvp, 0); 3596 if (tvp != NULL) 3597 vrele(tvp); 3598 if (error != EBUSY) { 3599 vrele(nvp); 3600 goto out; 3601 } 3602 error = vn_lock(nvp, LK_EXCLUSIVE); 3603 if (error != 0) { 3604 vrele(nvp); 3605 goto out; 3606 } 3607 VOP_UNLOCK(nvp, 0); 3608 /* 3609 * Concurrent rename race. 3610 * XXX ? 3611 */ 3612 if (nvp == tdvp) { 3613 vrele(nvp); 3614 error = SET_ERROR(EINVAL); 3615 goto out; 3616 } 3617 vrele(*svpp); 3618 *svpp = nvp; 3619 goto relock; 3620 } 3621 vrele(*svpp); 3622 *svpp = nvp; 3623 3624 if (*tvpp != NULL) 3625 vrele(*tvpp); 3626 *tvpp = NULL; 3627 if (tvp != NULL) { 3628 nvp = tvp; 3629 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 3630 if (error != 0) { 3631 VOP_UNLOCK(sdvp, 0); 3632 VOP_UNLOCK(tdvp, 0); 3633 VOP_UNLOCK(*svpp, 0); 3634 if (error != EBUSY) { 3635 vrele(nvp); 3636 goto out; 3637 } 3638 error = vn_lock(nvp, LK_EXCLUSIVE); 3639 if (error != 0) { 3640 vrele(nvp); 3641 goto out; 3642 } 3643 vput(nvp); 3644 goto relock; 3645 } 3646 *tvpp = nvp; 3647 } 3648 3649 return (0); 3650 3651out: 3652 return (error); 3653} 3654 3655/* 3656 * Note that we must use VRELE_ASYNC in this function as it walks 3657 * up the directory tree and vrele may need to acquire an exclusive 3658 * lock if a last reference to a vnode is dropped. 3659 */ 3660static int 3661zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) 3662{ 3663 zfsvfs_t *zfsvfs; 3664 znode_t *zp, *zp1; 3665 uint64_t parent; 3666 int error; 3667 3668 zfsvfs = tdzp->z_zfsvfs; 3669 if (tdzp == szp) 3670 return (SET_ERROR(EINVAL)); 3671 if (tdzp == sdzp) 3672 return (0); 3673 if (tdzp->z_id == zfsvfs->z_root) 3674 return (0); 3675 zp = tdzp; 3676 for (;;) { 3677 ASSERT(!zp->z_unlinked); 3678 if ((error = sa_lookup(zp->z_sa_hdl, 3679 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 3680 break; 3681 3682 if (parent == szp->z_id) { 3683 error = SET_ERROR(EINVAL); 3684 break; 3685 } 3686 if (parent == zfsvfs->z_root) 3687 break; 3688 if (parent == sdzp->z_id) 3689 break; 3690 3691 error = zfs_zget(zfsvfs, parent, &zp1); 3692 if (error != 0) 3693 break; 3694 3695 if (zp != tdzp) 3696 VN_RELE_ASYNC(ZTOV(zp), 3697 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3698 zp = zp1; 3699 } 3700 3701 if (error == ENOTDIR) 3702 panic("checkpath: .. not a directory\n"); 3703 if (zp != tdzp) 3704 VN_RELE_ASYNC(ZTOV(zp), 3705 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 3706 return (error); 3707} 3708 3709/* 3710 * Move an entry from the provided source directory to the target 3711 * directory. Change the entry name as indicated. 3712 * 3713 * IN: sdvp - Source directory containing the "old entry". 3714 * snm - Old entry name. 3715 * tdvp - Target directory to contain the "new entry". 3716 * tnm - New entry name. 3717 * cr - credentials of caller. 3718 * ct - caller context 3719 * flags - case flags 3720 * 3721 * RETURN: 0 on success, error code on failure. 3722 * 3723 * Timestamps: 3724 * sdvp,tdvp - ctime|mtime updated 3725 */ 3726/*ARGSUSED*/ 3727static int 3728zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, 3729 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, 3730 cred_t *cr) 3731{ 3732 zfsvfs_t *zfsvfs; 3733 znode_t *sdzp, *tdzp, *szp, *tzp; 3734 zilog_t *zilog = NULL; 3735 dmu_tx_t *tx; 3736 char *snm = scnp->cn_nameptr; 3737 char *tnm = tcnp->cn_nameptr; 3738 int error = 0; 3739 3740 /* Reject renames across filesystems. */ 3741 if ((*svpp)->v_mount != tdvp->v_mount || 3742 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { 3743 error = SET_ERROR(EXDEV); 3744 goto out; 3745 } 3746 3747 if (zfsctl_is_node(tdvp)) { 3748 error = SET_ERROR(EXDEV); 3749 goto out; 3750 } 3751 3752 /* 3753 * Lock all four vnodes to ensure safety and semantics of renaming. 3754 */ 3755 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); 3756 if (error != 0) { 3757 /* no vnodes are locked in the case of error here */ 3758 return (error); 3759 } 3760 3761 tdzp = VTOZ(tdvp); 3762 sdzp = VTOZ(sdvp); 3763 zfsvfs = tdzp->z_zfsvfs; 3764 zilog = zfsvfs->z_log; 3765 3766 /* 3767 * After we re-enter ZFS_ENTER() we will have to revalidate all 3768 * znodes involved. 3769 */ 3770 ZFS_ENTER(zfsvfs); 3771 3772 if (zfsvfs->z_utf8 && u8_validate(tnm, 3773 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3774 error = SET_ERROR(EILSEQ); 3775 goto unlockout; 3776 } 3777 3778 /* If source and target are the same file, there is nothing to do. */ 3779 if ((*svpp) == (*tvpp)) { 3780 error = 0; 3781 goto unlockout; 3782 } 3783 3784 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || 3785 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && 3786 (*tvpp)->v_mountedhere != NULL)) { 3787 error = SET_ERROR(EXDEV); 3788 goto unlockout; 3789 } 3790 3791 /* 3792 * We can not use ZFS_VERIFY_ZP() here because it could directly return 3793 * bypassing the cleanup code in the case of an error. 3794 */ 3795 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 3796 error = SET_ERROR(EIO); 3797 goto unlockout; 3798 } 3799 3800 szp = VTOZ(*svpp); 3801 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); 3802 if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { 3803 error = SET_ERROR(EIO); 3804 goto unlockout; 3805 } 3806 3807 /* 3808 * This is to prevent the creation of links into attribute space 3809 * by renaming a linked file into/outof an attribute directory. 3810 * See the comment in zfs_link() for why this is considered bad. 3811 */ 3812 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3813 error = SET_ERROR(EINVAL); 3814 goto unlockout; 3815 } 3816 3817 /* 3818 * Must have write access at the source to remove the old entry 3819 * and write access at the target to create the new entry. 3820 * Note that if target and source are the same, this can be 3821 * done in a single check. 3822 */ 3823 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3824 goto unlockout; 3825 3826 if ((*svpp)->v_type == VDIR) { 3827 /* 3828 * Avoid ".", "..", and aliases of "." for obvious reasons. 3829 */ 3830 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || 3831 sdzp == szp || 3832 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { 3833 error = EINVAL; 3834 goto unlockout; 3835 } 3836 3837 /* 3838 * Check to make sure rename is valid. 3839 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3840 */ 3841 if (error = zfs_rename_check(szp, sdzp, tdzp)) 3842 goto unlockout; 3843 } 3844 3845 /* 3846 * Does target exist? 3847 */ 3848 if (tzp) { 3849 /* 3850 * Source and target must be the same type. 3851 */ 3852 if ((*svpp)->v_type == VDIR) { 3853 if ((*tvpp)->v_type != VDIR) { 3854 error = SET_ERROR(ENOTDIR); 3855 goto unlockout; 3856 } else { 3857 cache_purge(tdvp); 3858 if (sdvp != tdvp) 3859 cache_purge(sdvp); 3860 } 3861 } else { 3862 if ((*tvpp)->v_type == VDIR) { 3863 error = SET_ERROR(EISDIR); 3864 goto unlockout; 3865 } 3866 } 3867 } 3868 3869 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); 3870 if (tzp) 3871 vnevent_rename_dest(*tvpp, tdvp, tnm, ct); 3872 3873 /* 3874 * notify the target directory if it is not the same 3875 * as source directory. 3876 */ 3877 if (tdvp != sdvp) { 3878 vnevent_rename_dest_dir(tdvp, ct); 3879 } 3880 3881 tx = dmu_tx_create(zfsvfs->z_os); 3882 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3883 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3884 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3885 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3886 if (sdzp != tdzp) { 3887 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3888 zfs_sa_upgrade_txholds(tx, tdzp); 3889 } 3890 if (tzp) { 3891 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3892 zfs_sa_upgrade_txholds(tx, tzp); 3893 } 3894 3895 zfs_sa_upgrade_txholds(tx, szp); 3896 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3897 error = dmu_tx_assign(tx, TXG_WAIT); 3898 if (error) { 3899 dmu_tx_abort(tx); 3900 goto unlockout; 3901 } 3902 3903 3904 if (tzp) /* Attempt to remove the existing target */ 3905 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); 3906 3907 if (error == 0) { 3908 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); 3909 if (error == 0) { 3910 szp->z_pflags |= ZFS_AV_MODIFIED; 3911 3912 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3913 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3914 ASSERT0(error); 3915 3916 error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, 3917 NULL); 3918 if (error == 0) { 3919 zfs_log_rename(zilog, tx, TX_RENAME, sdzp, 3920 snm, tdzp, tnm, szp); 3921 3922 /* 3923 * Update path information for the target vnode 3924 */ 3925 vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); 3926 } else { 3927 /* 3928 * At this point, we have successfully created 3929 * the target name, but have failed to remove 3930 * the source name. Since the create was done 3931 * with the ZRENAMING flag, there are 3932 * complications; for one, the link count is 3933 * wrong. The easiest way to deal with this 3934 * is to remove the newly created target, and 3935 * return the original error. This must 3936 * succeed; fortunately, it is very unlikely to 3937 * fail, since we just created it. 3938 */ 3939 VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, 3940 ZRENAMING, NULL), ==, 0); 3941 } 3942 } 3943 if (error == 0) { 3944 cache_purge(*svpp); 3945 if (*tvpp != NULL) 3946 cache_purge(*tvpp); 3947 cache_purge_negative(tdvp); 3948 } 3949 } 3950 3951 dmu_tx_commit(tx); 3952 3953unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ 3954 ZFS_EXIT(zfsvfs); 3955 VOP_UNLOCK(*svpp, 0); 3956 VOP_UNLOCK(sdvp, 0); 3957 3958out: /* original two vnodes are locked */ 3959 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3960 zil_commit(zilog, 0); 3961 3962 if (*tvpp != NULL) 3963 VOP_UNLOCK(*tvpp, 0); 3964 if (tdvp != *tvpp) 3965 VOP_UNLOCK(tdvp, 0); 3966 return (error); 3967} 3968 3969/* 3970 * Insert the indicated symbolic reference entry into the directory. 3971 * 3972 * IN: dvp - Directory to contain new symbolic link. 3973 * link - Name for new symlink entry. 3974 * vap - Attributes of new entry. 3975 * cr - credentials of caller. 3976 * ct - caller context 3977 * flags - case flags 3978 * 3979 * RETURN: 0 on success, error code on failure. 3980 * 3981 * Timestamps: 3982 * dvp - ctime|mtime updated 3983 */ 3984/*ARGSUSED*/ 3985static int 3986zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 3987 cred_t *cr, kthread_t *td) 3988{ 3989 znode_t *zp, *dzp = VTOZ(dvp); 3990 dmu_tx_t *tx; 3991 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3992 zilog_t *zilog; 3993 uint64_t len = strlen(link); 3994 int error; 3995 zfs_acl_ids_t acl_ids; 3996 boolean_t fuid_dirtied; 3997 uint64_t txtype = TX_SYMLINK; 3998 int flags = 0; 3999 4000 ASSERT(vap->va_type == VLNK); 4001 4002 ZFS_ENTER(zfsvfs); 4003 ZFS_VERIFY_ZP(dzp); 4004 zilog = zfsvfs->z_log; 4005 4006 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 4007 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4008 ZFS_EXIT(zfsvfs); 4009 return (SET_ERROR(EILSEQ)); 4010 } 4011 4012 if (len > MAXPATHLEN) { 4013 ZFS_EXIT(zfsvfs); 4014 return (SET_ERROR(ENAMETOOLONG)); 4015 } 4016 4017 if ((error = zfs_acl_ids_create(dzp, 0, 4018 vap, cr, NULL, &acl_ids)) != 0) { 4019 ZFS_EXIT(zfsvfs); 4020 return (error); 4021 } 4022 4023 /* 4024 * Attempt to lock directory; fail if entry already exists. 4025 */ 4026 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 4027 if (error) { 4028 zfs_acl_ids_free(&acl_ids); 4029 ZFS_EXIT(zfsvfs); 4030 return (error); 4031 } 4032 4033 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4034 zfs_acl_ids_free(&acl_ids); 4035 ZFS_EXIT(zfsvfs); 4036 return (error); 4037 } 4038 4039 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 4040 zfs_acl_ids_free(&acl_ids); 4041 ZFS_EXIT(zfsvfs); 4042 return (SET_ERROR(EDQUOT)); 4043 } 4044 4045 getnewvnode_reserve(1); 4046 tx = dmu_tx_create(zfsvfs->z_os); 4047 fuid_dirtied = zfsvfs->z_fuid_dirty; 4048 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 4049 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4050 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 4051 ZFS_SA_BASE_ATTR_SIZE + len); 4052 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 4053 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 4054 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 4055 acl_ids.z_aclp->z_acl_bytes); 4056 } 4057 if (fuid_dirtied) 4058 zfs_fuid_txhold(zfsvfs, tx); 4059 error = dmu_tx_assign(tx, TXG_WAIT); 4060 if (error) { 4061 zfs_acl_ids_free(&acl_ids); 4062 dmu_tx_abort(tx); 4063 getnewvnode_drop_reserve(); 4064 ZFS_EXIT(zfsvfs); 4065 return (error); 4066 } 4067 4068 /* 4069 * Create a new object for the symlink. 4070 * for version 4 ZPL datsets the symlink will be an SA attribute 4071 */ 4072 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 4073 4074 if (fuid_dirtied) 4075 zfs_fuid_sync(zfsvfs, tx); 4076 4077 if (zp->z_is_sa) 4078 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 4079 link, len, tx); 4080 else 4081 zfs_sa_symlink(zp, link, len, tx); 4082 4083 zp->z_size = len; 4084 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 4085 &zp->z_size, sizeof (zp->z_size), tx); 4086 /* 4087 * Insert the new object into the directory. 4088 */ 4089 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 4090 4091 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 4092 *vpp = ZTOV(zp); 4093 4094 zfs_acl_ids_free(&acl_ids); 4095 4096 dmu_tx_commit(tx); 4097 4098 getnewvnode_drop_reserve(); 4099 4100 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4101 zil_commit(zilog, 0); 4102 4103 ZFS_EXIT(zfsvfs); 4104 return (error); 4105} 4106 4107/* 4108 * Return, in the buffer contained in the provided uio structure, 4109 * the symbolic path referred to by vp. 4110 * 4111 * IN: vp - vnode of symbolic link. 4112 * uio - structure to contain the link path. 4113 * cr - credentials of caller. 4114 * ct - caller context 4115 * 4116 * OUT: uio - structure containing the link path. 4117 * 4118 * RETURN: 0 on success, error code on failure. 4119 * 4120 * Timestamps: 4121 * vp - atime updated 4122 */ 4123/* ARGSUSED */ 4124static int 4125zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 4126{ 4127 znode_t *zp = VTOZ(vp); 4128 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4129 int error; 4130 4131 ZFS_ENTER(zfsvfs); 4132 ZFS_VERIFY_ZP(zp); 4133 4134 if (zp->z_is_sa) 4135 error = sa_lookup_uio(zp->z_sa_hdl, 4136 SA_ZPL_SYMLINK(zfsvfs), uio); 4137 else 4138 error = zfs_sa_readlink(zp, uio); 4139 4140 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4141 4142 ZFS_EXIT(zfsvfs); 4143 return (error); 4144} 4145 4146/* 4147 * Insert a new entry into directory tdvp referencing svp. 4148 * 4149 * IN: tdvp - Directory to contain new entry. 4150 * svp - vnode of new entry. 4151 * name - name of new entry. 4152 * cr - credentials of caller. 4153 * ct - caller context 4154 * 4155 * RETURN: 0 on success, error code on failure. 4156 * 4157 * Timestamps: 4158 * tdvp - ctime|mtime updated 4159 * svp - ctime updated 4160 */ 4161/* ARGSUSED */ 4162static int 4163zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 4164 caller_context_t *ct, int flags) 4165{ 4166 znode_t *dzp = VTOZ(tdvp); 4167 znode_t *tzp, *szp; 4168 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4169 zilog_t *zilog; 4170 dmu_tx_t *tx; 4171 int error; 4172 uint64_t parent; 4173 uid_t owner; 4174 4175 ASSERT(tdvp->v_type == VDIR); 4176 4177 ZFS_ENTER(zfsvfs); 4178 ZFS_VERIFY_ZP(dzp); 4179 zilog = zfsvfs->z_log; 4180 4181 /* 4182 * POSIX dictates that we return EPERM here. 4183 * Better choices include ENOTSUP or EISDIR. 4184 */ 4185 if (svp->v_type == VDIR) { 4186 ZFS_EXIT(zfsvfs); 4187 return (SET_ERROR(EPERM)); 4188 } 4189 4190 szp = VTOZ(svp); 4191 ZFS_VERIFY_ZP(szp); 4192 4193 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { 4194 ZFS_EXIT(zfsvfs); 4195 return (SET_ERROR(EPERM)); 4196 } 4197 4198 /* Prevent links to .zfs/shares files */ 4199 4200 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4201 &parent, sizeof (uint64_t))) != 0) { 4202 ZFS_EXIT(zfsvfs); 4203 return (error); 4204 } 4205 if (parent == zfsvfs->z_shares_dir) { 4206 ZFS_EXIT(zfsvfs); 4207 return (SET_ERROR(EPERM)); 4208 } 4209 4210 if (zfsvfs->z_utf8 && u8_validate(name, 4211 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4212 ZFS_EXIT(zfsvfs); 4213 return (SET_ERROR(EILSEQ)); 4214 } 4215 4216 /* 4217 * We do not support links between attributes and non-attributes 4218 * because of the potential security risk of creating links 4219 * into "normal" file space in order to circumvent restrictions 4220 * imposed in attribute space. 4221 */ 4222 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4223 ZFS_EXIT(zfsvfs); 4224 return (SET_ERROR(EINVAL)); 4225 } 4226 4227 4228 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4229 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { 4230 ZFS_EXIT(zfsvfs); 4231 return (SET_ERROR(EPERM)); 4232 } 4233 4234 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4235 ZFS_EXIT(zfsvfs); 4236 return (error); 4237 } 4238 4239 /* 4240 * Attempt to lock directory; fail if entry already exists. 4241 */ 4242 error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); 4243 if (error) { 4244 ZFS_EXIT(zfsvfs); 4245 return (error); 4246 } 4247 4248 tx = dmu_tx_create(zfsvfs->z_os); 4249 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4250 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4251 zfs_sa_upgrade_txholds(tx, szp); 4252 zfs_sa_upgrade_txholds(tx, dzp); 4253 error = dmu_tx_assign(tx, TXG_WAIT); 4254 if (error) { 4255 dmu_tx_abort(tx); 4256 ZFS_EXIT(zfsvfs); 4257 return (error); 4258 } 4259 4260 error = zfs_link_create(dzp, name, szp, tx, 0); 4261 4262 if (error == 0) { 4263 uint64_t txtype = TX_LINK; 4264 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4265 } 4266 4267 dmu_tx_commit(tx); 4268 4269 if (error == 0) { 4270 vnevent_link(svp, ct); 4271 } 4272 4273 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4274 zil_commit(zilog, 0); 4275 4276 ZFS_EXIT(zfsvfs); 4277 return (error); 4278} 4279 4280 4281/*ARGSUSED*/ 4282void 4283zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4284{ 4285 znode_t *zp = VTOZ(vp); 4286 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4287 int error; 4288 4289 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4290 if (zp->z_sa_hdl == NULL) { 4291 /* 4292 * The fs has been unmounted, or we did a 4293 * suspend/resume and this file no longer exists. 4294 */ 4295 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4296 vrecycle(vp); 4297 return; 4298 } 4299 4300 if (zp->z_unlinked) { 4301 /* 4302 * Fast path to recycle a vnode of a removed file. 4303 */ 4304 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4305 vrecycle(vp); 4306 return; 4307 } 4308 4309 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4310 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4311 4312 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4313 zfs_sa_upgrade_txholds(tx, zp); 4314 error = dmu_tx_assign(tx, TXG_WAIT); 4315 if (error) { 4316 dmu_tx_abort(tx); 4317 } else { 4318 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4319 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4320 zp->z_atime_dirty = 0; 4321 dmu_tx_commit(tx); 4322 } 4323 } 4324 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4325} 4326 4327 4328CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 4329CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 4330 4331/*ARGSUSED*/ 4332static int 4333zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4334{ 4335 znode_t *zp = VTOZ(vp); 4336 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4337 uint32_t gen; 4338 uint64_t gen64; 4339 uint64_t object = zp->z_id; 4340 zfid_short_t *zfid; 4341 int size, i, error; 4342 4343 ZFS_ENTER(zfsvfs); 4344 ZFS_VERIFY_ZP(zp); 4345 4346 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4347 &gen64, sizeof (uint64_t))) != 0) { 4348 ZFS_EXIT(zfsvfs); 4349 return (error); 4350 } 4351 4352 gen = (uint32_t)gen64; 4353 4354 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4355 4356#ifdef illumos 4357 if (fidp->fid_len < size) { 4358 fidp->fid_len = size; 4359 ZFS_EXIT(zfsvfs); 4360 return (SET_ERROR(ENOSPC)); 4361 } 4362#else 4363 fidp->fid_len = size; 4364#endif 4365 4366 zfid = (zfid_short_t *)fidp; 4367 4368 zfid->zf_len = size; 4369 4370 for (i = 0; i < sizeof (zfid->zf_object); i++) 4371 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4372 4373 /* Must have a non-zero generation number to distinguish from .zfs */ 4374 if (gen == 0) 4375 gen = 1; 4376 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4377 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4378 4379 if (size == LONG_FID_LEN) { 4380 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4381 zfid_long_t *zlfid; 4382 4383 zlfid = (zfid_long_t *)fidp; 4384 4385 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4386 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4387 4388 /* XXX - this should be the generation number for the objset */ 4389 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4390 zlfid->zf_setgen[i] = 0; 4391 } 4392 4393 ZFS_EXIT(zfsvfs); 4394 return (0); 4395} 4396 4397static int 4398zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4399 caller_context_t *ct) 4400{ 4401 znode_t *zp, *xzp; 4402 zfsvfs_t *zfsvfs; 4403 int error; 4404 4405 switch (cmd) { 4406 case _PC_LINK_MAX: 4407 *valp = INT_MAX; 4408 return (0); 4409 4410 case _PC_FILESIZEBITS: 4411 *valp = 64; 4412 return (0); 4413#ifdef illumos 4414 case _PC_XATTR_EXISTS: 4415 zp = VTOZ(vp); 4416 zfsvfs = zp->z_zfsvfs; 4417 ZFS_ENTER(zfsvfs); 4418 ZFS_VERIFY_ZP(zp); 4419 *valp = 0; 4420 error = zfs_dirent_lookup(zp, "", &xzp, 4421 ZXATTR | ZEXISTS | ZSHARED); 4422 if (error == 0) { 4423 if (!zfs_dirempty(xzp)) 4424 *valp = 1; 4425 vrele(ZTOV(xzp)); 4426 } else if (error == ENOENT) { 4427 /* 4428 * If there aren't extended attributes, it's the 4429 * same as having zero of them. 4430 */ 4431 error = 0; 4432 } 4433 ZFS_EXIT(zfsvfs); 4434 return (error); 4435 4436 case _PC_SATTR_ENABLED: 4437 case _PC_SATTR_EXISTS: 4438 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4439 (vp->v_type == VREG || vp->v_type == VDIR); 4440 return (0); 4441 4442 case _PC_ACCESS_FILTERING: 4443 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 4444 vp->v_type == VDIR; 4445 return (0); 4446 4447 case _PC_ACL_ENABLED: 4448 *valp = _ACL_ACE_ENABLED; 4449 return (0); 4450#endif /* illumos */ 4451 case _PC_MIN_HOLE_SIZE: 4452 *valp = (int)SPA_MINBLOCKSIZE; 4453 return (0); 4454#ifdef illumos 4455 case _PC_TIMESTAMP_RESOLUTION: 4456 /* nanosecond timestamp resolution */ 4457 *valp = 1L; 4458 return (0); 4459#endif 4460 case _PC_ACL_EXTENDED: 4461 *valp = 0; 4462 return (0); 4463 4464 case _PC_ACL_NFS4: 4465 *valp = 1; 4466 return (0); 4467 4468 case _PC_ACL_PATH_MAX: 4469 *valp = ACL_MAX_ENTRIES; 4470 return (0); 4471 4472 default: 4473 return (EOPNOTSUPP); 4474 } 4475} 4476 4477/*ARGSUSED*/ 4478static int 4479zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4480 caller_context_t *ct) 4481{ 4482 znode_t *zp = VTOZ(vp); 4483 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4484 int error; 4485 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4486 4487 ZFS_ENTER(zfsvfs); 4488 ZFS_VERIFY_ZP(zp); 4489 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4490 ZFS_EXIT(zfsvfs); 4491 4492 return (error); 4493} 4494 4495/*ARGSUSED*/ 4496int 4497zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4498 caller_context_t *ct) 4499{ 4500 znode_t *zp = VTOZ(vp); 4501 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4502 int error; 4503 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4504 zilog_t *zilog = zfsvfs->z_log; 4505 4506 ZFS_ENTER(zfsvfs); 4507 ZFS_VERIFY_ZP(zp); 4508 4509 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4510 4511 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4512 zil_commit(zilog, 0); 4513 4514 ZFS_EXIT(zfsvfs); 4515 return (error); 4516} 4517 4518static int 4519zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, 4520 int *rahead) 4521{ 4522 znode_t *zp = VTOZ(vp); 4523 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4524 objset_t *os = zp->z_zfsvfs->z_os; 4525 rl_t *rl; 4526 vm_object_t object; 4527 off_t start, end, obj_size; 4528 uint_t blksz; 4529 int pgsin_b, pgsin_a; 4530 int error; 4531 4532 ZFS_ENTER(zfsvfs); 4533 ZFS_VERIFY_ZP(zp); 4534 4535 start = IDX_TO_OFF(ma[0]->pindex); 4536 end = IDX_TO_OFF(ma[count - 1]->pindex + 1); 4537 4538 /* 4539 * Lock a range covering all required and optional pages. 4540 * Note that we need to handle the case of the block size growing. 4541 */ 4542 for (;;) { 4543 blksz = zp->z_blksz; 4544 rl = zfs_range_lock(zp, rounddown(start, blksz), 4545 roundup(end, blksz) - rounddown(start, blksz), RL_READER); 4546 if (blksz == zp->z_blksz) 4547 break; 4548 zfs_range_unlock(rl); 4549 } 4550 4551 object = ma[0]->object; 4552 zfs_vmobject_wlock(object); 4553 obj_size = object->un_pager.vnp.vnp_size; 4554 zfs_vmobject_wunlock(object); 4555 if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { 4556 zfs_range_unlock(rl); 4557 ZFS_EXIT(zfsvfs); 4558 return (zfs_vm_pagerret_bad); 4559 } 4560 4561 pgsin_b = 0; 4562 if (rbehind != NULL) { 4563 pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); 4564 pgsin_b = MIN(*rbehind, pgsin_b); 4565 } 4566 4567 pgsin_a = 0; 4568 if (rahead != NULL) { 4569 pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); 4570 if (end + IDX_TO_OFF(pgsin_a) >= obj_size) 4571 pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); 4572 pgsin_a = MIN(*rahead, pgsin_a); 4573 } 4574 4575 /* 4576 * NB: we need to pass the exact byte size of the data that we expect 4577 * to read after accounting for the file size. This is required because 4578 * ZFS will panic if we request DMU to read beyond the end of the last 4579 * allocated block. 4580 */ 4581 error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, 4582 MIN(end, obj_size) - (end - PAGE_SIZE)); 4583 4584 zfs_range_unlock(rl); 4585 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4586 ZFS_EXIT(zfsvfs); 4587 4588 if (error != 0) 4589 return (zfs_vm_pagerret_error); 4590 4591 PCPU_INC(cnt.v_vnodein); 4592 PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a); 4593 if (rbehind != NULL) 4594 *rbehind = pgsin_b; 4595 if (rahead != NULL) 4596 *rahead = pgsin_a; 4597 return (zfs_vm_pagerret_ok); 4598} 4599 4600static int 4601zfs_freebsd_getpages(ap) 4602 struct vop_getpages_args /* { 4603 struct vnode *a_vp; 4604 vm_page_t *a_m; 4605 int a_count; 4606 int *a_rbehind; 4607 int *a_rahead; 4608 } */ *ap; 4609{ 4610 4611 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, 4612 ap->a_rahead)); 4613} 4614 4615static int 4616zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, 4617 int *rtvals) 4618{ 4619 znode_t *zp = VTOZ(vp); 4620 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4621 rl_t *rl; 4622 dmu_tx_t *tx; 4623 struct sf_buf *sf; 4624 vm_object_t object; 4625 vm_page_t m; 4626 caddr_t va; 4627 size_t tocopy; 4628 size_t lo_len; 4629 vm_ooffset_t lo_off; 4630 vm_ooffset_t off; 4631 uint_t blksz; 4632 int ncount; 4633 int pcount; 4634 int err; 4635 int i; 4636 4637 ZFS_ENTER(zfsvfs); 4638 ZFS_VERIFY_ZP(zp); 4639 4640 object = vp->v_object; 4641 pcount = btoc(len); 4642 ncount = pcount; 4643 4644 KASSERT(ma[0]->object == object, ("mismatching object")); 4645 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); 4646 4647 for (i = 0; i < pcount; i++) 4648 rtvals[i] = zfs_vm_pagerret_error; 4649 4650 off = IDX_TO_OFF(ma[0]->pindex); 4651 blksz = zp->z_blksz; 4652 lo_off = rounddown(off, blksz); 4653 lo_len = roundup(len + (off - lo_off), blksz); 4654 rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); 4655 4656 zfs_vmobject_wlock(object); 4657 if (len + off > object->un_pager.vnp.vnp_size) { 4658 if (object->un_pager.vnp.vnp_size > off) { 4659 int pgoff; 4660 4661 len = object->un_pager.vnp.vnp_size - off; 4662 ncount = btoc(len); 4663 if ((pgoff = (int)len & PAGE_MASK) != 0) { 4664 /* 4665 * If the object is locked and the following 4666 * conditions hold, then the page's dirty 4667 * field cannot be concurrently changed by a 4668 * pmap operation. 4669 */ 4670 m = ma[ncount - 1]; 4671 vm_page_assert_sbusied(m); 4672 KASSERT(!pmap_page_is_write_mapped(m), 4673 ("zfs_putpages: page %p is not read-only", m)); 4674 vm_page_clear_dirty(m, pgoff, PAGE_SIZE - 4675 pgoff); 4676 } 4677 } else { 4678 len = 0; 4679 ncount = 0; 4680 } 4681 if (ncount < pcount) { 4682 for (i = ncount; i < pcount; i++) { 4683 rtvals[i] = zfs_vm_pagerret_bad; 4684 } 4685 } 4686 } 4687 zfs_vmobject_wunlock(object); 4688 4689 if (ncount == 0) 4690 goto out; 4691 4692 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 4693 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 4694 goto out; 4695 } 4696 4697 tx = dmu_tx_create(zfsvfs->z_os); 4698 dmu_tx_hold_write(tx, zp->z_id, off, len); 4699 4700 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4701 zfs_sa_upgrade_txholds(tx, zp); 4702 err = dmu_tx_assign(tx, TXG_WAIT); 4703 if (err != 0) { 4704 dmu_tx_abort(tx); 4705 goto out; 4706 } 4707 4708 if (zp->z_blksz < PAGE_SIZE) { 4709 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { 4710 tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; 4711 va = zfs_map_page(ma[i], &sf); 4712 dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); 4713 zfs_unmap_page(sf); 4714 } 4715 } else { 4716 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); 4717 } 4718 4719 if (err == 0) { 4720 uint64_t mtime[2], ctime[2]; 4721 sa_bulk_attr_t bulk[3]; 4722 int count = 0; 4723 4724 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4725 &mtime, 16); 4726 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4727 &ctime, 16); 4728 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4729 &zp->z_pflags, 8); 4730 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4731 B_TRUE); 4732 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 4733 ASSERT0(err); 4734 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4735 4736 zfs_vmobject_wlock(object); 4737 for (i = 0; i < ncount; i++) { 4738 rtvals[i] = zfs_vm_pagerret_ok; 4739 vm_page_undirty(ma[i]); 4740 } 4741 zfs_vmobject_wunlock(object); 4742 PCPU_INC(cnt.v_vnodeout); 4743 PCPU_ADD(cnt.v_vnodepgsout, ncount); 4744 } 4745 dmu_tx_commit(tx); 4746 4747out: 4748 zfs_range_unlock(rl); 4749 if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || 4750 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4751 zil_commit(zfsvfs->z_log, zp->z_id); 4752 ZFS_EXIT(zfsvfs); 4753 return (rtvals[0]); 4754} 4755 4756int 4757zfs_freebsd_putpages(ap) 4758 struct vop_putpages_args /* { 4759 struct vnode *a_vp; 4760 vm_page_t *a_m; 4761 int a_count; 4762 int a_sync; 4763 int *a_rtvals; 4764 } */ *ap; 4765{ 4766 4767 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, 4768 ap->a_rtvals)); 4769} 4770 4771static int 4772zfs_freebsd_bmap(ap) 4773 struct vop_bmap_args /* { 4774 struct vnode *a_vp; 4775 daddr_t a_bn; 4776 struct bufobj **a_bop; 4777 daddr_t *a_bnp; 4778 int *a_runp; 4779 int *a_runb; 4780 } */ *ap; 4781{ 4782 4783 if (ap->a_bop != NULL) 4784 *ap->a_bop = &ap->a_vp->v_bufobj; 4785 if (ap->a_bnp != NULL) 4786 *ap->a_bnp = ap->a_bn; 4787 if (ap->a_runp != NULL) 4788 *ap->a_runp = 0; 4789 if (ap->a_runb != NULL) 4790 *ap->a_runb = 0; 4791 4792 return (0); 4793} 4794 4795static int 4796zfs_freebsd_open(ap) 4797 struct vop_open_args /* { 4798 struct vnode *a_vp; 4799 int a_mode; 4800 struct ucred *a_cred; 4801 struct thread *a_td; 4802 } */ *ap; 4803{ 4804 vnode_t *vp = ap->a_vp; 4805 znode_t *zp = VTOZ(vp); 4806 int error; 4807 4808 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 4809 if (error == 0) 4810 vnode_create_vobject(vp, zp->z_size, ap->a_td); 4811 return (error); 4812} 4813 4814static int 4815zfs_freebsd_close(ap) 4816 struct vop_close_args /* { 4817 struct vnode *a_vp; 4818 int a_fflag; 4819 struct ucred *a_cred; 4820 struct thread *a_td; 4821 } */ *ap; 4822{ 4823 4824 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); 4825} 4826 4827static int 4828zfs_freebsd_ioctl(ap) 4829 struct vop_ioctl_args /* { 4830 struct vnode *a_vp; 4831 u_long a_command; 4832 caddr_t a_data; 4833 int a_fflag; 4834 struct ucred *cred; 4835 struct thread *td; 4836 } */ *ap; 4837{ 4838 4839 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 4840 ap->a_fflag, ap->a_cred, NULL, NULL)); 4841} 4842 4843static int 4844ioflags(int ioflags) 4845{ 4846 int flags = 0; 4847 4848 if (ioflags & IO_APPEND) 4849 flags |= FAPPEND; 4850 if (ioflags & IO_NDELAY) 4851 flags |= FNONBLOCK; 4852 if (ioflags & IO_SYNC) 4853 flags |= (FSYNC | FDSYNC | FRSYNC); 4854 4855 return (flags); 4856} 4857 4858static int 4859zfs_freebsd_read(ap) 4860 struct vop_read_args /* { 4861 struct vnode *a_vp; 4862 struct uio *a_uio; 4863 int a_ioflag; 4864 struct ucred *a_cred; 4865 } */ *ap; 4866{ 4867 4868 return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4869 ap->a_cred, NULL)); 4870} 4871 4872static int 4873zfs_freebsd_write(ap) 4874 struct vop_write_args /* { 4875 struct vnode *a_vp; 4876 struct uio *a_uio; 4877 int a_ioflag; 4878 struct ucred *a_cred; 4879 } */ *ap; 4880{ 4881 4882 return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 4883 ap->a_cred, NULL)); 4884} 4885 4886static int 4887zfs_freebsd_access(ap) 4888 struct vop_access_args /* { 4889 struct vnode *a_vp; 4890 accmode_t a_accmode; 4891 struct ucred *a_cred; 4892 struct thread *a_td; 4893 } */ *ap; 4894{ 4895 vnode_t *vp = ap->a_vp; 4896 znode_t *zp = VTOZ(vp); 4897 accmode_t accmode; 4898 int error = 0; 4899 4900 /* 4901 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, 4902 */ 4903 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); 4904 if (accmode != 0) 4905 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); 4906 4907 /* 4908 * VADMIN has to be handled by vaccess(). 4909 */ 4910 if (error == 0) { 4911 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); 4912 if (accmode != 0) { 4913 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, 4914 zp->z_gid, accmode, ap->a_cred, NULL); 4915 } 4916 } 4917 4918 /* 4919 * For VEXEC, ensure that at least one execute bit is set for 4920 * non-directories. 4921 */ 4922 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && 4923 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { 4924 error = EACCES; 4925 } 4926 4927 return (error); 4928} 4929 4930static int 4931zfs_freebsd_lookup(ap) 4932 struct vop_lookup_args /* { 4933 struct vnode *a_dvp; 4934 struct vnode **a_vpp; 4935 struct componentname *a_cnp; 4936 } */ *ap; 4937{ 4938 struct componentname *cnp = ap->a_cnp; 4939 char nm[NAME_MAX + 1]; 4940 4941 ASSERT(cnp->cn_namelen < sizeof(nm)); 4942 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 4943 4944 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 4945 cnp->cn_cred, cnp->cn_thread, 0)); 4946} 4947 4948static int 4949zfs_cache_lookup(ap) 4950 struct vop_lookup_args /* { 4951 struct vnode *a_dvp; 4952 struct vnode **a_vpp; 4953 struct componentname *a_cnp; 4954 } */ *ap; 4955{ 4956 zfsvfs_t *zfsvfs; 4957 4958 zfsvfs = ap->a_dvp->v_mount->mnt_data; 4959 if (zfsvfs->z_use_namecache) 4960 return (vfs_cache_lookup(ap)); 4961 else 4962 return (zfs_freebsd_lookup(ap)); 4963} 4964 4965static int 4966zfs_freebsd_create(ap) 4967 struct vop_create_args /* { 4968 struct vnode *a_dvp; 4969 struct vnode **a_vpp; 4970 struct componentname *a_cnp; 4971 struct vattr *a_vap; 4972 } */ *ap; 4973{ 4974 zfsvfs_t *zfsvfs; 4975 struct componentname *cnp = ap->a_cnp; 4976 vattr_t *vap = ap->a_vap; 4977 int error, mode; 4978 4979 ASSERT(cnp->cn_flags & SAVENAME); 4980 4981 vattr_init_mask(vap); 4982 mode = vap->va_mode & ALLPERMS; 4983 zfsvfs = ap->a_dvp->v_mount->mnt_data; 4984 4985 error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 4986 ap->a_vpp, cnp->cn_cred, cnp->cn_thread); 4987 if (zfsvfs->z_use_namecache && 4988 error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) 4989 cache_enter(ap->a_dvp, *ap->a_vpp, cnp); 4990 return (error); 4991} 4992 4993static int 4994zfs_freebsd_remove(ap) 4995 struct vop_remove_args /* { 4996 struct vnode *a_dvp; 4997 struct vnode *a_vp; 4998 struct componentname *a_cnp; 4999 } */ *ap; 5000{ 5001 5002 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5003 5004 return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, 5005 ap->a_cnp->cn_cred)); 5006} 5007 5008static int 5009zfs_freebsd_mkdir(ap) 5010 struct vop_mkdir_args /* { 5011 struct vnode *a_dvp; 5012 struct vnode **a_vpp; 5013 struct componentname *a_cnp; 5014 struct vattr *a_vap; 5015 } */ *ap; 5016{ 5017 vattr_t *vap = ap->a_vap; 5018 5019 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 5020 5021 vattr_init_mask(vap); 5022 5023 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 5024 ap->a_cnp->cn_cred)); 5025} 5026 5027static int 5028zfs_freebsd_rmdir(ap) 5029 struct vop_rmdir_args /* { 5030 struct vnode *a_dvp; 5031 struct vnode *a_vp; 5032 struct componentname *a_cnp; 5033 } */ *ap; 5034{ 5035 struct componentname *cnp = ap->a_cnp; 5036 5037 ASSERT(cnp->cn_flags & SAVENAME); 5038 5039 return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); 5040} 5041 5042static int 5043zfs_freebsd_readdir(ap) 5044 struct vop_readdir_args /* { 5045 struct vnode *a_vp; 5046 struct uio *a_uio; 5047 struct ucred *a_cred; 5048 int *a_eofflag; 5049 int *a_ncookies; 5050 u_long **a_cookies; 5051 } */ *ap; 5052{ 5053 5054 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 5055 ap->a_ncookies, ap->a_cookies)); 5056} 5057 5058static int 5059zfs_freebsd_fsync(ap) 5060 struct vop_fsync_args /* { 5061 struct vnode *a_vp; 5062 int a_waitfor; 5063 struct thread *a_td; 5064 } */ *ap; 5065{ 5066 5067 vop_stdfsync(ap); 5068 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 5069} 5070 5071static int 5072zfs_freebsd_getattr(ap) 5073 struct vop_getattr_args /* { 5074 struct vnode *a_vp; 5075 struct vattr *a_vap; 5076 struct ucred *a_cred; 5077 } */ *ap; 5078{ 5079 vattr_t *vap = ap->a_vap; 5080 xvattr_t xvap; 5081 u_long fflags = 0; 5082 int error; 5083 5084 xva_init(&xvap); 5085 xvap.xva_vattr = *vap; 5086 xvap.xva_vattr.va_mask |= AT_XVATTR; 5087 5088 /* Convert chflags into ZFS-type flags. */ 5089 /* XXX: what about SF_SETTABLE?. */ 5090 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 5091 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 5092 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 5093 XVA_SET_REQ(&xvap, XAT_NODUMP); 5094 XVA_SET_REQ(&xvap, XAT_READONLY); 5095 XVA_SET_REQ(&xvap, XAT_ARCHIVE); 5096 XVA_SET_REQ(&xvap, XAT_SYSTEM); 5097 XVA_SET_REQ(&xvap, XAT_HIDDEN); 5098 XVA_SET_REQ(&xvap, XAT_REPARSE); 5099 XVA_SET_REQ(&xvap, XAT_OFFLINE); 5100 XVA_SET_REQ(&xvap, XAT_SPARSE); 5101 5102 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 5103 if (error != 0) 5104 return (error); 5105 5106 /* Convert ZFS xattr into chflags. */ 5107#define FLAG_CHECK(fflag, xflag, xfield) do { \ 5108 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 5109 fflags |= (fflag); \ 5110} while (0) 5111 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 5112 xvap.xva_xoptattrs.xoa_immutable); 5113 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 5114 xvap.xva_xoptattrs.xoa_appendonly); 5115 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 5116 xvap.xva_xoptattrs.xoa_nounlink); 5117 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, 5118 xvap.xva_xoptattrs.xoa_archive); 5119 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 5120 xvap.xva_xoptattrs.xoa_nodump); 5121 FLAG_CHECK(UF_READONLY, XAT_READONLY, 5122 xvap.xva_xoptattrs.xoa_readonly); 5123 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, 5124 xvap.xva_xoptattrs.xoa_system); 5125 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, 5126 xvap.xva_xoptattrs.xoa_hidden); 5127 FLAG_CHECK(UF_REPARSE, XAT_REPARSE, 5128 xvap.xva_xoptattrs.xoa_reparse); 5129 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, 5130 xvap.xva_xoptattrs.xoa_offline); 5131 FLAG_CHECK(UF_SPARSE, XAT_SPARSE, 5132 xvap.xva_xoptattrs.xoa_sparse); 5133 5134#undef FLAG_CHECK 5135 *vap = xvap.xva_vattr; 5136 vap->va_flags = fflags; 5137 return (0); 5138} 5139 5140static int 5141zfs_freebsd_setattr(ap) 5142 struct vop_setattr_args /* { 5143 struct vnode *a_vp; 5144 struct vattr *a_vap; 5145 struct ucred *a_cred; 5146 } */ *ap; 5147{ 5148 vnode_t *vp = ap->a_vp; 5149 vattr_t *vap = ap->a_vap; 5150 cred_t *cred = ap->a_cred; 5151 xvattr_t xvap; 5152 u_long fflags; 5153 uint64_t zflags; 5154 5155 vattr_init_mask(vap); 5156 vap->va_mask &= ~AT_NOSET; 5157 5158 xva_init(&xvap); 5159 xvap.xva_vattr = *vap; 5160 5161 zflags = VTOZ(vp)->z_pflags; 5162 5163 if (vap->va_flags != VNOVAL) { 5164 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; 5165 int error; 5166 5167 if (zfsvfs->z_use_fuids == B_FALSE) 5168 return (EOPNOTSUPP); 5169 5170 fflags = vap->va_flags; 5171 /* 5172 * XXX KDM 5173 * We need to figure out whether it makes sense to allow 5174 * UF_REPARSE through, since we don't really have other 5175 * facilities to handle reparse points and zfs_setattr() 5176 * doesn't currently allow setting that attribute anyway. 5177 */ 5178 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| 5179 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| 5180 UF_OFFLINE|UF_SPARSE)) != 0) 5181 return (EOPNOTSUPP); 5182 /* 5183 * Unprivileged processes are not permitted to unset system 5184 * flags, or modify flags if any system flags are set. 5185 * Privileged non-jail processes may not modify system flags 5186 * if securelevel > 0 and any existing system flags are set. 5187 * Privileged jail processes behave like privileged non-jail 5188 * processes if the security.jail.chflags_allowed sysctl is 5189 * is non-zero; otherwise, they behave like unprivileged 5190 * processes. 5191 */ 5192 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || 5193 priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 5194 if (zflags & 5195 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5196 error = securelevel_gt(cred, 0); 5197 if (error != 0) 5198 return (error); 5199 } 5200 } else { 5201 /* 5202 * Callers may only modify the file flags on objects they 5203 * have VADMIN rights for. 5204 */ 5205 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 5206 return (error); 5207 if (zflags & 5208 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 5209 return (EPERM); 5210 } 5211 if (fflags & 5212 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 5213 return (EPERM); 5214 } 5215 } 5216 5217#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 5218 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 5219 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 5220 XVA_SET_REQ(&xvap, (xflag)); \ 5221 (xfield) = ((fflags & (fflag)) != 0); \ 5222 } \ 5223} while (0) 5224 /* Convert chflags into ZFS-type flags. */ 5225 /* XXX: what about SF_SETTABLE?. */ 5226 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 5227 xvap.xva_xoptattrs.xoa_immutable); 5228 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 5229 xvap.xva_xoptattrs.xoa_appendonly); 5230 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 5231 xvap.xva_xoptattrs.xoa_nounlink); 5232 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, 5233 xvap.xva_xoptattrs.xoa_archive); 5234 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 5235 xvap.xva_xoptattrs.xoa_nodump); 5236 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, 5237 xvap.xva_xoptattrs.xoa_readonly); 5238 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, 5239 xvap.xva_xoptattrs.xoa_system); 5240 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, 5241 xvap.xva_xoptattrs.xoa_hidden); 5242 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, 5243 xvap.xva_xoptattrs.xoa_hidden); 5244 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, 5245 xvap.xva_xoptattrs.xoa_offline); 5246 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, 5247 xvap.xva_xoptattrs.xoa_sparse); 5248#undef FLAG_CHANGE 5249 } 5250 if (vap->va_birthtime.tv_sec != VNOVAL) { 5251 xvap.xva_vattr.va_mask |= AT_XVATTR; 5252 XVA_SET_REQ(&xvap, XAT_CREATETIME); 5253 } 5254 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 5255} 5256 5257static int 5258zfs_freebsd_rename(ap) 5259 struct vop_rename_args /* { 5260 struct vnode *a_fdvp; 5261 struct vnode *a_fvp; 5262 struct componentname *a_fcnp; 5263 struct vnode *a_tdvp; 5264 struct vnode *a_tvp; 5265 struct componentname *a_tcnp; 5266 } */ *ap; 5267{ 5268 vnode_t *fdvp = ap->a_fdvp; 5269 vnode_t *fvp = ap->a_fvp; 5270 vnode_t *tdvp = ap->a_tdvp; 5271 vnode_t *tvp = ap->a_tvp; 5272 int error; 5273 5274 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 5275 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 5276 5277 error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, 5278 ap->a_tcnp, ap->a_fcnp->cn_cred); 5279 5280 vrele(fdvp); 5281 vrele(fvp); 5282 vrele(tdvp); 5283 if (tvp != NULL) 5284 vrele(tvp); 5285 5286 return (error); 5287} 5288 5289static int 5290zfs_freebsd_symlink(ap) 5291 struct vop_symlink_args /* { 5292 struct vnode *a_dvp; 5293 struct vnode **a_vpp; 5294 struct componentname *a_cnp; 5295 struct vattr *a_vap; 5296 char *a_target; 5297 } */ *ap; 5298{ 5299 struct componentname *cnp = ap->a_cnp; 5300 vattr_t *vap = ap->a_vap; 5301 5302 ASSERT(cnp->cn_flags & SAVENAME); 5303 5304 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 5305 vattr_init_mask(vap); 5306 5307 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 5308 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 5309} 5310 5311static int 5312zfs_freebsd_readlink(ap) 5313 struct vop_readlink_args /* { 5314 struct vnode *a_vp; 5315 struct uio *a_uio; 5316 struct ucred *a_cred; 5317 } */ *ap; 5318{ 5319 5320 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 5321} 5322 5323static int 5324zfs_freebsd_link(ap) 5325 struct vop_link_args /* { 5326 struct vnode *a_tdvp; 5327 struct vnode *a_vp; 5328 struct componentname *a_cnp; 5329 } */ *ap; 5330{ 5331 struct componentname *cnp = ap->a_cnp; 5332 vnode_t *vp = ap->a_vp; 5333 vnode_t *tdvp = ap->a_tdvp; 5334 5335 if (tdvp->v_mount != vp->v_mount) 5336 return (EXDEV); 5337 5338 ASSERT(cnp->cn_flags & SAVENAME); 5339 5340 return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 5341} 5342 5343static int 5344zfs_freebsd_inactive(ap) 5345 struct vop_inactive_args /* { 5346 struct vnode *a_vp; 5347 struct thread *a_td; 5348 } */ *ap; 5349{ 5350 vnode_t *vp = ap->a_vp; 5351 5352 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 5353 return (0); 5354} 5355 5356static int 5357zfs_freebsd_reclaim(ap) 5358 struct vop_reclaim_args /* { 5359 struct vnode *a_vp; 5360 struct thread *a_td; 5361 } */ *ap; 5362{ 5363 vnode_t *vp = ap->a_vp; 5364 znode_t *zp = VTOZ(vp); 5365 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5366 5367 ASSERT(zp != NULL); 5368 5369 /* Destroy the vm object and flush associated pages. */ 5370 vnode_destroy_vobject(vp); 5371 5372 /* 5373 * z_teardown_inactive_lock protects from a race with 5374 * zfs_znode_dmu_fini in zfsvfs_teardown during 5375 * force unmount. 5376 */ 5377 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 5378 if (zp->z_sa_hdl == NULL) 5379 zfs_znode_free(zp); 5380 else 5381 zfs_zinactive(zp); 5382 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5383 5384 vp->v_data = NULL; 5385 return (0); 5386} 5387 5388static int 5389zfs_freebsd_fid(ap) 5390 struct vop_fid_args /* { 5391 struct vnode *a_vp; 5392 struct fid *a_fid; 5393 } */ *ap; 5394{ 5395 5396 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 5397} 5398 5399static int 5400zfs_freebsd_pathconf(ap) 5401 struct vop_pathconf_args /* { 5402 struct vnode *a_vp; 5403 int a_name; 5404 register_t *a_retval; 5405 } */ *ap; 5406{ 5407 ulong_t val; 5408 int error; 5409 5410 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 5411 if (error == 0) { 5412 *ap->a_retval = val; 5413 return (error); 5414 } 5415 if (error != EOPNOTSUPP) 5416 return (error); 5417 5418 switch (ap->a_name) { 5419 case _PC_NAME_MAX: 5420 *ap->a_retval = NAME_MAX; 5421 return (0); 5422 case _PC_PIPE_BUF: 5423 if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { 5424 *ap->a_retval = PIPE_BUF; 5425 return (0); 5426 } 5427 return (EINVAL); 5428 default: 5429 return (vop_stdpathconf(ap)); 5430 } 5431} 5432 5433/* 5434 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 5435 * extended attribute name: 5436 * 5437 * NAMESPACE PREFIX 5438 * system freebsd:system: 5439 * user (none, can be used to access ZFS fsattr(5) attributes 5440 * created on Solaris) 5441 */ 5442static int 5443zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 5444 size_t size) 5445{ 5446 const char *namespace, *prefix, *suffix; 5447 5448 /* We don't allow '/' character in attribute name. */ 5449 if (strchr(name, '/') != NULL) 5450 return (EINVAL); 5451 /* We don't allow attribute names that start with "freebsd:" string. */ 5452 if (strncmp(name, "freebsd:", 8) == 0) 5453 return (EINVAL); 5454 5455 bzero(attrname, size); 5456 5457 switch (attrnamespace) { 5458 case EXTATTR_NAMESPACE_USER: 5459#if 0 5460 prefix = "freebsd:"; 5461 namespace = EXTATTR_NAMESPACE_USER_STRING; 5462 suffix = ":"; 5463#else 5464 /* 5465 * This is the default namespace by which we can access all 5466 * attributes created on Solaris. 5467 */ 5468 prefix = namespace = suffix = ""; 5469#endif 5470 break; 5471 case EXTATTR_NAMESPACE_SYSTEM: 5472 prefix = "freebsd:"; 5473 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 5474 suffix = ":"; 5475 break; 5476 case EXTATTR_NAMESPACE_EMPTY: 5477 default: 5478 return (EINVAL); 5479 } 5480 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 5481 name) >= size) { 5482 return (ENAMETOOLONG); 5483 } 5484 return (0); 5485} 5486 5487/* 5488 * Vnode operating to retrieve a named extended attribute. 5489 */ 5490static int 5491zfs_getextattr(struct vop_getextattr_args *ap) 5492/* 5493vop_getextattr { 5494 IN struct vnode *a_vp; 5495 IN int a_attrnamespace; 5496 IN const char *a_name; 5497 INOUT struct uio *a_uio; 5498 OUT size_t *a_size; 5499 IN struct ucred *a_cred; 5500 IN struct thread *a_td; 5501}; 5502*/ 5503{ 5504 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5505 struct thread *td = ap->a_td; 5506 struct nameidata nd; 5507 char attrname[255]; 5508 struct vattr va; 5509 vnode_t *xvp = NULL, *vp; 5510 int error, flags; 5511 5512 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5513 ap->a_cred, ap->a_td, VREAD); 5514 if (error != 0) 5515 return (error); 5516 5517 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5518 sizeof(attrname)); 5519 if (error != 0) 5520 return (error); 5521 5522 ZFS_ENTER(zfsvfs); 5523 5524 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5525 LOOKUP_XATTR); 5526 if (error != 0) { 5527 ZFS_EXIT(zfsvfs); 5528 return (error); 5529 } 5530 5531 flags = FREAD; 5532 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5533 xvp, td); 5534 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); 5535 vp = nd.ni_vp; 5536 NDFREE(&nd, NDF_ONLY_PNBUF); 5537 if (error != 0) { 5538 ZFS_EXIT(zfsvfs); 5539 if (error == ENOENT) 5540 error = ENOATTR; 5541 return (error); 5542 } 5543 5544 if (ap->a_size != NULL) { 5545 error = VOP_GETATTR(vp, &va, ap->a_cred); 5546 if (error == 0) 5547 *ap->a_size = (size_t)va.va_size; 5548 } else if (ap->a_uio != NULL) 5549 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5550 5551 VOP_UNLOCK(vp, 0); 5552 vn_close(vp, flags, ap->a_cred, td); 5553 ZFS_EXIT(zfsvfs); 5554 5555 return (error); 5556} 5557 5558/* 5559 * Vnode operation to remove a named attribute. 5560 */ 5561int 5562zfs_deleteextattr(struct vop_deleteextattr_args *ap) 5563/* 5564vop_deleteextattr { 5565 IN struct vnode *a_vp; 5566 IN int a_attrnamespace; 5567 IN const char *a_name; 5568 IN struct ucred *a_cred; 5569 IN struct thread *a_td; 5570}; 5571*/ 5572{ 5573 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5574 struct thread *td = ap->a_td; 5575 struct nameidata nd; 5576 char attrname[255]; 5577 struct vattr va; 5578 vnode_t *xvp = NULL, *vp; 5579 int error, flags; 5580 5581 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5582 ap->a_cred, ap->a_td, VWRITE); 5583 if (error != 0) 5584 return (error); 5585 5586 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5587 sizeof(attrname)); 5588 if (error != 0) 5589 return (error); 5590 5591 ZFS_ENTER(zfsvfs); 5592 5593 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5594 LOOKUP_XATTR); 5595 if (error != 0) { 5596 ZFS_EXIT(zfsvfs); 5597 return (error); 5598 } 5599 5600 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, 5601 UIO_SYSSPACE, attrname, xvp, td); 5602 error = namei(&nd); 5603 vp = nd.ni_vp; 5604 if (error != 0) { 5605 ZFS_EXIT(zfsvfs); 5606 NDFREE(&nd, NDF_ONLY_PNBUF); 5607 if (error == ENOENT) 5608 error = ENOATTR; 5609 return (error); 5610 } 5611 5612 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 5613 NDFREE(&nd, NDF_ONLY_PNBUF); 5614 5615 vput(nd.ni_dvp); 5616 if (vp == nd.ni_dvp) 5617 vrele(vp); 5618 else 5619 vput(vp); 5620 ZFS_EXIT(zfsvfs); 5621 5622 return (error); 5623} 5624 5625/* 5626 * Vnode operation to set a named attribute. 5627 */ 5628static int 5629zfs_setextattr(struct vop_setextattr_args *ap) 5630/* 5631vop_setextattr { 5632 IN struct vnode *a_vp; 5633 IN int a_attrnamespace; 5634 IN const char *a_name; 5635 INOUT struct uio *a_uio; 5636 IN struct ucred *a_cred; 5637 IN struct thread *a_td; 5638}; 5639*/ 5640{ 5641 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5642 struct thread *td = ap->a_td; 5643 struct nameidata nd; 5644 char attrname[255]; 5645 struct vattr va; 5646 vnode_t *xvp = NULL, *vp; 5647 int error, flags; 5648 5649 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5650 ap->a_cred, ap->a_td, VWRITE); 5651 if (error != 0) 5652 return (error); 5653 5654 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 5655 sizeof(attrname)); 5656 if (error != 0) 5657 return (error); 5658 5659 ZFS_ENTER(zfsvfs); 5660 5661 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5662 LOOKUP_XATTR | CREATE_XATTR_DIR); 5663 if (error != 0) { 5664 ZFS_EXIT(zfsvfs); 5665 return (error); 5666 } 5667 5668 flags = FFLAGS(O_WRONLY | O_CREAT); 5669 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 5670 xvp, td); 5671 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); 5672 vp = nd.ni_vp; 5673 NDFREE(&nd, NDF_ONLY_PNBUF); 5674 if (error != 0) { 5675 ZFS_EXIT(zfsvfs); 5676 return (error); 5677 } 5678 5679 VATTR_NULL(&va); 5680 va.va_size = 0; 5681 error = VOP_SETATTR(vp, &va, ap->a_cred); 5682 if (error == 0) 5683 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); 5684 5685 VOP_UNLOCK(vp, 0); 5686 vn_close(vp, flags, ap->a_cred, td); 5687 ZFS_EXIT(zfsvfs); 5688 5689 return (error); 5690} 5691 5692/* 5693 * Vnode operation to retrieve extended attributes on a vnode. 5694 */ 5695static int 5696zfs_listextattr(struct vop_listextattr_args *ap) 5697/* 5698vop_listextattr { 5699 IN struct vnode *a_vp; 5700 IN int a_attrnamespace; 5701 INOUT struct uio *a_uio; 5702 OUT size_t *a_size; 5703 IN struct ucred *a_cred; 5704 IN struct thread *a_td; 5705}; 5706*/ 5707{ 5708 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 5709 struct thread *td = ap->a_td; 5710 struct nameidata nd; 5711 char attrprefix[16]; 5712 u_char dirbuf[sizeof(struct dirent)]; 5713 struct dirent *dp; 5714 struct iovec aiov; 5715 struct uio auio, *uio = ap->a_uio; 5716 size_t *sizep = ap->a_size; 5717 size_t plen; 5718 vnode_t *xvp = NULL, *vp; 5719 int done, error, eof, pos; 5720 5721 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 5722 ap->a_cred, ap->a_td, VREAD); 5723 if (error != 0) 5724 return (error); 5725 5726 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 5727 sizeof(attrprefix)); 5728 if (error != 0) 5729 return (error); 5730 plen = strlen(attrprefix); 5731 5732 ZFS_ENTER(zfsvfs); 5733 5734 if (sizep != NULL) 5735 *sizep = 0; 5736 5737 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 5738 LOOKUP_XATTR); 5739 if (error != 0) { 5740 ZFS_EXIT(zfsvfs); 5741 /* 5742 * ENOATTR means that the EA directory does not yet exist, 5743 * i.e. there are no extended attributes there. 5744 */ 5745 if (error == ENOATTR) 5746 error = 0; 5747 return (error); 5748 } 5749 5750 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, 5751 UIO_SYSSPACE, ".", xvp, td); 5752 error = namei(&nd); 5753 vp = nd.ni_vp; 5754 NDFREE(&nd, NDF_ONLY_PNBUF); 5755 if (error != 0) { 5756 ZFS_EXIT(zfsvfs); 5757 return (error); 5758 } 5759 5760 auio.uio_iov = &aiov; 5761 auio.uio_iovcnt = 1; 5762 auio.uio_segflg = UIO_SYSSPACE; 5763 auio.uio_td = td; 5764 auio.uio_rw = UIO_READ; 5765 auio.uio_offset = 0; 5766 5767 do { 5768 u_char nlen; 5769 5770 aiov.iov_base = (void *)dirbuf; 5771 aiov.iov_len = sizeof(dirbuf); 5772 auio.uio_resid = sizeof(dirbuf); 5773 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 5774 done = sizeof(dirbuf) - auio.uio_resid; 5775 if (error != 0) 5776 break; 5777 for (pos = 0; pos < done;) { 5778 dp = (struct dirent *)(dirbuf + pos); 5779 pos += dp->d_reclen; 5780 /* 5781 * XXX: Temporarily we also accept DT_UNKNOWN, as this 5782 * is what we get when attribute was created on Solaris. 5783 */ 5784 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 5785 continue; 5786 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 5787 continue; 5788 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 5789 continue; 5790 nlen = dp->d_namlen - plen; 5791 if (sizep != NULL) 5792 *sizep += 1 + nlen; 5793 else if (uio != NULL) { 5794 /* 5795 * Format of extattr name entry is one byte for 5796 * length and the rest for name. 5797 */ 5798 error = uiomove(&nlen, 1, uio->uio_rw, uio); 5799 if (error == 0) { 5800 error = uiomove(dp->d_name + plen, nlen, 5801 uio->uio_rw, uio); 5802 } 5803 if (error != 0) 5804 break; 5805 } 5806 } 5807 } while (!eof && error == 0); 5808 5809 vput(vp); 5810 ZFS_EXIT(zfsvfs); 5811 5812 return (error); 5813} 5814 5815int 5816zfs_freebsd_getacl(ap) 5817 struct vop_getacl_args /* { 5818 struct vnode *vp; 5819 acl_type_t type; 5820 struct acl *aclp; 5821 struct ucred *cred; 5822 struct thread *td; 5823 } */ *ap; 5824{ 5825 int error; 5826 vsecattr_t vsecattr; 5827 5828 if (ap->a_type != ACL_TYPE_NFS4) 5829 return (EINVAL); 5830 5831 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 5832 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 5833 return (error); 5834 5835 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 5836 if (vsecattr.vsa_aclentp != NULL) 5837 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 5838 5839 return (error); 5840} 5841 5842int 5843zfs_freebsd_setacl(ap) 5844 struct vop_setacl_args /* { 5845 struct vnode *vp; 5846 acl_type_t type; 5847 struct acl *aclp; 5848 struct ucred *cred; 5849 struct thread *td; 5850 } */ *ap; 5851{ 5852 int error; 5853 vsecattr_t vsecattr; 5854 int aclbsize; /* size of acl list in bytes */ 5855 aclent_t *aaclp; 5856 5857 if (ap->a_type != ACL_TYPE_NFS4) 5858 return (EINVAL); 5859 5860 if (ap->a_aclp == NULL) 5861 return (EINVAL); 5862 5863 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 5864 return (EINVAL); 5865 5866 /* 5867 * With NFSv4 ACLs, chmod(2) may need to add additional entries, 5868 * splitting every entry into two and appending "canonical six" 5869 * entries at the end. Don't allow for setting an ACL that would 5870 * cause chmod(2) to run out of ACL entries. 5871 */ 5872 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 5873 return (ENOSPC); 5874 5875 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); 5876 if (error != 0) 5877 return (error); 5878 5879 vsecattr.vsa_mask = VSA_ACE; 5880 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 5881 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 5882 aaclp = vsecattr.vsa_aclentp; 5883 vsecattr.vsa_aclentsz = aclbsize; 5884 5885 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 5886 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 5887 kmem_free(aaclp, aclbsize); 5888 5889 return (error); 5890} 5891 5892int 5893zfs_freebsd_aclcheck(ap) 5894 struct vop_aclcheck_args /* { 5895 struct vnode *vp; 5896 acl_type_t type; 5897 struct acl *aclp; 5898 struct ucred *cred; 5899 struct thread *td; 5900 } */ *ap; 5901{ 5902 5903 return (EOPNOTSUPP); 5904} 5905 5906static int 5907zfs_vptocnp(struct vop_vptocnp_args *ap) 5908{ 5909 vnode_t *covered_vp; 5910 vnode_t *vp = ap->a_vp;; 5911 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; 5912 znode_t *zp = VTOZ(vp); 5913 int ltype; 5914 int error; 5915 5916 ZFS_ENTER(zfsvfs); 5917 ZFS_VERIFY_ZP(zp); 5918 5919 /* 5920 * If we are a snapshot mounted under .zfs, run the operation 5921 * on the covered vnode. 5922 */ 5923 if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { 5924 char name[MAXNAMLEN + 1]; 5925 znode_t *dzp; 5926 size_t len; 5927 5928 error = zfs_znode_parent_and_name(zp, &dzp, name); 5929 if (error == 0) { 5930 len = strlen(name); 5931 if (*ap->a_buflen < len) 5932 error = SET_ERROR(ENOMEM); 5933 } 5934 if (error == 0) { 5935 *ap->a_buflen -= len; 5936 bcopy(name, ap->a_buf + *ap->a_buflen, len); 5937 *ap->a_vpp = ZTOV(dzp); 5938 } 5939 ZFS_EXIT(zfsvfs); 5940 return (error); 5941 } 5942 ZFS_EXIT(zfsvfs); 5943 5944 covered_vp = vp->v_mount->mnt_vnodecovered; 5945 vhold(covered_vp); 5946 ltype = VOP_ISLOCKED(vp); 5947 VOP_UNLOCK(vp, 0); 5948 error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); 5949 if (error == 0) { 5950 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, 5951 ap->a_buf, ap->a_buflen); 5952 vput(covered_vp); 5953 } 5954 vn_lock(vp, ltype | LK_RETRY); 5955 if ((vp->v_iflag & VI_DOOMED) != 0) 5956 error = SET_ERROR(ENOENT); 5957 return (error); 5958} 5959 5960#ifdef DIAGNOSTIC 5961static int 5962zfs_lock(ap) 5963 struct vop_lock1_args /* { 5964 struct vnode *a_vp; 5965 int a_flags; 5966 char *file; 5967 int line; 5968 } */ *ap; 5969{ 5970 vnode_t *vp; 5971 znode_t *zp; 5972 int err; 5973 5974 err = vop_stdlock(ap); 5975 if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { 5976 vp = ap->a_vp; 5977 zp = vp->v_data; 5978 if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && 5979 zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) 5980 VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); 5981 } 5982 return (err); 5983} 5984#endif 5985 5986struct vop_vector zfs_vnodeops; 5987struct vop_vector zfs_fifoops; 5988struct vop_vector zfs_shareops; 5989 5990struct vop_vector zfs_vnodeops = { 5991 .vop_default = &default_vnodeops, 5992 .vop_inactive = zfs_freebsd_inactive, 5993 .vop_reclaim = zfs_freebsd_reclaim, 5994 .vop_access = zfs_freebsd_access, 5995 .vop_lookup = zfs_cache_lookup, 5996 .vop_cachedlookup = zfs_freebsd_lookup, 5997 .vop_getattr = zfs_freebsd_getattr, 5998 .vop_setattr = zfs_freebsd_setattr, 5999 .vop_create = zfs_freebsd_create, 6000 .vop_mknod = zfs_freebsd_create, 6001 .vop_mkdir = zfs_freebsd_mkdir, 6002 .vop_readdir = zfs_freebsd_readdir, 6003 .vop_fsync = zfs_freebsd_fsync, 6004 .vop_open = zfs_freebsd_open, 6005 .vop_close = zfs_freebsd_close, 6006 .vop_rmdir = zfs_freebsd_rmdir, 6007 .vop_ioctl = zfs_freebsd_ioctl, 6008 .vop_link = zfs_freebsd_link, 6009 .vop_symlink = zfs_freebsd_symlink, 6010 .vop_readlink = zfs_freebsd_readlink, 6011 .vop_read = zfs_freebsd_read, 6012 .vop_write = zfs_freebsd_write, 6013 .vop_remove = zfs_freebsd_remove, 6014 .vop_rename = zfs_freebsd_rename, 6015 .vop_pathconf = zfs_freebsd_pathconf, 6016 .vop_bmap = zfs_freebsd_bmap, 6017 .vop_fid = zfs_freebsd_fid, 6018 .vop_getextattr = zfs_getextattr, 6019 .vop_deleteextattr = zfs_deleteextattr, 6020 .vop_setextattr = zfs_setextattr, 6021 .vop_listextattr = zfs_listextattr, 6022 .vop_getacl = zfs_freebsd_getacl, 6023 .vop_setacl = zfs_freebsd_setacl, 6024 .vop_aclcheck = zfs_freebsd_aclcheck, 6025 .vop_getpages = zfs_freebsd_getpages, 6026 .vop_putpages = zfs_freebsd_putpages, 6027 .vop_vptocnp = zfs_vptocnp, 6028#ifdef DIAGNOSTIC 6029 .vop_lock1 = zfs_lock, 6030#endif 6031}; 6032 6033struct vop_vector zfs_fifoops = { 6034 .vop_default = &fifo_specops, 6035 .vop_fsync = zfs_freebsd_fsync, 6036 .vop_access = zfs_freebsd_access, 6037 .vop_getattr = zfs_freebsd_getattr, 6038 .vop_inactive = zfs_freebsd_inactive, 6039 .vop_read = VOP_PANIC, 6040 .vop_reclaim = zfs_freebsd_reclaim, 6041 .vop_setattr = zfs_freebsd_setattr, 6042 .vop_write = VOP_PANIC, 6043 .vop_pathconf = zfs_freebsd_pathconf, 6044 .vop_fid = zfs_freebsd_fid, 6045 .vop_getacl = zfs_freebsd_getacl, 6046 .vop_setacl = zfs_freebsd_setacl, 6047 .vop_aclcheck = zfs_freebsd_aclcheck, 6048}; 6049 6050/* 6051 * special share hidden files vnode operations template 6052 */ 6053struct vop_vector zfs_shareops = { 6054 .vop_default = &default_vnodeops, 6055 .vop_access = zfs_freebsd_access, 6056 .vop_inactive = zfs_freebsd_inactive, 6057 .vop_reclaim = zfs_freebsd_reclaim, 6058 .vop_fid = zfs_freebsd_fid, 6059 .vop_pathconf = zfs_freebsd_pathconf, 6060}; 6061