dmu_send.c revision 273510
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 * Copyright 2014 HybridCluster. All rights reserved. 28 */ 29 30#include <sys/dmu.h> 31#include <sys/dmu_impl.h> 32#include <sys/dmu_tx.h> 33#include <sys/dbuf.h> 34#include <sys/dnode.h> 35#include <sys/zfs_context.h> 36#include <sys/dmu_objset.h> 37#include <sys/dmu_traverse.h> 38#include <sys/dsl_dataset.h> 39#include <sys/dsl_dir.h> 40#include <sys/dsl_prop.h> 41#include <sys/dsl_pool.h> 42#include <sys/dsl_synctask.h> 43#include <sys/zfs_ioctl.h> 44#include <sys/zap.h> 45#include <sys/zio_checksum.h> 46#include <sys/zfs_znode.h> 47#include <zfs_fletcher.h> 48#include <sys/avl.h> 49#include <sys/ddt.h> 50#include <sys/zfs_onexit.h> 51#include <sys/dmu_send.h> 52#include <sys/dsl_destroy.h> 53#include <sys/blkptr.h> 54#include <sys/dsl_bookmark.h> 55#include <sys/zfeature.h> 56 57#ifdef __FreeBSD__ 58#undef dump_write 59#define dump_write dmu_dump_write 60#endif 61 62/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 63int zfs_send_corrupt_data = B_FALSE; 64 65static char *dmu_recv_tag = "dmu_recv_tag"; 66static const char *recv_clone_name = "%recv"; 67 68static int 69dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 70{ 71 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 72 struct uio auio; 73 struct iovec aiov; 74 ASSERT0(len % 8); 75 76 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 77 aiov.iov_base = buf; 78 aiov.iov_len = len; 79 auio.uio_iov = &aiov; 80 auio.uio_iovcnt = 1; 81 auio.uio_resid = len; 82 auio.uio_segflg = UIO_SYSSPACE; 83 auio.uio_rw = UIO_WRITE; 84 auio.uio_offset = (off_t)-1; 85 auio.uio_td = dsp->dsa_td; 86#ifdef _KERNEL 87 if (dsp->dsa_fp->f_type == DTYPE_VNODE) 88 bwillwrite(); 89 dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, 90 dsp->dsa_td); 91#else 92 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 93 dsp->dsa_err = EOPNOTSUPP; 94#endif 95 mutex_enter(&ds->ds_sendstream_lock); 96 *dsp->dsa_off += len; 97 mutex_exit(&ds->ds_sendstream_lock); 98 99 return (dsp->dsa_err); 100} 101 102static int 103dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 104 uint64_t length) 105{ 106 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 107 108 /* 109 * When we receive a free record, dbuf_free_range() assumes 110 * that the receiving system doesn't have any dbufs in the range 111 * being freed. This is always true because there is a one-record 112 * constraint: we only send one WRITE record for any given 113 * object+offset. We know that the one-record constraint is 114 * true because we always send data in increasing order by 115 * object,offset. 116 * 117 * If the increasing-order constraint ever changes, we should find 118 * another way to assert that the one-record constraint is still 119 * satisfied. 120 */ 121 ASSERT(object > dsp->dsa_last_data_object || 122 (object == dsp->dsa_last_data_object && 123 offset > dsp->dsa_last_data_offset)); 124 125 /* 126 * If we are doing a non-incremental send, then there can't 127 * be any data in the dataset we're receiving into. Therefore 128 * a free record would simply be a no-op. Save space by not 129 * sending it to begin with. 130 */ 131 if (!dsp->dsa_incremental) 132 return (0); 133 134 if (length != -1ULL && offset + length < offset) 135 length = -1ULL; 136 137 /* 138 * If there is a pending op, but it's not PENDING_FREE, push it out, 139 * since free block aggregation can only be done for blocks of the 140 * same type (i.e., DRR_FREE records can only be aggregated with 141 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 142 * aggregated with other DRR_FREEOBJECTS records. 143 */ 144 if (dsp->dsa_pending_op != PENDING_NONE && 145 dsp->dsa_pending_op != PENDING_FREE) { 146 if (dump_bytes(dsp, dsp->dsa_drr, 147 sizeof (dmu_replay_record_t)) != 0) 148 return (SET_ERROR(EINTR)); 149 dsp->dsa_pending_op = PENDING_NONE; 150 } 151 152 if (dsp->dsa_pending_op == PENDING_FREE) { 153 /* 154 * There should never be a PENDING_FREE if length is -1 155 * (because dump_dnode is the only place where this 156 * function is called with a -1, and only after flushing 157 * any pending record). 158 */ 159 ASSERT(length != -1ULL); 160 /* 161 * Check to see whether this free block can be aggregated 162 * with pending one. 163 */ 164 if (drrf->drr_object == object && drrf->drr_offset + 165 drrf->drr_length == offset) { 166 drrf->drr_length += length; 167 return (0); 168 } else { 169 /* not a continuation. Push out pending record */ 170 if (dump_bytes(dsp, dsp->dsa_drr, 171 sizeof (dmu_replay_record_t)) != 0) 172 return (SET_ERROR(EINTR)); 173 dsp->dsa_pending_op = PENDING_NONE; 174 } 175 } 176 /* create a FREE record and make it pending */ 177 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 178 dsp->dsa_drr->drr_type = DRR_FREE; 179 drrf->drr_object = object; 180 drrf->drr_offset = offset; 181 drrf->drr_length = length; 182 drrf->drr_toguid = dsp->dsa_toguid; 183 if (length == -1ULL) { 184 if (dump_bytes(dsp, dsp->dsa_drr, 185 sizeof (dmu_replay_record_t)) != 0) 186 return (SET_ERROR(EINTR)); 187 } else { 188 dsp->dsa_pending_op = PENDING_FREE; 189 } 190 191 return (0); 192} 193 194static int 195dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, 196 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 197{ 198 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 199 200 /* 201 * We send data in increasing object, offset order. 202 * See comment in dump_free() for details. 203 */ 204 ASSERT(object > dsp->dsa_last_data_object || 205 (object == dsp->dsa_last_data_object && 206 offset > dsp->dsa_last_data_offset)); 207 dsp->dsa_last_data_object = object; 208 dsp->dsa_last_data_offset = offset + blksz - 1; 209 210 /* 211 * If there is any kind of pending aggregation (currently either 212 * a grouping of free objects or free blocks), push it out to 213 * the stream, since aggregation can't be done across operations 214 * of different types. 215 */ 216 if (dsp->dsa_pending_op != PENDING_NONE) { 217 if (dump_bytes(dsp, dsp->dsa_drr, 218 sizeof (dmu_replay_record_t)) != 0) 219 return (SET_ERROR(EINTR)); 220 dsp->dsa_pending_op = PENDING_NONE; 221 } 222 /* write a DATA record */ 223 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 224 dsp->dsa_drr->drr_type = DRR_WRITE; 225 drrw->drr_object = object; 226 drrw->drr_type = type; 227 drrw->drr_offset = offset; 228 drrw->drr_length = blksz; 229 drrw->drr_toguid = dsp->dsa_toguid; 230 if (BP_IS_EMBEDDED(bp)) { 231 /* 232 * There's no pre-computed checksum of embedded BP's, so 233 * (like fletcher4-checkummed blocks) userland will have 234 * to compute a dedup-capable checksum itself. 235 */ 236 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; 237 } else { 238 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 239 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 240 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 241 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 242 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 243 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 244 drrw->drr_key.ddk_cksum = bp->blk_cksum; 245 } 246 247 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 248 return (SET_ERROR(EINTR)); 249 if (dump_bytes(dsp, data, blksz) != 0) 250 return (SET_ERROR(EINTR)); 251 return (0); 252} 253 254static int 255dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 256 int blksz, const blkptr_t *bp) 257{ 258 char buf[BPE_PAYLOAD_SIZE]; 259 struct drr_write_embedded *drrw = 260 &(dsp->dsa_drr->drr_u.drr_write_embedded); 261 262 if (dsp->dsa_pending_op != PENDING_NONE) { 263 if (dump_bytes(dsp, dsp->dsa_drr, 264 sizeof (dmu_replay_record_t)) != 0) 265 return (EINTR); 266 dsp->dsa_pending_op = PENDING_NONE; 267 } 268 269 ASSERT(BP_IS_EMBEDDED(bp)); 270 271 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 272 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; 273 drrw->drr_object = object; 274 drrw->drr_offset = offset; 275 drrw->drr_length = blksz; 276 drrw->drr_toguid = dsp->dsa_toguid; 277 drrw->drr_compression = BP_GET_COMPRESS(bp); 278 drrw->drr_etype = BPE_GET_ETYPE(bp); 279 drrw->drr_lsize = BPE_GET_LSIZE(bp); 280 drrw->drr_psize = BPE_GET_PSIZE(bp); 281 282 decode_embedded_bp_compressed(bp, buf); 283 284 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 285 return (EINTR); 286 if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) 287 return (EINTR); 288 return (0); 289} 290 291static int 292dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 293{ 294 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 295 296 if (dsp->dsa_pending_op != PENDING_NONE) { 297 if (dump_bytes(dsp, dsp->dsa_drr, 298 sizeof (dmu_replay_record_t)) != 0) 299 return (SET_ERROR(EINTR)); 300 dsp->dsa_pending_op = PENDING_NONE; 301 } 302 303 /* write a SPILL record */ 304 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 305 dsp->dsa_drr->drr_type = DRR_SPILL; 306 drrs->drr_object = object; 307 drrs->drr_length = blksz; 308 drrs->drr_toguid = dsp->dsa_toguid; 309 310 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 311 return (SET_ERROR(EINTR)); 312 if (dump_bytes(dsp, data, blksz)) 313 return (SET_ERROR(EINTR)); 314 return (0); 315} 316 317static int 318dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 319{ 320 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 321 322 /* See comment in dump_free(). */ 323 if (!dsp->dsa_incremental) 324 return (0); 325 326 /* 327 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 328 * push it out, since free block aggregation can only be done for 329 * blocks of the same type (i.e., DRR_FREE records can only be 330 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 331 * can only be aggregated with other DRR_FREEOBJECTS records. 332 */ 333 if (dsp->dsa_pending_op != PENDING_NONE && 334 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 335 if (dump_bytes(dsp, dsp->dsa_drr, 336 sizeof (dmu_replay_record_t)) != 0) 337 return (SET_ERROR(EINTR)); 338 dsp->dsa_pending_op = PENDING_NONE; 339 } 340 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 341 /* 342 * See whether this free object array can be aggregated 343 * with pending one 344 */ 345 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 346 drrfo->drr_numobjs += numobjs; 347 return (0); 348 } else { 349 /* can't be aggregated. Push out pending record */ 350 if (dump_bytes(dsp, dsp->dsa_drr, 351 sizeof (dmu_replay_record_t)) != 0) 352 return (SET_ERROR(EINTR)); 353 dsp->dsa_pending_op = PENDING_NONE; 354 } 355 } 356 357 /* write a FREEOBJECTS record */ 358 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 359 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 360 drrfo->drr_firstobj = firstobj; 361 drrfo->drr_numobjs = numobjs; 362 drrfo->drr_toguid = dsp->dsa_toguid; 363 364 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 365 366 return (0); 367} 368 369static int 370dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 371{ 372 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 373 374 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 375 return (dump_freeobjects(dsp, object, 1)); 376 377 if (dsp->dsa_pending_op != PENDING_NONE) { 378 if (dump_bytes(dsp, dsp->dsa_drr, 379 sizeof (dmu_replay_record_t)) != 0) 380 return (SET_ERROR(EINTR)); 381 dsp->dsa_pending_op = PENDING_NONE; 382 } 383 384 /* write an OBJECT record */ 385 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 386 dsp->dsa_drr->drr_type = DRR_OBJECT; 387 drro->drr_object = object; 388 drro->drr_type = dnp->dn_type; 389 drro->drr_bonustype = dnp->dn_bonustype; 390 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 391 drro->drr_bonuslen = dnp->dn_bonuslen; 392 drro->drr_checksumtype = dnp->dn_checksum; 393 drro->drr_compress = dnp->dn_compress; 394 drro->drr_toguid = dsp->dsa_toguid; 395 396 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 397 return (SET_ERROR(EINTR)); 398 399 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 400 return (SET_ERROR(EINTR)); 401 402 /* Free anything past the end of the file. */ 403 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 404 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) 405 return (SET_ERROR(EINTR)); 406 if (dsp->dsa_err != 0) 407 return (SET_ERROR(EINTR)); 408 return (0); 409} 410 411static boolean_t 412backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) 413{ 414 if (!BP_IS_EMBEDDED(bp)) 415 return (B_FALSE); 416 417 /* 418 * Compression function must be legacy, or explicitly enabled. 419 */ 420 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && 421 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) 422 return (B_FALSE); 423 424 /* 425 * Embed type must be explicitly enabled. 426 */ 427 switch (BPE_GET_ETYPE(bp)) { 428 case BP_EMBEDDED_TYPE_DATA: 429 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) 430 return (B_TRUE); 431 break; 432 default: 433 return (B_FALSE); 434 } 435 return (B_FALSE); 436} 437 438#define BP_SPAN(dnp, level) \ 439 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 440 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 441 442/* ARGSUSED */ 443static int 444backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 445 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 446{ 447 dmu_sendarg_t *dsp = arg; 448 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 449 int err = 0; 450 451 if (issig(JUSTLOOKING) && issig(FORREAL)) 452 return (SET_ERROR(EINTR)); 453 454 if (zb->zb_object != DMU_META_DNODE_OBJECT && 455 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 456 return (0); 457 } else if (zb->zb_level == ZB_ZIL_LEVEL) { 458 /* 459 * If we are sending a non-snapshot (which is allowed on 460 * read-only pools), it may have a ZIL, which must be ignored. 461 */ 462 return (0); 463 } else if (BP_IS_HOLE(bp) && 464 zb->zb_object == DMU_META_DNODE_OBJECT) { 465 uint64_t span = BP_SPAN(dnp, zb->zb_level); 466 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 467 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 468 } else if (BP_IS_HOLE(bp)) { 469 uint64_t span = BP_SPAN(dnp, zb->zb_level); 470 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 471 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 472 return (0); 473 } else if (type == DMU_OT_DNODE) { 474 dnode_phys_t *blk; 475 int i; 476 int blksz = BP_GET_LSIZE(bp); 477 uint32_t aflags = ARC_WAIT; 478 arc_buf_t *abuf; 479 480 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 481 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 482 &aflags, zb) != 0) 483 return (SET_ERROR(EIO)); 484 485 blk = abuf->b_data; 486 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 487 uint64_t dnobj = (zb->zb_blkid << 488 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 489 err = dump_dnode(dsp, dnobj, blk+i); 490 if (err != 0) 491 break; 492 } 493 (void) arc_buf_remove_ref(abuf, &abuf); 494 } else if (type == DMU_OT_SA) { 495 uint32_t aflags = ARC_WAIT; 496 arc_buf_t *abuf; 497 int blksz = BP_GET_LSIZE(bp); 498 499 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 500 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 501 &aflags, zb) != 0) 502 return (SET_ERROR(EIO)); 503 504 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 505 (void) arc_buf_remove_ref(abuf, &abuf); 506 } else if (backup_do_embed(dsp, bp)) { 507 /* it's an embedded level-0 block of a regular object */ 508 int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 509 err = dump_write_embedded(dsp, zb->zb_object, 510 zb->zb_blkid * blksz, blksz, bp); 511 } else { /* it's a level-0 block of a regular object */ 512 uint32_t aflags = ARC_WAIT; 513 arc_buf_t *abuf; 514 int blksz = BP_GET_LSIZE(bp); 515 516 ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 517 ASSERT0(zb->zb_level); 518 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 519 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 520 &aflags, zb) != 0) { 521 if (zfs_send_corrupt_data) { 522 /* Send a block filled with 0x"zfs badd bloc" */ 523 abuf = arc_buf_alloc(spa, blksz, &abuf, 524 ARC_BUFC_DATA); 525 uint64_t *ptr; 526 for (ptr = abuf->b_data; 527 (char *)ptr < (char *)abuf->b_data + blksz; 528 ptr++) 529 *ptr = 0x2f5baddb10c; 530 } else { 531 return (SET_ERROR(EIO)); 532 } 533 } 534 535 err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz, 536 blksz, bp, abuf->b_data); 537 (void) arc_buf_remove_ref(abuf, &abuf); 538 } 539 540 ASSERT(err == 0 || err == EINTR); 541 return (err); 542} 543 544/* 545 * Releases dp using the specified tag. 546 */ 547static int 548dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, 549 zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, 550#ifdef illumos 551 int outfd, vnode_t *vp, offset_t *off) 552#else 553 int outfd, struct file *fp, offset_t *off) 554#endif 555{ 556 objset_t *os; 557 dmu_replay_record_t *drr; 558 dmu_sendarg_t *dsp; 559 int err; 560 uint64_t fromtxg = 0; 561 uint64_t featureflags = 0; 562 563 err = dmu_objset_from_ds(ds, &os); 564 if (err != 0) { 565 dsl_pool_rele(dp, tag); 566 return (err); 567 } 568 569 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 570 drr->drr_type = DRR_BEGIN; 571 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 572 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 573 DMU_SUBSTREAM); 574 575#ifdef _KERNEL 576 if (dmu_objset_type(os) == DMU_OST_ZFS) { 577 uint64_t version; 578 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 579 kmem_free(drr, sizeof (dmu_replay_record_t)); 580 dsl_pool_rele(dp, tag); 581 return (SET_ERROR(EINVAL)); 582 } 583 if (version >= ZPL_VERSION_SA) { 584 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; 585 } 586 } 587#endif 588 589 if (embedok && 590 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { 591 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; 592 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 593 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; 594 } else { 595 embedok = B_FALSE; 596 } 597 598 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, 599 featureflags); 600 601 drr->drr_u.drr_begin.drr_creation_time = 602 ds->ds_phys->ds_creation_time; 603 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 604 if (is_clone) 605 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 606 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 607 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 608 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 609 610 if (fromzb != NULL) { 611 drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; 612 fromtxg = fromzb->zbm_creation_txg; 613 } 614 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 615 if (!dsl_dataset_is_snapshot(ds)) { 616 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", 617 sizeof (drr->drr_u.drr_begin.drr_toname)); 618 } 619 620 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 621 622 dsp->dsa_drr = drr; 623 dsp->dsa_outfd = outfd; 624 dsp->dsa_proc = curproc; 625 dsp->dsa_td = curthread; 626 dsp->dsa_fp = fp; 627 dsp->dsa_os = os; 628 dsp->dsa_off = off; 629 dsp->dsa_toguid = ds->ds_phys->ds_guid; 630 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 631 dsp->dsa_pending_op = PENDING_NONE; 632 dsp->dsa_incremental = (fromzb != NULL); 633 dsp->dsa_featureflags = featureflags; 634 635 mutex_enter(&ds->ds_sendstream_lock); 636 list_insert_head(&ds->ds_sendstreams, dsp); 637 mutex_exit(&ds->ds_sendstream_lock); 638 639 dsl_dataset_long_hold(ds, FTAG); 640 dsl_pool_rele(dp, tag); 641 642 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 643 err = dsp->dsa_err; 644 goto out; 645 } 646 647 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 648 backup_cb, dsp); 649 650 if (dsp->dsa_pending_op != PENDING_NONE) 651 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 652 err = SET_ERROR(EINTR); 653 654 if (err != 0) { 655 if (err == EINTR && dsp->dsa_err != 0) 656 err = dsp->dsa_err; 657 goto out; 658 } 659 660 bzero(drr, sizeof (dmu_replay_record_t)); 661 drr->drr_type = DRR_END; 662 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 663 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 664 665 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 666 err = dsp->dsa_err; 667 goto out; 668 } 669 670out: 671 mutex_enter(&ds->ds_sendstream_lock); 672 list_remove(&ds->ds_sendstreams, dsp); 673 mutex_exit(&ds->ds_sendstream_lock); 674 675 kmem_free(drr, sizeof (dmu_replay_record_t)); 676 kmem_free(dsp, sizeof (dmu_sendarg_t)); 677 678 dsl_dataset_long_rele(ds, FTAG); 679 680 return (err); 681} 682 683int 684dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 685#ifdef illumos 686 boolean_t embedok, int outfd, vnode_t *vp, offset_t *off) 687#else 688 boolean_t embedok, int outfd, struct file *fp, offset_t *off) 689#endif 690{ 691 dsl_pool_t *dp; 692 dsl_dataset_t *ds; 693 dsl_dataset_t *fromds = NULL; 694 int err; 695 696 err = dsl_pool_hold(pool, FTAG, &dp); 697 if (err != 0) 698 return (err); 699 700 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 701 if (err != 0) { 702 dsl_pool_rele(dp, FTAG); 703 return (err); 704 } 705 706 if (fromsnap != 0) { 707 zfs_bookmark_phys_t zb; 708 boolean_t is_clone; 709 710 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 711 if (err != 0) { 712 dsl_dataset_rele(ds, FTAG); 713 dsl_pool_rele(dp, FTAG); 714 return (err); 715 } 716 if (!dsl_dataset_is_before(ds, fromds, 0)) 717 err = SET_ERROR(EXDEV); 718 zb.zbm_creation_time = fromds->ds_phys->ds_creation_time; 719 zb.zbm_creation_txg = fromds->ds_phys->ds_creation_txg; 720 zb.zbm_guid = fromds->ds_phys->ds_guid; 721 is_clone = (fromds->ds_dir != ds->ds_dir); 722 dsl_dataset_rele(fromds, FTAG); 723 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, 724 outfd, fp, off); 725 } else { 726 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, 727 outfd, fp, off); 728 } 729 dsl_dataset_rele(ds, FTAG); 730 return (err); 731} 732 733int 734dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, 735#ifdef illumos 736 int outfd, vnode_t *vp, offset_t *off) 737#else 738 int outfd, struct file *fp, offset_t *off) 739#endif 740{ 741 dsl_pool_t *dp; 742 dsl_dataset_t *ds; 743 int err; 744 boolean_t owned = B_FALSE; 745 746 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) 747 return (SET_ERROR(EINVAL)); 748 749 err = dsl_pool_hold(tosnap, FTAG, &dp); 750 if (err != 0) 751 return (err); 752 753 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { 754 /* 755 * We are sending a filesystem or volume. Ensure 756 * that it doesn't change by owning the dataset. 757 */ 758 err = dsl_dataset_own(dp, tosnap, FTAG, &ds); 759 owned = B_TRUE; 760 } else { 761 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 762 } 763 if (err != 0) { 764 dsl_pool_rele(dp, FTAG); 765 return (err); 766 } 767 768 if (fromsnap != NULL) { 769 zfs_bookmark_phys_t zb; 770 boolean_t is_clone = B_FALSE; 771 int fsnamelen = strchr(tosnap, '@') - tosnap; 772 773 /* 774 * If the fromsnap is in a different filesystem, then 775 * mark the send stream as a clone. 776 */ 777 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || 778 (fromsnap[fsnamelen] != '@' && 779 fromsnap[fsnamelen] != '#')) { 780 is_clone = B_TRUE; 781 } 782 783 if (strchr(fromsnap, '@')) { 784 dsl_dataset_t *fromds; 785 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 786 if (err == 0) { 787 if (!dsl_dataset_is_before(ds, fromds, 0)) 788 err = SET_ERROR(EXDEV); 789 zb.zbm_creation_time = 790 fromds->ds_phys->ds_creation_time; 791 zb.zbm_creation_txg = 792 fromds->ds_phys->ds_creation_txg; 793 zb.zbm_guid = fromds->ds_phys->ds_guid; 794 is_clone = (ds->ds_dir != fromds->ds_dir); 795 dsl_dataset_rele(fromds, FTAG); 796 } 797 } else { 798 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); 799 } 800 if (err != 0) { 801 dsl_dataset_rele(ds, FTAG); 802 dsl_pool_rele(dp, FTAG); 803 return (err); 804 } 805 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, 806 outfd, fp, off); 807 } else { 808 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, 809 outfd, fp, off); 810 } 811 if (owned) 812 dsl_dataset_disown(ds, FTAG); 813 else 814 dsl_dataset_rele(ds, FTAG); 815 return (err); 816} 817 818int 819dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) 820{ 821 dsl_pool_t *dp = ds->ds_dir->dd_pool; 822 int err; 823 uint64_t size; 824 825 ASSERT(dsl_pool_config_held(dp)); 826 827 /* tosnap must be a snapshot */ 828 if (!dsl_dataset_is_snapshot(ds)) 829 return (SET_ERROR(EINVAL)); 830 831 /* 832 * fromsnap must be an earlier snapshot from the same fs as tosnap, 833 * or the origin's fs. 834 */ 835 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) 836 return (SET_ERROR(EXDEV)); 837 838 /* Get uncompressed size estimate of changed data. */ 839 if (fromds == NULL) { 840 size = ds->ds_phys->ds_uncompressed_bytes; 841 } else { 842 uint64_t used, comp; 843 err = dsl_dataset_space_written(fromds, ds, 844 &used, &comp, &size); 845 if (err != 0) 846 return (err); 847 } 848 849 /* 850 * Assume that space (both on-disk and in-stream) is dominated by 851 * data. We will adjust for indirect blocks and the copies property, 852 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 853 */ 854 855 /* 856 * Subtract out approximate space used by indirect blocks. 857 * Assume most space is used by data blocks (non-indirect, non-dnode). 858 * Assume all blocks are recordsize. Assume ditto blocks and 859 * internal fragmentation counter out compression. 860 * 861 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 862 * block, which we observe in practice. 863 */ 864 uint64_t recordsize; 865 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); 866 if (err != 0) 867 return (err); 868 size -= size / recordsize * sizeof (blkptr_t); 869 870 /* Add in the space for the record associated with each block. */ 871 size += size / recordsize * sizeof (dmu_replay_record_t); 872 873 *sizep = size; 874 875 return (0); 876} 877 878typedef struct dmu_recv_begin_arg { 879 const char *drba_origin; 880 dmu_recv_cookie_t *drba_cookie; 881 cred_t *drba_cred; 882 uint64_t drba_snapobj; 883} dmu_recv_begin_arg_t; 884 885static int 886recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 887 uint64_t fromguid) 888{ 889 uint64_t val; 890 int error; 891 dsl_pool_t *dp = ds->ds_dir->dd_pool; 892 893 /* temporary clone name must not exist */ 894 error = zap_lookup(dp->dp_meta_objset, 895 ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, 896 8, 1, &val); 897 if (error != ENOENT) 898 return (error == 0 ? EBUSY : error); 899 900 /* new snapshot name must not exist */ 901 error = zap_lookup(dp->dp_meta_objset, 902 ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 903 8, 1, &val); 904 if (error != ENOENT) 905 return (error == 0 ? EEXIST : error); 906 907 /* 908 * Check snapshot limit before receiving. We'll recheck again at the 909 * end, but might as well abort before receiving if we're already over 910 * the limit. 911 * 912 * Note that we do not check the file system limit with 913 * dsl_dir_fscount_check because the temporary %clones don't count 914 * against that limit. 915 */ 916 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, 917 NULL, drba->drba_cred); 918 if (error != 0) 919 return (error); 920 921 if (fromguid != 0) { 922 dsl_dataset_t *snap; 923 uint64_t obj = ds->ds_phys->ds_prev_snap_obj; 924 925 /* Find snapshot in this dir that matches fromguid. */ 926 while (obj != 0) { 927 error = dsl_dataset_hold_obj(dp, obj, FTAG, 928 &snap); 929 if (error != 0) 930 return (SET_ERROR(ENODEV)); 931 if (snap->ds_dir != ds->ds_dir) { 932 dsl_dataset_rele(snap, FTAG); 933 return (SET_ERROR(ENODEV)); 934 } 935 if (snap->ds_phys->ds_guid == fromguid) 936 break; 937 obj = snap->ds_phys->ds_prev_snap_obj; 938 dsl_dataset_rele(snap, FTAG); 939 } 940 if (obj == 0) 941 return (SET_ERROR(ENODEV)); 942 943 if (drba->drba_cookie->drc_force) { 944 drba->drba_snapobj = obj; 945 } else { 946 /* 947 * If we are not forcing, there must be no 948 * changes since fromsnap. 949 */ 950 if (dsl_dataset_modified_since_snap(ds, snap)) { 951 dsl_dataset_rele(snap, FTAG); 952 return (SET_ERROR(ETXTBSY)); 953 } 954 drba->drba_snapobj = ds->ds_prev->ds_object; 955 } 956 957 dsl_dataset_rele(snap, FTAG); 958 } else { 959 /* if full, most recent snapshot must be $ORIGIN */ 960 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 961 return (SET_ERROR(ENODEV)); 962 drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj; 963 } 964 965 return (0); 966 967} 968 969static int 970dmu_recv_begin_check(void *arg, dmu_tx_t *tx) 971{ 972 dmu_recv_begin_arg_t *drba = arg; 973 dsl_pool_t *dp = dmu_tx_pool(tx); 974 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 975 uint64_t fromguid = drrb->drr_fromguid; 976 int flags = drrb->drr_flags; 977 int error; 978 uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 979 dsl_dataset_t *ds; 980 const char *tofs = drba->drba_cookie->drc_tofs; 981 982 /* already checked */ 983 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 984 985 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 986 DMU_COMPOUNDSTREAM || 987 drrb->drr_type >= DMU_OST_NUMTYPES || 988 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 989 return (SET_ERROR(EINVAL)); 990 991 /* Verify pool version supports SA if SA_SPILL feature set */ 992 if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 993 spa_version(dp->dp_spa) < SPA_VERSION_SA) 994 return (SET_ERROR(ENOTSUP)); 995 996 /* 997 * The receiving code doesn't know how to translate a WRITE_EMBEDDED 998 * record to a plan WRITE record, so the pool must have the 999 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED 1000 * records. Same with WRITE_EMBEDDED records that use LZ4 compression. 1001 */ 1002 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && 1003 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) 1004 return (SET_ERROR(ENOTSUP)); 1005 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && 1006 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 1007 return (SET_ERROR(ENOTSUP)); 1008 1009 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1010 if (error == 0) { 1011 /* target fs already exists; recv into temp clone */ 1012 1013 /* Can't recv a clone into an existing fs */ 1014 if (flags & DRR_FLAG_CLONE) { 1015 dsl_dataset_rele(ds, FTAG); 1016 return (SET_ERROR(EINVAL)); 1017 } 1018 1019 error = recv_begin_check_existing_impl(drba, ds, fromguid); 1020 dsl_dataset_rele(ds, FTAG); 1021 } else if (error == ENOENT) { 1022 /* target fs does not exist; must be a full backup or clone */ 1023 char buf[MAXNAMELEN]; 1024 1025 /* 1026 * If it's a non-clone incremental, we are missing the 1027 * target fs, so fail the recv. 1028 */ 1029 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) 1030 return (SET_ERROR(ENOENT)); 1031 1032 /* Open the parent of tofs */ 1033 ASSERT3U(strlen(tofs), <, MAXNAMELEN); 1034 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 1035 error = dsl_dataset_hold(dp, buf, FTAG, &ds); 1036 if (error != 0) 1037 return (error); 1038 1039 /* 1040 * Check filesystem and snapshot limits before receiving. We'll 1041 * recheck snapshot limits again at the end (we create the 1042 * filesystems and increment those counts during begin_sync). 1043 */ 1044 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1045 ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); 1046 if (error != 0) { 1047 dsl_dataset_rele(ds, FTAG); 1048 return (error); 1049 } 1050 1051 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1052 ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); 1053 if (error != 0) { 1054 dsl_dataset_rele(ds, FTAG); 1055 return (error); 1056 } 1057 1058 if (drba->drba_origin != NULL) { 1059 dsl_dataset_t *origin; 1060 error = dsl_dataset_hold(dp, drba->drba_origin, 1061 FTAG, &origin); 1062 if (error != 0) { 1063 dsl_dataset_rele(ds, FTAG); 1064 return (error); 1065 } 1066 if (!dsl_dataset_is_snapshot(origin)) { 1067 dsl_dataset_rele(origin, FTAG); 1068 dsl_dataset_rele(ds, FTAG); 1069 return (SET_ERROR(EINVAL)); 1070 } 1071 if (origin->ds_phys->ds_guid != fromguid) { 1072 dsl_dataset_rele(origin, FTAG); 1073 dsl_dataset_rele(ds, FTAG); 1074 return (SET_ERROR(ENODEV)); 1075 } 1076 dsl_dataset_rele(origin, FTAG); 1077 } 1078 dsl_dataset_rele(ds, FTAG); 1079 error = 0; 1080 } 1081 return (error); 1082} 1083 1084static void 1085dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 1086{ 1087 dmu_recv_begin_arg_t *drba = arg; 1088 dsl_pool_t *dp = dmu_tx_pool(tx); 1089 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1090 const char *tofs = drba->drba_cookie->drc_tofs; 1091 dsl_dataset_t *ds, *newds; 1092 uint64_t dsobj; 1093 int error; 1094 uint64_t crflags; 1095 1096 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? 1097 DS_FLAG_CI_DATASET : 0; 1098 1099 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1100 if (error == 0) { 1101 /* create temporary clone */ 1102 dsl_dataset_t *snap = NULL; 1103 if (drba->drba_snapobj != 0) { 1104 VERIFY0(dsl_dataset_hold_obj(dp, 1105 drba->drba_snapobj, FTAG, &snap)); 1106 } 1107 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 1108 snap, crflags, drba->drba_cred, tx); 1109 dsl_dataset_rele(snap, FTAG); 1110 dsl_dataset_rele(ds, FTAG); 1111 } else { 1112 dsl_dir_t *dd; 1113 const char *tail; 1114 dsl_dataset_t *origin = NULL; 1115 1116 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 1117 1118 if (drba->drba_origin != NULL) { 1119 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 1120 FTAG, &origin)); 1121 } 1122 1123 /* Create new dataset. */ 1124 dsobj = dsl_dataset_create_sync(dd, 1125 strrchr(tofs, '/') + 1, 1126 origin, crflags, drba->drba_cred, tx); 1127 if (origin != NULL) 1128 dsl_dataset_rele(origin, FTAG); 1129 dsl_dir_rele(dd, FTAG); 1130 drba->drba_cookie->drc_newfs = B_TRUE; 1131 } 1132 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 1133 1134 dmu_buf_will_dirty(newds->ds_dbuf, tx); 1135 newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1136 1137 /* 1138 * If we actually created a non-clone, we need to create the 1139 * objset in our new dataset. 1140 */ 1141 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 1142 (void) dmu_objset_create_impl(dp->dp_spa, 1143 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 1144 } 1145 1146 drba->drba_cookie->drc_ds = newds; 1147 1148 spa_history_log_internal_ds(newds, "receive", tx, ""); 1149} 1150 1151/* 1152 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 1153 * succeeds; otherwise we will leak the holds on the datasets. 1154 */ 1155int 1156dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, 1157 boolean_t force, char *origin, dmu_recv_cookie_t *drc) 1158{ 1159 dmu_recv_begin_arg_t drba = { 0 }; 1160 dmu_replay_record_t *drr; 1161 1162 bzero(drc, sizeof (dmu_recv_cookie_t)); 1163 drc->drc_drrb = drrb; 1164 drc->drc_tosnap = tosnap; 1165 drc->drc_tofs = tofs; 1166 drc->drc_force = force; 1167 drc->drc_cred = CRED(); 1168 1169 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1170 drc->drc_byteswap = B_TRUE; 1171 else if (drrb->drr_magic != DMU_BACKUP_MAGIC) 1172 return (SET_ERROR(EINVAL)); 1173 1174 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1175 drr->drr_type = DRR_BEGIN; 1176 drr->drr_u.drr_begin = *drc->drc_drrb; 1177 if (drc->drc_byteswap) { 1178 fletcher_4_incremental_byteswap(drr, 1179 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1180 } else { 1181 fletcher_4_incremental_native(drr, 1182 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1183 } 1184 kmem_free(drr, sizeof (dmu_replay_record_t)); 1185 1186 if (drc->drc_byteswap) { 1187 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1188 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1189 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1190 drrb->drr_type = BSWAP_32(drrb->drr_type); 1191 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1192 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1193 } 1194 1195 drba.drba_origin = origin; 1196 drba.drba_cookie = drc; 1197 drba.drba_cred = CRED(); 1198 1199 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, 1200 &drba, 5, ZFS_SPACE_CHECK_NORMAL)); 1201} 1202 1203struct restorearg { 1204 int err; 1205 boolean_t byteswap; 1206 kthread_t *td; 1207 struct file *fp; 1208 char *buf; 1209 uint64_t voff; 1210 int bufsize; /* amount of memory allocated for buf */ 1211 zio_cksum_t cksum; 1212 avl_tree_t *guid_to_ds_map; 1213}; 1214 1215typedef struct guid_map_entry { 1216 uint64_t guid; 1217 dsl_dataset_t *gme_ds; 1218 avl_node_t avlnode; 1219} guid_map_entry_t; 1220 1221static int 1222guid_compare(const void *arg1, const void *arg2) 1223{ 1224 const guid_map_entry_t *gmep1 = arg1; 1225 const guid_map_entry_t *gmep2 = arg2; 1226 1227 if (gmep1->guid < gmep2->guid) 1228 return (-1); 1229 else if (gmep1->guid > gmep2->guid) 1230 return (1); 1231 return (0); 1232} 1233 1234static void 1235free_guid_map_onexit(void *arg) 1236{ 1237 avl_tree_t *ca = arg; 1238 void *cookie = NULL; 1239 guid_map_entry_t *gmep; 1240 1241 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 1242 dsl_dataset_long_rele(gmep->gme_ds, gmep); 1243 dsl_dataset_rele(gmep->gme_ds, gmep); 1244 kmem_free(gmep, sizeof (guid_map_entry_t)); 1245 } 1246 avl_destroy(ca); 1247 kmem_free(ca, sizeof (avl_tree_t)); 1248} 1249 1250static int 1251restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) 1252{ 1253 struct uio auio; 1254 struct iovec aiov; 1255 int error; 1256 1257 aiov.iov_base = buf; 1258 aiov.iov_len = len; 1259 auio.uio_iov = &aiov; 1260 auio.uio_iovcnt = 1; 1261 auio.uio_resid = len; 1262 auio.uio_segflg = UIO_SYSSPACE; 1263 auio.uio_rw = UIO_READ; 1264 auio.uio_offset = off; 1265 auio.uio_td = ra->td; 1266#ifdef _KERNEL 1267 error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); 1268#else 1269 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 1270 error = EOPNOTSUPP; 1271#endif 1272 *resid = auio.uio_resid; 1273 return (error); 1274} 1275 1276static void * 1277restore_read(struct restorearg *ra, int len, char *buf) 1278{ 1279 int done = 0; 1280 1281 if (buf == NULL) 1282 buf = ra->buf; 1283 1284 /* some things will require 8-byte alignment, so everything must */ 1285 ASSERT0(len % 8); 1286 1287 while (done < len) { 1288 ssize_t resid; 1289 1290 ra->err = restore_bytes(ra, buf + done, 1291 len - done, ra->voff, &resid); 1292 1293 if (resid == len - done) 1294 ra->err = SET_ERROR(EINVAL); 1295 ra->voff += len - done - resid; 1296 done = len - resid; 1297 if (ra->err != 0) 1298 return (NULL); 1299 } 1300 1301 ASSERT3U(done, ==, len); 1302 if (ra->byteswap) 1303 fletcher_4_incremental_byteswap(buf, len, &ra->cksum); 1304 else 1305 fletcher_4_incremental_native(buf, len, &ra->cksum); 1306 return (buf); 1307} 1308 1309static void 1310backup_byteswap(dmu_replay_record_t *drr) 1311{ 1312#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1313#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1314 drr->drr_type = BSWAP_32(drr->drr_type); 1315 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1316 switch (drr->drr_type) { 1317 case DRR_BEGIN: 1318 DO64(drr_begin.drr_magic); 1319 DO64(drr_begin.drr_versioninfo); 1320 DO64(drr_begin.drr_creation_time); 1321 DO32(drr_begin.drr_type); 1322 DO32(drr_begin.drr_flags); 1323 DO64(drr_begin.drr_toguid); 1324 DO64(drr_begin.drr_fromguid); 1325 break; 1326 case DRR_OBJECT: 1327 DO64(drr_object.drr_object); 1328 DO32(drr_object.drr_type); 1329 DO32(drr_object.drr_bonustype); 1330 DO32(drr_object.drr_blksz); 1331 DO32(drr_object.drr_bonuslen); 1332 DO64(drr_object.drr_toguid); 1333 break; 1334 case DRR_FREEOBJECTS: 1335 DO64(drr_freeobjects.drr_firstobj); 1336 DO64(drr_freeobjects.drr_numobjs); 1337 DO64(drr_freeobjects.drr_toguid); 1338 break; 1339 case DRR_WRITE: 1340 DO64(drr_write.drr_object); 1341 DO32(drr_write.drr_type); 1342 DO64(drr_write.drr_offset); 1343 DO64(drr_write.drr_length); 1344 DO64(drr_write.drr_toguid); 1345 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1346 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1347 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1348 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1349 DO64(drr_write.drr_key.ddk_prop); 1350 break; 1351 case DRR_WRITE_BYREF: 1352 DO64(drr_write_byref.drr_object); 1353 DO64(drr_write_byref.drr_offset); 1354 DO64(drr_write_byref.drr_length); 1355 DO64(drr_write_byref.drr_toguid); 1356 DO64(drr_write_byref.drr_refguid); 1357 DO64(drr_write_byref.drr_refobject); 1358 DO64(drr_write_byref.drr_refoffset); 1359 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1360 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1361 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1362 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1363 DO64(drr_write_byref.drr_key.ddk_prop); 1364 break; 1365 case DRR_WRITE_EMBEDDED: 1366 DO64(drr_write_embedded.drr_object); 1367 DO64(drr_write_embedded.drr_offset); 1368 DO64(drr_write_embedded.drr_length); 1369 DO64(drr_write_embedded.drr_toguid); 1370 DO32(drr_write_embedded.drr_lsize); 1371 DO32(drr_write_embedded.drr_psize); 1372 break; 1373 case DRR_FREE: 1374 DO64(drr_free.drr_object); 1375 DO64(drr_free.drr_offset); 1376 DO64(drr_free.drr_length); 1377 DO64(drr_free.drr_toguid); 1378 break; 1379 case DRR_SPILL: 1380 DO64(drr_spill.drr_object); 1381 DO64(drr_spill.drr_length); 1382 DO64(drr_spill.drr_toguid); 1383 break; 1384 case DRR_END: 1385 DO64(drr_end.drr_checksum.zc_word[0]); 1386 DO64(drr_end.drr_checksum.zc_word[1]); 1387 DO64(drr_end.drr_checksum.zc_word[2]); 1388 DO64(drr_end.drr_checksum.zc_word[3]); 1389 DO64(drr_end.drr_toguid); 1390 break; 1391 } 1392#undef DO64 1393#undef DO32 1394} 1395 1396static inline uint8_t 1397deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) 1398{ 1399 if (bonus_type == DMU_OT_SA) { 1400 return (1); 1401 } else { 1402 return (1 + 1403 ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); 1404 } 1405} 1406 1407static int 1408restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1409{ 1410 dmu_object_info_t doi; 1411 dmu_tx_t *tx; 1412 void *data = NULL; 1413 uint64_t object; 1414 int err; 1415 1416 if (drro->drr_type == DMU_OT_NONE || 1417 !DMU_OT_IS_VALID(drro->drr_type) || 1418 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1419 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1420 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1421 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1422 drro->drr_blksz < SPA_MINBLOCKSIZE || 1423 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1424 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1425 return (SET_ERROR(EINVAL)); 1426 } 1427 1428 err = dmu_object_info(os, drro->drr_object, &doi); 1429 1430 if (err != 0 && err != ENOENT) 1431 return (SET_ERROR(EINVAL)); 1432 object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; 1433 1434 if (drro->drr_bonuslen) { 1435 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8), NULL); 1436 if (ra->err != 0) 1437 return (ra->err); 1438 } 1439 1440 /* 1441 * If we are losing blkptrs or changing the block size this must 1442 * be a new file instance. We must clear out the previous file 1443 * contents before we can change this type of metadata in the dnode. 1444 */ 1445 if (err == 0) { 1446 int nblkptr; 1447 1448 nblkptr = deduce_nblkptr(drro->drr_bonustype, 1449 drro->drr_bonuslen); 1450 1451 if (drro->drr_blksz != doi.doi_data_block_size || 1452 nblkptr < doi.doi_nblkptr) { 1453 err = dmu_free_long_range(os, drro->drr_object, 1454 0, DMU_OBJECT_END); 1455 if (err != 0) 1456 return (SET_ERROR(EINVAL)); 1457 } 1458 } 1459 1460 tx = dmu_tx_create(os); 1461 dmu_tx_hold_bonus(tx, object); 1462 err = dmu_tx_assign(tx, TXG_WAIT); 1463 if (err != 0) { 1464 dmu_tx_abort(tx); 1465 return (err); 1466 } 1467 1468 if (object == DMU_NEW_OBJECT) { 1469 /* currently free, want to be allocated */ 1470 err = dmu_object_claim(os, drro->drr_object, 1471 drro->drr_type, drro->drr_blksz, 1472 drro->drr_bonustype, drro->drr_bonuslen, tx); 1473 } else if (drro->drr_type != doi.doi_type || 1474 drro->drr_blksz != doi.doi_data_block_size || 1475 drro->drr_bonustype != doi.doi_bonus_type || 1476 drro->drr_bonuslen != doi.doi_bonus_size) { 1477 /* currently allocated, but with different properties */ 1478 err = dmu_object_reclaim(os, drro->drr_object, 1479 drro->drr_type, drro->drr_blksz, 1480 drro->drr_bonustype, drro->drr_bonuslen, tx); 1481 } 1482 if (err != 0) { 1483 dmu_tx_commit(tx); 1484 return (SET_ERROR(EINVAL)); 1485 } 1486 1487 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1488 tx); 1489 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1490 1491 if (data != NULL) { 1492 dmu_buf_t *db; 1493 1494 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1495 dmu_buf_will_dirty(db, tx); 1496 1497 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1498 bcopy(data, db->db_data, drro->drr_bonuslen); 1499 if (ra->byteswap) { 1500 dmu_object_byteswap_t byteswap = 1501 DMU_OT_BYTESWAP(drro->drr_bonustype); 1502 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1503 drro->drr_bonuslen); 1504 } 1505 dmu_buf_rele(db, FTAG); 1506 } 1507 dmu_tx_commit(tx); 1508 return (0); 1509} 1510 1511/* ARGSUSED */ 1512static int 1513restore_freeobjects(struct restorearg *ra, objset_t *os, 1514 struct drr_freeobjects *drrfo) 1515{ 1516 uint64_t obj; 1517 1518 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1519 return (SET_ERROR(EINVAL)); 1520 1521 for (obj = drrfo->drr_firstobj; 1522 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1523 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1524 int err; 1525 1526 if (dmu_object_info(os, obj, NULL) != 0) 1527 continue; 1528 1529 err = dmu_free_long_object(os, obj); 1530 if (err != 0) 1531 return (err); 1532 } 1533 return (0); 1534} 1535 1536static int 1537restore_write(struct restorearg *ra, objset_t *os, 1538 struct drr_write *drrw) 1539{ 1540 dmu_tx_t *tx; 1541 void *data; 1542 int err; 1543 1544 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1545 !DMU_OT_IS_VALID(drrw->drr_type)) 1546 return (SET_ERROR(EINVAL)); 1547 1548 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1549 return (SET_ERROR(EINVAL)); 1550 1551 dmu_buf_t *bonus; 1552 if (dmu_bonus_hold(os, drrw->drr_object, FTAG, &bonus) != 0) 1553 return (SET_ERROR(EINVAL)); 1554 1555 arc_buf_t *abuf = dmu_request_arcbuf(bonus, drrw->drr_length); 1556 1557 data = restore_read(ra, drrw->drr_length, abuf->b_data); 1558 if (data == NULL) { 1559 dmu_return_arcbuf(abuf); 1560 dmu_buf_rele(bonus, FTAG); 1561 return (ra->err); 1562 } 1563 1564 tx = dmu_tx_create(os); 1565 1566 dmu_tx_hold_write(tx, drrw->drr_object, 1567 drrw->drr_offset, drrw->drr_length); 1568 err = dmu_tx_assign(tx, TXG_WAIT); 1569 if (err != 0) { 1570 dmu_return_arcbuf(abuf); 1571 dmu_buf_rele(bonus, FTAG); 1572 dmu_tx_abort(tx); 1573 return (err); 1574 } 1575 if (ra->byteswap) { 1576 dmu_object_byteswap_t byteswap = 1577 DMU_OT_BYTESWAP(drrw->drr_type); 1578 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 1579 } 1580 dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); 1581 dmu_tx_commit(tx); 1582 dmu_buf_rele(bonus, FTAG); 1583 return (0); 1584} 1585 1586/* 1587 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1588 * streams to refer to a copy of the data that is already on the 1589 * system because it came in earlier in the stream. This function 1590 * finds the earlier copy of the data, and uses that copy instead of 1591 * data from the stream to fulfill this write. 1592 */ 1593static int 1594restore_write_byref(struct restorearg *ra, objset_t *os, 1595 struct drr_write_byref *drrwbr) 1596{ 1597 dmu_tx_t *tx; 1598 int err; 1599 guid_map_entry_t gmesrch; 1600 guid_map_entry_t *gmep; 1601 avl_index_t where; 1602 objset_t *ref_os = NULL; 1603 dmu_buf_t *dbp; 1604 1605 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1606 return (SET_ERROR(EINVAL)); 1607 1608 /* 1609 * If the GUID of the referenced dataset is different from the 1610 * GUID of the target dataset, find the referenced dataset. 1611 */ 1612 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1613 gmesrch.guid = drrwbr->drr_refguid; 1614 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1615 &where)) == NULL) { 1616 return (SET_ERROR(EINVAL)); 1617 } 1618 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1619 return (SET_ERROR(EINVAL)); 1620 } else { 1621 ref_os = os; 1622 } 1623 1624 err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1625 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); 1626 if (err != 0) 1627 return (err); 1628 1629 tx = dmu_tx_create(os); 1630 1631 dmu_tx_hold_write(tx, drrwbr->drr_object, 1632 drrwbr->drr_offset, drrwbr->drr_length); 1633 err = dmu_tx_assign(tx, TXG_WAIT); 1634 if (err != 0) { 1635 dmu_tx_abort(tx); 1636 return (err); 1637 } 1638 dmu_write(os, drrwbr->drr_object, 1639 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1640 dmu_buf_rele(dbp, FTAG); 1641 dmu_tx_commit(tx); 1642 return (0); 1643} 1644 1645static int 1646restore_write_embedded(struct restorearg *ra, objset_t *os, 1647 struct drr_write_embedded *drrwnp) 1648{ 1649 dmu_tx_t *tx; 1650 int err; 1651 void *data; 1652 1653 if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) 1654 return (EINVAL); 1655 1656 if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) 1657 return (EINVAL); 1658 1659 if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) 1660 return (EINVAL); 1661 if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) 1662 return (EINVAL); 1663 1664 data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8), NULL); 1665 if (data == NULL) 1666 return (ra->err); 1667 1668 tx = dmu_tx_create(os); 1669 1670 dmu_tx_hold_write(tx, drrwnp->drr_object, 1671 drrwnp->drr_offset, drrwnp->drr_length); 1672 err = dmu_tx_assign(tx, TXG_WAIT); 1673 if (err != 0) { 1674 dmu_tx_abort(tx); 1675 return (err); 1676 } 1677 1678 dmu_write_embedded(os, drrwnp->drr_object, 1679 drrwnp->drr_offset, data, drrwnp->drr_etype, 1680 drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, 1681 ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); 1682 1683 dmu_tx_commit(tx); 1684 return (0); 1685} 1686 1687static int 1688restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1689{ 1690 dmu_tx_t *tx; 1691 void *data; 1692 dmu_buf_t *db, *db_spill; 1693 int err; 1694 1695 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1696 drrs->drr_length > SPA_MAXBLOCKSIZE) 1697 return (SET_ERROR(EINVAL)); 1698 1699 data = restore_read(ra, drrs->drr_length, NULL); 1700 if (data == NULL) 1701 return (ra->err); 1702 1703 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1704 return (SET_ERROR(EINVAL)); 1705 1706 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1707 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1708 dmu_buf_rele(db, FTAG); 1709 return (err); 1710 } 1711 1712 tx = dmu_tx_create(os); 1713 1714 dmu_tx_hold_spill(tx, db->db_object); 1715 1716 err = dmu_tx_assign(tx, TXG_WAIT); 1717 if (err != 0) { 1718 dmu_buf_rele(db, FTAG); 1719 dmu_buf_rele(db_spill, FTAG); 1720 dmu_tx_abort(tx); 1721 return (err); 1722 } 1723 dmu_buf_will_dirty(db_spill, tx); 1724 1725 if (db_spill->db_size < drrs->drr_length) 1726 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1727 drrs->drr_length, tx)); 1728 bcopy(data, db_spill->db_data, drrs->drr_length); 1729 1730 dmu_buf_rele(db, FTAG); 1731 dmu_buf_rele(db_spill, FTAG); 1732 1733 dmu_tx_commit(tx); 1734 return (0); 1735} 1736 1737/* ARGSUSED */ 1738static int 1739restore_free(struct restorearg *ra, objset_t *os, 1740 struct drr_free *drrf) 1741{ 1742 int err; 1743 1744 if (drrf->drr_length != -1ULL && 1745 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1746 return (SET_ERROR(EINVAL)); 1747 1748 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1749 return (SET_ERROR(EINVAL)); 1750 1751 err = dmu_free_long_range(os, drrf->drr_object, 1752 drrf->drr_offset, drrf->drr_length); 1753 return (err); 1754} 1755 1756/* used to destroy the drc_ds on error */ 1757static void 1758dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 1759{ 1760 char name[MAXNAMELEN]; 1761 dsl_dataset_name(drc->drc_ds, name); 1762 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1763 (void) dsl_destroy_head(name); 1764} 1765 1766/* 1767 * NB: callers *must* call dmu_recv_end() if this succeeds. 1768 */ 1769int 1770dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, 1771 int cleanup_fd, uint64_t *action_handlep) 1772{ 1773 struct restorearg ra = { 0 }; 1774 dmu_replay_record_t *drr; 1775 objset_t *os; 1776 zio_cksum_t pcksum; 1777 int featureflags; 1778 1779 ra.byteswap = drc->drc_byteswap; 1780 ra.cksum = drc->drc_cksum; 1781 ra.td = curthread; 1782 ra.fp = fp; 1783 ra.voff = *voffp; 1784 ra.bufsize = 1<<20; 1785 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1786 1787 /* these were verified in dmu_recv_begin */ 1788 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 1789 DMU_SUBSTREAM); 1790 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 1791 1792 /* 1793 * Open the objset we are modifying. 1794 */ 1795 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); 1796 1797 ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1798 1799 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1800 1801 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1802 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1803 minor_t minor; 1804 1805 if (cleanup_fd == -1) { 1806 ra.err = SET_ERROR(EBADF); 1807 goto out; 1808 } 1809 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1810 if (ra.err != 0) { 1811 cleanup_fd = -1; 1812 goto out; 1813 } 1814 1815 if (*action_handlep == 0) { 1816 ra.guid_to_ds_map = 1817 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1818 avl_create(ra.guid_to_ds_map, guid_compare, 1819 sizeof (guid_map_entry_t), 1820 offsetof(guid_map_entry_t, avlnode)); 1821 ra.err = zfs_onexit_add_cb(minor, 1822 free_guid_map_onexit, ra.guid_to_ds_map, 1823 action_handlep); 1824 if (ra.err != 0) 1825 goto out; 1826 } else { 1827 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1828 (void **)&ra.guid_to_ds_map); 1829 if (ra.err != 0) 1830 goto out; 1831 } 1832 1833 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1834 } 1835 1836 /* 1837 * Read records and process them. 1838 */ 1839 pcksum = ra.cksum; 1840 while (ra.err == 0 && 1841 NULL != (drr = restore_read(&ra, sizeof (*drr), NULL))) { 1842 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1843 ra.err = SET_ERROR(EINTR); 1844 goto out; 1845 } 1846 1847 if (ra.byteswap) 1848 backup_byteswap(drr); 1849 1850 switch (drr->drr_type) { 1851 case DRR_OBJECT: 1852 { 1853 /* 1854 * We need to make a copy of the record header, 1855 * because restore_{object,write} may need to 1856 * restore_read(), which will invalidate drr. 1857 */ 1858 struct drr_object drro = drr->drr_u.drr_object; 1859 ra.err = restore_object(&ra, os, &drro); 1860 break; 1861 } 1862 case DRR_FREEOBJECTS: 1863 { 1864 struct drr_freeobjects drrfo = 1865 drr->drr_u.drr_freeobjects; 1866 ra.err = restore_freeobjects(&ra, os, &drrfo); 1867 break; 1868 } 1869 case DRR_WRITE: 1870 { 1871 struct drr_write drrw = drr->drr_u.drr_write; 1872 ra.err = restore_write(&ra, os, &drrw); 1873 break; 1874 } 1875 case DRR_WRITE_BYREF: 1876 { 1877 struct drr_write_byref drrwbr = 1878 drr->drr_u.drr_write_byref; 1879 ra.err = restore_write_byref(&ra, os, &drrwbr); 1880 break; 1881 } 1882 case DRR_WRITE_EMBEDDED: 1883 { 1884 struct drr_write_embedded drrwe = 1885 drr->drr_u.drr_write_embedded; 1886 ra.err = restore_write_embedded(&ra, os, &drrwe); 1887 break; 1888 } 1889 case DRR_FREE: 1890 { 1891 struct drr_free drrf = drr->drr_u.drr_free; 1892 ra.err = restore_free(&ra, os, &drrf); 1893 break; 1894 } 1895 case DRR_END: 1896 { 1897 struct drr_end drre = drr->drr_u.drr_end; 1898 /* 1899 * We compare against the *previous* checksum 1900 * value, because the stored checksum is of 1901 * everything before the DRR_END record. 1902 */ 1903 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1904 ra.err = SET_ERROR(ECKSUM); 1905 goto out; 1906 } 1907 case DRR_SPILL: 1908 { 1909 struct drr_spill drrs = drr->drr_u.drr_spill; 1910 ra.err = restore_spill(&ra, os, &drrs); 1911 break; 1912 } 1913 default: 1914 ra.err = SET_ERROR(EINVAL); 1915 goto out; 1916 } 1917 pcksum = ra.cksum; 1918 } 1919 ASSERT(ra.err != 0); 1920 1921out: 1922 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1923 zfs_onexit_fd_rele(cleanup_fd); 1924 1925 if (ra.err != 0) { 1926 /* 1927 * destroy what we created, so we don't leave it in the 1928 * inconsistent restoring state. 1929 */ 1930 dmu_recv_cleanup_ds(drc); 1931 } 1932 1933 kmem_free(ra.buf, ra.bufsize); 1934 *voffp = ra.voff; 1935 return (ra.err); 1936} 1937 1938static int 1939dmu_recv_end_check(void *arg, dmu_tx_t *tx) 1940{ 1941 dmu_recv_cookie_t *drc = arg; 1942 dsl_pool_t *dp = dmu_tx_pool(tx); 1943 int error; 1944 1945 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 1946 1947 if (!drc->drc_newfs) { 1948 dsl_dataset_t *origin_head; 1949 1950 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 1951 if (error != 0) 1952 return (error); 1953 if (drc->drc_force) { 1954 /* 1955 * We will destroy any snapshots in tofs (i.e. before 1956 * origin_head) that are after the origin (which is 1957 * the snap before drc_ds, because drc_ds can not 1958 * have any snaps of its own). 1959 */ 1960 uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; 1961 while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { 1962 dsl_dataset_t *snap; 1963 error = dsl_dataset_hold_obj(dp, obj, FTAG, 1964 &snap); 1965 if (error != 0) 1966 return (error); 1967 if (snap->ds_dir != origin_head->ds_dir) 1968 error = SET_ERROR(EINVAL); 1969 if (error == 0) { 1970 error = dsl_destroy_snapshot_check_impl( 1971 snap, B_FALSE); 1972 } 1973 obj = snap->ds_phys->ds_prev_snap_obj; 1974 dsl_dataset_rele(snap, FTAG); 1975 if (error != 0) 1976 return (error); 1977 } 1978 } 1979 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 1980 origin_head, drc->drc_force, drc->drc_owner, tx); 1981 if (error != 0) { 1982 dsl_dataset_rele(origin_head, FTAG); 1983 return (error); 1984 } 1985 error = dsl_dataset_snapshot_check_impl(origin_head, 1986 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 1987 dsl_dataset_rele(origin_head, FTAG); 1988 if (error != 0) 1989 return (error); 1990 1991 error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 1992 } else { 1993 error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 1994 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 1995 } 1996 return (error); 1997} 1998 1999static void 2000dmu_recv_end_sync(void *arg, dmu_tx_t *tx) 2001{ 2002 dmu_recv_cookie_t *drc = arg; 2003 dsl_pool_t *dp = dmu_tx_pool(tx); 2004 2005 spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 2006 tx, "snap=%s", drc->drc_tosnap); 2007 2008 if (!drc->drc_newfs) { 2009 dsl_dataset_t *origin_head; 2010 2011 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 2012 &origin_head)); 2013 2014 if (drc->drc_force) { 2015 /* 2016 * Destroy any snapshots of drc_tofs (origin_head) 2017 * after the origin (the snap before drc_ds). 2018 */ 2019 uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; 2020 while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { 2021 dsl_dataset_t *snap; 2022 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, 2023 &snap)); 2024 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); 2025 obj = snap->ds_phys->ds_prev_snap_obj; 2026 dsl_destroy_snapshot_sync_impl(snap, 2027 B_FALSE, tx); 2028 dsl_dataset_rele(snap, FTAG); 2029 } 2030 } 2031 VERIFY3P(drc->drc_ds->ds_prev, ==, 2032 origin_head->ds_prev); 2033 2034 dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 2035 origin_head, tx); 2036 dsl_dataset_snapshot_sync_impl(origin_head, 2037 drc->drc_tosnap, tx); 2038 2039 /* set snapshot's creation time and guid */ 2040 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 2041 origin_head->ds_prev->ds_phys->ds_creation_time = 2042 drc->drc_drrb->drr_creation_time; 2043 origin_head->ds_prev->ds_phys->ds_guid = 2044 drc->drc_drrb->drr_toguid; 2045 origin_head->ds_prev->ds_phys->ds_flags &= 2046 ~DS_FLAG_INCONSISTENT; 2047 2048 dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 2049 origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2050 2051 dsl_dataset_rele(origin_head, FTAG); 2052 dsl_destroy_head_sync_impl(drc->drc_ds, tx); 2053 2054 if (drc->drc_owner != NULL) 2055 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); 2056 } else { 2057 dsl_dataset_t *ds = drc->drc_ds; 2058 2059 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 2060 2061 /* set snapshot's creation time and guid */ 2062 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2063 ds->ds_prev->ds_phys->ds_creation_time = 2064 drc->drc_drrb->drr_creation_time; 2065 ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; 2066 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2067 2068 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2069 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2070 } 2071 drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; 2072 /* 2073 * Release the hold from dmu_recv_begin. This must be done before 2074 * we return to open context, so that when we free the dataset's dnode, 2075 * we can evict its bonus buffer. 2076 */ 2077 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 2078 drc->drc_ds = NULL; 2079} 2080 2081static int 2082add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 2083{ 2084 dsl_pool_t *dp; 2085 dsl_dataset_t *snapds; 2086 guid_map_entry_t *gmep; 2087 int err; 2088 2089 ASSERT(guid_map != NULL); 2090 2091 err = dsl_pool_hold(name, FTAG, &dp); 2092 if (err != 0) 2093 return (err); 2094 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); 2095 err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); 2096 if (err == 0) { 2097 gmep->guid = snapds->ds_phys->ds_guid; 2098 gmep->gme_ds = snapds; 2099 avl_add(guid_map, gmep); 2100 dsl_dataset_long_hold(snapds, gmep); 2101 } else 2102 kmem_free(gmep, sizeof (*gmep)); 2103 2104 dsl_pool_rele(dp, FTAG); 2105 return (err); 2106} 2107 2108static int dmu_recv_end_modified_blocks = 3; 2109 2110static int 2111dmu_recv_existing_end(dmu_recv_cookie_t *drc) 2112{ 2113 int error; 2114 char name[MAXNAMELEN]; 2115 2116#ifdef _KERNEL 2117 /* 2118 * We will be destroying the ds; make sure its origin is unmounted if 2119 * necessary. 2120 */ 2121 dsl_dataset_name(drc->drc_ds, name); 2122 zfs_destroy_unmount_origin(name); 2123#endif 2124 2125 error = dsl_sync_task(drc->drc_tofs, 2126 dmu_recv_end_check, dmu_recv_end_sync, drc, 2127 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2128 2129 if (error != 0) 2130 dmu_recv_cleanup_ds(drc); 2131 return (error); 2132} 2133 2134static int 2135dmu_recv_new_end(dmu_recv_cookie_t *drc) 2136{ 2137 int error; 2138 2139 error = dsl_sync_task(drc->drc_tofs, 2140 dmu_recv_end_check, dmu_recv_end_sync, drc, 2141 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2142 2143 if (error != 0) { 2144 dmu_recv_cleanup_ds(drc); 2145 } else if (drc->drc_guid_to_ds_map != NULL) { 2146 (void) add_ds_to_guidmap(drc->drc_tofs, 2147 drc->drc_guid_to_ds_map, 2148 drc->drc_newsnapobj); 2149 } 2150 return (error); 2151} 2152 2153int 2154dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) 2155{ 2156 drc->drc_owner = owner; 2157 2158 if (drc->drc_newfs) 2159 return (dmu_recv_new_end(drc)); 2160 else 2161 return (dmu_recv_existing_end(drc)); 2162} 2163 2164/* 2165 * Return TRUE if this objset is currently being received into. 2166 */ 2167boolean_t 2168dmu_objset_is_receiving(objset_t *os) 2169{ 2170 return (os->os_dsl_dataset != NULL && 2171 os->os_dsl_dataset->ds_owner == dmu_recv_tag); 2172} 2173