dmu_send.c revision 273350
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 */ 28 29#include <sys/dmu.h> 30#include <sys/dmu_impl.h> 31#include <sys/dmu_tx.h> 32#include <sys/dbuf.h> 33#include <sys/dnode.h> 34#include <sys/zfs_context.h> 35#include <sys/dmu_objset.h> 36#include <sys/dmu_traverse.h> 37#include <sys/dsl_dataset.h> 38#include <sys/dsl_dir.h> 39#include <sys/dsl_prop.h> 40#include <sys/dsl_pool.h> 41#include <sys/dsl_synctask.h> 42#include <sys/zfs_ioctl.h> 43#include <sys/zap.h> 44#include <sys/zio_checksum.h> 45#include <sys/zfs_znode.h> 46#include <zfs_fletcher.h> 47#include <sys/avl.h> 48#include <sys/ddt.h> 49#include <sys/zfs_onexit.h> 50#include <sys/dmu_send.h> 51#include <sys/dsl_destroy.h> 52#include <sys/blkptr.h> 53#include <sys/dsl_bookmark.h> 54#include <sys/zfeature.h> 55 56#ifdef __FreeBSD__ 57#undef dump_write 58#define dump_write dmu_dump_write 59#endif 60 61/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 62int zfs_send_corrupt_data = B_FALSE; 63 64static char *dmu_recv_tag = "dmu_recv_tag"; 65static const char *recv_clone_name = "%recv"; 66 67static int 68dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 69{ 70 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 71 struct uio auio; 72 struct iovec aiov; 73 ASSERT0(len % 8); 74 75 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 76 aiov.iov_base = buf; 77 aiov.iov_len = len; 78 auio.uio_iov = &aiov; 79 auio.uio_iovcnt = 1; 80 auio.uio_resid = len; 81 auio.uio_segflg = UIO_SYSSPACE; 82 auio.uio_rw = UIO_WRITE; 83 auio.uio_offset = (off_t)-1; 84 auio.uio_td = dsp->dsa_td; 85#ifdef _KERNEL 86 if (dsp->dsa_fp->f_type == DTYPE_VNODE) 87 bwillwrite(); 88 dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, 89 dsp->dsa_td); 90#else 91 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 92 dsp->dsa_err = EOPNOTSUPP; 93#endif 94 mutex_enter(&ds->ds_sendstream_lock); 95 *dsp->dsa_off += len; 96 mutex_exit(&ds->ds_sendstream_lock); 97 98 return (dsp->dsa_err); 99} 100 101static int 102dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 103 uint64_t length) 104{ 105 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 106 107 /* 108 * When we receive a free record, dbuf_free_range() assumes 109 * that the receiving system doesn't have any dbufs in the range 110 * being freed. This is always true because there is a one-record 111 * constraint: we only send one WRITE record for any given 112 * object+offset. We know that the one-record constraint is 113 * true because we always send data in increasing order by 114 * object,offset. 115 * 116 * If the increasing-order constraint ever changes, we should find 117 * another way to assert that the one-record constraint is still 118 * satisfied. 119 */ 120 ASSERT(object > dsp->dsa_last_data_object || 121 (object == dsp->dsa_last_data_object && 122 offset > dsp->dsa_last_data_offset)); 123 124 /* 125 * If we are doing a non-incremental send, then there can't 126 * be any data in the dataset we're receiving into. Therefore 127 * a free record would simply be a no-op. Save space by not 128 * sending it to begin with. 129 */ 130 if (!dsp->dsa_incremental) 131 return (0); 132 133 if (length != -1ULL && offset + length < offset) 134 length = -1ULL; 135 136 /* 137 * If there is a pending op, but it's not PENDING_FREE, push it out, 138 * since free block aggregation can only be done for blocks of the 139 * same type (i.e., DRR_FREE records can only be aggregated with 140 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 141 * aggregated with other DRR_FREEOBJECTS records. 142 */ 143 if (dsp->dsa_pending_op != PENDING_NONE && 144 dsp->dsa_pending_op != PENDING_FREE) { 145 if (dump_bytes(dsp, dsp->dsa_drr, 146 sizeof (dmu_replay_record_t)) != 0) 147 return (SET_ERROR(EINTR)); 148 dsp->dsa_pending_op = PENDING_NONE; 149 } 150 151 if (dsp->dsa_pending_op == PENDING_FREE) { 152 /* 153 * There should never be a PENDING_FREE if length is -1 154 * (because dump_dnode is the only place where this 155 * function is called with a -1, and only after flushing 156 * any pending record). 157 */ 158 ASSERT(length != -1ULL); 159 /* 160 * Check to see whether this free block can be aggregated 161 * with pending one. 162 */ 163 if (drrf->drr_object == object && drrf->drr_offset + 164 drrf->drr_length == offset) { 165 drrf->drr_length += length; 166 return (0); 167 } else { 168 /* not a continuation. Push out pending record */ 169 if (dump_bytes(dsp, dsp->dsa_drr, 170 sizeof (dmu_replay_record_t)) != 0) 171 return (SET_ERROR(EINTR)); 172 dsp->dsa_pending_op = PENDING_NONE; 173 } 174 } 175 /* create a FREE record and make it pending */ 176 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 177 dsp->dsa_drr->drr_type = DRR_FREE; 178 drrf->drr_object = object; 179 drrf->drr_offset = offset; 180 drrf->drr_length = length; 181 drrf->drr_toguid = dsp->dsa_toguid; 182 if (length == -1ULL) { 183 if (dump_bytes(dsp, dsp->dsa_drr, 184 sizeof (dmu_replay_record_t)) != 0) 185 return (SET_ERROR(EINTR)); 186 } else { 187 dsp->dsa_pending_op = PENDING_FREE; 188 } 189 190 return (0); 191} 192 193static int 194dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, 195 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 196{ 197 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 198 199 /* 200 * We send data in increasing object, offset order. 201 * See comment in dump_free() for details. 202 */ 203 ASSERT(object > dsp->dsa_last_data_object || 204 (object == dsp->dsa_last_data_object && 205 offset > dsp->dsa_last_data_offset)); 206 dsp->dsa_last_data_object = object; 207 dsp->dsa_last_data_offset = offset + blksz - 1; 208 209 /* 210 * If there is any kind of pending aggregation (currently either 211 * a grouping of free objects or free blocks), push it out to 212 * the stream, since aggregation can't be done across operations 213 * of different types. 214 */ 215 if (dsp->dsa_pending_op != PENDING_NONE) { 216 if (dump_bytes(dsp, dsp->dsa_drr, 217 sizeof (dmu_replay_record_t)) != 0) 218 return (SET_ERROR(EINTR)); 219 dsp->dsa_pending_op = PENDING_NONE; 220 } 221 /* write a DATA record */ 222 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 223 dsp->dsa_drr->drr_type = DRR_WRITE; 224 drrw->drr_object = object; 225 drrw->drr_type = type; 226 drrw->drr_offset = offset; 227 drrw->drr_length = blksz; 228 drrw->drr_toguid = dsp->dsa_toguid; 229 if (BP_IS_EMBEDDED(bp)) { 230 /* 231 * There's no pre-computed checksum of embedded BP's, so 232 * (like fletcher4-checkummed blocks) userland will have 233 * to compute a dedup-capable checksum itself. 234 */ 235 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; 236 } else { 237 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 238 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 239 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 240 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 241 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 242 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 243 drrw->drr_key.ddk_cksum = bp->blk_cksum; 244 } 245 246 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 247 return (SET_ERROR(EINTR)); 248 if (dump_bytes(dsp, data, blksz) != 0) 249 return (SET_ERROR(EINTR)); 250 return (0); 251} 252 253static int 254dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 255 int blksz, const blkptr_t *bp) 256{ 257 char buf[BPE_PAYLOAD_SIZE]; 258 struct drr_write_embedded *drrw = 259 &(dsp->dsa_drr->drr_u.drr_write_embedded); 260 261 if (dsp->dsa_pending_op != PENDING_NONE) { 262 if (dump_bytes(dsp, dsp->dsa_drr, 263 sizeof (dmu_replay_record_t)) != 0) 264 return (EINTR); 265 dsp->dsa_pending_op = PENDING_NONE; 266 } 267 268 ASSERT(BP_IS_EMBEDDED(bp)); 269 270 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 271 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; 272 drrw->drr_object = object; 273 drrw->drr_offset = offset; 274 drrw->drr_length = blksz; 275 drrw->drr_toguid = dsp->dsa_toguid; 276 drrw->drr_compression = BP_GET_COMPRESS(bp); 277 drrw->drr_etype = BPE_GET_ETYPE(bp); 278 drrw->drr_lsize = BPE_GET_LSIZE(bp); 279 drrw->drr_psize = BPE_GET_PSIZE(bp); 280 281 decode_embedded_bp_compressed(bp, buf); 282 283 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 284 return (EINTR); 285 if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) 286 return (EINTR); 287 return (0); 288} 289 290static int 291dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 292{ 293 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 294 295 if (dsp->dsa_pending_op != PENDING_NONE) { 296 if (dump_bytes(dsp, dsp->dsa_drr, 297 sizeof (dmu_replay_record_t)) != 0) 298 return (SET_ERROR(EINTR)); 299 dsp->dsa_pending_op = PENDING_NONE; 300 } 301 302 /* write a SPILL record */ 303 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 304 dsp->dsa_drr->drr_type = DRR_SPILL; 305 drrs->drr_object = object; 306 drrs->drr_length = blksz; 307 drrs->drr_toguid = dsp->dsa_toguid; 308 309 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 310 return (SET_ERROR(EINTR)); 311 if (dump_bytes(dsp, data, blksz)) 312 return (SET_ERROR(EINTR)); 313 return (0); 314} 315 316static int 317dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 318{ 319 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 320 321 /* See comment in dump_free(). */ 322 if (!dsp->dsa_incremental) 323 return (0); 324 325 /* 326 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 327 * push it out, since free block aggregation can only be done for 328 * blocks of the same type (i.e., DRR_FREE records can only be 329 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 330 * can only be aggregated with other DRR_FREEOBJECTS records. 331 */ 332 if (dsp->dsa_pending_op != PENDING_NONE && 333 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 334 if (dump_bytes(dsp, dsp->dsa_drr, 335 sizeof (dmu_replay_record_t)) != 0) 336 return (SET_ERROR(EINTR)); 337 dsp->dsa_pending_op = PENDING_NONE; 338 } 339 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 340 /* 341 * See whether this free object array can be aggregated 342 * with pending one 343 */ 344 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 345 drrfo->drr_numobjs += numobjs; 346 return (0); 347 } else { 348 /* can't be aggregated. Push out pending record */ 349 if (dump_bytes(dsp, dsp->dsa_drr, 350 sizeof (dmu_replay_record_t)) != 0) 351 return (SET_ERROR(EINTR)); 352 dsp->dsa_pending_op = PENDING_NONE; 353 } 354 } 355 356 /* write a FREEOBJECTS record */ 357 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 358 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 359 drrfo->drr_firstobj = firstobj; 360 drrfo->drr_numobjs = numobjs; 361 drrfo->drr_toguid = dsp->dsa_toguid; 362 363 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 364 365 return (0); 366} 367 368static int 369dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 370{ 371 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 372 373 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 374 return (dump_freeobjects(dsp, object, 1)); 375 376 if (dsp->dsa_pending_op != PENDING_NONE) { 377 if (dump_bytes(dsp, dsp->dsa_drr, 378 sizeof (dmu_replay_record_t)) != 0) 379 return (SET_ERROR(EINTR)); 380 dsp->dsa_pending_op = PENDING_NONE; 381 } 382 383 /* write an OBJECT record */ 384 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 385 dsp->dsa_drr->drr_type = DRR_OBJECT; 386 drro->drr_object = object; 387 drro->drr_type = dnp->dn_type; 388 drro->drr_bonustype = dnp->dn_bonustype; 389 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 390 drro->drr_bonuslen = dnp->dn_bonuslen; 391 drro->drr_checksumtype = dnp->dn_checksum; 392 drro->drr_compress = dnp->dn_compress; 393 drro->drr_toguid = dsp->dsa_toguid; 394 395 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 396 return (SET_ERROR(EINTR)); 397 398 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 399 return (SET_ERROR(EINTR)); 400 401 /* Free anything past the end of the file. */ 402 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 403 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) 404 return (SET_ERROR(EINTR)); 405 if (dsp->dsa_err != 0) 406 return (SET_ERROR(EINTR)); 407 return (0); 408} 409 410static boolean_t 411backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) 412{ 413 if (!BP_IS_EMBEDDED(bp)) 414 return (B_FALSE); 415 416 /* 417 * Compression function must be legacy, or explicitly enabled. 418 */ 419 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && 420 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) 421 return (B_FALSE); 422 423 /* 424 * Embed type must be explicitly enabled. 425 */ 426 switch (BPE_GET_ETYPE(bp)) { 427 case BP_EMBEDDED_TYPE_DATA: 428 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) 429 return (B_TRUE); 430 break; 431 default: 432 return (B_FALSE); 433 } 434 return (B_FALSE); 435} 436 437#define BP_SPAN(dnp, level) \ 438 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 439 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 440 441/* ARGSUSED */ 442static int 443backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 444 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 445{ 446 dmu_sendarg_t *dsp = arg; 447 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 448 int err = 0; 449 450 if (issig(JUSTLOOKING) && issig(FORREAL)) 451 return (SET_ERROR(EINTR)); 452 453 if (zb->zb_object != DMU_META_DNODE_OBJECT && 454 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 455 return (0); 456 } else if (zb->zb_level == ZB_ZIL_LEVEL) { 457 /* 458 * If we are sending a non-snapshot (which is allowed on 459 * read-only pools), it may have a ZIL, which must be ignored. 460 */ 461 return (0); 462 } else if (BP_IS_HOLE(bp) && 463 zb->zb_object == DMU_META_DNODE_OBJECT) { 464 uint64_t span = BP_SPAN(dnp, zb->zb_level); 465 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 466 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 467 } else if (BP_IS_HOLE(bp)) { 468 uint64_t span = BP_SPAN(dnp, zb->zb_level); 469 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 470 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 471 return (0); 472 } else if (type == DMU_OT_DNODE) { 473 dnode_phys_t *blk; 474 int i; 475 int blksz = BP_GET_LSIZE(bp); 476 uint32_t aflags = ARC_WAIT; 477 arc_buf_t *abuf; 478 479 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 480 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 481 &aflags, zb) != 0) 482 return (SET_ERROR(EIO)); 483 484 blk = abuf->b_data; 485 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 486 uint64_t dnobj = (zb->zb_blkid << 487 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 488 err = dump_dnode(dsp, dnobj, blk+i); 489 if (err != 0) 490 break; 491 } 492 (void) arc_buf_remove_ref(abuf, &abuf); 493 } else if (type == DMU_OT_SA) { 494 uint32_t aflags = ARC_WAIT; 495 arc_buf_t *abuf; 496 int blksz = BP_GET_LSIZE(bp); 497 498 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 499 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 500 &aflags, zb) != 0) 501 return (SET_ERROR(EIO)); 502 503 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 504 (void) arc_buf_remove_ref(abuf, &abuf); 505 } else if (backup_do_embed(dsp, bp)) { 506 /* it's an embedded level-0 block of a regular object */ 507 int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 508 err = dump_write_embedded(dsp, zb->zb_object, 509 zb->zb_blkid * blksz, blksz, bp); 510 } else { /* it's a level-0 block of a regular object */ 511 uint32_t aflags = ARC_WAIT; 512 arc_buf_t *abuf; 513 int blksz = BP_GET_LSIZE(bp); 514 515 ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 516 ASSERT0(zb->zb_level); 517 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 518 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 519 &aflags, zb) != 0) { 520 if (zfs_send_corrupt_data) { 521 /* Send a block filled with 0x"zfs badd bloc" */ 522 abuf = arc_buf_alloc(spa, blksz, &abuf, 523 ARC_BUFC_DATA); 524 uint64_t *ptr; 525 for (ptr = abuf->b_data; 526 (char *)ptr < (char *)abuf->b_data + blksz; 527 ptr++) 528 *ptr = 0x2f5baddb10c; 529 } else { 530 return (SET_ERROR(EIO)); 531 } 532 } 533 534 err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz, 535 blksz, bp, abuf->b_data); 536 (void) arc_buf_remove_ref(abuf, &abuf); 537 } 538 539 ASSERT(err == 0 || err == EINTR); 540 return (err); 541} 542 543/* 544 * Releases dp using the specified tag. 545 */ 546static int 547dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, 548 zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, 549#ifdef illumos 550 int outfd, vnode_t *vp, offset_t *off) 551#else 552 int outfd, struct file *fp, offset_t *off) 553#endif 554{ 555 objset_t *os; 556 dmu_replay_record_t *drr; 557 dmu_sendarg_t *dsp; 558 int err; 559 uint64_t fromtxg = 0; 560 uint64_t featureflags = 0; 561 562 err = dmu_objset_from_ds(ds, &os); 563 if (err != 0) { 564 dsl_pool_rele(dp, tag); 565 return (err); 566 } 567 568 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 569 drr->drr_type = DRR_BEGIN; 570 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 571 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 572 DMU_SUBSTREAM); 573 574#ifdef _KERNEL 575 if (dmu_objset_type(os) == DMU_OST_ZFS) { 576 uint64_t version; 577 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 578 kmem_free(drr, sizeof (dmu_replay_record_t)); 579 dsl_pool_rele(dp, tag); 580 return (SET_ERROR(EINVAL)); 581 } 582 if (version >= ZPL_VERSION_SA) { 583 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; 584 } 585 } 586#endif 587 588 if (embedok && 589 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { 590 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; 591 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 592 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; 593 } else { 594 embedok = B_FALSE; 595 } 596 597 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, 598 featureflags); 599 600 drr->drr_u.drr_begin.drr_creation_time = 601 ds->ds_phys->ds_creation_time; 602 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 603 if (is_clone) 604 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 605 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 606 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 607 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 608 609 if (fromzb != NULL) { 610 drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; 611 fromtxg = fromzb->zbm_creation_txg; 612 } 613 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 614 if (!dsl_dataset_is_snapshot(ds)) { 615 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", 616 sizeof (drr->drr_u.drr_begin.drr_toname)); 617 } 618 619 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 620 621 dsp->dsa_drr = drr; 622 dsp->dsa_outfd = outfd; 623 dsp->dsa_proc = curproc; 624 dsp->dsa_td = curthread; 625 dsp->dsa_fp = fp; 626 dsp->dsa_os = os; 627 dsp->dsa_off = off; 628 dsp->dsa_toguid = ds->ds_phys->ds_guid; 629 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 630 dsp->dsa_pending_op = PENDING_NONE; 631 dsp->dsa_incremental = (fromzb != NULL); 632 dsp->dsa_featureflags = featureflags; 633 634 mutex_enter(&ds->ds_sendstream_lock); 635 list_insert_head(&ds->ds_sendstreams, dsp); 636 mutex_exit(&ds->ds_sendstream_lock); 637 638 dsl_dataset_long_hold(ds, FTAG); 639 dsl_pool_rele(dp, tag); 640 641 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 642 err = dsp->dsa_err; 643 goto out; 644 } 645 646 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 647 backup_cb, dsp); 648 649 if (dsp->dsa_pending_op != PENDING_NONE) 650 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 651 err = SET_ERROR(EINTR); 652 653 if (err != 0) { 654 if (err == EINTR && dsp->dsa_err != 0) 655 err = dsp->dsa_err; 656 goto out; 657 } 658 659 bzero(drr, sizeof (dmu_replay_record_t)); 660 drr->drr_type = DRR_END; 661 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 662 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 663 664 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 665 err = dsp->dsa_err; 666 goto out; 667 } 668 669out: 670 mutex_enter(&ds->ds_sendstream_lock); 671 list_remove(&ds->ds_sendstreams, dsp); 672 mutex_exit(&ds->ds_sendstream_lock); 673 674 kmem_free(drr, sizeof (dmu_replay_record_t)); 675 kmem_free(dsp, sizeof (dmu_sendarg_t)); 676 677 dsl_dataset_long_rele(ds, FTAG); 678 679 return (err); 680} 681 682int 683dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 684#ifdef illumos 685 boolean_t embedok, int outfd, vnode_t *vp, offset_t *off) 686#else 687 boolean_t embedok, int outfd, struct file *fp, offset_t *off) 688#endif 689{ 690 dsl_pool_t *dp; 691 dsl_dataset_t *ds; 692 dsl_dataset_t *fromds = NULL; 693 int err; 694 695 err = dsl_pool_hold(pool, FTAG, &dp); 696 if (err != 0) 697 return (err); 698 699 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 700 if (err != 0) { 701 dsl_pool_rele(dp, FTAG); 702 return (err); 703 } 704 705 if (fromsnap != 0) { 706 zfs_bookmark_phys_t zb; 707 boolean_t is_clone; 708 709 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 710 if (err != 0) { 711 dsl_dataset_rele(ds, FTAG); 712 dsl_pool_rele(dp, FTAG); 713 return (err); 714 } 715 if (!dsl_dataset_is_before(ds, fromds, 0)) 716 err = SET_ERROR(EXDEV); 717 zb.zbm_creation_time = fromds->ds_phys->ds_creation_time; 718 zb.zbm_creation_txg = fromds->ds_phys->ds_creation_txg; 719 zb.zbm_guid = fromds->ds_phys->ds_guid; 720 is_clone = (fromds->ds_dir != ds->ds_dir); 721 dsl_dataset_rele(fromds, FTAG); 722 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, 723 outfd, fp, off); 724 } else { 725 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, 726 outfd, fp, off); 727 } 728 dsl_dataset_rele(ds, FTAG); 729 return (err); 730} 731 732int 733dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, 734#ifdef illumos 735 int outfd, vnode_t *vp, offset_t *off) 736#else 737 int outfd, struct file *fp, offset_t *off) 738#endif 739{ 740 dsl_pool_t *dp; 741 dsl_dataset_t *ds; 742 int err; 743 boolean_t owned = B_FALSE; 744 745 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) 746 return (SET_ERROR(EINVAL)); 747 748 err = dsl_pool_hold(tosnap, FTAG, &dp); 749 if (err != 0) 750 return (err); 751 752 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { 753 /* 754 * We are sending a filesystem or volume. Ensure 755 * that it doesn't change by owning the dataset. 756 */ 757 err = dsl_dataset_own(dp, tosnap, FTAG, &ds); 758 owned = B_TRUE; 759 } else { 760 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 761 } 762 if (err != 0) { 763 dsl_pool_rele(dp, FTAG); 764 return (err); 765 } 766 767 if (fromsnap != NULL) { 768 zfs_bookmark_phys_t zb; 769 boolean_t is_clone = B_FALSE; 770 int fsnamelen = strchr(tosnap, '@') - tosnap; 771 772 /* 773 * If the fromsnap is in a different filesystem, then 774 * mark the send stream as a clone. 775 */ 776 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || 777 (fromsnap[fsnamelen] != '@' && 778 fromsnap[fsnamelen] != '#')) { 779 is_clone = B_TRUE; 780 } 781 782 if (strchr(fromsnap, '@')) { 783 dsl_dataset_t *fromds; 784 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 785 if (err == 0) { 786 if (!dsl_dataset_is_before(ds, fromds, 0)) 787 err = SET_ERROR(EXDEV); 788 zb.zbm_creation_time = 789 fromds->ds_phys->ds_creation_time; 790 zb.zbm_creation_txg = 791 fromds->ds_phys->ds_creation_txg; 792 zb.zbm_guid = fromds->ds_phys->ds_guid; 793 is_clone = (ds->ds_dir != fromds->ds_dir); 794 dsl_dataset_rele(fromds, FTAG); 795 } 796 } else { 797 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); 798 } 799 if (err != 0) { 800 dsl_dataset_rele(ds, FTAG); 801 dsl_pool_rele(dp, FTAG); 802 return (err); 803 } 804 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, 805 outfd, fp, off); 806 } else { 807 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, 808 outfd, fp, off); 809 } 810 if (owned) 811 dsl_dataset_disown(ds, FTAG); 812 else 813 dsl_dataset_rele(ds, FTAG); 814 return (err); 815} 816 817int 818dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) 819{ 820 dsl_pool_t *dp = ds->ds_dir->dd_pool; 821 int err; 822 uint64_t size; 823 824 ASSERT(dsl_pool_config_held(dp)); 825 826 /* tosnap must be a snapshot */ 827 if (!dsl_dataset_is_snapshot(ds)) 828 return (SET_ERROR(EINVAL)); 829 830 /* 831 * fromsnap must be an earlier snapshot from the same fs as tosnap, 832 * or the origin's fs. 833 */ 834 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) 835 return (SET_ERROR(EXDEV)); 836 837 /* Get uncompressed size estimate of changed data. */ 838 if (fromds == NULL) { 839 size = ds->ds_phys->ds_uncompressed_bytes; 840 } else { 841 uint64_t used, comp; 842 err = dsl_dataset_space_written(fromds, ds, 843 &used, &comp, &size); 844 if (err != 0) 845 return (err); 846 } 847 848 /* 849 * Assume that space (both on-disk and in-stream) is dominated by 850 * data. We will adjust for indirect blocks and the copies property, 851 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 852 */ 853 854 /* 855 * Subtract out approximate space used by indirect blocks. 856 * Assume most space is used by data blocks (non-indirect, non-dnode). 857 * Assume all blocks are recordsize. Assume ditto blocks and 858 * internal fragmentation counter out compression. 859 * 860 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 861 * block, which we observe in practice. 862 */ 863 uint64_t recordsize; 864 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); 865 if (err != 0) 866 return (err); 867 size -= size / recordsize * sizeof (blkptr_t); 868 869 /* Add in the space for the record associated with each block. */ 870 size += size / recordsize * sizeof (dmu_replay_record_t); 871 872 *sizep = size; 873 874 return (0); 875} 876 877typedef struct dmu_recv_begin_arg { 878 const char *drba_origin; 879 dmu_recv_cookie_t *drba_cookie; 880 cred_t *drba_cred; 881 uint64_t drba_snapobj; 882} dmu_recv_begin_arg_t; 883 884static int 885recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 886 uint64_t fromguid) 887{ 888 uint64_t val; 889 int error; 890 dsl_pool_t *dp = ds->ds_dir->dd_pool; 891 892 /* temporary clone name must not exist */ 893 error = zap_lookup(dp->dp_meta_objset, 894 ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, 895 8, 1, &val); 896 if (error != ENOENT) 897 return (error == 0 ? EBUSY : error); 898 899 /* new snapshot name must not exist */ 900 error = zap_lookup(dp->dp_meta_objset, 901 ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 902 8, 1, &val); 903 if (error != ENOENT) 904 return (error == 0 ? EEXIST : error); 905 906 /* 907 * Check snapshot limit before receiving. We'll recheck again at the 908 * end, but might as well abort before receiving if we're already over 909 * the limit. 910 * 911 * Note that we do not check the file system limit with 912 * dsl_dir_fscount_check because the temporary %clones don't count 913 * against that limit. 914 */ 915 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, 916 NULL, drba->drba_cred); 917 if (error != 0) 918 return (error); 919 920 if (fromguid != 0) { 921 dsl_dataset_t *snap; 922 uint64_t obj = ds->ds_phys->ds_prev_snap_obj; 923 924 /* Find snapshot in this dir that matches fromguid. */ 925 while (obj != 0) { 926 error = dsl_dataset_hold_obj(dp, obj, FTAG, 927 &snap); 928 if (error != 0) 929 return (SET_ERROR(ENODEV)); 930 if (snap->ds_dir != ds->ds_dir) { 931 dsl_dataset_rele(snap, FTAG); 932 return (SET_ERROR(ENODEV)); 933 } 934 if (snap->ds_phys->ds_guid == fromguid) 935 break; 936 obj = snap->ds_phys->ds_prev_snap_obj; 937 dsl_dataset_rele(snap, FTAG); 938 } 939 if (obj == 0) 940 return (SET_ERROR(ENODEV)); 941 942 if (drba->drba_cookie->drc_force) { 943 drba->drba_snapobj = obj; 944 } else { 945 /* 946 * If we are not forcing, there must be no 947 * changes since fromsnap. 948 */ 949 if (dsl_dataset_modified_since_snap(ds, snap)) { 950 dsl_dataset_rele(snap, FTAG); 951 return (SET_ERROR(ETXTBSY)); 952 } 953 drba->drba_snapobj = ds->ds_prev->ds_object; 954 } 955 956 dsl_dataset_rele(snap, FTAG); 957 } else { 958 /* if full, most recent snapshot must be $ORIGIN */ 959 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 960 return (SET_ERROR(ENODEV)); 961 drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj; 962 } 963 964 return (0); 965 966} 967 968static int 969dmu_recv_begin_check(void *arg, dmu_tx_t *tx) 970{ 971 dmu_recv_begin_arg_t *drba = arg; 972 dsl_pool_t *dp = dmu_tx_pool(tx); 973 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 974 uint64_t fromguid = drrb->drr_fromguid; 975 int flags = drrb->drr_flags; 976 int error; 977 uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 978 dsl_dataset_t *ds; 979 const char *tofs = drba->drba_cookie->drc_tofs; 980 981 /* already checked */ 982 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 983 984 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 985 DMU_COMPOUNDSTREAM || 986 drrb->drr_type >= DMU_OST_NUMTYPES || 987 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 988 return (SET_ERROR(EINVAL)); 989 990 /* Verify pool version supports SA if SA_SPILL feature set */ 991 if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 992 spa_version(dp->dp_spa) < SPA_VERSION_SA) 993 return (SET_ERROR(ENOTSUP)); 994 995 /* 996 * The receiving code doesn't know how to translate a WRITE_EMBEDDED 997 * record to a plan WRITE record, so the pool must have the 998 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED 999 * records. Same with WRITE_EMBEDDED records that use LZ4 compression. 1000 */ 1001 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && 1002 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) 1003 return (SET_ERROR(ENOTSUP)); 1004 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && 1005 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 1006 return (SET_ERROR(ENOTSUP)); 1007 1008 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1009 if (error == 0) { 1010 /* target fs already exists; recv into temp clone */ 1011 1012 /* Can't recv a clone into an existing fs */ 1013 if (flags & DRR_FLAG_CLONE) { 1014 dsl_dataset_rele(ds, FTAG); 1015 return (SET_ERROR(EINVAL)); 1016 } 1017 1018 error = recv_begin_check_existing_impl(drba, ds, fromguid); 1019 dsl_dataset_rele(ds, FTAG); 1020 } else if (error == ENOENT) { 1021 /* target fs does not exist; must be a full backup or clone */ 1022 char buf[MAXNAMELEN]; 1023 1024 /* 1025 * If it's a non-clone incremental, we are missing the 1026 * target fs, so fail the recv. 1027 */ 1028 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) 1029 return (SET_ERROR(ENOENT)); 1030 1031 /* Open the parent of tofs */ 1032 ASSERT3U(strlen(tofs), <, MAXNAMELEN); 1033 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 1034 error = dsl_dataset_hold(dp, buf, FTAG, &ds); 1035 if (error != 0) 1036 return (error); 1037 1038 /* 1039 * Check filesystem and snapshot limits before receiving. We'll 1040 * recheck snapshot limits again at the end (we create the 1041 * filesystems and increment those counts during begin_sync). 1042 */ 1043 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1044 ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); 1045 if (error != 0) { 1046 dsl_dataset_rele(ds, FTAG); 1047 return (error); 1048 } 1049 1050 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1051 ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); 1052 if (error != 0) { 1053 dsl_dataset_rele(ds, FTAG); 1054 return (error); 1055 } 1056 1057 if (drba->drba_origin != NULL) { 1058 dsl_dataset_t *origin; 1059 error = dsl_dataset_hold(dp, drba->drba_origin, 1060 FTAG, &origin); 1061 if (error != 0) { 1062 dsl_dataset_rele(ds, FTAG); 1063 return (error); 1064 } 1065 if (!dsl_dataset_is_snapshot(origin)) { 1066 dsl_dataset_rele(origin, FTAG); 1067 dsl_dataset_rele(ds, FTAG); 1068 return (SET_ERROR(EINVAL)); 1069 } 1070 if (origin->ds_phys->ds_guid != fromguid) { 1071 dsl_dataset_rele(origin, FTAG); 1072 dsl_dataset_rele(ds, FTAG); 1073 return (SET_ERROR(ENODEV)); 1074 } 1075 dsl_dataset_rele(origin, FTAG); 1076 } 1077 dsl_dataset_rele(ds, FTAG); 1078 error = 0; 1079 } 1080 return (error); 1081} 1082 1083static void 1084dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 1085{ 1086 dmu_recv_begin_arg_t *drba = arg; 1087 dsl_pool_t *dp = dmu_tx_pool(tx); 1088 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1089 const char *tofs = drba->drba_cookie->drc_tofs; 1090 dsl_dataset_t *ds, *newds; 1091 uint64_t dsobj; 1092 int error; 1093 uint64_t crflags; 1094 1095 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? 1096 DS_FLAG_CI_DATASET : 0; 1097 1098 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1099 if (error == 0) { 1100 /* create temporary clone */ 1101 dsl_dataset_t *snap = NULL; 1102 if (drba->drba_snapobj != 0) { 1103 VERIFY0(dsl_dataset_hold_obj(dp, 1104 drba->drba_snapobj, FTAG, &snap)); 1105 } 1106 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 1107 snap, crflags, drba->drba_cred, tx); 1108 dsl_dataset_rele(snap, FTAG); 1109 dsl_dataset_rele(ds, FTAG); 1110 } else { 1111 dsl_dir_t *dd; 1112 const char *tail; 1113 dsl_dataset_t *origin = NULL; 1114 1115 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 1116 1117 if (drba->drba_origin != NULL) { 1118 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 1119 FTAG, &origin)); 1120 } 1121 1122 /* Create new dataset. */ 1123 dsobj = dsl_dataset_create_sync(dd, 1124 strrchr(tofs, '/') + 1, 1125 origin, crflags, drba->drba_cred, tx); 1126 if (origin != NULL) 1127 dsl_dataset_rele(origin, FTAG); 1128 dsl_dir_rele(dd, FTAG); 1129 drba->drba_cookie->drc_newfs = B_TRUE; 1130 } 1131 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 1132 1133 dmu_buf_will_dirty(newds->ds_dbuf, tx); 1134 newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1135 1136 /* 1137 * If we actually created a non-clone, we need to create the 1138 * objset in our new dataset. 1139 */ 1140 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 1141 (void) dmu_objset_create_impl(dp->dp_spa, 1142 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 1143 } 1144 1145 drba->drba_cookie->drc_ds = newds; 1146 1147 spa_history_log_internal_ds(newds, "receive", tx, ""); 1148} 1149 1150/* 1151 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 1152 * succeeds; otherwise we will leak the holds on the datasets. 1153 */ 1154int 1155dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, 1156 boolean_t force, char *origin, dmu_recv_cookie_t *drc) 1157{ 1158 dmu_recv_begin_arg_t drba = { 0 }; 1159 dmu_replay_record_t *drr; 1160 1161 bzero(drc, sizeof (dmu_recv_cookie_t)); 1162 drc->drc_drrb = drrb; 1163 drc->drc_tosnap = tosnap; 1164 drc->drc_tofs = tofs; 1165 drc->drc_force = force; 1166 drc->drc_cred = CRED(); 1167 1168 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1169 drc->drc_byteswap = B_TRUE; 1170 else if (drrb->drr_magic != DMU_BACKUP_MAGIC) 1171 return (SET_ERROR(EINVAL)); 1172 1173 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1174 drr->drr_type = DRR_BEGIN; 1175 drr->drr_u.drr_begin = *drc->drc_drrb; 1176 if (drc->drc_byteswap) { 1177 fletcher_4_incremental_byteswap(drr, 1178 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1179 } else { 1180 fletcher_4_incremental_native(drr, 1181 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1182 } 1183 kmem_free(drr, sizeof (dmu_replay_record_t)); 1184 1185 if (drc->drc_byteswap) { 1186 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1187 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1188 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1189 drrb->drr_type = BSWAP_32(drrb->drr_type); 1190 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1191 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1192 } 1193 1194 drba.drba_origin = origin; 1195 drba.drba_cookie = drc; 1196 drba.drba_cred = CRED(); 1197 1198 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, 1199 &drba, 5, ZFS_SPACE_CHECK_NORMAL)); 1200} 1201 1202struct restorearg { 1203 int err; 1204 boolean_t byteswap; 1205 kthread_t *td; 1206 struct file *fp; 1207 char *buf; 1208 uint64_t voff; 1209 int bufsize; /* amount of memory allocated for buf */ 1210 zio_cksum_t cksum; 1211 avl_tree_t *guid_to_ds_map; 1212}; 1213 1214typedef struct guid_map_entry { 1215 uint64_t guid; 1216 dsl_dataset_t *gme_ds; 1217 avl_node_t avlnode; 1218} guid_map_entry_t; 1219 1220static int 1221guid_compare(const void *arg1, const void *arg2) 1222{ 1223 const guid_map_entry_t *gmep1 = arg1; 1224 const guid_map_entry_t *gmep2 = arg2; 1225 1226 if (gmep1->guid < gmep2->guid) 1227 return (-1); 1228 else if (gmep1->guid > gmep2->guid) 1229 return (1); 1230 return (0); 1231} 1232 1233static void 1234free_guid_map_onexit(void *arg) 1235{ 1236 avl_tree_t *ca = arg; 1237 void *cookie = NULL; 1238 guid_map_entry_t *gmep; 1239 1240 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 1241 dsl_dataset_long_rele(gmep->gme_ds, gmep); 1242 dsl_dataset_rele(gmep->gme_ds, gmep); 1243 kmem_free(gmep, sizeof (guid_map_entry_t)); 1244 } 1245 avl_destroy(ca); 1246 kmem_free(ca, sizeof (avl_tree_t)); 1247} 1248 1249static int 1250restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) 1251{ 1252 struct uio auio; 1253 struct iovec aiov; 1254 int error; 1255 1256 aiov.iov_base = buf; 1257 aiov.iov_len = len; 1258 auio.uio_iov = &aiov; 1259 auio.uio_iovcnt = 1; 1260 auio.uio_resid = len; 1261 auio.uio_segflg = UIO_SYSSPACE; 1262 auio.uio_rw = UIO_READ; 1263 auio.uio_offset = off; 1264 auio.uio_td = ra->td; 1265#ifdef _KERNEL 1266 error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); 1267#else 1268 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 1269 error = EOPNOTSUPP; 1270#endif 1271 *resid = auio.uio_resid; 1272 return (error); 1273} 1274 1275static void * 1276restore_read(struct restorearg *ra, int len, char *buf) 1277{ 1278 int done = 0; 1279 1280 if (buf == NULL) 1281 buf = ra->buf; 1282 1283 /* some things will require 8-byte alignment, so everything must */ 1284 ASSERT0(len % 8); 1285 1286 while (done < len) { 1287 ssize_t resid; 1288 1289 ra->err = restore_bytes(ra, buf + done, 1290 len - done, ra->voff, &resid); 1291 1292 if (resid == len - done) 1293 ra->err = SET_ERROR(EINVAL); 1294 ra->voff += len - done - resid; 1295 done = len - resid; 1296 if (ra->err != 0) 1297 return (NULL); 1298 } 1299 1300 ASSERT3U(done, ==, len); 1301 if (ra->byteswap) 1302 fletcher_4_incremental_byteswap(buf, len, &ra->cksum); 1303 else 1304 fletcher_4_incremental_native(buf, len, &ra->cksum); 1305 return (buf); 1306} 1307 1308static void 1309backup_byteswap(dmu_replay_record_t *drr) 1310{ 1311#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1312#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1313 drr->drr_type = BSWAP_32(drr->drr_type); 1314 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1315 switch (drr->drr_type) { 1316 case DRR_BEGIN: 1317 DO64(drr_begin.drr_magic); 1318 DO64(drr_begin.drr_versioninfo); 1319 DO64(drr_begin.drr_creation_time); 1320 DO32(drr_begin.drr_type); 1321 DO32(drr_begin.drr_flags); 1322 DO64(drr_begin.drr_toguid); 1323 DO64(drr_begin.drr_fromguid); 1324 break; 1325 case DRR_OBJECT: 1326 DO64(drr_object.drr_object); 1327 DO32(drr_object.drr_type); 1328 DO32(drr_object.drr_bonustype); 1329 DO32(drr_object.drr_blksz); 1330 DO32(drr_object.drr_bonuslen); 1331 DO64(drr_object.drr_toguid); 1332 break; 1333 case DRR_FREEOBJECTS: 1334 DO64(drr_freeobjects.drr_firstobj); 1335 DO64(drr_freeobjects.drr_numobjs); 1336 DO64(drr_freeobjects.drr_toguid); 1337 break; 1338 case DRR_WRITE: 1339 DO64(drr_write.drr_object); 1340 DO32(drr_write.drr_type); 1341 DO64(drr_write.drr_offset); 1342 DO64(drr_write.drr_length); 1343 DO64(drr_write.drr_toguid); 1344 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1345 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1346 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1347 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1348 DO64(drr_write.drr_key.ddk_prop); 1349 break; 1350 case DRR_WRITE_BYREF: 1351 DO64(drr_write_byref.drr_object); 1352 DO64(drr_write_byref.drr_offset); 1353 DO64(drr_write_byref.drr_length); 1354 DO64(drr_write_byref.drr_toguid); 1355 DO64(drr_write_byref.drr_refguid); 1356 DO64(drr_write_byref.drr_refobject); 1357 DO64(drr_write_byref.drr_refoffset); 1358 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1359 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1360 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1361 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1362 DO64(drr_write_byref.drr_key.ddk_prop); 1363 break; 1364 case DRR_WRITE_EMBEDDED: 1365 DO64(drr_write_embedded.drr_object); 1366 DO64(drr_write_embedded.drr_offset); 1367 DO64(drr_write_embedded.drr_length); 1368 DO64(drr_write_embedded.drr_toguid); 1369 DO32(drr_write_embedded.drr_lsize); 1370 DO32(drr_write_embedded.drr_psize); 1371 break; 1372 case DRR_FREE: 1373 DO64(drr_free.drr_object); 1374 DO64(drr_free.drr_offset); 1375 DO64(drr_free.drr_length); 1376 DO64(drr_free.drr_toguid); 1377 break; 1378 case DRR_SPILL: 1379 DO64(drr_spill.drr_object); 1380 DO64(drr_spill.drr_length); 1381 DO64(drr_spill.drr_toguid); 1382 break; 1383 case DRR_END: 1384 DO64(drr_end.drr_checksum.zc_word[0]); 1385 DO64(drr_end.drr_checksum.zc_word[1]); 1386 DO64(drr_end.drr_checksum.zc_word[2]); 1387 DO64(drr_end.drr_checksum.zc_word[3]); 1388 DO64(drr_end.drr_toguid); 1389 break; 1390 } 1391#undef DO64 1392#undef DO32 1393} 1394 1395static int 1396restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1397{ 1398 int err; 1399 dmu_tx_t *tx; 1400 void *data = NULL; 1401 1402 if (drro->drr_type == DMU_OT_NONE || 1403 !DMU_OT_IS_VALID(drro->drr_type) || 1404 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1405 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1406 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1407 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1408 drro->drr_blksz < SPA_MINBLOCKSIZE || 1409 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1410 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1411 return (SET_ERROR(EINVAL)); 1412 } 1413 1414 err = dmu_object_info(os, drro->drr_object, NULL); 1415 1416 if (err != 0 && err != ENOENT) 1417 return (SET_ERROR(EINVAL)); 1418 1419 if (drro->drr_bonuslen) { 1420 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8), NULL); 1421 if (ra->err != 0) 1422 return (ra->err); 1423 } 1424 1425 if (err == ENOENT) { 1426 /* currently free, want to be allocated */ 1427 tx = dmu_tx_create(os); 1428 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1429 err = dmu_tx_assign(tx, TXG_WAIT); 1430 if (err != 0) { 1431 dmu_tx_abort(tx); 1432 return (err); 1433 } 1434 err = dmu_object_claim(os, drro->drr_object, 1435 drro->drr_type, drro->drr_blksz, 1436 drro->drr_bonustype, drro->drr_bonuslen, tx); 1437 dmu_tx_commit(tx); 1438 } else { 1439 /* currently allocated, want to be allocated */ 1440 err = dmu_object_reclaim(os, drro->drr_object, 1441 drro->drr_type, drro->drr_blksz, 1442 drro->drr_bonustype, drro->drr_bonuslen); 1443 } 1444 if (err != 0) { 1445 return (SET_ERROR(EINVAL)); 1446 } 1447 1448 tx = dmu_tx_create(os); 1449 dmu_tx_hold_bonus(tx, drro->drr_object); 1450 err = dmu_tx_assign(tx, TXG_WAIT); 1451 if (err != 0) { 1452 dmu_tx_abort(tx); 1453 return (err); 1454 } 1455 1456 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1457 tx); 1458 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1459 1460 if (data != NULL) { 1461 dmu_buf_t *db; 1462 1463 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1464 dmu_buf_will_dirty(db, tx); 1465 1466 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1467 bcopy(data, db->db_data, drro->drr_bonuslen); 1468 if (ra->byteswap) { 1469 dmu_object_byteswap_t byteswap = 1470 DMU_OT_BYTESWAP(drro->drr_bonustype); 1471 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1472 drro->drr_bonuslen); 1473 } 1474 dmu_buf_rele(db, FTAG); 1475 } 1476 dmu_tx_commit(tx); 1477 return (0); 1478} 1479 1480/* ARGSUSED */ 1481static int 1482restore_freeobjects(struct restorearg *ra, objset_t *os, 1483 struct drr_freeobjects *drrfo) 1484{ 1485 uint64_t obj; 1486 1487 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1488 return (SET_ERROR(EINVAL)); 1489 1490 for (obj = drrfo->drr_firstobj; 1491 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1492 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1493 int err; 1494 1495 if (dmu_object_info(os, obj, NULL) != 0) 1496 continue; 1497 1498 err = dmu_free_long_object(os, obj); 1499 if (err != 0) 1500 return (err); 1501 } 1502 return (0); 1503} 1504 1505static int 1506restore_write(struct restorearg *ra, objset_t *os, 1507 struct drr_write *drrw) 1508{ 1509 dmu_tx_t *tx; 1510 void *data; 1511 int err; 1512 1513 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1514 !DMU_OT_IS_VALID(drrw->drr_type)) 1515 return (SET_ERROR(EINVAL)); 1516 1517 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1518 return (SET_ERROR(EINVAL)); 1519 1520 dmu_buf_t *bonus; 1521 if (dmu_bonus_hold(os, drrw->drr_object, FTAG, &bonus) != 0) 1522 return (SET_ERROR(EINVAL)); 1523 1524 arc_buf_t *abuf = dmu_request_arcbuf(bonus, drrw->drr_length); 1525 1526 data = restore_read(ra, drrw->drr_length, abuf->b_data); 1527 if (data == NULL) { 1528 dmu_return_arcbuf(abuf); 1529 dmu_buf_rele(bonus, FTAG); 1530 return (ra->err); 1531 } 1532 1533 tx = dmu_tx_create(os); 1534 1535 dmu_tx_hold_write(tx, drrw->drr_object, 1536 drrw->drr_offset, drrw->drr_length); 1537 err = dmu_tx_assign(tx, TXG_WAIT); 1538 if (err != 0) { 1539 dmu_return_arcbuf(abuf); 1540 dmu_buf_rele(bonus, FTAG); 1541 dmu_tx_abort(tx); 1542 return (err); 1543 } 1544 if (ra->byteswap) { 1545 dmu_object_byteswap_t byteswap = 1546 DMU_OT_BYTESWAP(drrw->drr_type); 1547 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 1548 } 1549 dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); 1550 dmu_tx_commit(tx); 1551 dmu_buf_rele(bonus, FTAG); 1552 return (0); 1553} 1554 1555/* 1556 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1557 * streams to refer to a copy of the data that is already on the 1558 * system because it came in earlier in the stream. This function 1559 * finds the earlier copy of the data, and uses that copy instead of 1560 * data from the stream to fulfill this write. 1561 */ 1562static int 1563restore_write_byref(struct restorearg *ra, objset_t *os, 1564 struct drr_write_byref *drrwbr) 1565{ 1566 dmu_tx_t *tx; 1567 int err; 1568 guid_map_entry_t gmesrch; 1569 guid_map_entry_t *gmep; 1570 avl_index_t where; 1571 objset_t *ref_os = NULL; 1572 dmu_buf_t *dbp; 1573 1574 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1575 return (SET_ERROR(EINVAL)); 1576 1577 /* 1578 * If the GUID of the referenced dataset is different from the 1579 * GUID of the target dataset, find the referenced dataset. 1580 */ 1581 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1582 gmesrch.guid = drrwbr->drr_refguid; 1583 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1584 &where)) == NULL) { 1585 return (SET_ERROR(EINVAL)); 1586 } 1587 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1588 return (SET_ERROR(EINVAL)); 1589 } else { 1590 ref_os = os; 1591 } 1592 1593 err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1594 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); 1595 if (err != 0) 1596 return (err); 1597 1598 tx = dmu_tx_create(os); 1599 1600 dmu_tx_hold_write(tx, drrwbr->drr_object, 1601 drrwbr->drr_offset, drrwbr->drr_length); 1602 err = dmu_tx_assign(tx, TXG_WAIT); 1603 if (err != 0) { 1604 dmu_tx_abort(tx); 1605 return (err); 1606 } 1607 dmu_write(os, drrwbr->drr_object, 1608 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1609 dmu_buf_rele(dbp, FTAG); 1610 dmu_tx_commit(tx); 1611 return (0); 1612} 1613 1614static int 1615restore_write_embedded(struct restorearg *ra, objset_t *os, 1616 struct drr_write_embedded *drrwnp) 1617{ 1618 dmu_tx_t *tx; 1619 int err; 1620 void *data; 1621 1622 if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) 1623 return (EINVAL); 1624 1625 if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) 1626 return (EINVAL); 1627 1628 if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) 1629 return (EINVAL); 1630 if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) 1631 return (EINVAL); 1632 1633 data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8), NULL); 1634 if (data == NULL) 1635 return (ra->err); 1636 1637 tx = dmu_tx_create(os); 1638 1639 dmu_tx_hold_write(tx, drrwnp->drr_object, 1640 drrwnp->drr_offset, drrwnp->drr_length); 1641 err = dmu_tx_assign(tx, TXG_WAIT); 1642 if (err != 0) { 1643 dmu_tx_abort(tx); 1644 return (err); 1645 } 1646 1647 dmu_write_embedded(os, drrwnp->drr_object, 1648 drrwnp->drr_offset, data, drrwnp->drr_etype, 1649 drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, 1650 ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); 1651 1652 dmu_tx_commit(tx); 1653 return (0); 1654} 1655 1656static int 1657restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1658{ 1659 dmu_tx_t *tx; 1660 void *data; 1661 dmu_buf_t *db, *db_spill; 1662 int err; 1663 1664 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1665 drrs->drr_length > SPA_MAXBLOCKSIZE) 1666 return (SET_ERROR(EINVAL)); 1667 1668 data = restore_read(ra, drrs->drr_length, NULL); 1669 if (data == NULL) 1670 return (ra->err); 1671 1672 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1673 return (SET_ERROR(EINVAL)); 1674 1675 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1676 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1677 dmu_buf_rele(db, FTAG); 1678 return (err); 1679 } 1680 1681 tx = dmu_tx_create(os); 1682 1683 dmu_tx_hold_spill(tx, db->db_object); 1684 1685 err = dmu_tx_assign(tx, TXG_WAIT); 1686 if (err != 0) { 1687 dmu_buf_rele(db, FTAG); 1688 dmu_buf_rele(db_spill, FTAG); 1689 dmu_tx_abort(tx); 1690 return (err); 1691 } 1692 dmu_buf_will_dirty(db_spill, tx); 1693 1694 if (db_spill->db_size < drrs->drr_length) 1695 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1696 drrs->drr_length, tx)); 1697 bcopy(data, db_spill->db_data, drrs->drr_length); 1698 1699 dmu_buf_rele(db, FTAG); 1700 dmu_buf_rele(db_spill, FTAG); 1701 1702 dmu_tx_commit(tx); 1703 return (0); 1704} 1705 1706/* ARGSUSED */ 1707static int 1708restore_free(struct restorearg *ra, objset_t *os, 1709 struct drr_free *drrf) 1710{ 1711 int err; 1712 1713 if (drrf->drr_length != -1ULL && 1714 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1715 return (SET_ERROR(EINVAL)); 1716 1717 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1718 return (SET_ERROR(EINVAL)); 1719 1720 err = dmu_free_long_range(os, drrf->drr_object, 1721 drrf->drr_offset, drrf->drr_length); 1722 return (err); 1723} 1724 1725/* used to destroy the drc_ds on error */ 1726static void 1727dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 1728{ 1729 char name[MAXNAMELEN]; 1730 dsl_dataset_name(drc->drc_ds, name); 1731 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1732 (void) dsl_destroy_head(name); 1733} 1734 1735/* 1736 * NB: callers *must* call dmu_recv_end() if this succeeds. 1737 */ 1738int 1739dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, 1740 int cleanup_fd, uint64_t *action_handlep) 1741{ 1742 struct restorearg ra = { 0 }; 1743 dmu_replay_record_t *drr; 1744 objset_t *os; 1745 zio_cksum_t pcksum; 1746 int featureflags; 1747 1748 ra.byteswap = drc->drc_byteswap; 1749 ra.cksum = drc->drc_cksum; 1750 ra.td = curthread; 1751 ra.fp = fp; 1752 ra.voff = *voffp; 1753 ra.bufsize = 1<<20; 1754 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1755 1756 /* these were verified in dmu_recv_begin */ 1757 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 1758 DMU_SUBSTREAM); 1759 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 1760 1761 /* 1762 * Open the objset we are modifying. 1763 */ 1764 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); 1765 1766 ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1767 1768 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1769 1770 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1771 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1772 minor_t minor; 1773 1774 if (cleanup_fd == -1) { 1775 ra.err = SET_ERROR(EBADF); 1776 goto out; 1777 } 1778 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1779 if (ra.err != 0) { 1780 cleanup_fd = -1; 1781 goto out; 1782 } 1783 1784 if (*action_handlep == 0) { 1785 ra.guid_to_ds_map = 1786 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1787 avl_create(ra.guid_to_ds_map, guid_compare, 1788 sizeof (guid_map_entry_t), 1789 offsetof(guid_map_entry_t, avlnode)); 1790 ra.err = zfs_onexit_add_cb(minor, 1791 free_guid_map_onexit, ra.guid_to_ds_map, 1792 action_handlep); 1793 if (ra.err != 0) 1794 goto out; 1795 } else { 1796 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1797 (void **)&ra.guid_to_ds_map); 1798 if (ra.err != 0) 1799 goto out; 1800 } 1801 1802 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1803 } 1804 1805 /* 1806 * Read records and process them. 1807 */ 1808 pcksum = ra.cksum; 1809 while (ra.err == 0 && 1810 NULL != (drr = restore_read(&ra, sizeof (*drr), NULL))) { 1811 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1812 ra.err = SET_ERROR(EINTR); 1813 goto out; 1814 } 1815 1816 if (ra.byteswap) 1817 backup_byteswap(drr); 1818 1819 switch (drr->drr_type) { 1820 case DRR_OBJECT: 1821 { 1822 /* 1823 * We need to make a copy of the record header, 1824 * because restore_{object,write} may need to 1825 * restore_read(), which will invalidate drr. 1826 */ 1827 struct drr_object drro = drr->drr_u.drr_object; 1828 ra.err = restore_object(&ra, os, &drro); 1829 break; 1830 } 1831 case DRR_FREEOBJECTS: 1832 { 1833 struct drr_freeobjects drrfo = 1834 drr->drr_u.drr_freeobjects; 1835 ra.err = restore_freeobjects(&ra, os, &drrfo); 1836 break; 1837 } 1838 case DRR_WRITE: 1839 { 1840 struct drr_write drrw = drr->drr_u.drr_write; 1841 ra.err = restore_write(&ra, os, &drrw); 1842 break; 1843 } 1844 case DRR_WRITE_BYREF: 1845 { 1846 struct drr_write_byref drrwbr = 1847 drr->drr_u.drr_write_byref; 1848 ra.err = restore_write_byref(&ra, os, &drrwbr); 1849 break; 1850 } 1851 case DRR_WRITE_EMBEDDED: 1852 { 1853 struct drr_write_embedded drrwe = 1854 drr->drr_u.drr_write_embedded; 1855 ra.err = restore_write_embedded(&ra, os, &drrwe); 1856 break; 1857 } 1858 case DRR_FREE: 1859 { 1860 struct drr_free drrf = drr->drr_u.drr_free; 1861 ra.err = restore_free(&ra, os, &drrf); 1862 break; 1863 } 1864 case DRR_END: 1865 { 1866 struct drr_end drre = drr->drr_u.drr_end; 1867 /* 1868 * We compare against the *previous* checksum 1869 * value, because the stored checksum is of 1870 * everything before the DRR_END record. 1871 */ 1872 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1873 ra.err = SET_ERROR(ECKSUM); 1874 goto out; 1875 } 1876 case DRR_SPILL: 1877 { 1878 struct drr_spill drrs = drr->drr_u.drr_spill; 1879 ra.err = restore_spill(&ra, os, &drrs); 1880 break; 1881 } 1882 default: 1883 ra.err = SET_ERROR(EINVAL); 1884 goto out; 1885 } 1886 pcksum = ra.cksum; 1887 } 1888 ASSERT(ra.err != 0); 1889 1890out: 1891 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1892 zfs_onexit_fd_rele(cleanup_fd); 1893 1894 if (ra.err != 0) { 1895 /* 1896 * destroy what we created, so we don't leave it in the 1897 * inconsistent restoring state. 1898 */ 1899 dmu_recv_cleanup_ds(drc); 1900 } 1901 1902 kmem_free(ra.buf, ra.bufsize); 1903 *voffp = ra.voff; 1904 return (ra.err); 1905} 1906 1907static int 1908dmu_recv_end_check(void *arg, dmu_tx_t *tx) 1909{ 1910 dmu_recv_cookie_t *drc = arg; 1911 dsl_pool_t *dp = dmu_tx_pool(tx); 1912 int error; 1913 1914 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 1915 1916 if (!drc->drc_newfs) { 1917 dsl_dataset_t *origin_head; 1918 1919 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 1920 if (error != 0) 1921 return (error); 1922 if (drc->drc_force) { 1923 /* 1924 * We will destroy any snapshots in tofs (i.e. before 1925 * origin_head) that are after the origin (which is 1926 * the snap before drc_ds, because drc_ds can not 1927 * have any snaps of its own). 1928 */ 1929 uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; 1930 while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { 1931 dsl_dataset_t *snap; 1932 error = dsl_dataset_hold_obj(dp, obj, FTAG, 1933 &snap); 1934 if (error != 0) 1935 return (error); 1936 if (snap->ds_dir != origin_head->ds_dir) 1937 error = SET_ERROR(EINVAL); 1938 if (error == 0) { 1939 error = dsl_destroy_snapshot_check_impl( 1940 snap, B_FALSE); 1941 } 1942 obj = snap->ds_phys->ds_prev_snap_obj; 1943 dsl_dataset_rele(snap, FTAG); 1944 if (error != 0) 1945 return (error); 1946 } 1947 } 1948 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 1949 origin_head, drc->drc_force, drc->drc_owner, tx); 1950 if (error != 0) { 1951 dsl_dataset_rele(origin_head, FTAG); 1952 return (error); 1953 } 1954 error = dsl_dataset_snapshot_check_impl(origin_head, 1955 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 1956 dsl_dataset_rele(origin_head, FTAG); 1957 if (error != 0) 1958 return (error); 1959 1960 error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 1961 } else { 1962 error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 1963 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 1964 } 1965 return (error); 1966} 1967 1968static void 1969dmu_recv_end_sync(void *arg, dmu_tx_t *tx) 1970{ 1971 dmu_recv_cookie_t *drc = arg; 1972 dsl_pool_t *dp = dmu_tx_pool(tx); 1973 1974 spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 1975 tx, "snap=%s", drc->drc_tosnap); 1976 1977 if (!drc->drc_newfs) { 1978 dsl_dataset_t *origin_head; 1979 1980 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 1981 &origin_head)); 1982 1983 if (drc->drc_force) { 1984 /* 1985 * Destroy any snapshots of drc_tofs (origin_head) 1986 * after the origin (the snap before drc_ds). 1987 */ 1988 uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; 1989 while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { 1990 dsl_dataset_t *snap; 1991 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, 1992 &snap)); 1993 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); 1994 obj = snap->ds_phys->ds_prev_snap_obj; 1995 dsl_destroy_snapshot_sync_impl(snap, 1996 B_FALSE, tx); 1997 dsl_dataset_rele(snap, FTAG); 1998 } 1999 } 2000 VERIFY3P(drc->drc_ds->ds_prev, ==, 2001 origin_head->ds_prev); 2002 2003 dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 2004 origin_head, tx); 2005 dsl_dataset_snapshot_sync_impl(origin_head, 2006 drc->drc_tosnap, tx); 2007 2008 /* set snapshot's creation time and guid */ 2009 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 2010 origin_head->ds_prev->ds_phys->ds_creation_time = 2011 drc->drc_drrb->drr_creation_time; 2012 origin_head->ds_prev->ds_phys->ds_guid = 2013 drc->drc_drrb->drr_toguid; 2014 origin_head->ds_prev->ds_phys->ds_flags &= 2015 ~DS_FLAG_INCONSISTENT; 2016 2017 dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 2018 origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2019 2020 dsl_dataset_rele(origin_head, FTAG); 2021 dsl_destroy_head_sync_impl(drc->drc_ds, tx); 2022 2023 if (drc->drc_owner != NULL) 2024 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); 2025 } else { 2026 dsl_dataset_t *ds = drc->drc_ds; 2027 2028 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 2029 2030 /* set snapshot's creation time and guid */ 2031 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2032 ds->ds_prev->ds_phys->ds_creation_time = 2033 drc->drc_drrb->drr_creation_time; 2034 ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; 2035 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2036 2037 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2038 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2039 } 2040 drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; 2041 /* 2042 * Release the hold from dmu_recv_begin. This must be done before 2043 * we return to open context, so that when we free the dataset's dnode, 2044 * we can evict its bonus buffer. 2045 */ 2046 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 2047 drc->drc_ds = NULL; 2048} 2049 2050static int 2051add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 2052{ 2053 dsl_pool_t *dp; 2054 dsl_dataset_t *snapds; 2055 guid_map_entry_t *gmep; 2056 int err; 2057 2058 ASSERT(guid_map != NULL); 2059 2060 err = dsl_pool_hold(name, FTAG, &dp); 2061 if (err != 0) 2062 return (err); 2063 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); 2064 err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); 2065 if (err == 0) { 2066 gmep->guid = snapds->ds_phys->ds_guid; 2067 gmep->gme_ds = snapds; 2068 avl_add(guid_map, gmep); 2069 dsl_dataset_long_hold(snapds, gmep); 2070 } else 2071 kmem_free(gmep, sizeof (*gmep)); 2072 2073 dsl_pool_rele(dp, FTAG); 2074 return (err); 2075} 2076 2077static int dmu_recv_end_modified_blocks = 3; 2078 2079static int 2080dmu_recv_existing_end(dmu_recv_cookie_t *drc) 2081{ 2082 int error; 2083 char name[MAXNAMELEN]; 2084 2085#ifdef _KERNEL 2086 /* 2087 * We will be destroying the ds; make sure its origin is unmounted if 2088 * necessary. 2089 */ 2090 dsl_dataset_name(drc->drc_ds, name); 2091 zfs_destroy_unmount_origin(name); 2092#endif 2093 2094 error = dsl_sync_task(drc->drc_tofs, 2095 dmu_recv_end_check, dmu_recv_end_sync, drc, 2096 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2097 2098 if (error != 0) 2099 dmu_recv_cleanup_ds(drc); 2100 return (error); 2101} 2102 2103static int 2104dmu_recv_new_end(dmu_recv_cookie_t *drc) 2105{ 2106 int error; 2107 2108 error = dsl_sync_task(drc->drc_tofs, 2109 dmu_recv_end_check, dmu_recv_end_sync, drc, 2110 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2111 2112 if (error != 0) { 2113 dmu_recv_cleanup_ds(drc); 2114 } else if (drc->drc_guid_to_ds_map != NULL) { 2115 (void) add_ds_to_guidmap(drc->drc_tofs, 2116 drc->drc_guid_to_ds_map, 2117 drc->drc_newsnapobj); 2118 } 2119 return (error); 2120} 2121 2122int 2123dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) 2124{ 2125 drc->drc_owner = owner; 2126 2127 if (drc->drc_newfs) 2128 return (dmu_recv_new_end(drc)); 2129 else 2130 return (dmu_recv_existing_end(drc)); 2131} 2132 2133/* 2134 * Return TRUE if this objset is currently being received into. 2135 */ 2136boolean_t 2137dmu_objset_is_receiving(objset_t *os) 2138{ 2139 return (os->os_dsl_dataset != NULL && 2140 os->os_dsl_dataset->ds_owner == dmu_recv_tag); 2141} 2142