dmu_send.c revision 269006
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 */ 28 29#include <sys/dmu.h> 30#include <sys/dmu_impl.h> 31#include <sys/dmu_tx.h> 32#include <sys/dbuf.h> 33#include <sys/dnode.h> 34#include <sys/zfs_context.h> 35#include <sys/dmu_objset.h> 36#include <sys/dmu_traverse.h> 37#include <sys/dsl_dataset.h> 38#include <sys/dsl_dir.h> 39#include <sys/dsl_prop.h> 40#include <sys/dsl_pool.h> 41#include <sys/dsl_synctask.h> 42#include <sys/zfs_ioctl.h> 43#include <sys/zap.h> 44#include <sys/zio_checksum.h> 45#include <sys/zfs_znode.h> 46#include <zfs_fletcher.h> 47#include <sys/avl.h> 48#include <sys/ddt.h> 49#include <sys/zfs_onexit.h> 50#include <sys/dmu_send.h> 51#include <sys/dsl_destroy.h> 52#include <sys/blkptr.h> 53#include <sys/dsl_bookmark.h> 54#include <sys/zfeature.h> 55 56#ifdef __FreeBSD__ 57#undef dump_write 58#define dump_write dmu_dump_write 59#endif 60 61/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 62int zfs_send_corrupt_data = B_FALSE; 63 64static char *dmu_recv_tag = "dmu_recv_tag"; 65static const char *recv_clone_name = "%recv"; 66 67static int 68dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 69{ 70 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 71 struct uio auio; 72 struct iovec aiov; 73 ASSERT0(len % 8); 74 75 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 76 aiov.iov_base = buf; 77 aiov.iov_len = len; 78 auio.uio_iov = &aiov; 79 auio.uio_iovcnt = 1; 80 auio.uio_resid = len; 81 auio.uio_segflg = UIO_SYSSPACE; 82 auio.uio_rw = UIO_WRITE; 83 auio.uio_offset = (off_t)-1; 84 auio.uio_td = dsp->dsa_td; 85#ifdef _KERNEL 86 if (dsp->dsa_fp->f_type == DTYPE_VNODE) 87 bwillwrite(); 88 dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, 89 dsp->dsa_td); 90#else 91 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 92 dsp->dsa_err = EOPNOTSUPP; 93#endif 94 mutex_enter(&ds->ds_sendstream_lock); 95 *dsp->dsa_off += len; 96 mutex_exit(&ds->ds_sendstream_lock); 97 98 return (dsp->dsa_err); 99} 100 101static int 102dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 103 uint64_t length) 104{ 105 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 106 107 /* 108 * When we receive a free record, dbuf_free_range() assumes 109 * that the receiving system doesn't have any dbufs in the range 110 * being freed. This is always true because there is a one-record 111 * constraint: we only send one WRITE record for any given 112 * object+offset. We know that the one-record constraint is 113 * true because we always send data in increasing order by 114 * object,offset. 115 * 116 * If the increasing-order constraint ever changes, we should find 117 * another way to assert that the one-record constraint is still 118 * satisfied. 119 */ 120 ASSERT(object > dsp->dsa_last_data_object || 121 (object == dsp->dsa_last_data_object && 122 offset > dsp->dsa_last_data_offset)); 123 124 /* 125 * If we are doing a non-incremental send, then there can't 126 * be any data in the dataset we're receiving into. Therefore 127 * a free record would simply be a no-op. Save space by not 128 * sending it to begin with. 129 */ 130 if (!dsp->dsa_incremental) 131 return (0); 132 133 if (length != -1ULL && offset + length < offset) 134 length = -1ULL; 135 136 /* 137 * If there is a pending op, but it's not PENDING_FREE, push it out, 138 * since free block aggregation can only be done for blocks of the 139 * same type (i.e., DRR_FREE records can only be aggregated with 140 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 141 * aggregated with other DRR_FREEOBJECTS records. 142 */ 143 if (dsp->dsa_pending_op != PENDING_NONE && 144 dsp->dsa_pending_op != PENDING_FREE) { 145 if (dump_bytes(dsp, dsp->dsa_drr, 146 sizeof (dmu_replay_record_t)) != 0) 147 return (SET_ERROR(EINTR)); 148 dsp->dsa_pending_op = PENDING_NONE; 149 } 150 151 if (dsp->dsa_pending_op == PENDING_FREE) { 152 /* 153 * There should never be a PENDING_FREE if length is -1 154 * (because dump_dnode is the only place where this 155 * function is called with a -1, and only after flushing 156 * any pending record). 157 */ 158 ASSERT(length != -1ULL); 159 /* 160 * Check to see whether this free block can be aggregated 161 * with pending one. 162 */ 163 if (drrf->drr_object == object && drrf->drr_offset + 164 drrf->drr_length == offset) { 165 drrf->drr_length += length; 166 return (0); 167 } else { 168 /* not a continuation. Push out pending record */ 169 if (dump_bytes(dsp, dsp->dsa_drr, 170 sizeof (dmu_replay_record_t)) != 0) 171 return (SET_ERROR(EINTR)); 172 dsp->dsa_pending_op = PENDING_NONE; 173 } 174 } 175 /* create a FREE record and make it pending */ 176 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 177 dsp->dsa_drr->drr_type = DRR_FREE; 178 drrf->drr_object = object; 179 drrf->drr_offset = offset; 180 drrf->drr_length = length; 181 drrf->drr_toguid = dsp->dsa_toguid; 182 if (length == -1ULL) { 183 if (dump_bytes(dsp, dsp->dsa_drr, 184 sizeof (dmu_replay_record_t)) != 0) 185 return (SET_ERROR(EINTR)); 186 } else { 187 dsp->dsa_pending_op = PENDING_FREE; 188 } 189 190 return (0); 191} 192 193static int 194dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, 195 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 196{ 197 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 198 199 /* 200 * We send data in increasing object, offset order. 201 * See comment in dump_free() for details. 202 */ 203 ASSERT(object > dsp->dsa_last_data_object || 204 (object == dsp->dsa_last_data_object && 205 offset > dsp->dsa_last_data_offset)); 206 dsp->dsa_last_data_object = object; 207 dsp->dsa_last_data_offset = offset + blksz - 1; 208 209 /* 210 * If there is any kind of pending aggregation (currently either 211 * a grouping of free objects or free blocks), push it out to 212 * the stream, since aggregation can't be done across operations 213 * of different types. 214 */ 215 if (dsp->dsa_pending_op != PENDING_NONE) { 216 if (dump_bytes(dsp, dsp->dsa_drr, 217 sizeof (dmu_replay_record_t)) != 0) 218 return (SET_ERROR(EINTR)); 219 dsp->dsa_pending_op = PENDING_NONE; 220 } 221 /* write a DATA record */ 222 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 223 dsp->dsa_drr->drr_type = DRR_WRITE; 224 drrw->drr_object = object; 225 drrw->drr_type = type; 226 drrw->drr_offset = offset; 227 drrw->drr_length = blksz; 228 drrw->drr_toguid = dsp->dsa_toguid; 229 if (BP_IS_EMBEDDED(bp)) { 230 /* 231 * There's no pre-computed checksum of embedded BP's, so 232 * (like fletcher4-checkummed blocks) userland will have 233 * to compute a dedup-capable checksum itself. 234 */ 235 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; 236 } else { 237 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 238 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 239 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 240 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 241 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 242 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 243 drrw->drr_key.ddk_cksum = bp->blk_cksum; 244 } 245 246 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 247 return (SET_ERROR(EINTR)); 248 if (dump_bytes(dsp, data, blksz) != 0) 249 return (SET_ERROR(EINTR)); 250 return (0); 251} 252 253static int 254dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 255 int blksz, const blkptr_t *bp) 256{ 257 char buf[BPE_PAYLOAD_SIZE]; 258 struct drr_write_embedded *drrw = 259 &(dsp->dsa_drr->drr_u.drr_write_embedded); 260 261 if (dsp->dsa_pending_op != PENDING_NONE) { 262 if (dump_bytes(dsp, dsp->dsa_drr, 263 sizeof (dmu_replay_record_t)) != 0) 264 return (EINTR); 265 dsp->dsa_pending_op = PENDING_NONE; 266 } 267 268 ASSERT(BP_IS_EMBEDDED(bp)); 269 270 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 271 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; 272 drrw->drr_object = object; 273 drrw->drr_offset = offset; 274 drrw->drr_length = blksz; 275 drrw->drr_toguid = dsp->dsa_toguid; 276 drrw->drr_compression = BP_GET_COMPRESS(bp); 277 drrw->drr_etype = BPE_GET_ETYPE(bp); 278 drrw->drr_lsize = BPE_GET_LSIZE(bp); 279 drrw->drr_psize = BPE_GET_PSIZE(bp); 280 281 decode_embedded_bp_compressed(bp, buf); 282 283 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 284 return (EINTR); 285 if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) 286 return (EINTR); 287 return (0); 288} 289 290static int 291dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 292{ 293 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 294 295 if (dsp->dsa_pending_op != PENDING_NONE) { 296 if (dump_bytes(dsp, dsp->dsa_drr, 297 sizeof (dmu_replay_record_t)) != 0) 298 return (SET_ERROR(EINTR)); 299 dsp->dsa_pending_op = PENDING_NONE; 300 } 301 302 /* write a SPILL record */ 303 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 304 dsp->dsa_drr->drr_type = DRR_SPILL; 305 drrs->drr_object = object; 306 drrs->drr_length = blksz; 307 drrs->drr_toguid = dsp->dsa_toguid; 308 309 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 310 return (SET_ERROR(EINTR)); 311 if (dump_bytes(dsp, data, blksz)) 312 return (SET_ERROR(EINTR)); 313 return (0); 314} 315 316static int 317dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 318{ 319 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 320 321 /* See comment in dump_free(). */ 322 if (!dsp->dsa_incremental) 323 return (0); 324 325 /* 326 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 327 * push it out, since free block aggregation can only be done for 328 * blocks of the same type (i.e., DRR_FREE records can only be 329 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 330 * can only be aggregated with other DRR_FREEOBJECTS records. 331 */ 332 if (dsp->dsa_pending_op != PENDING_NONE && 333 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 334 if (dump_bytes(dsp, dsp->dsa_drr, 335 sizeof (dmu_replay_record_t)) != 0) 336 return (SET_ERROR(EINTR)); 337 dsp->dsa_pending_op = PENDING_NONE; 338 } 339 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 340 /* 341 * See whether this free object array can be aggregated 342 * with pending one 343 */ 344 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 345 drrfo->drr_numobjs += numobjs; 346 return (0); 347 } else { 348 /* can't be aggregated. Push out pending record */ 349 if (dump_bytes(dsp, dsp->dsa_drr, 350 sizeof (dmu_replay_record_t)) != 0) 351 return (SET_ERROR(EINTR)); 352 dsp->dsa_pending_op = PENDING_NONE; 353 } 354 } 355 356 /* write a FREEOBJECTS record */ 357 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 358 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 359 drrfo->drr_firstobj = firstobj; 360 drrfo->drr_numobjs = numobjs; 361 drrfo->drr_toguid = dsp->dsa_toguid; 362 363 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 364 365 return (0); 366} 367 368static int 369dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 370{ 371 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 372 373 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 374 return (dump_freeobjects(dsp, object, 1)); 375 376 if (dsp->dsa_pending_op != PENDING_NONE) { 377 if (dump_bytes(dsp, dsp->dsa_drr, 378 sizeof (dmu_replay_record_t)) != 0) 379 return (SET_ERROR(EINTR)); 380 dsp->dsa_pending_op = PENDING_NONE; 381 } 382 383 /* write an OBJECT record */ 384 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 385 dsp->dsa_drr->drr_type = DRR_OBJECT; 386 drro->drr_object = object; 387 drro->drr_type = dnp->dn_type; 388 drro->drr_bonustype = dnp->dn_bonustype; 389 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 390 drro->drr_bonuslen = dnp->dn_bonuslen; 391 drro->drr_checksumtype = dnp->dn_checksum; 392 drro->drr_compress = dnp->dn_compress; 393 drro->drr_toguid = dsp->dsa_toguid; 394 395 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 396 return (SET_ERROR(EINTR)); 397 398 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 399 return (SET_ERROR(EINTR)); 400 401 /* Free anything past the end of the file. */ 402 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 403 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) 404 return (SET_ERROR(EINTR)); 405 if (dsp->dsa_err != 0) 406 return (SET_ERROR(EINTR)); 407 return (0); 408} 409 410static boolean_t 411backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) 412{ 413 if (!BP_IS_EMBEDDED(bp)) 414 return (B_FALSE); 415 416 /* 417 * Compression function must be legacy, or explicitly enabled. 418 */ 419 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && 420 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) 421 return (B_FALSE); 422 423 /* 424 * Embed type must be explicitly enabled. 425 */ 426 switch (BPE_GET_ETYPE(bp)) { 427 case BP_EMBEDDED_TYPE_DATA: 428 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) 429 return (B_TRUE); 430 break; 431 default: 432 return (B_FALSE); 433 } 434 return (B_FALSE); 435} 436 437#define BP_SPAN(dnp, level) \ 438 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 439 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 440 441/* ARGSUSED */ 442static int 443backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 444 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 445{ 446 dmu_sendarg_t *dsp = arg; 447 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 448 int err = 0; 449 450 if (issig(JUSTLOOKING) && issig(FORREAL)) 451 return (SET_ERROR(EINTR)); 452 453 if (zb->zb_object != DMU_META_DNODE_OBJECT && 454 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 455 return (0); 456 } else if (zb->zb_level == ZB_ZIL_LEVEL) { 457 /* 458 * If we are sending a non-snapshot (which is allowed on 459 * read-only pools), it may have a ZIL, which must be ignored. 460 */ 461 return (0); 462 } else if (BP_IS_HOLE(bp) && 463 zb->zb_object == DMU_META_DNODE_OBJECT) { 464 uint64_t span = BP_SPAN(dnp, zb->zb_level); 465 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 466 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 467 } else if (BP_IS_HOLE(bp)) { 468 uint64_t span = BP_SPAN(dnp, zb->zb_level); 469 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 470 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 471 return (0); 472 } else if (type == DMU_OT_DNODE) { 473 dnode_phys_t *blk; 474 int i; 475 int blksz = BP_GET_LSIZE(bp); 476 uint32_t aflags = ARC_WAIT; 477 arc_buf_t *abuf; 478 479 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 480 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 481 &aflags, zb) != 0) 482 return (SET_ERROR(EIO)); 483 484 blk = abuf->b_data; 485 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 486 uint64_t dnobj = (zb->zb_blkid << 487 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 488 err = dump_dnode(dsp, dnobj, blk+i); 489 if (err != 0) 490 break; 491 } 492 (void) arc_buf_remove_ref(abuf, &abuf); 493 } else if (type == DMU_OT_SA) { 494 uint32_t aflags = ARC_WAIT; 495 arc_buf_t *abuf; 496 int blksz = BP_GET_LSIZE(bp); 497 498 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 499 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 500 &aflags, zb) != 0) 501 return (SET_ERROR(EIO)); 502 503 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 504 (void) arc_buf_remove_ref(abuf, &abuf); 505 } else if (backup_do_embed(dsp, bp)) { 506 /* it's an embedded level-0 block of a regular object */ 507 int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 508 err = dump_write_embedded(dsp, zb->zb_object, 509 zb->zb_blkid * blksz, blksz, bp); 510 } else { /* it's a level-0 block of a regular object */ 511 uint32_t aflags = ARC_WAIT; 512 arc_buf_t *abuf; 513 int blksz = BP_GET_LSIZE(bp); 514 515 ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 516 ASSERT0(zb->zb_level); 517 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 518 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 519 &aflags, zb) != 0) { 520 if (zfs_send_corrupt_data) { 521 /* Send a block filled with 0x"zfs badd bloc" */ 522 abuf = arc_buf_alloc(spa, blksz, &abuf, 523 ARC_BUFC_DATA); 524 uint64_t *ptr; 525 for (ptr = abuf->b_data; 526 (char *)ptr < (char *)abuf->b_data + blksz; 527 ptr++) 528 *ptr = 0x2f5baddb10c; 529 } else { 530 return (SET_ERROR(EIO)); 531 } 532 } 533 534 err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz, 535 blksz, bp, abuf->b_data); 536 (void) arc_buf_remove_ref(abuf, &abuf); 537 } 538 539 ASSERT(err == 0 || err == EINTR); 540 return (err); 541} 542 543/* 544 * Releases dp using the specified tag. 545 */ 546static int 547dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, 548 zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, 549#ifdef illumos 550 int outfd, vnode_t *vp, offset_t *off) 551#else 552 int outfd, struct file *fp, offset_t *off) 553#endif 554{ 555 objset_t *os; 556 dmu_replay_record_t *drr; 557 dmu_sendarg_t *dsp; 558 int err; 559 uint64_t fromtxg = 0; 560 uint64_t featureflags = 0; 561 562 err = dmu_objset_from_ds(ds, &os); 563 if (err != 0) { 564 dsl_pool_rele(dp, tag); 565 return (err); 566 } 567 568 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 569 drr->drr_type = DRR_BEGIN; 570 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 571 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 572 DMU_SUBSTREAM); 573 574#ifdef _KERNEL 575 if (dmu_objset_type(os) == DMU_OST_ZFS) { 576 uint64_t version; 577 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 578 kmem_free(drr, sizeof (dmu_replay_record_t)); 579 dsl_pool_rele(dp, tag); 580 return (SET_ERROR(EINVAL)); 581 } 582 if (version >= ZPL_VERSION_SA) { 583 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; 584 } 585 } 586#endif 587 588 if (embedok && 589 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { 590 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; 591 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 592 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; 593 } else { 594 embedok = B_FALSE; 595 } 596 597 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, 598 featureflags); 599 600 drr->drr_u.drr_begin.drr_creation_time = 601 ds->ds_phys->ds_creation_time; 602 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 603 if (is_clone) 604 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 605 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 606 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 607 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 608 609 if (fromzb != NULL) { 610 drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; 611 fromtxg = fromzb->zbm_creation_txg; 612 } 613 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 614 if (!dsl_dataset_is_snapshot(ds)) { 615 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", 616 sizeof (drr->drr_u.drr_begin.drr_toname)); 617 } 618 619 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 620 621 dsp->dsa_drr = drr; 622 dsp->dsa_outfd = outfd; 623 dsp->dsa_proc = curproc; 624 dsp->dsa_td = curthread; 625 dsp->dsa_fp = fp; 626 dsp->dsa_os = os; 627 dsp->dsa_off = off; 628 dsp->dsa_toguid = ds->ds_phys->ds_guid; 629 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 630 dsp->dsa_pending_op = PENDING_NONE; 631 dsp->dsa_incremental = (fromzb != NULL); 632 dsp->dsa_featureflags = featureflags; 633 634 mutex_enter(&ds->ds_sendstream_lock); 635 list_insert_head(&ds->ds_sendstreams, dsp); 636 mutex_exit(&ds->ds_sendstream_lock); 637 638 dsl_dataset_long_hold(ds, FTAG); 639 dsl_pool_rele(dp, tag); 640 641 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 642 err = dsp->dsa_err; 643 goto out; 644 } 645 646 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 647 backup_cb, dsp); 648 649 if (dsp->dsa_pending_op != PENDING_NONE) 650 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 651 err = SET_ERROR(EINTR); 652 653 if (err != 0) { 654 if (err == EINTR && dsp->dsa_err != 0) 655 err = dsp->dsa_err; 656 goto out; 657 } 658 659 bzero(drr, sizeof (dmu_replay_record_t)); 660 drr->drr_type = DRR_END; 661 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 662 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 663 664 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 665 err = dsp->dsa_err; 666 goto out; 667 } 668 669out: 670 mutex_enter(&ds->ds_sendstream_lock); 671 list_remove(&ds->ds_sendstreams, dsp); 672 mutex_exit(&ds->ds_sendstream_lock); 673 674 kmem_free(drr, sizeof (dmu_replay_record_t)); 675 kmem_free(dsp, sizeof (dmu_sendarg_t)); 676 677 dsl_dataset_long_rele(ds, FTAG); 678 679 return (err); 680} 681 682int 683dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 684#ifdef illumos 685 boolean_t embedok, int outfd, vnode_t *vp, offset_t *off) 686#else 687 boolean_t embedok, int outfd, struct file *fp, offset_t *off) 688#endif 689{ 690 dsl_pool_t *dp; 691 dsl_dataset_t *ds; 692 dsl_dataset_t *fromds = NULL; 693 int err; 694 695 err = dsl_pool_hold(pool, FTAG, &dp); 696 if (err != 0) 697 return (err); 698 699 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 700 if (err != 0) { 701 dsl_pool_rele(dp, FTAG); 702 return (err); 703 } 704 705 if (fromsnap != 0) { 706 zfs_bookmark_phys_t zb; 707 boolean_t is_clone; 708 709 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 710 if (err != 0) { 711 dsl_dataset_rele(ds, FTAG); 712 dsl_pool_rele(dp, FTAG); 713 return (err); 714 } 715 if (!dsl_dataset_is_before(ds, fromds, 0)) 716 err = SET_ERROR(EXDEV); 717 zb.zbm_creation_time = fromds->ds_phys->ds_creation_time; 718 zb.zbm_creation_txg = fromds->ds_phys->ds_creation_txg; 719 zb.zbm_guid = fromds->ds_phys->ds_guid; 720 is_clone = (fromds->ds_dir != ds->ds_dir); 721 dsl_dataset_rele(fromds, FTAG); 722 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, 723 outfd, fp, off); 724 } else { 725 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, 726 outfd, fp, off); 727 } 728 dsl_dataset_rele(ds, FTAG); 729 return (err); 730} 731 732int 733dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, 734#ifdef illumos 735 int outfd, vnode_t *vp, offset_t *off) 736#else 737 int outfd, struct file *fp, offset_t *off) 738#endif 739{ 740 dsl_pool_t *dp; 741 dsl_dataset_t *ds; 742 int err; 743 boolean_t owned = B_FALSE; 744 745 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) 746 return (SET_ERROR(EINVAL)); 747 748 err = dsl_pool_hold(tosnap, FTAG, &dp); 749 if (err != 0) 750 return (err); 751 752 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { 753 /* 754 * We are sending a filesystem or volume. Ensure 755 * that it doesn't change by owning the dataset. 756 */ 757 err = dsl_dataset_own(dp, tosnap, FTAG, &ds); 758 owned = B_TRUE; 759 } else { 760 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 761 } 762 if (err != 0) { 763 dsl_pool_rele(dp, FTAG); 764 return (err); 765 } 766 767 if (fromsnap != NULL) { 768 zfs_bookmark_phys_t zb; 769 boolean_t is_clone = B_FALSE; 770 int fsnamelen = strchr(tosnap, '@') - tosnap; 771 772 /* 773 * If the fromsnap is in a different filesystem, then 774 * mark the send stream as a clone. 775 */ 776 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || 777 (fromsnap[fsnamelen] != '@' && 778 fromsnap[fsnamelen] != '#')) { 779 is_clone = B_TRUE; 780 } 781 782 if (strchr(fromsnap, '@')) { 783 dsl_dataset_t *fromds; 784 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 785 if (err == 0) { 786 if (!dsl_dataset_is_before(ds, fromds, 0)) 787 err = SET_ERROR(EXDEV); 788 zb.zbm_creation_time = 789 fromds->ds_phys->ds_creation_time; 790 zb.zbm_creation_txg = 791 fromds->ds_phys->ds_creation_txg; 792 zb.zbm_guid = fromds->ds_phys->ds_guid; 793 is_clone = (ds->ds_dir != fromds->ds_dir); 794 dsl_dataset_rele(fromds, FTAG); 795 } 796 } else { 797 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); 798 } 799 if (err != 0) { 800 dsl_dataset_rele(ds, FTAG); 801 dsl_pool_rele(dp, FTAG); 802 return (err); 803 } 804 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, 805 outfd, fp, off); 806 } else { 807 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, 808 outfd, fp, off); 809 } 810 if (owned) 811 dsl_dataset_disown(ds, FTAG); 812 else 813 dsl_dataset_rele(ds, FTAG); 814 return (err); 815} 816 817int 818dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) 819{ 820 dsl_pool_t *dp = ds->ds_dir->dd_pool; 821 int err; 822 uint64_t size; 823 824 ASSERT(dsl_pool_config_held(dp)); 825 826 /* tosnap must be a snapshot */ 827 if (!dsl_dataset_is_snapshot(ds)) 828 return (SET_ERROR(EINVAL)); 829 830 /* 831 * fromsnap must be an earlier snapshot from the same fs as tosnap, 832 * or the origin's fs. 833 */ 834 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) 835 return (SET_ERROR(EXDEV)); 836 837 /* Get uncompressed size estimate of changed data. */ 838 if (fromds == NULL) { 839 size = ds->ds_phys->ds_uncompressed_bytes; 840 } else { 841 uint64_t used, comp; 842 err = dsl_dataset_space_written(fromds, ds, 843 &used, &comp, &size); 844 if (err != 0) 845 return (err); 846 } 847 848 /* 849 * Assume that space (both on-disk and in-stream) is dominated by 850 * data. We will adjust for indirect blocks and the copies property, 851 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 852 */ 853 854 /* 855 * Subtract out approximate space used by indirect blocks. 856 * Assume most space is used by data blocks (non-indirect, non-dnode). 857 * Assume all blocks are recordsize. Assume ditto blocks and 858 * internal fragmentation counter out compression. 859 * 860 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 861 * block, which we observe in practice. 862 */ 863 uint64_t recordsize; 864 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); 865 if (err != 0) 866 return (err); 867 size -= size / recordsize * sizeof (blkptr_t); 868 869 /* Add in the space for the record associated with each block. */ 870 size += size / recordsize * sizeof (dmu_replay_record_t); 871 872 *sizep = size; 873 874 return (0); 875} 876 877typedef struct dmu_recv_begin_arg { 878 const char *drba_origin; 879 dmu_recv_cookie_t *drba_cookie; 880 cred_t *drba_cred; 881 uint64_t drba_snapobj; 882} dmu_recv_begin_arg_t; 883 884static int 885recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 886 uint64_t fromguid) 887{ 888 uint64_t val; 889 int error; 890 dsl_pool_t *dp = ds->ds_dir->dd_pool; 891 892 /* temporary clone name must not exist */ 893 error = zap_lookup(dp->dp_meta_objset, 894 ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, 895 8, 1, &val); 896 if (error != ENOENT) 897 return (error == 0 ? EBUSY : error); 898 899 /* new snapshot name must not exist */ 900 error = zap_lookup(dp->dp_meta_objset, 901 ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 902 8, 1, &val); 903 if (error != ENOENT) 904 return (error == 0 ? EEXIST : error); 905 906 /* 907 * Check snapshot limit before receiving. We'll recheck again at the 908 * end, but might as well abort before receiving if we're already over 909 * the limit. 910 * 911 * Note that we do not check the file system limit with 912 * dsl_dir_fscount_check because the temporary %clones don't count 913 * against that limit. 914 */ 915 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, 916 NULL, drba->drba_cred); 917 if (error != 0) 918 return (error); 919 920 if (fromguid != 0) { 921 dsl_dataset_t *snap; 922 uint64_t obj = ds->ds_phys->ds_prev_snap_obj; 923 924 /* Find snapshot in this dir that matches fromguid. */ 925 while (obj != 0) { 926 error = dsl_dataset_hold_obj(dp, obj, FTAG, 927 &snap); 928 if (error != 0) 929 return (SET_ERROR(ENODEV)); 930 if (snap->ds_dir != ds->ds_dir) { 931 dsl_dataset_rele(snap, FTAG); 932 return (SET_ERROR(ENODEV)); 933 } 934 if (snap->ds_phys->ds_guid == fromguid) 935 break; 936 obj = snap->ds_phys->ds_prev_snap_obj; 937 dsl_dataset_rele(snap, FTAG); 938 } 939 if (obj == 0) 940 return (SET_ERROR(ENODEV)); 941 942 if (drba->drba_cookie->drc_force) { 943 drba->drba_snapobj = obj; 944 } else { 945 /* 946 * If we are not forcing, there must be no 947 * changes since fromsnap. 948 */ 949 if (dsl_dataset_modified_since_snap(ds, snap)) { 950 dsl_dataset_rele(snap, FTAG); 951 return (SET_ERROR(ETXTBSY)); 952 } 953 drba->drba_snapobj = ds->ds_prev->ds_object; 954 } 955 956 dsl_dataset_rele(snap, FTAG); 957 } else { 958 /* if full, most recent snapshot must be $ORIGIN */ 959 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 960 return (SET_ERROR(ENODEV)); 961 drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj; 962 } 963 964 return (0); 965 966} 967 968static int 969dmu_recv_begin_check(void *arg, dmu_tx_t *tx) 970{ 971 dmu_recv_begin_arg_t *drba = arg; 972 dsl_pool_t *dp = dmu_tx_pool(tx); 973 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 974 uint64_t fromguid = drrb->drr_fromguid; 975 int flags = drrb->drr_flags; 976 int error; 977 uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 978 dsl_dataset_t *ds; 979 const char *tofs = drba->drba_cookie->drc_tofs; 980 981 /* already checked */ 982 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 983 984 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 985 DMU_COMPOUNDSTREAM || 986 drrb->drr_type >= DMU_OST_NUMTYPES || 987 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 988 return (SET_ERROR(EINVAL)); 989 990 /* Verify pool version supports SA if SA_SPILL feature set */ 991 if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 992 spa_version(dp->dp_spa) < SPA_VERSION_SA) 993 return (SET_ERROR(ENOTSUP)); 994 995 /* 996 * The receiving code doesn't know how to translate a WRITE_EMBEDDED 997 * record to a plan WRITE record, so the pool must have the 998 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED 999 * records. Same with WRITE_EMBEDDED records that use LZ4 compression. 1000 */ 1001 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && 1002 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) 1003 return (SET_ERROR(ENOTSUP)); 1004 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && 1005 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 1006 return (SET_ERROR(ENOTSUP)); 1007 1008 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1009 if (error == 0) { 1010 /* target fs already exists; recv into temp clone */ 1011 1012 /* Can't recv a clone into an existing fs */ 1013 if (flags & DRR_FLAG_CLONE) { 1014 dsl_dataset_rele(ds, FTAG); 1015 return (SET_ERROR(EINVAL)); 1016 } 1017 1018 error = recv_begin_check_existing_impl(drba, ds, fromguid); 1019 dsl_dataset_rele(ds, FTAG); 1020 } else if (error == ENOENT) { 1021 /* target fs does not exist; must be a full backup or clone */ 1022 char buf[MAXNAMELEN]; 1023 1024 /* 1025 * If it's a non-clone incremental, we are missing the 1026 * target fs, so fail the recv. 1027 */ 1028 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) 1029 return (SET_ERROR(ENOENT)); 1030 1031 /* Open the parent of tofs */ 1032 ASSERT3U(strlen(tofs), <, MAXNAMELEN); 1033 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 1034 error = dsl_dataset_hold(dp, buf, FTAG, &ds); 1035 if (error != 0) 1036 return (error); 1037 1038 /* 1039 * Check filesystem and snapshot limits before receiving. We'll 1040 * recheck snapshot limits again at the end (we create the 1041 * filesystems and increment those counts during begin_sync). 1042 */ 1043 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1044 ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); 1045 if (error != 0) { 1046 dsl_dataset_rele(ds, FTAG); 1047 return (error); 1048 } 1049 1050 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1051 ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); 1052 if (error != 0) { 1053 dsl_dataset_rele(ds, FTAG); 1054 return (error); 1055 } 1056 1057 if (drba->drba_origin != NULL) { 1058 dsl_dataset_t *origin; 1059 error = dsl_dataset_hold(dp, drba->drba_origin, 1060 FTAG, &origin); 1061 if (error != 0) { 1062 dsl_dataset_rele(ds, FTAG); 1063 return (error); 1064 } 1065 if (!dsl_dataset_is_snapshot(origin)) { 1066 dsl_dataset_rele(origin, FTAG); 1067 dsl_dataset_rele(ds, FTAG); 1068 return (SET_ERROR(EINVAL)); 1069 } 1070 if (origin->ds_phys->ds_guid != fromguid) { 1071 dsl_dataset_rele(origin, FTAG); 1072 dsl_dataset_rele(ds, FTAG); 1073 return (SET_ERROR(ENODEV)); 1074 } 1075 dsl_dataset_rele(origin, FTAG); 1076 } 1077 dsl_dataset_rele(ds, FTAG); 1078 error = 0; 1079 } 1080 return (error); 1081} 1082 1083static void 1084dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 1085{ 1086 dmu_recv_begin_arg_t *drba = arg; 1087 dsl_pool_t *dp = dmu_tx_pool(tx); 1088 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1089 const char *tofs = drba->drba_cookie->drc_tofs; 1090 dsl_dataset_t *ds, *newds; 1091 uint64_t dsobj; 1092 int error; 1093 uint64_t crflags; 1094 1095 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? 1096 DS_FLAG_CI_DATASET : 0; 1097 1098 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1099 if (error == 0) { 1100 /* create temporary clone */ 1101 dsl_dataset_t *snap = NULL; 1102 if (drba->drba_snapobj != 0) { 1103 VERIFY0(dsl_dataset_hold_obj(dp, 1104 drba->drba_snapobj, FTAG, &snap)); 1105 } 1106 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 1107 snap, crflags, drba->drba_cred, tx); 1108 dsl_dataset_rele(snap, FTAG); 1109 dsl_dataset_rele(ds, FTAG); 1110 } else { 1111 dsl_dir_t *dd; 1112 const char *tail; 1113 dsl_dataset_t *origin = NULL; 1114 1115 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 1116 1117 if (drba->drba_origin != NULL) { 1118 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 1119 FTAG, &origin)); 1120 } 1121 1122 /* Create new dataset. */ 1123 dsobj = dsl_dataset_create_sync(dd, 1124 strrchr(tofs, '/') + 1, 1125 origin, crflags, drba->drba_cred, tx); 1126 if (origin != NULL) 1127 dsl_dataset_rele(origin, FTAG); 1128 dsl_dir_rele(dd, FTAG); 1129 drba->drba_cookie->drc_newfs = B_TRUE; 1130 } 1131 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 1132 1133 dmu_buf_will_dirty(newds->ds_dbuf, tx); 1134 newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1135 1136 /* 1137 * If we actually created a non-clone, we need to create the 1138 * objset in our new dataset. 1139 */ 1140 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 1141 (void) dmu_objset_create_impl(dp->dp_spa, 1142 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 1143 } 1144 1145 drba->drba_cookie->drc_ds = newds; 1146 1147 spa_history_log_internal_ds(newds, "receive", tx, ""); 1148} 1149 1150/* 1151 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 1152 * succeeds; otherwise we will leak the holds on the datasets. 1153 */ 1154int 1155dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, 1156 boolean_t force, char *origin, dmu_recv_cookie_t *drc) 1157{ 1158 dmu_recv_begin_arg_t drba = { 0 }; 1159 dmu_replay_record_t *drr; 1160 1161 bzero(drc, sizeof (dmu_recv_cookie_t)); 1162 drc->drc_drrb = drrb; 1163 drc->drc_tosnap = tosnap; 1164 drc->drc_tofs = tofs; 1165 drc->drc_force = force; 1166 drc->drc_cred = CRED(); 1167 1168 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1169 drc->drc_byteswap = B_TRUE; 1170 else if (drrb->drr_magic != DMU_BACKUP_MAGIC) 1171 return (SET_ERROR(EINVAL)); 1172 1173 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1174 drr->drr_type = DRR_BEGIN; 1175 drr->drr_u.drr_begin = *drc->drc_drrb; 1176 if (drc->drc_byteswap) { 1177 fletcher_4_incremental_byteswap(drr, 1178 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1179 } else { 1180 fletcher_4_incremental_native(drr, 1181 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1182 } 1183 kmem_free(drr, sizeof (dmu_replay_record_t)); 1184 1185 if (drc->drc_byteswap) { 1186 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1187 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1188 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1189 drrb->drr_type = BSWAP_32(drrb->drr_type); 1190 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1191 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1192 } 1193 1194 drba.drba_origin = origin; 1195 drba.drba_cookie = drc; 1196 drba.drba_cred = CRED(); 1197 1198 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, 1199 &drba, 5, ZFS_SPACE_CHECK_NORMAL)); 1200} 1201 1202struct restorearg { 1203 int err; 1204 boolean_t byteswap; 1205 kthread_t *td; 1206 struct file *fp; 1207 char *buf; 1208 uint64_t voff; 1209 int bufsize; /* amount of memory allocated for buf */ 1210 zio_cksum_t cksum; 1211 avl_tree_t *guid_to_ds_map; 1212}; 1213 1214typedef struct guid_map_entry { 1215 uint64_t guid; 1216 dsl_dataset_t *gme_ds; 1217 avl_node_t avlnode; 1218} guid_map_entry_t; 1219 1220static int 1221guid_compare(const void *arg1, const void *arg2) 1222{ 1223 const guid_map_entry_t *gmep1 = arg1; 1224 const guid_map_entry_t *gmep2 = arg2; 1225 1226 if (gmep1->guid < gmep2->guid) 1227 return (-1); 1228 else if (gmep1->guid > gmep2->guid) 1229 return (1); 1230 return (0); 1231} 1232 1233static void 1234free_guid_map_onexit(void *arg) 1235{ 1236 avl_tree_t *ca = arg; 1237 void *cookie = NULL; 1238 guid_map_entry_t *gmep; 1239 1240 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 1241 dsl_dataset_long_rele(gmep->gme_ds, gmep); 1242 dsl_dataset_rele(gmep->gme_ds, gmep); 1243 kmem_free(gmep, sizeof (guid_map_entry_t)); 1244 } 1245 avl_destroy(ca); 1246 kmem_free(ca, sizeof (avl_tree_t)); 1247} 1248 1249static int 1250restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) 1251{ 1252 struct uio auio; 1253 struct iovec aiov; 1254 int error; 1255 1256 aiov.iov_base = buf; 1257 aiov.iov_len = len; 1258 auio.uio_iov = &aiov; 1259 auio.uio_iovcnt = 1; 1260 auio.uio_resid = len; 1261 auio.uio_segflg = UIO_SYSSPACE; 1262 auio.uio_rw = UIO_READ; 1263 auio.uio_offset = off; 1264 auio.uio_td = ra->td; 1265#ifdef _KERNEL 1266 error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); 1267#else 1268 fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); 1269 error = EOPNOTSUPP; 1270#endif 1271 *resid = auio.uio_resid; 1272 return (error); 1273} 1274 1275static void * 1276restore_read(struct restorearg *ra, int len) 1277{ 1278 void *rv; 1279 int done = 0; 1280 1281 /* some things will require 8-byte alignment, so everything must */ 1282 ASSERT0(len % 8); 1283 1284 while (done < len) { 1285 ssize_t resid; 1286 1287 ra->err = restore_bytes(ra, (caddr_t)ra->buf + done, 1288 len - done, ra->voff, &resid); 1289 1290 if (resid == len - done) 1291 ra->err = SET_ERROR(EINVAL); 1292 ra->voff += len - done - resid; 1293 done = len - resid; 1294 if (ra->err != 0) 1295 return (NULL); 1296 } 1297 1298 ASSERT3U(done, ==, len); 1299 rv = ra->buf; 1300 if (ra->byteswap) 1301 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 1302 else 1303 fletcher_4_incremental_native(rv, len, &ra->cksum); 1304 return (rv); 1305} 1306 1307static void 1308backup_byteswap(dmu_replay_record_t *drr) 1309{ 1310#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1311#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1312 drr->drr_type = BSWAP_32(drr->drr_type); 1313 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1314 switch (drr->drr_type) { 1315 case DRR_BEGIN: 1316 DO64(drr_begin.drr_magic); 1317 DO64(drr_begin.drr_versioninfo); 1318 DO64(drr_begin.drr_creation_time); 1319 DO32(drr_begin.drr_type); 1320 DO32(drr_begin.drr_flags); 1321 DO64(drr_begin.drr_toguid); 1322 DO64(drr_begin.drr_fromguid); 1323 break; 1324 case DRR_OBJECT: 1325 DO64(drr_object.drr_object); 1326 DO32(drr_object.drr_type); 1327 DO32(drr_object.drr_bonustype); 1328 DO32(drr_object.drr_blksz); 1329 DO32(drr_object.drr_bonuslen); 1330 DO64(drr_object.drr_toguid); 1331 break; 1332 case DRR_FREEOBJECTS: 1333 DO64(drr_freeobjects.drr_firstobj); 1334 DO64(drr_freeobjects.drr_numobjs); 1335 DO64(drr_freeobjects.drr_toguid); 1336 break; 1337 case DRR_WRITE: 1338 DO64(drr_write.drr_object); 1339 DO32(drr_write.drr_type); 1340 DO64(drr_write.drr_offset); 1341 DO64(drr_write.drr_length); 1342 DO64(drr_write.drr_toguid); 1343 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1344 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1345 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1346 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1347 DO64(drr_write.drr_key.ddk_prop); 1348 break; 1349 case DRR_WRITE_BYREF: 1350 DO64(drr_write_byref.drr_object); 1351 DO64(drr_write_byref.drr_offset); 1352 DO64(drr_write_byref.drr_length); 1353 DO64(drr_write_byref.drr_toguid); 1354 DO64(drr_write_byref.drr_refguid); 1355 DO64(drr_write_byref.drr_refobject); 1356 DO64(drr_write_byref.drr_refoffset); 1357 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1358 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1359 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1360 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1361 DO64(drr_write_byref.drr_key.ddk_prop); 1362 break; 1363 case DRR_WRITE_EMBEDDED: 1364 DO64(drr_write_embedded.drr_object); 1365 DO64(drr_write_embedded.drr_offset); 1366 DO64(drr_write_embedded.drr_length); 1367 DO64(drr_write_embedded.drr_toguid); 1368 DO32(drr_write_embedded.drr_lsize); 1369 DO32(drr_write_embedded.drr_psize); 1370 break; 1371 case DRR_FREE: 1372 DO64(drr_free.drr_object); 1373 DO64(drr_free.drr_offset); 1374 DO64(drr_free.drr_length); 1375 DO64(drr_free.drr_toguid); 1376 break; 1377 case DRR_SPILL: 1378 DO64(drr_spill.drr_object); 1379 DO64(drr_spill.drr_length); 1380 DO64(drr_spill.drr_toguid); 1381 break; 1382 case DRR_END: 1383 DO64(drr_end.drr_checksum.zc_word[0]); 1384 DO64(drr_end.drr_checksum.zc_word[1]); 1385 DO64(drr_end.drr_checksum.zc_word[2]); 1386 DO64(drr_end.drr_checksum.zc_word[3]); 1387 DO64(drr_end.drr_toguid); 1388 break; 1389 } 1390#undef DO64 1391#undef DO32 1392} 1393 1394static int 1395restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1396{ 1397 int err; 1398 dmu_tx_t *tx; 1399 void *data = NULL; 1400 1401 if (drro->drr_type == DMU_OT_NONE || 1402 !DMU_OT_IS_VALID(drro->drr_type) || 1403 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1404 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1405 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1406 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1407 drro->drr_blksz < SPA_MINBLOCKSIZE || 1408 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1409 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1410 return (SET_ERROR(EINVAL)); 1411 } 1412 1413 err = dmu_object_info(os, drro->drr_object, NULL); 1414 1415 if (err != 0 && err != ENOENT) 1416 return (SET_ERROR(EINVAL)); 1417 1418 if (drro->drr_bonuslen) { 1419 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 1420 if (ra->err != 0) 1421 return (ra->err); 1422 } 1423 1424 if (err == ENOENT) { 1425 /* currently free, want to be allocated */ 1426 tx = dmu_tx_create(os); 1427 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1428 err = dmu_tx_assign(tx, TXG_WAIT); 1429 if (err != 0) { 1430 dmu_tx_abort(tx); 1431 return (err); 1432 } 1433 err = dmu_object_claim(os, drro->drr_object, 1434 drro->drr_type, drro->drr_blksz, 1435 drro->drr_bonustype, drro->drr_bonuslen, tx); 1436 dmu_tx_commit(tx); 1437 } else { 1438 /* currently allocated, want to be allocated */ 1439 err = dmu_object_reclaim(os, drro->drr_object, 1440 drro->drr_type, drro->drr_blksz, 1441 drro->drr_bonustype, drro->drr_bonuslen); 1442 } 1443 if (err != 0) { 1444 return (SET_ERROR(EINVAL)); 1445 } 1446 1447 tx = dmu_tx_create(os); 1448 dmu_tx_hold_bonus(tx, drro->drr_object); 1449 err = dmu_tx_assign(tx, TXG_WAIT); 1450 if (err != 0) { 1451 dmu_tx_abort(tx); 1452 return (err); 1453 } 1454 1455 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1456 tx); 1457 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1458 1459 if (data != NULL) { 1460 dmu_buf_t *db; 1461 1462 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1463 dmu_buf_will_dirty(db, tx); 1464 1465 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1466 bcopy(data, db->db_data, drro->drr_bonuslen); 1467 if (ra->byteswap) { 1468 dmu_object_byteswap_t byteswap = 1469 DMU_OT_BYTESWAP(drro->drr_bonustype); 1470 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1471 drro->drr_bonuslen); 1472 } 1473 dmu_buf_rele(db, FTAG); 1474 } 1475 dmu_tx_commit(tx); 1476 return (0); 1477} 1478 1479/* ARGSUSED */ 1480static int 1481restore_freeobjects(struct restorearg *ra, objset_t *os, 1482 struct drr_freeobjects *drrfo) 1483{ 1484 uint64_t obj; 1485 1486 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1487 return (SET_ERROR(EINVAL)); 1488 1489 for (obj = drrfo->drr_firstobj; 1490 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1491 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1492 int err; 1493 1494 if (dmu_object_info(os, obj, NULL) != 0) 1495 continue; 1496 1497 err = dmu_free_long_object(os, obj); 1498 if (err != 0) 1499 return (err); 1500 } 1501 return (0); 1502} 1503 1504static int 1505restore_write(struct restorearg *ra, objset_t *os, 1506 struct drr_write *drrw) 1507{ 1508 dmu_tx_t *tx; 1509 void *data; 1510 int err; 1511 1512 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1513 !DMU_OT_IS_VALID(drrw->drr_type)) 1514 return (SET_ERROR(EINVAL)); 1515 1516 data = restore_read(ra, drrw->drr_length); 1517 if (data == NULL) 1518 return (ra->err); 1519 1520 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1521 return (SET_ERROR(EINVAL)); 1522 1523 tx = dmu_tx_create(os); 1524 1525 dmu_tx_hold_write(tx, drrw->drr_object, 1526 drrw->drr_offset, drrw->drr_length); 1527 err = dmu_tx_assign(tx, TXG_WAIT); 1528 if (err != 0) { 1529 dmu_tx_abort(tx); 1530 return (err); 1531 } 1532 if (ra->byteswap) { 1533 dmu_object_byteswap_t byteswap = 1534 DMU_OT_BYTESWAP(drrw->drr_type); 1535 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 1536 } 1537 dmu_write(os, drrw->drr_object, 1538 drrw->drr_offset, drrw->drr_length, data, tx); 1539 dmu_tx_commit(tx); 1540 return (0); 1541} 1542 1543/* 1544 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1545 * streams to refer to a copy of the data that is already on the 1546 * system because it came in earlier in the stream. This function 1547 * finds the earlier copy of the data, and uses that copy instead of 1548 * data from the stream to fulfill this write. 1549 */ 1550static int 1551restore_write_byref(struct restorearg *ra, objset_t *os, 1552 struct drr_write_byref *drrwbr) 1553{ 1554 dmu_tx_t *tx; 1555 int err; 1556 guid_map_entry_t gmesrch; 1557 guid_map_entry_t *gmep; 1558 avl_index_t where; 1559 objset_t *ref_os = NULL; 1560 dmu_buf_t *dbp; 1561 1562 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1563 return (SET_ERROR(EINVAL)); 1564 1565 /* 1566 * If the GUID of the referenced dataset is different from the 1567 * GUID of the target dataset, find the referenced dataset. 1568 */ 1569 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1570 gmesrch.guid = drrwbr->drr_refguid; 1571 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1572 &where)) == NULL) { 1573 return (SET_ERROR(EINVAL)); 1574 } 1575 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1576 return (SET_ERROR(EINVAL)); 1577 } else { 1578 ref_os = os; 1579 } 1580 1581 err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1582 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); 1583 if (err != 0) 1584 return (err); 1585 1586 tx = dmu_tx_create(os); 1587 1588 dmu_tx_hold_write(tx, drrwbr->drr_object, 1589 drrwbr->drr_offset, drrwbr->drr_length); 1590 err = dmu_tx_assign(tx, TXG_WAIT); 1591 if (err != 0) { 1592 dmu_tx_abort(tx); 1593 return (err); 1594 } 1595 dmu_write(os, drrwbr->drr_object, 1596 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1597 dmu_buf_rele(dbp, FTAG); 1598 dmu_tx_commit(tx); 1599 return (0); 1600} 1601 1602static int 1603restore_write_embedded(struct restorearg *ra, objset_t *os, 1604 struct drr_write_embedded *drrwnp) 1605{ 1606 dmu_tx_t *tx; 1607 int err; 1608 void *data; 1609 1610 if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) 1611 return (EINVAL); 1612 1613 if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) 1614 return (EINVAL); 1615 1616 if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) 1617 return (EINVAL); 1618 if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) 1619 return (EINVAL); 1620 1621 data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8)); 1622 if (data == NULL) 1623 return (ra->err); 1624 1625 tx = dmu_tx_create(os); 1626 1627 dmu_tx_hold_write(tx, drrwnp->drr_object, 1628 drrwnp->drr_offset, drrwnp->drr_length); 1629 err = dmu_tx_assign(tx, TXG_WAIT); 1630 if (err != 0) { 1631 dmu_tx_abort(tx); 1632 return (err); 1633 } 1634 1635 dmu_write_embedded(os, drrwnp->drr_object, 1636 drrwnp->drr_offset, data, drrwnp->drr_etype, 1637 drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, 1638 ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); 1639 1640 dmu_tx_commit(tx); 1641 return (0); 1642} 1643 1644static int 1645restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1646{ 1647 dmu_tx_t *tx; 1648 void *data; 1649 dmu_buf_t *db, *db_spill; 1650 int err; 1651 1652 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1653 drrs->drr_length > SPA_MAXBLOCKSIZE) 1654 return (SET_ERROR(EINVAL)); 1655 1656 data = restore_read(ra, drrs->drr_length); 1657 if (data == NULL) 1658 return (ra->err); 1659 1660 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1661 return (SET_ERROR(EINVAL)); 1662 1663 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1664 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1665 dmu_buf_rele(db, FTAG); 1666 return (err); 1667 } 1668 1669 tx = dmu_tx_create(os); 1670 1671 dmu_tx_hold_spill(tx, db->db_object); 1672 1673 err = dmu_tx_assign(tx, TXG_WAIT); 1674 if (err != 0) { 1675 dmu_buf_rele(db, FTAG); 1676 dmu_buf_rele(db_spill, FTAG); 1677 dmu_tx_abort(tx); 1678 return (err); 1679 } 1680 dmu_buf_will_dirty(db_spill, tx); 1681 1682 if (db_spill->db_size < drrs->drr_length) 1683 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1684 drrs->drr_length, tx)); 1685 bcopy(data, db_spill->db_data, drrs->drr_length); 1686 1687 dmu_buf_rele(db, FTAG); 1688 dmu_buf_rele(db_spill, FTAG); 1689 1690 dmu_tx_commit(tx); 1691 return (0); 1692} 1693 1694/* ARGSUSED */ 1695static int 1696restore_free(struct restorearg *ra, objset_t *os, 1697 struct drr_free *drrf) 1698{ 1699 int err; 1700 1701 if (drrf->drr_length != -1ULL && 1702 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1703 return (SET_ERROR(EINVAL)); 1704 1705 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1706 return (SET_ERROR(EINVAL)); 1707 1708 err = dmu_free_long_range(os, drrf->drr_object, 1709 drrf->drr_offset, drrf->drr_length); 1710 return (err); 1711} 1712 1713/* used to destroy the drc_ds on error */ 1714static void 1715dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 1716{ 1717 char name[MAXNAMELEN]; 1718 dsl_dataset_name(drc->drc_ds, name); 1719 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1720 (void) dsl_destroy_head(name); 1721} 1722 1723/* 1724 * NB: callers *must* call dmu_recv_end() if this succeeds. 1725 */ 1726int 1727dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, 1728 int cleanup_fd, uint64_t *action_handlep) 1729{ 1730 struct restorearg ra = { 0 }; 1731 dmu_replay_record_t *drr; 1732 objset_t *os; 1733 zio_cksum_t pcksum; 1734 int featureflags; 1735 1736 ra.byteswap = drc->drc_byteswap; 1737 ra.cksum = drc->drc_cksum; 1738 ra.td = curthread; 1739 ra.fp = fp; 1740 ra.voff = *voffp; 1741 ra.bufsize = 1<<20; 1742 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1743 1744 /* these were verified in dmu_recv_begin */ 1745 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 1746 DMU_SUBSTREAM); 1747 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 1748 1749 /* 1750 * Open the objset we are modifying. 1751 */ 1752 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); 1753 1754 ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1755 1756 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1757 1758 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1759 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1760 minor_t minor; 1761 1762 if (cleanup_fd == -1) { 1763 ra.err = SET_ERROR(EBADF); 1764 goto out; 1765 } 1766 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1767 if (ra.err != 0) { 1768 cleanup_fd = -1; 1769 goto out; 1770 } 1771 1772 if (*action_handlep == 0) { 1773 ra.guid_to_ds_map = 1774 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1775 avl_create(ra.guid_to_ds_map, guid_compare, 1776 sizeof (guid_map_entry_t), 1777 offsetof(guid_map_entry_t, avlnode)); 1778 ra.err = zfs_onexit_add_cb(minor, 1779 free_guid_map_onexit, ra.guid_to_ds_map, 1780 action_handlep); 1781 if (ra.err != 0) 1782 goto out; 1783 } else { 1784 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1785 (void **)&ra.guid_to_ds_map); 1786 if (ra.err != 0) 1787 goto out; 1788 } 1789 1790 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1791 } 1792 1793 /* 1794 * Read records and process them. 1795 */ 1796 pcksum = ra.cksum; 1797 while (ra.err == 0 && 1798 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1799 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1800 ra.err = SET_ERROR(EINTR); 1801 goto out; 1802 } 1803 1804 if (ra.byteswap) 1805 backup_byteswap(drr); 1806 1807 switch (drr->drr_type) { 1808 case DRR_OBJECT: 1809 { 1810 /* 1811 * We need to make a copy of the record header, 1812 * because restore_{object,write} may need to 1813 * restore_read(), which will invalidate drr. 1814 */ 1815 struct drr_object drro = drr->drr_u.drr_object; 1816 ra.err = restore_object(&ra, os, &drro); 1817 break; 1818 } 1819 case DRR_FREEOBJECTS: 1820 { 1821 struct drr_freeobjects drrfo = 1822 drr->drr_u.drr_freeobjects; 1823 ra.err = restore_freeobjects(&ra, os, &drrfo); 1824 break; 1825 } 1826 case DRR_WRITE: 1827 { 1828 struct drr_write drrw = drr->drr_u.drr_write; 1829 ra.err = restore_write(&ra, os, &drrw); 1830 break; 1831 } 1832 case DRR_WRITE_BYREF: 1833 { 1834 struct drr_write_byref drrwbr = 1835 drr->drr_u.drr_write_byref; 1836 ra.err = restore_write_byref(&ra, os, &drrwbr); 1837 break; 1838 } 1839 case DRR_WRITE_EMBEDDED: 1840 { 1841 struct drr_write_embedded drrwe = 1842 drr->drr_u.drr_write_embedded; 1843 ra.err = restore_write_embedded(&ra, os, &drrwe); 1844 break; 1845 } 1846 case DRR_FREE: 1847 { 1848 struct drr_free drrf = drr->drr_u.drr_free; 1849 ra.err = restore_free(&ra, os, &drrf); 1850 break; 1851 } 1852 case DRR_END: 1853 { 1854 struct drr_end drre = drr->drr_u.drr_end; 1855 /* 1856 * We compare against the *previous* checksum 1857 * value, because the stored checksum is of 1858 * everything before the DRR_END record. 1859 */ 1860 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1861 ra.err = SET_ERROR(ECKSUM); 1862 goto out; 1863 } 1864 case DRR_SPILL: 1865 { 1866 struct drr_spill drrs = drr->drr_u.drr_spill; 1867 ra.err = restore_spill(&ra, os, &drrs); 1868 break; 1869 } 1870 default: 1871 ra.err = SET_ERROR(EINVAL); 1872 goto out; 1873 } 1874 pcksum = ra.cksum; 1875 } 1876 ASSERT(ra.err != 0); 1877 1878out: 1879 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1880 zfs_onexit_fd_rele(cleanup_fd); 1881 1882 if (ra.err != 0) { 1883 /* 1884 * destroy what we created, so we don't leave it in the 1885 * inconsistent restoring state. 1886 */ 1887 dmu_recv_cleanup_ds(drc); 1888 } 1889 1890 kmem_free(ra.buf, ra.bufsize); 1891 *voffp = ra.voff; 1892 return (ra.err); 1893} 1894 1895static int 1896dmu_recv_end_check(void *arg, dmu_tx_t *tx) 1897{ 1898 dmu_recv_cookie_t *drc = arg; 1899 dsl_pool_t *dp = dmu_tx_pool(tx); 1900 int error; 1901 1902 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 1903 1904 if (!drc->drc_newfs) { 1905 dsl_dataset_t *origin_head; 1906 1907 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 1908 if (error != 0) 1909 return (error); 1910 if (drc->drc_force) { 1911 /* 1912 * We will destroy any snapshots in tofs (i.e. before 1913 * origin_head) that are after the origin (which is 1914 * the snap before drc_ds, because drc_ds can not 1915 * have any snaps of its own). 1916 */ 1917 uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; 1918 while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { 1919 dsl_dataset_t *snap; 1920 error = dsl_dataset_hold_obj(dp, obj, FTAG, 1921 &snap); 1922 if (error != 0) 1923 return (error); 1924 if (snap->ds_dir != origin_head->ds_dir) 1925 error = SET_ERROR(EINVAL); 1926 if (error == 0) { 1927 error = dsl_destroy_snapshot_check_impl( 1928 snap, B_FALSE); 1929 } 1930 obj = snap->ds_phys->ds_prev_snap_obj; 1931 dsl_dataset_rele(snap, FTAG); 1932 if (error != 0) 1933 return (error); 1934 } 1935 } 1936 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 1937 origin_head, drc->drc_force, drc->drc_owner, tx); 1938 if (error != 0) { 1939 dsl_dataset_rele(origin_head, FTAG); 1940 return (error); 1941 } 1942 error = dsl_dataset_snapshot_check_impl(origin_head, 1943 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 1944 dsl_dataset_rele(origin_head, FTAG); 1945 if (error != 0) 1946 return (error); 1947 1948 error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 1949 } else { 1950 error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 1951 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 1952 } 1953 return (error); 1954} 1955 1956static void 1957dmu_recv_end_sync(void *arg, dmu_tx_t *tx) 1958{ 1959 dmu_recv_cookie_t *drc = arg; 1960 dsl_pool_t *dp = dmu_tx_pool(tx); 1961 1962 spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 1963 tx, "snap=%s", drc->drc_tosnap); 1964 1965 if (!drc->drc_newfs) { 1966 dsl_dataset_t *origin_head; 1967 1968 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 1969 &origin_head)); 1970 1971 if (drc->drc_force) { 1972 /* 1973 * Destroy any snapshots of drc_tofs (origin_head) 1974 * after the origin (the snap before drc_ds). 1975 */ 1976 uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; 1977 while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { 1978 dsl_dataset_t *snap; 1979 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, 1980 &snap)); 1981 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); 1982 obj = snap->ds_phys->ds_prev_snap_obj; 1983 dsl_destroy_snapshot_sync_impl(snap, 1984 B_FALSE, tx); 1985 dsl_dataset_rele(snap, FTAG); 1986 } 1987 } 1988 VERIFY3P(drc->drc_ds->ds_prev, ==, 1989 origin_head->ds_prev); 1990 1991 dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 1992 origin_head, tx); 1993 dsl_dataset_snapshot_sync_impl(origin_head, 1994 drc->drc_tosnap, tx); 1995 1996 /* set snapshot's creation time and guid */ 1997 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 1998 origin_head->ds_prev->ds_phys->ds_creation_time = 1999 drc->drc_drrb->drr_creation_time; 2000 origin_head->ds_prev->ds_phys->ds_guid = 2001 drc->drc_drrb->drr_toguid; 2002 origin_head->ds_prev->ds_phys->ds_flags &= 2003 ~DS_FLAG_INCONSISTENT; 2004 2005 dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 2006 origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2007 2008 dsl_dataset_rele(origin_head, FTAG); 2009 dsl_destroy_head_sync_impl(drc->drc_ds, tx); 2010 2011 if (drc->drc_owner != NULL) 2012 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); 2013 } else { 2014 dsl_dataset_t *ds = drc->drc_ds; 2015 2016 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 2017 2018 /* set snapshot's creation time and guid */ 2019 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2020 ds->ds_prev->ds_phys->ds_creation_time = 2021 drc->drc_drrb->drr_creation_time; 2022 ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; 2023 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2024 2025 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2026 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2027 } 2028 drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; 2029 /* 2030 * Release the hold from dmu_recv_begin. This must be done before 2031 * we return to open context, so that when we free the dataset's dnode, 2032 * we can evict its bonus buffer. 2033 */ 2034 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 2035 drc->drc_ds = NULL; 2036} 2037 2038static int 2039add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 2040{ 2041 dsl_pool_t *dp; 2042 dsl_dataset_t *snapds; 2043 guid_map_entry_t *gmep; 2044 int err; 2045 2046 ASSERT(guid_map != NULL); 2047 2048 err = dsl_pool_hold(name, FTAG, &dp); 2049 if (err != 0) 2050 return (err); 2051 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); 2052 err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); 2053 if (err == 0) { 2054 gmep->guid = snapds->ds_phys->ds_guid; 2055 gmep->gme_ds = snapds; 2056 avl_add(guid_map, gmep); 2057 dsl_dataset_long_hold(snapds, gmep); 2058 } else 2059 kmem_free(gmep, sizeof (*gmep)); 2060 2061 dsl_pool_rele(dp, FTAG); 2062 return (err); 2063} 2064 2065static int dmu_recv_end_modified_blocks = 3; 2066 2067static int 2068dmu_recv_existing_end(dmu_recv_cookie_t *drc) 2069{ 2070 int error; 2071 char name[MAXNAMELEN]; 2072 2073#ifdef _KERNEL 2074 /* 2075 * We will be destroying the ds; make sure its origin is unmounted if 2076 * necessary. 2077 */ 2078 dsl_dataset_name(drc->drc_ds, name); 2079 zfs_destroy_unmount_origin(name); 2080#endif 2081 2082 error = dsl_sync_task(drc->drc_tofs, 2083 dmu_recv_end_check, dmu_recv_end_sync, drc, 2084 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2085 2086 if (error != 0) 2087 dmu_recv_cleanup_ds(drc); 2088 return (error); 2089} 2090 2091static int 2092dmu_recv_new_end(dmu_recv_cookie_t *drc) 2093{ 2094 int error; 2095 2096 error = dsl_sync_task(drc->drc_tofs, 2097 dmu_recv_end_check, dmu_recv_end_sync, drc, 2098 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2099 2100 if (error != 0) { 2101 dmu_recv_cleanup_ds(drc); 2102 } else if (drc->drc_guid_to_ds_map != NULL) { 2103 (void) add_ds_to_guidmap(drc->drc_tofs, 2104 drc->drc_guid_to_ds_map, 2105 drc->drc_newsnapobj); 2106 } 2107 return (error); 2108} 2109 2110int 2111dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) 2112{ 2113 drc->drc_owner = owner; 2114 2115 if (drc->drc_newfs) 2116 return (dmu_recv_new_end(drc)); 2117 else 2118 return (dmu_recv_existing_end(drc)); 2119} 2120 2121/* 2122 * Return TRUE if this objset is currently being received into. 2123 */ 2124boolean_t 2125dmu_objset_is_receiving(objset_t *os) 2126{ 2127 return (os->os_dsl_dataset != NULL && 2128 os->os_dsl_dataset->ds_owner == dmu_recv_tag); 2129} 2130