dsl_scan.c revision 277584
1153761Swollman/*
2206219Sedwin * CDDL HEADER START
3192886Sedwin *
4192886Sedwin * The contents of this file are subject to the terms of the
564499Swollman * Common Development and Distribution License (the "License").
62742Swollman * You may not use this file except in compliance with the License.
72742Swollman *
82742Swollman * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
92742Swollman * or http://www.opensolaris.org/os/licensing.
10158421Swollman * See the License for the specific language governing permissions
112742Swollman * and limitations under the License.
12158421Swollman *
13158421Swollman * When distributing Covered Code, include this CDDL HEADER in each
142742Swollman * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1586222Swollman * If applicable, add the following below this CDDL HEADER, with the
1620094Swollman * fields enclosed by brackets "[]" replaced with your own identifying
1720094Swollman * information: Portions Copyright [yyyy] [name of copyright owner]
1820094Swollman *
1920094Swollman * CDDL HEADER END
2020094Swollman */
21158421Swollman/*
22158421Swollman * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
2320094Swollman * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
2443543Swollman */
252742Swollman
2643543Swollman#include <sys/dsl_scan.h>
2743543Swollman#include <sys/dsl_pool.h>
2843543Swollman#include <sys/dsl_dataset.h>
2943543Swollman#include <sys/dsl_prop.h>
30121098Swollman#include <sys/dsl_dir.h>
31121098Swollman#include <sys/dsl_synctask.h>
32121098Swollman#include <sys/dnode.h>
33121098Swollman#include <sys/dmu_tx.h>
3443543Swollman#include <sys/dmu_objset.h>
3543543Swollman#include <sys/arc.h>
3643543Swollman#include <sys/zap.h>
3743543Swollman#include <sys/zio.h>
3843543Swollman#include <sys/zfs_context.h>
3943543Swollman#include <sys/fs/zfs.h>
402742Swollman#include <sys/zfs_znode.h>
412742Swollman#include <sys/spa_impl.h>
4219878Swollman#include <sys/vdev_impl.h>
43114173Swollman#include <sys/zil_impl.h>
44114173Swollman#include <sys/zio_checksum.h>
45114173Swollman#include <sys/ddt.h>
46114173Swollman#include <sys/sa.h>
47114173Swollman#include <sys/sa_impl.h>
48114173Swollman#include <sys/zfeature.h>
49114173Swollman#ifdef _KERNEL
50114173Swollman#include <sys/zfs_vfsops.h>
51114173Swollman#endif
52114173Swollman
53114173Swollmantypedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
54114173Swollman    const zbookmark_phys_t *);
55114173Swollman
56114173Swollmanstatic scan_cb_t dsl_scan_scrub_cb;
572742Swollmanstatic void dsl_scan_cancel_sync(void *, dmu_tx_t *);
582742Swollmanstatic void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
5958787Sru
602742Swollmanunsigned int zfs_top_maxinflight = 32;	/* maximum I/Os per top-level */
61149514Swollmanunsigned int zfs_resilver_delay = 2;	/* number of ticks to delay resilver */
6214343Swollmanunsigned int zfs_scrub_delay = 4;	/* number of ticks to delay scrub */
639908Swollmanunsigned int zfs_scan_idle = 50;	/* idle window in clock ticks */
649908Swollman
659908Swollmanunsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
669908Swollmanunsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
679908Swollmanunsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
689908Swollman						 per txg */
699908Swollmanboolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
709908Swollmanboolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
719908Swollman
729908SwollmanSYSCTL_DECL(_vfs_zfs);
7314343SwollmanTUNABLE_INT("vfs.zfs.top_maxinflight", &zfs_top_maxinflight);
7414343SwollmanSYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RW,
759908Swollman    &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev");
769908SwollmanTUNABLE_INT("vfs.zfs.resilver_delay", &zfs_resilver_delay);
779908SwollmanSYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RW,
789908Swollman    &zfs_resilver_delay, 0, "Number of ticks to delay resilver");
799908SwollmanTUNABLE_INT("vfs.zfs.scrub_delay", &zfs_scrub_delay);
809908SwollmanSYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RW,
819908Swollman    &zfs_scrub_delay, 0, "Number of ticks to delay scrub");
8219878SwollmanTUNABLE_INT("vfs.zfs.scan_idle", &zfs_scan_idle);
832742SwollmanSYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RW,
842742Swollman    &zfs_scan_idle, 0, "Idle scan window in clock ticks");
8543014SwollmanTUNABLE_INT("vfs.zfs.scan_min_time_ms", &zfs_scan_min_time_ms);
862742SwollmanSYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RW,
87149514Swollman    &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg");
882742SwollmanTUNABLE_INT("vfs.zfs.free_min_time_ms", &zfs_free_min_time_ms);
892742SwollmanSYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RW,
902742Swollman    &zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
912742SwollmanTUNABLE_INT("vfs.zfs.resilver_min_time_ms", &zfs_resilver_min_time_ms);
922742SwollmanSYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RW,
932742Swollman    &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
942742SwollmanTUNABLE_INT("vfs.zfs.no_scrub_io", &zfs_no_scrub_io);
9521217SwollmanSYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RW,
962742Swollman    &zfs_no_scrub_io, 0, "Disable scrub I/O");
972742SwollmanTUNABLE_INT("vfs.zfs.no_scrub_prefetch", &zfs_no_scrub_prefetch);
982742SwollmanSYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RW,
992742Swollman    &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
1002742Swollman
1012742Swollmanenum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
1022742Swollman/* max number of blocks to free in a single TXG */
10321217Swollmanuint64_t zfs_free_max_blocks = UINT64_MAX;
1042742SwollmanSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
1052742Swollman    &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG");
1062742Swollman
1072742Swollman
108149514Swollman#define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
1092742Swollman	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
1102742Swollman	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
1112742Swollman
1122742Swollmanextern int zfs_txg_timeout;
11358787Sru
11458787Sru/* the order has to match pool_scan_type */
1152742Swollmanstatic scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
1162742Swollman	NULL,
11758787Sru	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
1182742Swollman	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
11958787Sru};
12058787Sru
12158787Sruint
1222742Swollmandsl_scan_init(dsl_pool_t *dp, uint64_t txg)
12358787Sru{
12458787Sru	int err;
1252742Swollman	dsl_scan_t *scn;
1262742Swollman	spa_t *spa = dp->dp_spa;
1272742Swollman	uint64_t f;
1282742Swollman
12958787Sru	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
1302742Swollman	scn->scn_dp = dp;
1312742Swollman
13258787Sru	/*
13358787Sru	 * It's possible that we're resuming a scan after a reboot so
1342742Swollman	 * make sure that the scan_async_destroying flag is initialized
135136638Swollman	 * appropriately.
13617200Swollman	 */
13743543Swollman	ASSERT(!scn->scn_async_destroying);
13858787Sru	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
13917200Swollman	    SPA_FEATURE_ASYNC_DESTROY);
14017200Swollman
14117200Swollman	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
14217200Swollman	    "scrub_func", sizeof (uint64_t), 1, &f);
143121098Swollman	if (err == 0) {
144121098Swollman		/*
145121098Swollman		 * There was an old-style scrub in progress.  Restart a
146121098Swollman		 * new-style scrub from the beginning.
147136638Swollman		 */
148121098Swollman		scn->scn_restart_txg = txg;
149121098Swollman		zfs_dbgmsg("old-style scrub was in progress; "
150121098Swollman		    "restarting new-style scrub in txg %llu",
15117200Swollman		    scn->scn_restart_txg);
152121098Swollman
153121098Swollman		/*
154121098Swollman		 * Load the queue obj from the old location so that it
155121098Swollman		 * can be freed by dsl_scan_done().
156121098Swollman		 */
157121098Swollman		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
158121098Swollman		    "scrub_queue", sizeof (uint64_t), 1,
159121098Swollman		    &scn->scn_phys.scn_queue_obj);
160121098Swollman	} else {
161121098Swollman		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
16219878Swollman		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
16319878Swollman		    &scn->scn_phys);
16419878Swollman		if (err == ENOENT)
16519878Swollman			return (0);
16619878Swollman		else if (err)
16719878Swollman			return (err);
16819878Swollman
1692742Swollman		if (scn->scn_phys.scn_state == DSS_SCANNING &&
1702742Swollman		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
1712742Swollman			/*
1722742Swollman			 * A new-type scrub was in progress on an old
1732742Swollman			 * pool, and the pool was accessed by old
174149514Swollman			 * software.  Restart from the beginning, since
1752742Swollman			 * the old software may have changed the pool in
1762742Swollman			 * the meantime.
1772742Swollman			 */
1782742Swollman			scn->scn_restart_txg = txg;
179149514Swollman			zfs_dbgmsg("new-style scrub was modified "
18043014Swollman			    "by old software; restarting in txg %llu",
18143014Swollman			    scn->scn_restart_txg);
18243014Swollman		}
18343014Swollman	}
18443014Swollman
18558787Sru	spa_scan_stat_init(spa);
18658787Sru	return (0);
18758787Sru}
18858787Sru
1892742Swollmanvoid
19075267Swollmandsl_scan_fini(dsl_pool_t *dp)
19167578Swollman{
19267578Swollman	if (dp->dp_scan) {
19367578Swollman		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
19467578Swollman		dp->dp_scan = NULL;
19567578Swollman	}
19675267Swollman}
19767578Swollman
19858787Sru/* ARGSUSED */
19967578Swollmanstatic int
20067578Swollmandsl_scan_setup_check(void *arg, dmu_tx_t *tx)
20167578Swollman{
20267578Swollman	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
20367578Swollman
20467578Swollman	if (scn->scn_phys.scn_state == DSS_SCANNING)
20567578Swollman		return (SET_ERROR(EBUSY));
20667578Swollman
20767578Swollman	return (0);
20867578Swollman}
20967578Swollman
210149514Swollmanstatic void
21158787Srudsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
212149514Swollman{
21358787Sru	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
21458787Sru	pool_scan_func_t *funcp = arg;
21558787Sru	dmu_object_type_t ot = 0;
21693799Swollman	dsl_pool_t *dp = scn->scn_dp;
21758787Sru	spa_t *spa = dp->dp_spa;
218149514Swollman
21943014Swollman	ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
22043014Swollman	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
22143014Swollman	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
22243014Swollman	scn->scn_phys.scn_func = *funcp;
22343014Swollman	scn->scn_phys.scn_state = DSS_SCANNING;
22443014Swollman	scn->scn_phys.scn_min_txg = 0;
2252742Swollman	scn->scn_phys.scn_max_txg = tx->tx_txg;
226158421Swollman	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
2272742Swollman	scn->scn_phys.scn_start_time = gethrestime_sec();
228158421Swollman	scn->scn_phys.scn_errors = 0;
2292742Swollman	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
230158421Swollman	scn->scn_restart_txg = 0;
231158421Swollman	scn->scn_done_txg = 0;
23230711Swollman	spa_scan_stat_init(spa);
2332742Swollman
2342742Swollman	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
2352742Swollman		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
2362742Swollman
23730711Swollman		/* rewrite all disk labels */
2382742Swollman		vdev_config_dirty(spa->spa_root_vdev);
23917200Swollman
24017200Swollman		if (vdev_resilver_needed(spa->spa_root_vdev,
24117200Swollman		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
2422742Swollman			spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
243158421Swollman		} else {
2442742Swollman			spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
24530711Swollman		}
2462742Swollman
24758787Sru		spa->spa_scrub_started = B_TRUE;
24858787Sru		/*
249158421Swollman		 * If this is an incremental scrub, limit the DDT scrub phase
25058787Sru		 * to just the auto-ditto class (for correctness); the rest
25117200Swollman		 * of the scrub should go faster using top-down pruning.
2522742Swollman		 */
25317200Swollman		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
254158421Swollman			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
255158421Swollman
25686222Swollman	}
25786222Swollman
25886222Swollman	/* back to the generic stuff */
25986222Swollman
26086222Swollman	if (dp->dp_blkstats == NULL) {
26186222Swollman		dp->dp_blkstats =
2622742Swollman		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
263149514Swollman	}
264169811Swollman	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
265149514Swollman
266169811Swollman	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
267149514Swollman		ot = DMU_OT_ZAP_OTHER;
268169811Swollman
269149514Swollman	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
270149514Swollman	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
271169811Swollman
272149514Swollman	dsl_scan_sync_state(scn, tx);
273149514Swollman
274149514Swollman	spa_history_log_internal(spa, "scan setup", tx,
275169811Swollman	    "func=%u mintxg=%llu maxtxg=%llu",
276149514Swollman	    *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
277149514Swollman}
278149514Swollman
279149514Swollman/* ARGSUSED */
280149514Swollmanstatic void
281149514Swollmandsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
282169811Swollman{
283169811Swollman	static const char *old_names[] = {
284169811Swollman		"scrub_bookmark",
285149514Swollman		"scrub_ddt_bookmark",
286169811Swollman		"scrub_ddt_class_max",
287169811Swollman		"scrub_queue",
288169811Swollman		"scrub_min_txg",
289169811Swollman		"scrub_max_txg",
290149514Swollman		"scrub_func",
291149514Swollman		"scrub_errors",
29258787Sru		NULL
29358787Sru	};
29458787Sru
29558787Sru	dsl_pool_t *dp = scn->scn_dp;
29658787Sru	spa_t *spa = dp->dp_spa;
29758787Sru	int i;
29858787Sru
29958787Sru	/* Remove any remnants of an old-style scrub. */
30058787Sru	for (i = 0; old_names[i]; i++) {
30158787Sru		(void) zap_remove(dp->dp_meta_objset,
30258787Sru		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
30358787Sru	}
30458787Sru
30558787Sru	if (scn->scn_phys.scn_queue_obj != 0) {
30617200Swollman		VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
30717200Swollman		    scn->scn_phys.scn_queue_obj, tx));
30817200Swollman		scn->scn_phys.scn_queue_obj = 0;
30917200Swollman	}
31017200Swollman
31117200Swollman	/*
31217200Swollman	 * If we were "restarted" from a stopped state, don't bother
31317200Swollman	 * with anything else.
3142742Swollman	 */
31543014Swollman	if (scn->scn_phys.scn_state != DSS_SCANNING)
3162742Swollman		return;
3172742Swollman
31843014Swollman	if (complete)
3192742Swollman		scn->scn_phys.scn_state = DSS_FINISHED;
3202742Swollman	else
32143014Swollman		scn->scn_phys.scn_state = DSS_CANCELED;
3222742Swollman
3232742Swollman	spa_history_log_internal(spa, "scan done", tx,
32443014Swollman	    "complete=%u", complete);
3252742Swollman
3262742Swollman	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
32743014Swollman		mutex_enter(&spa->spa_scrub_lock);
3282742Swollman		while (spa->spa_scrub_inflight > 0) {
32943014Swollman			cv_wait(&spa->spa_scrub_io_cv,
3302742Swollman			    &spa->spa_scrub_lock);
33143014Swollman		}
3322742Swollman		mutex_exit(&spa->spa_scrub_lock);
3332742Swollman		spa->spa_scrub_started = B_FALSE;
33443014Swollman		spa->spa_scrub_active = B_FALSE;
3352742Swollman
33658787Sru		/*
33743014Swollman		 * If the scrub/resilver completed, update all DTLs to
3382742Swollman		 * reflect this.  Whether it succeeded or not, vacate
3392742Swollman		 * all temporary scrub DTLs.
34043014Swollman		 */
3412742Swollman		vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
34243014Swollman		    complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
3432742Swollman		if (complete) {
34443014Swollman			spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
3452742Swollman			    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
34643014Swollman		}
3472742Swollman		spa_errlog_rotate(spa);
34843014Swollman
3492742Swollman		/*
35043014Swollman		 * We may have finished replacing a device.
3512742Swollman		 * Let the async thread assess this and handle the detach.
35243014Swollman		 */
3532742Swollman		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
35443014Swollman	}
35543014Swollman
35643014Swollman	scn->scn_phys.scn_end_time = gethrestime_sec();
35743014Swollman}
35843014Swollman
35919878Swollman/* ARGSUSED */
3602742Swollmanstatic int
36143014Swollmandsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
36219878Swollman{
36343014Swollman	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
3642742Swollman
36543014Swollman	if (scn->scn_phys.scn_state != DSS_SCANNING)
36643014Swollman		return (SET_ERROR(ENOENT));
36743014Swollman	return (0);
36843014Swollman}
36943014Swollman
37043014Swollman/* ARGSUSED */
37143014Swollmanstatic void
3722742Swollmandsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
37319878Swollman{
3742742Swollman	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
3752742Swollman
37643014Swollman	dsl_scan_done(scn, B_FALSE, tx);
3772742Swollman	dsl_scan_sync_state(scn, tx);
37843014Swollman}
37943014Swollman
3802742Swollmanint
38143014Swollmandsl_scan_cancel(dsl_pool_t *dp)
38243014Swollman{
38343014Swollman	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
38443014Swollman	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
38543014Swollman}
3862742Swollman
38743014Swollmanstatic void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
38843014Swollman    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
38943014Swollman    dmu_objset_type_t ostype, dmu_tx_t *tx);
39043014Swollmanstatic void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
3912742Swollman    dmu_objset_type_t ostype,
39243014Swollman    dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
3932742Swollman
39443014Swollmanvoid
39543014Swollmandsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
39643014Swollman{
39743014Swollman	zio_free(dp->dp_spa, txg, bp);
3982742Swollman}
39943014Swollman
40043014Swollmanvoid
40143014Swollmandsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
40243014Swollman{
4032742Swollman	ASSERT(dsl_pool_sync_context(dp));
40443014Swollman	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
40517200Swollman	    pio->io_flags));
40643014Swollman}
40743014Swollman
40843014Swollmanstatic uint64_t
4092742Swollmandsl_scan_ds_maxtxg(dsl_dataset_t *ds)
4102742Swollman{
41143014Swollman	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
41243014Swollman	if (dsl_dataset_is_snapshot(ds))
41343014Swollman		return (MIN(smt, ds->ds_phys->ds_creation_txg));
41443014Swollman	return (smt);
4159908Swollman}
4169908Swollman
41743014Swollmanstatic void
41843014Swollmandsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
41943014Swollman{
4209908Swollman	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
42143014Swollman	    DMU_POOL_DIRECTORY_OBJECT,
42217200Swollman	    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
4232742Swollman	    &scn->scn_phys, tx));
4242742Swollman}
425158421Swollman
42617200Swollmanextern int zfs_vdev_async_write_active_min_dirty_percent;
42717200Swollman
4289908Swollmanstatic boolean_t
42917200Swollmandsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
430163302Sru{
431163302Sru	/* we never skip user/group accounting objects */
432163302Sru	if (zb && (int64_t)zb->zb_object < 0)
43386222Swollman		return (B_FALSE);
43486222Swollman
43543014Swollman	if (scn->scn_pausing)
43619878Swollman		return (B_TRUE); /* we're already pausing */
43717200Swollman
43817200Swollman	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
4392742Swollman		return (B_FALSE); /* we're resuming */
44017200Swollman
4412742Swollman	/* We only know how to resume from level-0 blocks. */
44217200Swollman	if (zb && zb->zb_level != 0)
44317200Swollman		return (B_FALSE);
44417200Swollman
44517200Swollman	/*
4462742Swollman	 * We pause if:
4472742Swollman	 *  - we have scanned for the maximum time: an entire txg
4482742Swollman	 *    timeout (default 5 sec)
449171948Sedwin	 *  or
4502742Swollman	 *  - we have scanned for at least the minimum time (default 1 sec
45117200Swollman	 *    for scrub, 3 sec for resilver), and either we have sufficient
45217200Swollman	 *    dirty data that we are starting to write more quickly
4539908Swollman	 *    (default 30%), or someone is explicitly waiting for this txg
4549908Swollman	 *    to complete.
45519878Swollman	 *  or
45617200Swollman	 *  - the spa is shutting down because this pool is being exported
45717200Swollman	 *    or the machine is rebooting.
45817200Swollman	 */
45919878Swollman	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
46017200Swollman	    zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
46175267Swollman	uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
462196581Sedwin	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
46375267Swollman	if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
46475267Swollman	    (NSEC2MSEC(elapsed_nanosecs) > mintime &&
46575267Swollman	    (txg_sync_waiting(scn->scn_dp) ||
4669908Swollman	    dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
46717200Swollman	    spa_shutting_down(scn->scn_dp->dp_spa)) {
46819878Swollman		if (zb) {
4692742Swollman			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
4702742Swollman			    (longlong_t)zb->zb_objset,
4719908Swollman			    (longlong_t)zb->zb_object,
47219878Swollman			    (longlong_t)zb->zb_level,
4739908Swollman			    (longlong_t)zb->zb_blkid);
4742742Swollman			scn->scn_phys.scn_bookmark = *zb;
47519878Swollman		}
47619878Swollman		dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
47719878Swollman		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
47819878Swollman		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
47919878Swollman		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
48019878Swollman		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
48119878Swollman		scn->scn_pausing = B_TRUE;
48219878Swollman		return (B_TRUE);
48319878Swollman	}
48419878Swollman	return (B_FALSE);
485181421Sedwin}
486158421Swollman
48719878Swollmantypedef struct zil_scan_arg {
488181424Sedwin	dsl_pool_t	*zsa_dp;
489181424Sedwin	zil_header_t	*zsa_zh;
490181424Sedwin} zil_scan_arg_t;
491181424Sedwin
492181424Sedwin/* ARGSUSED */
493181424Sedwinstatic int
494181424Sedwindsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
495181424Sedwin{
496181424Sedwin	zil_scan_arg_t *zsa = arg;
497181424Sedwin	dsl_pool_t *dp = zsa->zsa_dp;
498181424Sedwin	dsl_scan_t *scn = dp->dp_scan;
499181424Sedwin	zil_header_t *zh = zsa->zsa_zh;
500181424Sedwin	zbookmark_phys_t zb;
501181424Sedwin
502181424Sedwin	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
503181424Sedwin		return (0);
504181424Sedwin
505181424Sedwin	/*
506181424Sedwin	 * One block ("stubby") can be allocated a long time ago; we
507181424Sedwin	 * want to visit that one because it has been allocated
508181424Sedwin	 * (on-disk) even if it hasn't been claimed (even though for
509181424Sedwin	 * scrub there's nothing to do to it).
510181424Sedwin	 */
511181424Sedwin	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
512181424Sedwin		return (0);
513181424Sedwin
514181424Sedwin	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
515181424Sedwin	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
516181424Sedwin
517181424Sedwin	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
518181424Sedwin	return (0);
519181424Sedwin}
520181424Sedwin
521181424Sedwin/* ARGSUSED */
522181424Sedwinstatic int
523181424Sedwindsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
524181424Sedwin{
525181424Sedwin	if (lrc->lrc_txtype == TX_WRITE) {
526181424Sedwin		zil_scan_arg_t *zsa = arg;
52719878Swollman		dsl_pool_t *dp = zsa->zsa_dp;
52819878Swollman		dsl_scan_t *scn = dp->dp_scan;
52919878Swollman		zil_header_t *zh = zsa->zsa_zh;
53019878Swollman		lr_write_t *lr = (lr_write_t *)lrc;
53119878Swollman		blkptr_t *bp = &lr->lr_blkptr;
53219878Swollman		zbookmark_phys_t zb;
5332742Swollman
53420094Swollman		if (BP_IS_HOLE(bp) ||
53520094Swollman		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
53620094Swollman			return (0);
53720094Swollman
53820094Swollman		/*
53920094Swollman		 * birth can be < claim_txg if this record's txg is
54020094Swollman		 * already txg sync'ed (but this log block contains
54120094Swollman		 * other records that are not synced)
5422742Swollman		 */
54320094Swollman		if (claim_txg == 0 || bp->blk_birth < claim_txg)
54420094Swollman			return (0);
54520094Swollman
54658787Sru		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
54720094Swollman		    lr->lr_foid, ZB_ZIL_LEVEL,
54820094Swollman		    lr->lr_offset / BP_GET_LSIZE(bp));
54920094Swollman
55020094Swollman		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
55143014Swollman	}
55220094Swollman	return (0);
55320094Swollman}
55443014Swollman
55520094Swollmanstatic void
55620094Swollmandsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
55743014Swollman{
55843014Swollman	uint64_t claim_txg = zh->zh_claim_txg;
55920094Swollman	zil_scan_arg_t zsa = { dp, zh };
56020094Swollman	zilog_t *zilog;
56143014Swollman
56220094Swollman	/*
56320094Swollman	 * We only want to visit blocks that have been claimed but not yet
56420094Swollman	 * replayed (or, in read-only mode, blocks that *would* be claimed).
56520094Swollman	 */
56620094Swollman	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
5672742Swollman		return;
5682742Swollman
5692742Swollman	zilog = zil_alloc(dp->dp_meta_objset, zh);
5702742Swollman
57119878Swollman	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
57219878Swollman	    claim_txg);
57319878Swollman
57419878Swollman	zil_free(zilog);
5752742Swollman}
57619878Swollman
57719878Swollman/* ARGSUSED */
5782742Swollmanstatic void
579149514Swollmandsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
58019878Swollman    uint64_t objset, uint64_t object, uint64_t blkid)
58119878Swollman{
58219878Swollman	zbookmark_phys_t czb;
58319878Swollman	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
58419878Swollman
58519878Swollman	if (zfs_no_scrub_prefetch)
58619878Swollman		return;
58719878Swollman
58819878Swollman	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
58919878Swollman	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
59019878Swollman		return;
59119878Swollman
59219878Swollman	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
59319878Swollman
59419878Swollman	(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
59519878Swollman	    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
59619878Swollman	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
59719878Swollman}
59819878Swollman
59919878Swollmanstatic boolean_t
60019878Swollmandsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
6012742Swollman    const zbookmark_phys_t *zb)
60219878Swollman{
6032742Swollman	/*
6042742Swollman	 * We never skip over user/group accounting objects (obj<0)
60519878Swollman	 */
6062742Swollman	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
60719878Swollman	    (int64_t)zb->zb_object >= 0) {
6082742Swollman		/*
60919878Swollman		 * If we already visited this bp & everything below (in
6102742Swollman		 * a prior txg sync), don't bother doing it again.
61119878Swollman		 */
6122742Swollman		if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
61319878Swollman			return (B_TRUE);
6142742Swollman
61519878Swollman		/*
6162742Swollman		 * If we found the block we're trying to resume from, or
61719878Swollman		 * we went past it to a different object, zero it out to
6182742Swollman		 * indicate that it's OK to start checking for pausing
61919878Swollman		 * again.
6202742Swollman		 */
62119878Swollman		if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
6222742Swollman		    zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
62319878Swollman			dprintf("resuming at %llx/%llx/%llx/%llx\n",
6242742Swollman			    (longlong_t)zb->zb_objset,
62519878Swollman			    (longlong_t)zb->zb_object,
6262742Swollman			    (longlong_t)zb->zb_level,
62719878Swollman			    (longlong_t)zb->zb_blkid);
6282742Swollman			bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
62919878Swollman		}
6302742Swollman	}
6312742Swollman	return (B_FALSE);
63219878Swollman}
63330711Swollman
63420094Swollman/*
6352742Swollman * Return nonzero on i/o error.
6362742Swollman * Return new buf to write out in *bufp.
6372742Swollman */
6382742Swollmanstatic int
6392742Swollmandsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
64019878Swollman    dnode_phys_t *dnp, const blkptr_t *bp,
64119878Swollman    const zbookmark_phys_t *zb, dmu_tx_t *tx)
6422742Swollman{
6432742Swollman	dsl_pool_t *dp = scn->scn_dp;
644114173Swollman	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
645158421Swollman	int err;
646114173Swollman
647114173Swollman	if (BP_GET_LEVEL(bp) > 0) {
648114173Swollman		uint32_t flags = ARC_WAIT;
649158421Swollman		int i;
650158421Swollman		blkptr_t *cbp;
651114173Swollman		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
6522742Swollman		arc_buf_t *buf;
65319878Swollman
6542742Swollman		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
65519878Swollman		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
6562742Swollman		if (err) {
65719878Swollman			scn->scn_phys.scn_errors++;
65819878Swollman			return (err);
659114173Swollman		}
660114173Swollman		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
6612742Swollman			dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
6622742Swollman			    zb->zb_object, zb->zb_blkid * epb + i);
663114173Swollman		}
664114173Swollman		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
665114173Swollman			zbookmark_phys_t czb;
666114173Swollman
667114173Swollman			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
66819878Swollman			    zb->zb_level - 1,
66919878Swollman			    zb->zb_blkid * epb + i);
6702742Swollman			dsl_scan_visitbp(cbp, &czb, dnp,
6712742Swollman			    ds, scn, ostype, tx);
6722742Swollman		}
6732742Swollman		(void) arc_buf_remove_ref(buf, &buf);
67443014Swollman	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
6752742Swollman		uint32_t flags = ARC_WAIT;
67643014Swollman		dnode_phys_t *cdnp;
67743014Swollman		int i, j;
67858787Sru		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
67958787Sru		arc_buf_t *buf;
68019878Swollman
68158787Sru		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
68258787Sru		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
68320094Swollman		if (err) {
6842742Swollman			scn->scn_phys.scn_errors++;
6852742Swollman			return (err);
68630711Swollman		}
68730711Swollman		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
68830711Swollman			for (j = 0; j < cdnp->dn_nblkptr; j++) {
68930711Swollman				blkptr_t *cbp = &cdnp->dn_blkptr[j];
69030711Swollman				dsl_scan_prefetch(scn, buf, cbp,
69130711Swollman				    zb->zb_objset, zb->zb_blkid * epb + i, j);
69230711Swollman			}
69330711Swollman		}
69430711Swollman		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
695149514Swollman			dsl_scan_visitdnode(scn, ds, ostype,
69630711Swollman			    cdnp, zb->zb_blkid * epb + i, tx);
69730711Swollman		}
69875267Swollman
6992742Swollman		(void) arc_buf_remove_ref(buf, &buf);
70030711Swollman	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
70130711Swollman		uint32_t flags = ARC_WAIT;
70219878Swollman		objset_phys_t *osp;
70330711Swollman		arc_buf_t *buf;
7042742Swollman
70530711Swollman		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
7062742Swollman		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
70719878Swollman		if (err) {
70830711Swollman			scn->scn_phys.scn_errors++;
70919878Swollman			return (err);
71019878Swollman		}
71119878Swollman
71243543Swollman		osp = buf->b_data;
71343543Swollman
71443543Swollman		dsl_scan_visitdnode(scn, ds, osp->os_type,
71543543Swollman		    &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
71619878Swollman
71719878Swollman		if (OBJSET_BUF_HAS_USERUSED(buf)) {
71819878Swollman			/*
71930711Swollman			 * We also always visit user/group accounting
72019878Swollman			 * objects, and never skip them, even if we are
72119878Swollman			 * pausing.  This is necessary so that the space
72219878Swollman			 * deltas from this txg get integrated.
72330711Swollman			 */
72419878Swollman			dsl_scan_visitdnode(scn, ds, osp->os_type,
72519878Swollman			    &osp->os_groupused_dnode,
72619878Swollman			    DMU_GROUPUSED_OBJECT, tx);
72719878Swollman			dsl_scan_visitdnode(scn, ds, osp->os_type,
72819878Swollman			    &osp->os_userused_dnode,
72919878Swollman			    DMU_USERUSED_OBJECT, tx);
73019878Swollman		}
7312742Swollman		(void) arc_buf_remove_ref(buf, &buf);
73230711Swollman	}
73330711Swollman
73419878Swollman	return (0);
7352742Swollman}
73619878Swollman
7372742Swollmanstatic void
7382742Swollmandsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
73930711Swollman    dmu_objset_type_t ostype, dnode_phys_t *dnp,
74030711Swollman    uint64_t object, dmu_tx_t *tx)
74130711Swollman{
74230711Swollman	int j;
74330711Swollman
74430711Swollman	for (j = 0; j < dnp->dn_nblkptr; j++) {
74530711Swollman		zbookmark_phys_t czb;
74619878Swollman
74719878Swollman		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
7482742Swollman		    dnp->dn_nlevels - 1, j);
7492742Swollman		dsl_scan_visitbp(&dnp->dn_blkptr[j],
750163302Sru		    &czb, dnp, ds, scn, ostype, tx);
7512742Swollman	}
7522742Swollman
75358787Sru	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
754149514Swollman		zbookmark_phys_t czb;
75558787Sru		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
75658787Sru		    0, DMU_SPILL_BLKID);
75758787Sru		dsl_scan_visitbp(&dnp->dn_spill,
75858787Sru		    &czb, dnp, ds, scn, ostype, tx);
7592742Swollman	}
76019878Swollman}
7612742Swollman
762158421Swollman/*
7632742Swollman * The arguments are in this order because mdb can only print the
7642742Swollman * first 5; we want them to be useful.
7652742Swollman */
7662742Swollmanstatic void
76719878Swollmandsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
7682742Swollman    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
769181424Sedwin    dmu_objset_type_t ostype, dmu_tx_t *tx)
770181424Sedwin{
7712742Swollman	dsl_pool_t *dp = scn->scn_dp;
77219878Swollman	arc_buf_t *buf = NULL;
77320094Swollman	blkptr_t bp_toread = *bp;
77458787Sru
77558787Sru	/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
7762742Swollman
7772742Swollman	if (dsl_scan_check_pause(scn, zb))
778163302Sru		return;
7792742Swollman
780136638Swollman	if (dsl_scan_check_resume(scn, dnp, zb))
781138323Swollman		return;
782136638Swollman
7832742Swollman	if (BP_IS_HOLE(bp))
7842742Swollman		return;
78519878Swollman
7862742Swollman	scn->scn_visited_this_txg++;
78719878Swollman
7882742Swollman	dprintf_bp(bp,
78919878Swollman	    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
79019878Swollman	    ds, ds ? ds->ds_object : 0,
79119878Swollman	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
7922742Swollman	    bp);
7932742Swollman
79419878Swollman	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
79519878Swollman		return;
79619878Swollman
79719878Swollman	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)
7982742Swollman		return;
799169811Swollman
800149514Swollman	/*
801149514Swollman	 * If dsl_scan_ddt() has aready visited this block, it will have
802149514Swollman	 * already done any translations or scrubbing, so don't call the
803149514Swollman	 * callback again.
804149514Swollman	 */
805149514Swollman	if (ddt_class_contains(dp->dp_spa,
806149514Swollman	    scn->scn_phys.scn_ddt_class_max, bp)) {
807149514Swollman		ASSERT(buf == NULL);
808149514Swollman		return;
809149514Swollman	}
810149514Swollman
811149514Swollman	/*
812149514Swollman	 * If this block is from the future (after cur_max_txg), then we
813149514Swollman	 * are doing this on behalf of a deleted snapshot, and we will
814149514Swollman	 * revisit the future block on the next pass of this dataset.
815149514Swollman	 * Don't scan it now unless we need to because something
816149514Swollman	 * under it was modified.
817149514Swollman	 */
818149514Swollman	if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
819149514Swollman		scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
820149514Swollman	}
821149514Swollman}
822149514Swollman
823149514Swollmanstatic void
824149514Swollmandsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
825149514Swollman    dmu_tx_t *tx)
826149514Swollman{
827149514Swollman	zbookmark_phys_t zb;
828149514Swollman
829149514Swollman	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
830149514Swollman	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
831149514Swollman	dsl_scan_visitbp(bp, &zb, NULL,
832149514Swollman	    ds, scn, DMU_OST_NONE, tx);
833149514Swollman
8342742Swollman	dprintf_ds(ds, "finished scan%s", "");
83519878Swollman}
8362742Swollman
83719878Swollmanvoid
83819878Swollmandsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
8392742Swollman{
84019878Swollman	dsl_pool_t *dp = ds->ds_dir->dd_pool;
8412742Swollman	dsl_scan_t *scn = dp->dp_scan;
84219878Swollman	uint64_t mintxg;
8432742Swollman
84419878Swollman	if (scn->scn_phys.scn_state != DSS_SCANNING)
8452742Swollman		return;
8462742Swollman
8472742Swollman	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
8482742Swollman		if (dsl_dataset_is_snapshot(ds)) {
849149514Swollman			/* Note, scn_cur_{min,max}_txg stays the same. */
85019878Swollman			scn->scn_phys.scn_bookmark.zb_objset =
85119878Swollman			    ds->ds_phys->ds_next_snap_obj;
85219878Swollman			zfs_dbgmsg("destroying ds %llu; currently traversing; "
85319878Swollman			    "reset zb_objset to %llu",
854169811Swollman			    (u_longlong_t)ds->ds_object,
8559908Swollman			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
85619878Swollman			scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
85720094Swollman		} else {
858149514Swollman			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
859149514Swollman			    ZB_DESTROYED_OBJSET, 0, 0, 0);
860149514Swollman			zfs_dbgmsg("destroying ds %llu; currently traversing; "
861149514Swollman			    "reset bookmark to -1,0,0,0",
862149514Swollman			    (u_longlong_t)ds->ds_object);
863158421Swollman		}
86420094Swollman	} else if (zap_lookup_int_key(dp->dp_meta_objset,
86520094Swollman	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
866158421Swollman		ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
86720094Swollman		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
86820094Swollman		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
86920094Swollman		if (dsl_dataset_is_snapshot(ds)) {
87086222Swollman			/*
87186222Swollman			 * We keep the same mintxg; it could be >
87286222Swollman			 * ds_creation_txg if the previous snapshot was
87320094Swollman			 * deleted too.
87486222Swollman			 */
87586222Swollman			VERIFY(zap_add_int_key(dp->dp_meta_objset,
87686222Swollman			    scn->scn_phys.scn_queue_obj,
87786222Swollman			    ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
87886222Swollman			zfs_dbgmsg("destroying ds %llu; in queue; "
87986222Swollman			    "replacing with %llu",
88086222Swollman			    (u_longlong_t)ds->ds_object,
88186222Swollman			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
88286222Swollman		} else {
88386222Swollman			zfs_dbgmsg("destroying ds %llu; in queue; removing",
88486222Swollman			    (u_longlong_t)ds->ds_object);
88593799Swollman		}
886114173Swollman	} else {
88793799Swollman		zfs_dbgmsg("destroying ds %llu; ignoring",
88893799Swollman		    (u_longlong_t)ds->ds_object);
88993799Swollman	}
89093799Swollman
89193799Swollman	/*
89293799Swollman	 * dsl_scan_sync() should be called after this, and should sync
89393799Swollman	 * out our changed state, but just to be safe, do it here.
89493799Swollman	 */
89593799Swollman	dsl_scan_sync_state(scn, tx);
89693799Swollman}
89793799Swollman
89893799Swollmanvoid
89993799Swollmandsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
90093799Swollman{
90193799Swollman	dsl_pool_t *dp = ds->ds_dir->dd_pool;
90293799Swollman	dsl_scan_t *scn = dp->dp_scan;
90393799Swollman	uint64_t mintxg;
90493799Swollman
90593799Swollman	if (scn->scn_phys.scn_state != DSS_SCANNING)
90693799Swollman		return;
90793799Swollman
90893799Swollman	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
90993799Swollman
91093799Swollman	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
91193799Swollman		scn->scn_phys.scn_bookmark.zb_objset =
91293799Swollman		    ds->ds_phys->ds_prev_snap_obj;
91393799Swollman		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
91493799Swollman		    "reset zb_objset to %llu",
91593799Swollman		    (u_longlong_t)ds->ds_object,
91693799Swollman		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
91793799Swollman	} else if (zap_lookup_int_key(dp->dp_meta_objset,
91893799Swollman	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
91993799Swollman		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
92093799Swollman		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
921158421Swollman		VERIFY(zap_add_int_key(dp->dp_meta_objset,
922158421Swollman		    scn->scn_phys.scn_queue_obj,
923158421Swollman		    ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
92486222Swollman		zfs_dbgmsg("snapshotting ds %llu; in queue; "
92593799Swollman		    "replacing with %llu",
926158421Swollman		    (u_longlong_t)ds->ds_object,
927158421Swollman		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
92886222Swollman	}
92920094Swollman	dsl_scan_sync_state(scn, tx);
93093799Swollman}
93193799Swollman
932158421Swollmanvoid
933158421Swollmandsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
934158421Swollman{
935158421Swollman	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
93620094Swollman	dsl_scan_t *scn = dp->dp_scan;
93720094Swollman	uint64_t mintxg;
93886222Swollman
93986222Swollman	if (scn->scn_phys.scn_state != DSS_SCANNING)
94086222Swollman		return;
94186222Swollman
942163302Sru	if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
94319878Swollman		scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
94419878Swollman		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
94520094Swollman		    "reset zb_objset to %llu",
94620094Swollman		    (u_longlong_t)ds1->ds_object,
9472742Swollman		    (u_longlong_t)ds2->ds_object);
94820094Swollman	} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
94967578Swollman		scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
95020094Swollman		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
9512742Swollman		    "reset zb_objset to %llu",
9522742Swollman		    (u_longlong_t)ds2->ds_object,
953149514Swollman		    (u_longlong_t)ds1->ds_object);
9549908Swollman	}
9559908Swollman
9569908Swollman	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
95720094Swollman	    ds1->ds_object, &mintxg) == 0) {
958149514Swollman		int err;
95920094Swollman
96020094Swollman		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
96120094Swollman		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
96220094Swollman		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
96320094Swollman		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
96420094Swollman		err = zap_add_int_key(dp->dp_meta_objset,
96520094Swollman		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
96620094Swollman		VERIFY(err == 0 || err == EEXIST);
96743543Swollman		if (err == EEXIST) {
968149514Swollman			/* Both were there to begin with */
96943543Swollman			VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
97043543Swollman			    scn->scn_phys.scn_queue_obj,
97143543Swollman			    ds1->ds_object, mintxg, tx));
97243543Swollman		}
97343543Swollman		zfs_dbgmsg("clone_swap ds %llu; in queue; "
97443543Swollman		    "replacing with %llu",
97543543Swollman		    (u_longlong_t)ds1->ds_object,
97643543Swollman		    (u_longlong_t)ds2->ds_object);
97743543Swollman	} else if (zap_lookup_int_key(dp->dp_meta_objset,
97843543Swollman	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
97958787Sru		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
98058787Sru		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
98158787Sru		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
98258787Sru		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
98358787Sru		VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
98458787Sru		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
98558787Sru		zfs_dbgmsg("clone_swap ds %llu; in queue; "
98675267Swollman		    "replacing with %llu",
98758787Sru		    (u_longlong_t)ds2->ds_object,
98858787Sru		    (u_longlong_t)ds1->ds_object);
98958787Sru	}
99058787Sru
99158787Sru	dsl_scan_sync_state(scn, tx);
99258787Sru}
99393799Swollman
99493799Swollmanstruct enqueue_clones_arg {
99593799Swollman	dmu_tx_t *tx;
99693799Swollman	uint64_t originobj;
99793799Swollman};
99893799Swollman
99993799Swollman/* ARGSUSED */
10002742Swollmanstatic int
10012742Swollmanenqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
100219878Swollman{
100319878Swollman	struct enqueue_clones_arg *eca = arg;
100419878Swollman	dsl_dataset_t *ds;
10052742Swollman	int err;
100620094Swollman	dsl_scan_t *scn = dp->dp_scan;
100719878Swollman
100820094Swollman	if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj)
100919878Swollman		return (0);
101043543Swollman
101158787Sru	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
101293799Swollman	if (err)
101393799Swollman		return (err);
10142742Swollman
10152742Swollman	while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
10169908Swollman		dsl_dataset_t *prev;
1017158421Swollman		err = dsl_dataset_hold_obj(dp,
10189908Swollman		    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
10199908Swollman
10209908Swollman		dsl_dataset_rele(ds, FTAG);
1021158421Swollman		if (err)
1022158421Swollman			return (err);
1023158421Swollman		ds = prev;
10249908Swollman	}
10252742Swollman	VERIFY(zap_add_int_key(dp->dp_meta_objset,
102619878Swollman	    scn->scn_phys.scn_queue_obj, ds->ds_object,
10272742Swollman	    ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
10282742Swollman	dsl_dataset_rele(ds, FTAG);
10292742Swollman	return (0);
103019878Swollman}
103119878Swollman
103219878Swollmanstatic void
10332742Swollmandsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
1034136638Swollman{
1035136638Swollman	dsl_pool_t *dp = scn->scn_dp;
1036136638Swollman	dsl_dataset_t *ds;
1037136638Swollman	objset_t *os;
10382742Swollman
103975267Swollman	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
104075267Swollman
104119878Swollman	if (dmu_objset_from_ds(ds, &os))
104275267Swollman		goto out;
104375267Swollman
104475267Swollman	/*
104575267Swollman	 * Only the ZIL in the head (non-snapshot) is valid.  Even though
104675267Swollman	 * snapshots can have ZIL block pointers (which may be the same
104775267Swollman	 * BP as in the head), they must be ignored.  So we traverse the
104875267Swollman	 * ZIL here, rather than in scan_recurse(), because the regular
104975267Swollman	 * snapshot block-sharing rules don't apply to it.
105075267Swollman	 */
105175267Swollman	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
105275267Swollman		dsl_scan_zil(dp, &os->os_zil_header);
1053158421Swollman
10542742Swollman	/*
105519878Swollman	 * Iterate over the bps in this ds.
105643014Swollman	 */
105719878Swollman	dmu_buf_will_dirty(ds->ds_dbuf, tx);
105819878Swollman	dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
105919878Swollman
106019878Swollman	char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
10612742Swollman	dsl_dataset_name(ds, dsname);
106219878Swollman	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
10632742Swollman	    "pausing=%u",
106419878Swollman	    (longlong_t)dsobj, dsname,
106543543Swollman	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
106643543Swollman	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
1067158421Swollman	    (int)scn->scn_pausing);
10682742Swollman	kmem_free(dsname, ZFS_MAXNAMELEN);
106919878Swollman
107019878Swollman	if (scn->scn_pausing)
107119878Swollman		goto out;
107219878Swollman
107319878Swollman	/*
107419878Swollman	 * We've finished this pass over this dataset.
107519878Swollman	 */
107619878Swollman
107719878Swollman	/*
107819878Swollman	 * If we did not completely visit this dataset, do another pass.
107919878Swollman	 */
108019878Swollman	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
108119878Swollman		zfs_dbgmsg("incomplete pass; visiting again");
108219878Swollman		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
108319878Swollman		VERIFY(zap_add_int_key(dp->dp_meta_objset,
108419878Swollman		    scn->scn_phys.scn_queue_obj, ds->ds_object,
108519878Swollman		    scn->scn_phys.scn_cur_max_txg, tx) == 0);
10862742Swollman		goto out;
108719878Swollman	}
1088158421Swollman
1089158421Swollman	/*
109075267Swollman	 * Add descendent datasets to work queue.
109175267Swollman	 */
109275267Swollman	if (ds->ds_phys->ds_next_snap_obj != 0) {
109375267Swollman		VERIFY(zap_add_int_key(dp->dp_meta_objset,
109475267Swollman		    scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
109575267Swollman		    ds->ds_phys->ds_creation_txg, tx) == 0);
109643014Swollman	}
1097158421Swollman	if (ds->ds_phys->ds_num_children > 1) {
1098149514Swollman		boolean_t usenext = B_FALSE;
109943014Swollman		if (ds->ds_phys->ds_next_clones_obj != 0) {
110043014Swollman			uint64_t count;
110143014Swollman			/*
110243014Swollman			 * A bug in a previous version of the code could
110319878Swollman			 * cause upgrade_clones_cb() to not set
110443014Swollman			 * ds_next_snap_obj when it should, leading to a
110543014Swollman			 * missing entry.  Therefore we can only use the
110643014Swollman			 * next_clones_obj when its count is correct.
110719878Swollman			 */
110843014Swollman			int err = zap_count(dp->dp_meta_objset,
11092742Swollman			    ds->ds_phys->ds_next_clones_obj, &count);
1110158421Swollman			if (err == 0 &&
111143014Swollman			    count == ds->ds_phys->ds_num_children - 1)
111243014Swollman				usenext = B_TRUE;
111343014Swollman		}
1114158421Swollman
111514343Swollman		if (usenext) {
111614343Swollman			VERIFY0(zap_join_key(dp->dp_meta_objset,
111714343Swollman			    ds->ds_phys->ds_next_clones_obj,
11182742Swollman			    scn->scn_phys.scn_queue_obj,
111943014Swollman			    ds->ds_phys->ds_creation_txg, tx));
112075267Swollman		} else {
1121158421Swollman			struct enqueue_clones_arg eca;
112243014Swollman			eca.tx = tx;
112375267Swollman			eca.originobj = ds->ds_object;
1124158421Swollman
112519878Swollman			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
112643014Swollman			    enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
112719878Swollman		}
112819878Swollman	}
11292742Swollman
113019878Swollmanout:
11318049Swollman	dsl_dataset_rele(ds, FTAG);
1132149514Swollman}
113343014Swollman
113443014Swollman/* ARGSUSED */
1135114173Swollmanstatic int
113643014Swollmanenqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
1137114173Swollman{
1138181424Sedwin	dmu_tx_t *tx = arg;
1139181424Sedwin	dsl_dataset_t *ds;
1140114173Swollman	int err;
1141114173Swollman	dsl_scan_t *scn = dp->dp_scan;
1142114173Swollman
1143114173Swollman	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
1144181421Sedwin	if (err)
1145181421Sedwin		return (err);
1146114173Swollman
1147114173Swollman	while (ds->ds_phys->ds_prev_snap_obj != 0) {
1148114173Swollman		dsl_dataset_t *prev;
1149114173Swollman		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
1150114173Swollman		    FTAG, &prev);
11512742Swollman		if (err) {
115219878Swollman			dsl_dataset_rele(ds, FTAG);
115343014Swollman			return (err);
115443014Swollman		}
1155181421Sedwin
1156181421Sedwin		/*
1157181421Sedwin		 * If this is a clone, we don't need to worry about it for now.
1158181421Sedwin		 */
115943014Swollman		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
116019878Swollman			dsl_dataset_rele(ds, FTAG);
116119878Swollman			dsl_dataset_rele(prev, FTAG);
116219878Swollman			return (0);
1163181421Sedwin		}
1164181421Sedwin		dsl_dataset_rele(ds, FTAG);
1165181421Sedwin		ds = prev;
1166181421Sedwin	}
1167181421Sedwin
11682742Swollman	VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
11692742Swollman	    ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
1170181421Sedwin	dsl_dataset_rele(ds, FTAG);
1171181421Sedwin	return (0);
117219878Swollman}
117319878Swollman
11742742Swollman/*
1175138323Swollman * Scrub/dedup interaction.
1176138323Swollman *
1177138323Swollman * If there are N references to a deduped block, we don't want to scrub it
1178138323Swollman * N times -- ideally, we should scrub it exactly once.
1179138323Swollman *
11802742Swollman * We leverage the fact that the dde's replication class (enum ddt_class)
11812742Swollman * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
1182158421Swollman * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
11832742Swollman *
118419878Swollman * To prevent excess scrubbing, the scrub begins by walking the DDT
118519878Swollman * to find all blocks with refcnt > 1, and scrubs each of these once.
11862742Swollman * Since there are two replication classes which contain blocks with
11872742Swollman * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
11882742Swollman * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
1189158421Swollman *
119019878Swollman * There would be nothing more to say if a block's refcnt couldn't change
11912742Swollman * during a scrub, but of course it can so we must account for changes
1192158421Swollman * in a block's replication class.
119319878Swollman *
1194158421Swollman * Here's an example of what can occur:
11952742Swollman *
119619878Swollman * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
11972742Swollman * when visited during the top-down scrub phase, it will be scrubbed twice.
1198158421Swollman * This negates our scrub optimization, but is otherwise harmless.
119919878Swollman *
12002742Swollman * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
120119878Swollman * on each visit during the top-down scrub phase, it will never be scrubbed.
12022742Swollman * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
120319878Swollman * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
12042742Swollman * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
120519878Swollman * while a scrub is in progress, it scrubs the block right then.
12062742Swollman */
12072742Swollmanstatic void
120819878Swollmandsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
12092742Swollman{
121019878Swollman	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
12112742Swollman	ddt_entry_t dde = { 0 };
12122742Swollman	int error;
12132742Swollman	uint64_t n = 0;
121419878Swollman
121519878Swollman	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
121619878Swollman		ddt_t *ddt;
121719878Swollman
1218158421Swollman		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
121921217Swollman			break;
122019878Swollman		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
12212742Swollman		    (longlong_t)ddb->ddb_class,
12222742Swollman		    (longlong_t)ddb->ddb_type,
12232742Swollman		    (longlong_t)ddb->ddb_checksum,
122419878Swollman		    (longlong_t)ddb->ddb_cursor);
122517200Swollman
122619878Swollman		/* There should be no pending changes to the dedup table */
12272742Swollman		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
122819878Swollman		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
12292742Swollman
123019878Swollman		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
12312742Swollman		n++;
123219878Swollman
12332742Swollman		if (dsl_scan_check_pause(scn, NULL))
123419878Swollman			break;
123519878Swollman	}
12362742Swollman
123719878Swollman	zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
12382742Swollman	    (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
123919878Swollman	    (int)scn->scn_pausing);
12402742Swollman
124119878Swollman	ASSERT(error == 0 || error == ENOENT);
12422742Swollman	ASSERT(error != ENOENT ||
124319878Swollman	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
12442742Swollman}
12452742Swollman
124619878Swollman/* ARGSUSED */
124719878Swollmanvoid
1248181424Sedwindsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
124919878Swollman    ddt_entry_t *dde, dmu_tx_t *tx)
125019878Swollman{
12512742Swollman	const ddt_key_t *ddk = &dde->dde_key;
12522742Swollman	ddt_phys_t *ddp = dde->dde_phys;
12532742Swollman	blkptr_t bp;
1254149514Swollman	zbookmark_phys_t zb = { 0 };
12552742Swollman
12562742Swollman	if (scn->scn_phys.scn_state != DSS_SCANNING)
125719878Swollman		return;
12582742Swollman
12592742Swollman	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
12602742Swollman		if (ddp->ddp_phys_birth == 0 ||
12612742Swollman		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
12622742Swollman			continue;
12632742Swollman		ddt_bp_create(checksum, ddk, ddp, &bp);
12642742Swollman
12652742Swollman		scn->scn_visited_this_txg++;
12662742Swollman		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
12672742Swollman	}
12682742Swollman}
12692742Swollman
12702742Swollmanstatic void
127119878Swollmandsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
12722742Swollman{
12732742Swollman	dsl_pool_t *dp = scn->scn_dp;
12742742Swollman	zap_cursor_t zc;
12752742Swollman	zap_attribute_t za;
12762742Swollman
12772742Swollman	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
12782742Swollman	    scn->scn_phys.scn_ddt_class_max) {
127919878Swollman		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
12802742Swollman		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
1281158421Swollman		dsl_scan_ddt(scn, tx);
1282158421Swollman		if (scn->scn_pausing)
1283158421Swollman			return;
1284158421Swollman	}
12852742Swollman
12862742Swollman	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
128719878Swollman		/* First do the MOS & ORIGIN */
128819878Swollman
128919878Swollman		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
129019878Swollman		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
129119878Swollman		dsl_scan_visit_rootbp(scn, NULL,
129219878Swollman		    &dp->dp_meta_rootbp, tx);
129319878Swollman		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
129419878Swollman		if (scn->scn_pausing)
129519878Swollman			return;
129619878Swollman
129719878Swollman		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
12982742Swollman			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
129919878Swollman			    enqueue_cb, tx, DS_FIND_CHILDREN));
130019878Swollman		} else {
13012742Swollman			dsl_scan_visitds(scn,
130219878Swollman			    dp->dp_origin_snap->ds_object, tx);
13032742Swollman		}
130419878Swollman		ASSERT(!scn->scn_pausing);
130519878Swollman	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
130619878Swollman	    ZB_DESTROYED_OBJSET) {
13072742Swollman		/*
13082742Swollman		 * If we were paused, continue from here.  Note if the
130919878Swollman		 * ds we were paused on was deleted, the zb_objset may
131019878Swollman		 * be -1, so we will skip this and find a new objset
13112742Swollman		 * below.
13122742Swollman		 */
13132742Swollman		dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
131419878Swollman		if (scn->scn_pausing)
131575267Swollman			return;
131675267Swollman	}
131786222Swollman
131875267Swollman	/*
131975267Swollman	 * In case we were paused right at the end of the ds, zero the
132075267Swollman	 * bookmark so we don't think that we're still trying to resume.
132175267Swollman	 */
1322158421Swollman	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
1323158421Swollman
1324158421Swollman	/* keep pulling things out of the zap-object-as-queue */
132575267Swollman	while (zap_cursor_init(&zc, dp->dp_meta_objset,
1326158421Swollman	    scn->scn_phys.scn_queue_obj),
132743014Swollman	    zap_cursor_retrieve(&zc, &za) == 0) {
132843014Swollman		dsl_dataset_t *ds;
132943014Swollman		uint64_t dsobj;
133017200Swollman
1331158421Swollman		dsobj = strtonum(za.za_name, NULL);
133217200Swollman		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
133317200Swollman		    scn->scn_phys.scn_queue_obj, dsobj, tx));
133417200Swollman
133517200Swollman		/* Set up min/max txg */
133617200Swollman		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
133717200Swollman		if (za.za_first_integer != 0) {
133817200Swollman			scn->scn_phys.scn_cur_min_txg =
133917200Swollman			    MAX(scn->scn_phys.scn_min_txg,
134017200Swollman			    za.za_first_integer);
134117200Swollman		} else {
134219878Swollman			scn->scn_phys.scn_cur_min_txg =
134317200Swollman			    MAX(scn->scn_phys.scn_min_txg,
134417200Swollman			    ds->ds_phys->ds_prev_snap_txg);
134517200Swollman		}
134617200Swollman		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
134717200Swollman		dsl_dataset_rele(ds, FTAG);
13482742Swollman
134919878Swollman		dsl_scan_visitds(scn, dsobj, tx);
135017200Swollman		zap_cursor_fini(&zc);
135119878Swollman		if (scn->scn_pausing)
13522742Swollman			return;
135319878Swollman	}
13542742Swollman	zap_cursor_fini(&zc);
135519878Swollman}
135619878Swollman
135717200Swollmanstatic boolean_t
135819878Swollmandsl_scan_free_should_pause(dsl_scan_t *scn)
135917200Swollman{
136019878Swollman	uint64_t elapsed_nanosecs;
136117200Swollman
136219878Swollman	if (zfs_recover)
13632742Swollman		return (B_FALSE);
136419878Swollman
13652742Swollman	if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
136619878Swollman		return (B_TRUE);
13672742Swollman
136819878Swollman	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
13692742Swollman	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
137019878Swollman	    (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
137119878Swollman	    txg_sync_waiting(scn->scn_dp)) ||
13722742Swollman	    spa_shutting_down(scn->scn_dp->dp_spa));
137319878Swollman}
13742742Swollman
13752742Swollmanstatic int
137619878Swollmandsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
13772742Swollman{
137819878Swollman	dsl_scan_t *scn = arg;
137919878Swollman
13802742Swollman	if (!scn->scn_is_bptree ||
138119878Swollman	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
138219878Swollman		if (dsl_scan_free_should_pause(scn))
13832742Swollman			return (SET_ERROR(ERESTART));
13842742Swollman	}
13852742Swollman
13862742Swollman	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1387158421Swollman	    dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
138819878Swollman	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
138919878Swollman	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
139019878Swollman	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
139119878Swollman	scn->scn_visited_this_txg++;
13922742Swollman	return (0);
13939908Swollman}
13949908Swollman
13959908Swollmanboolean_t
13962742Swollmandsl_scan_active(dsl_scan_t *scn)
139743014Swollman{
1398149514Swollman	spa_t *spa = scn->scn_dp->dp_spa;
139943014Swollman	uint64_t used = 0, comp, uncomp;
140043014Swollman
140143014Swollman	if (spa->spa_load_state != SPA_LOAD_NONE)
140243014Swollman		return (B_FALSE);
140343014Swollman	if (spa_shutting_down(spa))
140443014Swollman		return (B_FALSE);
140543014Swollman	if (scn->scn_phys.scn_state == DSS_SCANNING ||
140643014Swollman	    (scn->scn_async_destroying && !scn->scn_async_stalled))
140743014Swollman		return (B_TRUE);
140843014Swollman
140943014Swollman	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
141043014Swollman		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
141143014Swollman		    &used, &comp, &uncomp);
141243014Swollman	}
141343014Swollman	return (used != 0);
141443014Swollman}
141543014Swollman
141643014Swollmanvoid
141743014Swollmandsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
141843014Swollman{
141943014Swollman	dsl_scan_t *scn = dp->dp_scan;
142043014Swollman	spa_t *spa = dp->dp_spa;
142143014Swollman	int err = 0;
142243014Swollman
142343014Swollman	/*
142443014Swollman	 * Check for scn_restart_txg before checking spa_load_state, so
142543014Swollman	 * that we can restart an old-style scan while the pool is being
142643014Swollman	 * imported (see dsl_scan_init).
142743014Swollman	 */
142843014Swollman	if (scn->scn_restart_txg != 0 &&
142943014Swollman	    scn->scn_restart_txg <= tx->tx_txg) {
143043014Swollman		pool_scan_func_t func = POOL_SCAN_SCRUB;
143143014Swollman		dsl_scan_done(scn, B_FALSE, tx);
143243014Swollman		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
143358787Sru			func = POOL_SCAN_RESILVER;
143458787Sru		zfs_dbgmsg("restarting scan func=%u txg=%llu",
143558787Sru		    func, tx->tx_txg);
143658787Sru		dsl_scan_setup_sync(&func, tx);
143758787Sru	}
143858787Sru
143975267Swollman	/*
144075267Swollman	 * If the scan is inactive due to a stalled async destroy, try again.
144175267Swollman	 */
144275267Swollman	if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
144375267Swollman	    spa_sync_pass(dp->dp_spa) > 1)
144475267Swollman		return;
144575267Swollman
144675267Swollman	scn->scn_visited_this_txg = 0;
144775267Swollman	scn->scn_pausing = B_FALSE;
144875267Swollman	scn->scn_sync_start_time = gethrtime();
144975267Swollman	spa->spa_scrub_active = B_TRUE;
145075267Swollman
145175267Swollman	/*
145275267Swollman	 * First process the async destroys.  If we pause, don't do
145320094Swollman	 * any scrubbing or resilvering.  This ensures that there are no
145443014Swollman	 * async destroys while we are scanning, so the scan code doesn't
145543014Swollman	 * have to worry about traversing it.  It is also faster to free the
14562742Swollman	 * blocks than to scrub them.
14572742Swollman	 */
145819878Swollman	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
145919878Swollman		scn->scn_is_bptree = B_FALSE;
146019878Swollman		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
146119878Swollman		    NULL, ZIO_FLAG_MUSTSUCCEED);
146219878Swollman		err = bpobj_iterate(&dp->dp_free_bpobj,
14632742Swollman		    dsl_scan_free_block_cb, scn, tx);
146420094Swollman		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
146543014Swollman
146643014Swollman		if (err != 0 && err != ERESTART)
146743014Swollman			zfs_panic_recover("error %u from bpobj_iterate()", err);
146843014Swollman	}
146958787Sru
1470121098Swollman	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
147175267Swollman		ASSERT(scn->scn_async_destroying);
14722742Swollman		scn->scn_is_bptree = B_TRUE;
14732742Swollman		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
14742742Swollman		    NULL, ZIO_FLAG_MUSTSUCCEED);
14752742Swollman		err = bptree_iterate(dp->dp_meta_objset,
147619878Swollman		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
147719878Swollman		VERIFY0(zio_wait(scn->scn_zio_root));
14782742Swollman
14792742Swollman		if (err == EIO || err == ECKSUM) {
148058787Sru			err = 0;
148158787Sru		} else if (err != 0 && err != ERESTART) {
148258787Sru			zfs_panic_recover("error %u from "
148358787Sru			    "traverse_dataset_destroyed()", err);
148458787Sru		}
1485149514Swollman
148658787Sru		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
148758787Sru			/* finished; deactivate async destroy feature */
148858787Sru			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
148958787Sru			ASSERT(!spa_feature_is_active(spa,
149058787Sru			    SPA_FEATURE_ASYNC_DESTROY));
149158787Sru			VERIFY0(zap_remove(dp->dp_meta_objset,
149258787Sru			    DMU_POOL_DIRECTORY_OBJECT,
149358787Sru			    DMU_POOL_BPTREE_OBJ, tx));
149458787Sru			VERIFY0(bptree_free(dp->dp_meta_objset,
149558787Sru			    dp->dp_bptree_obj, tx));
149675267Swollman			dp->dp_bptree_obj = 0;
149758787Sru			scn->scn_async_destroying = B_FALSE;
149864499Swollman			scn->scn_async_stalled = B_FALSE;
149964499Swollman		} else {
150064499Swollman			/*
150164499Swollman			 * If we didn't make progress, mark the async
1502114173Swollman			 * destroy as stalled, so that we will not initiate
1503114173Swollman			 * a spa_sync() on its behalf.  Note that we only
1504114173Swollman			 * check this if we are not finished, because if the
1505114173Swollman			 * bptree had no blocks for us to visit, we can
1506114173Swollman			 * finish without "making progress".
1507114173Swollman			 */
1508114173Swollman			scn->scn_async_stalled =
1509114173Swollman			    (scn->scn_visited_this_txg == 0);
1510114173Swollman		}
15112742Swollman	}
15122742Swollman	if (scn->scn_visited_this_txg) {
151319878Swollman		zfs_dbgmsg("freed %llu blocks in %llums from "
151419878Swollman		    "free_bpobj/bptree txg %llu; err=%d",
151519878Swollman		    (longlong_t)scn->scn_visited_this_txg,
15162742Swollman		    (longlong_t)
151719878Swollman		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
151820094Swollman		    (longlong_t)tx->tx_txg, err);
151919878Swollman		scn->scn_visited_this_txg = 0;
152020094Swollman
152119878Swollman		/*
152243014Swollman		 * Write out changes to the DDT that may be required as a
152343014Swollman		 * result of the blocks freed.  This ensures that the DDT
152458787Sru		 * is clean when a scrub/resilver runs.
1525114173Swollman		 */
1526114173Swollman		ddt_sync(spa, tx->tx_txg);
15272742Swollman	}
15282742Swollman	if (err != 0)
1529158421Swollman		return;
1530158421Swollman	if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
15312742Swollman	    (dp->dp_free_dir->dd_phys->dd_used_bytes != 0 ||
153219878Swollman	    dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 ||
15332742Swollman	    dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) {
153419878Swollman		/*
15352742Swollman		 * We have finished background destroying, but there is still
153619878Swollman		 * some space left in the dp_free_dir. Transfer this leaked
15372742Swollman		 * space to the dp_leak_dir.
153819878Swollman		 */
15392742Swollman		if (dp->dp_leak_dir == NULL) {
154019878Swollman			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
15412742Swollman			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
154219878Swollman			    LEAK_DIR_NAME, tx);
15432742Swollman			VERIFY0(dsl_pool_open_special_dir(dp,
154419878Swollman			    LEAK_DIR_NAME, &dp->dp_leak_dir));
15452742Swollman			rrw_exit(&dp->dp_config_rwlock, FTAG);
154619878Swollman		}
15472742Swollman		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
154819878Swollman		    dp->dp_free_dir->dd_phys->dd_used_bytes,
15492742Swollman		    dp->dp_free_dir->dd_phys->dd_compressed_bytes,
155019878Swollman		    dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
155119878Swollman		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
155219878Swollman		    -dp->dp_free_dir->dd_phys->dd_used_bytes,
155319878Swollman		    -dp->dp_free_dir->dd_phys->dd_compressed_bytes,
155419878Swollman		    -dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
15552742Swollman	}
15562742Swollman	if (!scn->scn_async_destroying) {
155719878Swollman		/* finished; verify that space accounting went to zero */
155819878Swollman		ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
155919878Swollman		ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
156019878Swollman		ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes);
156119878Swollman	}
156219878Swollman
15632742Swollman	if (scn->scn_phys.scn_state != DSS_SCANNING)
15642742Swollman		return;
1565163302Sru
15662742Swollman	if (scn->scn_done_txg == tx->tx_txg) {
15672742Swollman		ASSERT(!scn->scn_pausing);
15682742Swollman		/* finished with scan. */
156919878Swollman		zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
15702742Swollman		dsl_scan_done(scn, B_TRUE, tx);
157119878Swollman		ASSERT3U(spa->spa_scrub_inflight, ==, 0);
15722742Swollman		dsl_scan_sync_state(scn, tx);
157319878Swollman		return;
15742742Swollman	}
157519878Swollman
15762742Swollman	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1577158421Swollman	    scn->scn_phys.scn_ddt_class_max) {
157819878Swollman		zfs_dbgmsg("doing scan sync txg %llu; "
157919878Swollman		    "ddt bm=%llu/%llu/%llu/%llx",
158019878Swollman		    (longlong_t)tx->tx_txg,
158119878Swollman		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
158219878Swollman		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
15832742Swollman		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
15842742Swollman		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
158575267Swollman		ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
1586158421Swollman		ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
1587158421Swollman		ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
1588158421Swollman		ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
158975267Swollman	} else {
159075267Swollman		zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
159175267Swollman		    (longlong_t)tx->tx_txg,
159275267Swollman		    (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
1593149514Swollman		    (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
159475267Swollman		    (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
159575267Swollman		    (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
159675267Swollman	}
15972742Swollman
159858787Sru	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
159958787Sru	    NULL, ZIO_FLAG_CANFAIL);
160058787Sru	dsl_pool_config_enter(dp, FTAG);
160158787Sru	dsl_scan_visit(scn, tx);
160258787Sru	dsl_pool_config_exit(dp, FTAG);
160358787Sru	(void) zio_wait(scn->scn_zio_root);
160458787Sru	scn->scn_zio_root = NULL;
160558787Sru
160658787Sru	zfs_dbgmsg("visited %llu blocks in %llums",
160758787Sru	    (longlong_t)scn->scn_visited_this_txg,
160858787Sru	    (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
160958787Sru
161058787Sru	if (!scn->scn_pausing) {
16112742Swollman		scn->scn_done_txg = tx->tx_txg + 1;
16122742Swollman		zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
1613158421Swollman		    tx->tx_txg, scn->scn_done_txg);
1614158421Swollman	}
16152742Swollman
16162742Swollman	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
161714343Swollman		mutex_enter(&spa->spa_scrub_lock);
161819878Swollman		while (spa->spa_scrub_inflight > 0) {
161919878Swollman			cv_wait(&spa->spa_scrub_io_cv,
162019878Swollman			    &spa->spa_scrub_lock);
16212742Swollman		}
1622163302Sru		mutex_exit(&spa->spa_scrub_lock);
1623163302Sru	}
1624163302Sru
16252742Swollman	dsl_scan_sync_state(scn, tx);
162686222Swollman}
162714343Swollman
162814343Swollman/*
162986222Swollman * This will start a new scan, or restart an existing one.
163086222Swollman */
163186222Swollmanvoid
163286222Swollmandsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
163386222Swollman{
163486222Swollman	if (txg == 0) {
163586222Swollman		dmu_tx_t *tx;
163686222Swollman		tx = dmu_tx_create_dd(dp->dp_mos_dir);
163786222Swollman		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
163886222Swollman
163986222Swollman		txg = dmu_tx_get_txg(tx);
164086222Swollman		dp->dp_scan->scn_restart_txg = txg;
164186222Swollman		dmu_tx_commit(tx);
164286222Swollman	} else {
164386222Swollman		dp->dp_scan->scn_restart_txg = txg;
164486222Swollman	}
164586222Swollman	zfs_dbgmsg("restarting resilver txg=%llu", txg);
164686222Swollman}
164786222Swollman
164886222Swollmanboolean_t
164986222Swollmandsl_scan_resilvering(dsl_pool_t *dp)
165086222Swollman{
165186222Swollman	return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
165286222Swollman	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
165386222Swollman}
165486222Swollman
165586222Swollman/*
165686222Swollman * scrub consumers
165786222Swollman */
165886222Swollman
165986222Swollmanstatic void
166086222Swollmancount_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
166186222Swollman{
166286222Swollman	int i;
166375267Swollman
166486222Swollman	/*
16652742Swollman	 * If we resume after a reboot, zab will be NULL; don't record
166675267Swollman	 * incomplete stats in that case.
166775267Swollman	 */
16682742Swollman	if (zab == NULL)
16692742Swollman		return;
16702742Swollman
167186222Swollman	for (i = 0; i < 4; i++) {
167275267Swollman		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
167319878Swollman		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
167475267Swollman		if (t & DMU_OT_NEWTYPE)
167575267Swollman			t = DMU_OT_OTHER;
167675267Swollman		zfs_blkstat_t *zb = &zab->zab_type[l][t];
167786222Swollman		int equal;
167886222Swollman
16792742Swollman		zb->zb_count++;
16802742Swollman		zb->zb_asize += BP_GET_ASIZE(bp);
16812742Swollman		zb->zb_lsize += BP_GET_LSIZE(bp);
16822742Swollman		zb->zb_psize += BP_GET_PSIZE(bp);
168319878Swollman		zb->zb_gangs += BP_COUNT_GANG(bp);
168419878Swollman
168519878Swollman		switch (BP_GET_NDVAS(bp)) {
168619878Swollman		case 2:
168775267Swollman			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
168886222Swollman			    DVA_GET_VDEV(&bp->blk_dva[1]))
168986222Swollman				zb->zb_ditto_2_of_2_samevdev++;
169086222Swollman			break;
16912742Swollman		case 3:
169286222Swollman			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
169386222Swollman			    DVA_GET_VDEV(&bp->blk_dva[1])) +
169486222Swollman			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
169519878Swollman			    DVA_GET_VDEV(&bp->blk_dva[2])) +
169619878Swollman			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
169719878Swollman			    DVA_GET_VDEV(&bp->blk_dva[2]));
16982742Swollman			if (equal == 1)
16992742Swollman				zb->zb_ditto_2_of_3_samevdev++;
1700158421Swollman			else if (equal == 3)
1701158421Swollman				zb->zb_ditto_3_of_3_samevdev++;
17022742Swollman			break;
170319878Swollman		}
17042742Swollman	}
170519878Swollman}
17062742Swollman
170719878Swollmanstatic void
17082742Swollmandsl_scan_scrub_done(zio_t *zio)
170919878Swollman{
17102742Swollman	spa_t *spa = zio->io_spa;
171186222Swollman
171219878Swollman	zio_data_buf_free(zio->io_data, zio->io_size);
171319878Swollman
171419878Swollman	mutex_enter(&spa->spa_scrub_lock);
171519878Swollman	spa->spa_scrub_inflight--;
171643014Swollman	cv_broadcast(&spa->spa_scrub_io_cv);
171743014Swollman
171886222Swollman	if (zio->io_error && (zio->io_error != ECKSUM ||
171986222Swollman	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
172086222Swollman		spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
172186222Swollman	}
172286222Swollman	mutex_exit(&spa->spa_scrub_lock);
172386222Swollman}
172486222Swollman
172586222Swollmanstatic int
172686222Swollmandsl_scan_scrub_cb(dsl_pool_t *dp,
172786222Swollman    const blkptr_t *bp, const zbookmark_phys_t *zb)
172886222Swollman{
172986222Swollman	dsl_scan_t *scn = dp->dp_scan;
173086222Swollman	size_t size = BP_GET_PSIZE(bp);
173186222Swollman	spa_t *spa = dp->dp_spa;
173286222Swollman	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
173386222Swollman	boolean_t needs_io;
173486222Swollman	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
173586222Swollman	unsigned int scan_delay = 0;
173686222Swollman
173786222Swollman	if (phys_birth <= scn->scn_phys.scn_min_txg ||
173886222Swollman	    phys_birth >= scn->scn_phys.scn_max_txg)
173986222Swollman		return (0);
174086222Swollman
1741138323Swollman	count_block(dp->dp_blkstats, bp);
174286222Swollman
174386222Swollman	if (BP_IS_EMBEDDED(bp))
174486222Swollman		return (0);
174586222Swollman
174686222Swollman	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
174786222Swollman	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
174886222Swollman		zio_flags |= ZIO_FLAG_SCRUB;
174986222Swollman		needs_io = B_TRUE;
175086222Swollman		scan_delay = zfs_scrub_delay;
175186222Swollman	} else {
175286222Swollman		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
175386222Swollman		zio_flags |= ZIO_FLAG_RESILVER;
175486222Swollman		needs_io = B_FALSE;
175586222Swollman		scan_delay = zfs_resilver_delay;
175686222Swollman	}
175786222Swollman
1758149514Swollman	/* If it's an intent log block, failure is expected. */
175986222Swollman	if (zb->zb_level == ZB_ZIL_LEVEL)
176086222Swollman		zio_flags |= ZIO_FLAG_SPECULATIVE;
176186222Swollman
176286222Swollman	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
176386222Swollman		vdev_t *vd = vdev_lookup_top(spa,
176486222Swollman		    DVA_GET_VDEV(&bp->blk_dva[d]));
176519878Swollman
17662742Swollman		/*
17672742Swollman		 * Keep track of how much data we've examined so that
17682742Swollman		 * zpool(1M) status can make useful progress reports.
17692742Swollman		 */
177019878Swollman		scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
1771158421Swollman		spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
1772158421Swollman
17732742Swollman		/* if it's a resilver, this may not be in the target range */
1774158421Swollman		if (!needs_io) {
177519878Swollman			if (DVA_GET_GANG(&bp->blk_dva[d])) {
17762742Swollman				/*
1777149514Swollman				 * Gang members may be spread across multiple
1778149514Swollman				 * vdevs, so the best estimate we have is the
1779149514Swollman				 * scrub range, which has already been checked.
1780149514Swollman				 * XXX -- it would be better to change our
1781149514Swollman				 * allocation policy to ensure that all
1782149514Swollman				 * gang members reside on the same vdev.
1783149514Swollman				 */
1784149514Swollman				needs_io = B_TRUE;
1785149514Swollman			} else {
1786149514Swollman				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
1787149514Swollman				    phys_birth, 1);
1788149514Swollman			}
1789149514Swollman		}
179019878Swollman	}
17912742Swollman
179219878Swollman	if (needs_io && !zfs_no_scrub_io) {
179319878Swollman		vdev_t *rvd = spa->spa_root_vdev;
17942742Swollman		uint64_t maxinflight = rvd->vdev_children *
179519878Swollman		    MAX(zfs_top_maxinflight, 1);
1796149514Swollman		void *data = zio_data_buf_alloc(size);
17972742Swollman
17982742Swollman		mutex_enter(&spa->spa_scrub_lock);
17992742Swollman		while (spa->spa_scrub_inflight >= maxinflight)
180019878Swollman			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
180119878Swollman		spa->spa_scrub_inflight++;
180219878Swollman		mutex_exit(&spa->spa_scrub_lock);
180319878Swollman
180419878Swollman		/*
1805149514Swollman		 * If we're seeing recent (zfs_scan_idle) "important" I/Os
1806149514Swollman		 * then throttle our workload to limit the impact of a scan.
180758787Sru		 */
18082742Swollman		if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
18092742Swollman			delay(MAX((int)scan_delay, 0));
181020094Swollman
1811149514Swollman		zio_nowait(zio_read(NULL, spa, bp, data, size,
181220094Swollman		    dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
181320094Swollman		    zio_flags, zb));
181420094Swollman	}
1815149514Swollman
181620094Swollman	/* do not relocate this block */
181720094Swollman	return (0);
181820094Swollman}
181920094Swollman
182020094Swollmanint
182120094Swollmandsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
182220094Swollman{
182320094Swollman	spa_t *spa = dp->dp_spa;
182420094Swollman
182520094Swollman	/*
182620094Swollman	 * Purge all vdev caches and probe all devices.  We do this here
18272742Swollman	 * rather than in sync context because this requires a writer lock
182843543Swollman	 * on the spa_config lock, which we can't do from sync context.  The
182943543Swollman	 * spa_scrub_reopen flag indicates that vdev_open() should not
1830158421Swollman	 * attempt to start another scrub.
183119878Swollman	 */
1832158421Swollman	spa_vdev_state_enter(spa, SCL_NONE);
18332742Swollman	spa->spa_scrub_reopen = B_TRUE;
183419878Swollman	vdev_reopen(spa->spa_root_vdev);
18352742Swollman	spa->spa_scrub_reopen = B_FALSE;
183619878Swollman	(void) spa_vdev_state_exit(spa, NULL, 0);
183719878Swollman
183819878Swollman	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
183919878Swollman	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
184019878Swollman}
18412742Swollman