1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright (c) 2017, Intel Corporation. 29 */ 30 31/* 32 * The objective of this program is to provide a DMU/ZAP/SPA stress test 33 * that runs entirely in userland, is easy to use, and easy to extend. 34 * 35 * The overall design of the ztest program is as follows: 36 * 37 * (1) For each major functional area (e.g. adding vdevs to a pool, 38 * creating and destroying datasets, reading and writing objects, etc) 39 * we have a simple routine to test that functionality. These 40 * individual routines do not have to do anything "stressful". 41 * 42 * (2) We turn these simple functionality tests into a stress test by 43 * running them all in parallel, with as many threads as desired, 44 * and spread across as many datasets, objects, and vdevs as desired. 45 * 46 * (3) While all this is happening, we inject faults into the pool to 47 * verify that self-healing data really works. 48 * 49 * (4) Every time we open a dataset, we change its checksum and compression 50 * functions. Thus even individual objects vary from block to block 51 * in which checksum they use and whether they're compressed. 52 * 53 * (5) To verify that we never lose on-disk consistency after a crash, 54 * we run the entire test in a child of the main process. 55 * At random times, the child self-immolates with a SIGKILL. 56 * This is the software equivalent of pulling the power cord. 57 * The parent then runs the test again, using the existing 58 * storage pool, as many times as desired. If backwards compatibility 59 * testing is enabled ztest will sometimes run the "older" version 60 * of ztest after a SIGKILL. 61 * 62 * (6) To verify that we don't have future leaks or temporal incursions, 63 * many of the functional tests record the transaction group number 64 * as part of their data. When reading old data, they verify that 65 * the transaction group number is less than the current, open txg. 66 * If you add a new test, please do this if applicable. 67 * 68 * (7) Threads are created with a reduced stack size, for sanity checking. 69 * Therefore, it's important not to allocate huge buffers on the stack. 70 * 71 * When run with no arguments, ztest runs for about five minutes and 72 * produces no output if successful. To get a little bit of information, 73 * specify -V. To get more information, specify -VV, and so on. 74 * 75 * To turn this into an overnight stress test, use -T to specify run time. 76 * 77 * You can ask more vdevs [-v], datasets [-d], or threads [-t] 78 * to increase the pool capacity, fanout, and overall stress level. 79 * 80 * Use the -k option to set the desired frequency of kills. 81 * 82 * When ztest invokes itself it passes all relevant information through a 83 * temporary file which is mmap-ed in the child process. This allows shared 84 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 85 * stored at offset 0 of this file and contains information on the size and 86 * number of shared structures in the file. The information stored in this file 87 * must remain backwards compatible with older versions of ztest so that 88 * ztest can invoke them during backwards compatibility testing (-B). 89 */ 90 91#include <sys/zfs_context.h> 92#include <sys/spa.h> 93#include <sys/dmu.h> 94#include <sys/txg.h> 95#include <sys/dbuf.h> 96#include <sys/zap.h> 97#include <sys/dmu_objset.h> 98#include <sys/poll.h> 99#include <sys/stat.h> 100#include <sys/time.h> 101#include <sys/wait.h> 102#include <sys/mman.h> 103#include <sys/resource.h> 104#include <sys/zio.h> 105#include <sys/zil.h> 106#include <sys/zil_impl.h> 107#include <sys/vdev_draid.h> 108#include <sys/vdev_impl.h> 109#include <sys/vdev_file.h> 110#include <sys/vdev_initialize.h> 111#include <sys/vdev_raidz.h> 112#include <sys/vdev_trim.h> 113#include <sys/spa_impl.h> 114#include <sys/metaslab_impl.h> 115#include <sys/dsl_prop.h> 116#include <sys/dsl_dataset.h> 117#include <sys/dsl_destroy.h> 118#include <sys/dsl_scan.h> 119#include <sys/zio_checksum.h> 120#include <sys/zfs_refcount.h> 121#include <sys/zfeature.h> 122#include <sys/dsl_userhold.h> 123#include <sys/abd.h> 124#include <stdio.h> 125#include <stdlib.h> 126#include <unistd.h> 127#include <getopt.h> 128#include <signal.h> 129#include <umem.h> 130#include <ctype.h> 131#include <math.h> 132#include <sys/fs/zfs.h> 133#include <zfs_fletcher.h> 134#include <libnvpair.h> 135#include <libzutil.h> 136#include <sys/crypto/icp.h> 137#if (__GLIBC__ && !__UCLIBC__) 138#include <execinfo.h> /* for backtrace() */ 139#endif 140 141static int ztest_fd_data = -1; 142static int ztest_fd_rand = -1; 143 144typedef struct ztest_shared_hdr { 145 uint64_t zh_hdr_size; 146 uint64_t zh_opts_size; 147 uint64_t zh_size; 148 uint64_t zh_stats_size; 149 uint64_t zh_stats_count; 150 uint64_t zh_ds_size; 151 uint64_t zh_ds_count; 152} ztest_shared_hdr_t; 153 154static ztest_shared_hdr_t *ztest_shared_hdr; 155 156enum ztest_class_state { 157 ZTEST_VDEV_CLASS_OFF, 158 ZTEST_VDEV_CLASS_ON, 159 ZTEST_VDEV_CLASS_RND 160}; 161 162#define ZO_GVARS_MAX_ARGLEN ((size_t)64) 163#define ZO_GVARS_MAX_COUNT ((size_t)10) 164 165typedef struct ztest_shared_opts { 166 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 167 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 168 char zo_alt_ztest[MAXNAMELEN]; 169 char zo_alt_libpath[MAXNAMELEN]; 170 uint64_t zo_vdevs; 171 uint64_t zo_vdevtime; 172 size_t zo_vdev_size; 173 int zo_ashift; 174 int zo_mirrors; 175 int zo_raid_children; 176 int zo_raid_parity; 177 char zo_raid_type[8]; 178 int zo_draid_data; 179 int zo_draid_spares; 180 int zo_datasets; 181 int zo_threads; 182 uint64_t zo_passtime; 183 uint64_t zo_killrate; 184 int zo_verbose; 185 int zo_init; 186 uint64_t zo_time; 187 uint64_t zo_maxloops; 188 uint64_t zo_metaslab_force_ganging; 189 int zo_mmp_test; 190 int zo_special_vdevs; 191 int zo_dump_dbgmsg; 192 int zo_gvars_count; 193 char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; 194} ztest_shared_opts_t; 195 196/* Default values for command line options. */ 197#define DEFAULT_POOL "ztest" 198#define DEFAULT_VDEV_DIR "/tmp" 199#define DEFAULT_VDEV_COUNT 5 200#define DEFAULT_VDEV_SIZE (SPA_MINDEVSIZE * 4) /* 256m default size */ 201#define DEFAULT_VDEV_SIZE_STR "256M" 202#define DEFAULT_ASHIFT SPA_MINBLOCKSHIFT 203#define DEFAULT_MIRRORS 2 204#define DEFAULT_RAID_CHILDREN 4 205#define DEFAULT_RAID_PARITY 1 206#define DEFAULT_DRAID_DATA 4 207#define DEFAULT_DRAID_SPARES 1 208#define DEFAULT_DATASETS_COUNT 7 209#define DEFAULT_THREADS 23 210#define DEFAULT_RUN_TIME 300 /* 300 seconds */ 211#define DEFAULT_RUN_TIME_STR "300 sec" 212#define DEFAULT_PASS_TIME 60 /* 60 seconds */ 213#define DEFAULT_PASS_TIME_STR "60 sec" 214#define DEFAULT_KILL_RATE 70 /* 70% kill rate */ 215#define DEFAULT_KILLRATE_STR "70%" 216#define DEFAULT_INITS 1 217#define DEFAULT_MAX_LOOPS 50 /* 5 minutes */ 218#define DEFAULT_FORCE_GANGING (64 << 10) 219#define DEFAULT_FORCE_GANGING_STR "64K" 220 221/* Simplifying assumption: -1 is not a valid default. */ 222#define NO_DEFAULT -1 223 224static const ztest_shared_opts_t ztest_opts_defaults = { 225 .zo_pool = DEFAULT_POOL, 226 .zo_dir = DEFAULT_VDEV_DIR, 227 .zo_alt_ztest = { '\0' }, 228 .zo_alt_libpath = { '\0' }, 229 .zo_vdevs = DEFAULT_VDEV_COUNT, 230 .zo_ashift = DEFAULT_ASHIFT, 231 .zo_mirrors = DEFAULT_MIRRORS, 232 .zo_raid_children = DEFAULT_RAID_CHILDREN, 233 .zo_raid_parity = DEFAULT_RAID_PARITY, 234 .zo_raid_type = VDEV_TYPE_RAIDZ, 235 .zo_vdev_size = DEFAULT_VDEV_SIZE, 236 .zo_draid_data = DEFAULT_DRAID_DATA, /* data drives */ 237 .zo_draid_spares = DEFAULT_DRAID_SPARES, /* distributed spares */ 238 .zo_datasets = DEFAULT_DATASETS_COUNT, 239 .zo_threads = DEFAULT_THREADS, 240 .zo_passtime = DEFAULT_PASS_TIME, 241 .zo_killrate = DEFAULT_KILL_RATE, 242 .zo_verbose = 0, 243 .zo_mmp_test = 0, 244 .zo_init = DEFAULT_INITS, 245 .zo_time = DEFAULT_RUN_TIME, 246 .zo_maxloops = DEFAULT_MAX_LOOPS, /* max loops during spa_freeze() */ 247 .zo_metaslab_force_ganging = DEFAULT_FORCE_GANGING, 248 .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, 249 .zo_gvars_count = 0, 250}; 251 252extern uint64_t metaslab_force_ganging; 253extern uint64_t metaslab_df_alloc_threshold; 254extern unsigned long zfs_deadman_synctime_ms; 255extern int metaslab_preload_limit; 256extern boolean_t zfs_compressed_arc_enabled; 257extern int zfs_abd_scatter_enabled; 258extern int dmu_object_alloc_chunk_shift; 259extern boolean_t zfs_force_some_double_word_sm_entries; 260extern unsigned long zio_decompress_fail_fraction; 261extern unsigned long zfs_reconstruct_indirect_damage_fraction; 262 263 264static ztest_shared_opts_t *ztest_shared_opts; 265static ztest_shared_opts_t ztest_opts; 266static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; 267 268typedef struct ztest_shared_ds { 269 uint64_t zd_seq; 270} ztest_shared_ds_t; 271 272static ztest_shared_ds_t *ztest_shared_ds; 273#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 274 275#define BT_MAGIC 0x123456789abcdefULL 276#define MAXFAULTS(zs) \ 277 (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) 278 279enum ztest_io_type { 280 ZTEST_IO_WRITE_TAG, 281 ZTEST_IO_WRITE_PATTERN, 282 ZTEST_IO_WRITE_ZEROES, 283 ZTEST_IO_TRUNCATE, 284 ZTEST_IO_SETATTR, 285 ZTEST_IO_REWRITE, 286 ZTEST_IO_TYPES 287}; 288 289typedef struct ztest_block_tag { 290 uint64_t bt_magic; 291 uint64_t bt_objset; 292 uint64_t bt_object; 293 uint64_t bt_dnodesize; 294 uint64_t bt_offset; 295 uint64_t bt_gen; 296 uint64_t bt_txg; 297 uint64_t bt_crtxg; 298} ztest_block_tag_t; 299 300typedef struct bufwad { 301 uint64_t bw_index; 302 uint64_t bw_txg; 303 uint64_t bw_data; 304} bufwad_t; 305 306/* 307 * It would be better to use a rangelock_t per object. Unfortunately 308 * the rangelock_t is not a drop-in replacement for rl_t, because we 309 * still need to map from object ID to rangelock_t. 310 */ 311typedef enum { 312 RL_READER, 313 RL_WRITER, 314 RL_APPEND 315} rl_type_t; 316 317typedef struct rll { 318 void *rll_writer; 319 int rll_readers; 320 kmutex_t rll_lock; 321 kcondvar_t rll_cv; 322} rll_t; 323 324typedef struct rl { 325 uint64_t rl_object; 326 uint64_t rl_offset; 327 uint64_t rl_size; 328 rll_t *rl_lock; 329} rl_t; 330 331#define ZTEST_RANGE_LOCKS 64 332#define ZTEST_OBJECT_LOCKS 64 333 334/* 335 * Object descriptor. Used as a template for object lookup/create/remove. 336 */ 337typedef struct ztest_od { 338 uint64_t od_dir; 339 uint64_t od_object; 340 dmu_object_type_t od_type; 341 dmu_object_type_t od_crtype; 342 uint64_t od_blocksize; 343 uint64_t od_crblocksize; 344 uint64_t od_crdnodesize; 345 uint64_t od_gen; 346 uint64_t od_crgen; 347 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 348} ztest_od_t; 349 350/* 351 * Per-dataset state. 352 */ 353typedef struct ztest_ds { 354 ztest_shared_ds_t *zd_shared; 355 objset_t *zd_os; 356 pthread_rwlock_t zd_zilog_lock; 357 zilog_t *zd_zilog; 358 ztest_od_t *zd_od; /* debugging aid */ 359 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 360 kmutex_t zd_dirobj_lock; 361 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 362 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 363} ztest_ds_t; 364 365/* 366 * Per-iteration state. 367 */ 368typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 369 370typedef struct ztest_info { 371 ztest_func_t *zi_func; /* test function */ 372 uint64_t zi_iters; /* iterations per execution */ 373 uint64_t *zi_interval; /* execute every <interval> seconds */ 374 const char *zi_funcname; /* name of test function */ 375} ztest_info_t; 376 377typedef struct ztest_shared_callstate { 378 uint64_t zc_count; /* per-pass count */ 379 uint64_t zc_time; /* per-pass time */ 380 uint64_t zc_next; /* next time to call this function */ 381} ztest_shared_callstate_t; 382 383static ztest_shared_callstate_t *ztest_shared_callstate; 384#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 385 386ztest_func_t ztest_dmu_read_write; 387ztest_func_t ztest_dmu_write_parallel; 388ztest_func_t ztest_dmu_object_alloc_free; 389ztest_func_t ztest_dmu_object_next_chunk; 390ztest_func_t ztest_dmu_commit_callbacks; 391ztest_func_t ztest_zap; 392ztest_func_t ztest_zap_parallel; 393ztest_func_t ztest_zil_commit; 394ztest_func_t ztest_zil_remount; 395ztest_func_t ztest_dmu_read_write_zcopy; 396ztest_func_t ztest_dmu_objset_create_destroy; 397ztest_func_t ztest_dmu_prealloc; 398ztest_func_t ztest_fzap; 399ztest_func_t ztest_dmu_snapshot_create_destroy; 400ztest_func_t ztest_dsl_prop_get_set; 401ztest_func_t ztest_spa_prop_get_set; 402ztest_func_t ztest_spa_create_destroy; 403ztest_func_t ztest_fault_inject; 404ztest_func_t ztest_dmu_snapshot_hold; 405ztest_func_t ztest_mmp_enable_disable; 406ztest_func_t ztest_scrub; 407ztest_func_t ztest_dsl_dataset_promote_busy; 408ztest_func_t ztest_vdev_attach_detach; 409ztest_func_t ztest_vdev_LUN_growth; 410ztest_func_t ztest_vdev_add_remove; 411ztest_func_t ztest_vdev_class_add; 412ztest_func_t ztest_vdev_aux_add_remove; 413ztest_func_t ztest_split_pool; 414ztest_func_t ztest_reguid; 415ztest_func_t ztest_spa_upgrade; 416ztest_func_t ztest_device_removal; 417ztest_func_t ztest_spa_checkpoint_create_discard; 418ztest_func_t ztest_initialize; 419ztest_func_t ztest_trim; 420ztest_func_t ztest_fletcher; 421ztest_func_t ztest_fletcher_incr; 422ztest_func_t ztest_verify_dnode_bt; 423 424uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 425uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 426uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 427uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 428uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 429 430#define ZTI_INIT(func, iters, interval) \ 431 { .zi_func = (func), \ 432 .zi_iters = (iters), \ 433 .zi_interval = (interval), \ 434 .zi_funcname = # func } 435 436ztest_info_t ztest_info[] = { 437 ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), 438 ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), 439 ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), 440 ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), 441 ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), 442 ZTI_INIT(ztest_zap, 30, &zopt_always), 443 ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), 444 ZTI_INIT(ztest_split_pool, 1, &zopt_always), 445 ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), 446 ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), 447 ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), 448 ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), 449 ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), 450 ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), 451#if 0 452 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), 453#endif 454 ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), 455 ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), 456 ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), 457 ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), 458 ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), 459 ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), 460 ZTI_INIT(ztest_reguid, 1, &zopt_rarely), 461 ZTI_INIT(ztest_scrub, 1, &zopt_rarely), 462 ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), 463 ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), 464 ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), 465 ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), 466 ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), 467 ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), 468 ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), 469 ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), 470 ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), 471 ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), 472 ZTI_INIT(ztest_trim, 1, &zopt_sometimes), 473 ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), 474 ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), 475 ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), 476}; 477 478#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 479 480/* 481 * The following struct is used to hold a list of uncalled commit callbacks. 482 * The callbacks are ordered by txg number. 483 */ 484typedef struct ztest_cb_list { 485 kmutex_t zcl_callbacks_lock; 486 list_t zcl_callbacks; 487} ztest_cb_list_t; 488 489/* 490 * Stuff we need to share writably between parent and child. 491 */ 492typedef struct ztest_shared { 493 boolean_t zs_do_init; 494 hrtime_t zs_proc_start; 495 hrtime_t zs_proc_stop; 496 hrtime_t zs_thread_start; 497 hrtime_t zs_thread_stop; 498 hrtime_t zs_thread_kill; 499 uint64_t zs_enospc_count; 500 uint64_t zs_vdev_next_leaf; 501 uint64_t zs_vdev_aux; 502 uint64_t zs_alloc; 503 uint64_t zs_space; 504 uint64_t zs_splits; 505 uint64_t zs_mirrors; 506 uint64_t zs_metaslab_sz; 507 uint64_t zs_metaslab_df_alloc_threshold; 508 uint64_t zs_guid; 509} ztest_shared_t; 510 511#define ID_PARALLEL -1ULL 512 513static char ztest_dev_template[] = "%s/%s.%llua"; 514static char ztest_aux_template[] = "%s/%s.%s.%llu"; 515ztest_shared_t *ztest_shared; 516 517static spa_t *ztest_spa = NULL; 518static ztest_ds_t *ztest_ds; 519 520static kmutex_t ztest_vdev_lock; 521static boolean_t ztest_device_removal_active = B_FALSE; 522static boolean_t ztest_pool_scrubbed = B_FALSE; 523static kmutex_t ztest_checkpoint_lock; 524 525/* 526 * The ztest_name_lock protects the pool and dataset namespace used by 527 * the individual tests. To modify the namespace, consumers must grab 528 * this lock as writer. Grabbing the lock as reader will ensure that the 529 * namespace does not change while the lock is held. 530 */ 531static pthread_rwlock_t ztest_name_lock; 532 533static boolean_t ztest_dump_core = B_TRUE; 534static boolean_t ztest_exiting; 535 536/* Global commit callback list */ 537static ztest_cb_list_t zcl; 538/* Commit cb delay */ 539static uint64_t zc_min_txg_delay = UINT64_MAX; 540static int zc_cb_counter = 0; 541 542/* 543 * Minimum number of commit callbacks that need to be registered for us to check 544 * whether the minimum txg delay is acceptable. 545 */ 546#define ZTEST_COMMIT_CB_MIN_REG 100 547 548/* 549 * If a number of txgs equal to this threshold have been created after a commit 550 * callback has been registered but not called, then we assume there is an 551 * implementation bug. 552 */ 553#define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) 554 555enum ztest_object { 556 ZTEST_META_DNODE = 0, 557 ZTEST_DIROBJ, 558 ZTEST_OBJECTS 559}; 560 561static void usage(boolean_t) __NORETURN; 562static int ztest_scrub_impl(spa_t *spa); 563 564/* 565 * These libumem hooks provide a reasonable set of defaults for the allocator's 566 * debugging facilities. 567 */ 568const char * 569_umem_debug_init(void) 570{ 571 return ("default,verbose"); /* $UMEM_DEBUG setting */ 572} 573 574const char * 575_umem_logging_init(void) 576{ 577 return ("fail,contents"); /* $UMEM_LOGGING setting */ 578} 579 580static void 581dump_debug_buffer(void) 582{ 583 ssize_t ret __attribute__((unused)); 584 585 if (!ztest_opts.zo_dump_dbgmsg) 586 return; 587 588 /* 589 * We use write() instead of printf() so that this function 590 * is safe to call from a signal handler. 591 */ 592 ret = write(STDOUT_FILENO, "\n", 1); 593 zfs_dbgmsg_print("ztest"); 594} 595 596#define BACKTRACE_SZ 100 597 598static void sig_handler(int signo) 599{ 600 struct sigaction action; 601#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ 602 int nptrs; 603 void *buffer[BACKTRACE_SZ]; 604 605 nptrs = backtrace(buffer, BACKTRACE_SZ); 606 backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); 607#endif 608 dump_debug_buffer(); 609 610 /* 611 * Restore default action and re-raise signal so SIGSEGV and 612 * SIGABRT can trigger a core dump. 613 */ 614 action.sa_handler = SIG_DFL; 615 sigemptyset(&action.sa_mask); 616 action.sa_flags = 0; 617 (void) sigaction(signo, &action, NULL); 618 raise(signo); 619} 620 621#define FATAL_MSG_SZ 1024 622 623char *fatal_msg; 624 625static void 626fatal(int do_perror, char *message, ...) 627{ 628 va_list args; 629 int save_errno = errno; 630 char *buf; 631 632 (void) fflush(stdout); 633 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); 634 635 va_start(args, message); 636 (void) sprintf(buf, "ztest: "); 637 /* LINTED */ 638 (void) vsprintf(buf + strlen(buf), message, args); 639 va_end(args); 640 if (do_perror) { 641 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 642 ": %s", strerror(save_errno)); 643 } 644 (void) fprintf(stderr, "%s\n", buf); 645 fatal_msg = buf; /* to ease debugging */ 646 647 if (ztest_dump_core) 648 abort(); 649 else 650 dump_debug_buffer(); 651 652 exit(3); 653} 654 655static int 656str2shift(const char *buf) 657{ 658 const char *ends = "BKMGTPEZ"; 659 int i; 660 661 if (buf[0] == '\0') 662 return (0); 663 for (i = 0; i < strlen(ends); i++) { 664 if (toupper(buf[0]) == ends[i]) 665 break; 666 } 667 if (i == strlen(ends)) { 668 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 669 buf); 670 usage(B_FALSE); 671 } 672 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 673 return (10*i); 674 } 675 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 676 usage(B_FALSE); 677 /* NOTREACHED */ 678} 679 680static uint64_t 681nicenumtoull(const char *buf) 682{ 683 char *end; 684 uint64_t val; 685 686 val = strtoull(buf, &end, 0); 687 if (end == buf) { 688 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 689 usage(B_FALSE); 690 } else if (end[0] == '.') { 691 double fval = strtod(buf, &end); 692 fval *= pow(2, str2shift(end)); 693 /* 694 * UINT64_MAX is not exactly representable as a double. 695 * The closest representation is UINT64_MAX + 1, so we 696 * use a >= comparison instead of > for the bounds check. 697 */ 698 if (fval >= (double)UINT64_MAX) { 699 (void) fprintf(stderr, "ztest: value too large: %s\n", 700 buf); 701 usage(B_FALSE); 702 } 703 val = (uint64_t)fval; 704 } else { 705 int shift = str2shift(end); 706 if (shift >= 64 || (val << shift) >> shift != val) { 707 (void) fprintf(stderr, "ztest: value too large: %s\n", 708 buf); 709 usage(B_FALSE); 710 } 711 val <<= shift; 712 } 713 return (val); 714} 715 716typedef struct ztest_option { 717 const char short_opt; 718 const char *long_opt; 719 const char *long_opt_param; 720 const char *comment; 721 unsigned int default_int; 722 char *default_str; 723} ztest_option_t; 724 725/* 726 * The following option_table is used for generating the usage info as well as 727 * the long and short option information for calling getopt_long(). 728 */ 729static ztest_option_t option_table[] = { 730 { 'v', "vdevs", "INTEGER", "Number of vdevs", DEFAULT_VDEV_COUNT, 731 NULL}, 732 { 's', "vdev-size", "INTEGER", "Size of each vdev", 733 NO_DEFAULT, DEFAULT_VDEV_SIZE_STR}, 734 { 'a', "alignment-shift", "INTEGER", 735 "Alignment shift; use 0 for random", DEFAULT_ASHIFT, NULL}, 736 { 'm', "mirror-copies", "INTEGER", "Number of mirror copies", 737 DEFAULT_MIRRORS, NULL}, 738 { 'r', "raid-disks", "INTEGER", "Number of raidz/draid disks", 739 DEFAULT_RAID_CHILDREN, NULL}, 740 { 'R', "raid-parity", "INTEGER", "Raid parity", 741 DEFAULT_RAID_PARITY, NULL}, 742 { 'K', "raid-kind", "raidz|draid|random", "Raid kind", 743 NO_DEFAULT, "random"}, 744 { 'D', "draid-data", "INTEGER", "Number of draid data drives", 745 DEFAULT_DRAID_DATA, NULL}, 746 { 'S', "draid-spares", "INTEGER", "Number of draid spares", 747 DEFAULT_DRAID_SPARES, NULL}, 748 { 'd', "datasets", "INTEGER", "Number of datasets", 749 DEFAULT_DATASETS_COUNT, NULL}, 750 { 't', "threads", "INTEGER", "Number of ztest threads", 751 DEFAULT_THREADS, NULL}, 752 { 'g', "gang-block-threshold", "INTEGER", 753 "Metaslab gang block threshold", 754 NO_DEFAULT, DEFAULT_FORCE_GANGING_STR}, 755 { 'i', "init-count", "INTEGER", "Number of times to initialize pool", 756 DEFAULT_INITS, NULL}, 757 { 'k', "kill-percentage", "INTEGER", "Kill percentage", 758 NO_DEFAULT, DEFAULT_KILLRATE_STR}, 759 { 'p', "pool-name", "STRING", "Pool name", 760 NO_DEFAULT, DEFAULT_POOL}, 761 { 'f', "vdev-file-directory", "PATH", "File directory for vdev files", 762 NO_DEFAULT, DEFAULT_VDEV_DIR}, 763 { 'M', "multi-host", NULL, 764 "Multi-host; simulate pool imported on remote host", 765 NO_DEFAULT, NULL}, 766 { 'E', "use-existing-pool", NULL, 767 "Use existing pool instead of creating new one", NO_DEFAULT, NULL}, 768 { 'T', "run-time", "INTEGER", "Total run time", 769 NO_DEFAULT, DEFAULT_RUN_TIME_STR}, 770 { 'P', "pass-time", "INTEGER", "Time per pass", 771 NO_DEFAULT, DEFAULT_PASS_TIME_STR}, 772 { 'F', "freeze-loops", "INTEGER", "Max loops in spa_freeze()", 773 DEFAULT_MAX_LOOPS, NULL}, 774 { 'B', "alt-ztest", "PATH", "Alternate ztest path", 775 NO_DEFAULT, NULL}, 776 { 'C', "vdev-class-state", "on|off|random", "vdev class state", 777 NO_DEFAULT, "random"}, 778 { 'o', "option", "\"OPTION=INTEGER\"", 779 "Set global variable to an unsigned 32-bit integer value", 780 NO_DEFAULT, NULL}, 781 { 'G', "dump-debug-msg", NULL, 782 "Dump zfs_dbgmsg buffer before exiting due to an error", 783 NO_DEFAULT, NULL}, 784 { 'V', "verbose", NULL, 785 "Verbose (use multiple times for ever more verbosity)", 786 NO_DEFAULT, NULL}, 787 { 'h', "help", NULL, "Show this help", 788 NO_DEFAULT, NULL}, 789 {0, 0, 0, 0, 0, 0} 790}; 791 792static struct option *long_opts = NULL; 793static char *short_opts = NULL; 794 795static void 796init_options(void) 797{ 798 ASSERT3P(long_opts, ==, NULL); 799 ASSERT3P(short_opts, ==, NULL); 800 801 int count = sizeof (option_table) / sizeof (option_table[0]); 802 long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); 803 804 short_opts = umem_alloc(sizeof (char) * 2 * count, UMEM_NOFAIL); 805 int short_opt_index = 0; 806 807 for (int i = 0; i < count; i++) { 808 long_opts[i].val = option_table[i].short_opt; 809 long_opts[i].name = option_table[i].long_opt; 810 long_opts[i].has_arg = option_table[i].long_opt_param != NULL 811 ? required_argument : no_argument; 812 long_opts[i].flag = NULL; 813 short_opts[short_opt_index++] = option_table[i].short_opt; 814 if (option_table[i].long_opt_param != NULL) { 815 short_opts[short_opt_index++] = ':'; 816 } 817 } 818} 819 820static void 821fini_options(void) 822{ 823 int count = sizeof (option_table) / sizeof (option_table[0]); 824 825 umem_free(long_opts, sizeof (struct option) * count); 826 umem_free(short_opts, sizeof (char) * 2 * count); 827 828 long_opts = NULL; 829 short_opts = NULL; 830} 831 832static void 833usage(boolean_t requested) 834{ 835 char option[80]; 836 FILE *fp = requested ? stdout : stderr; 837 838 (void) fprintf(fp, "Usage: %s [OPTIONS...]\n", DEFAULT_POOL); 839 for (int i = 0; option_table[i].short_opt != 0; i++) { 840 if (option_table[i].long_opt_param != NULL) { 841 (void) sprintf(option, " -%c --%s=%s", 842 option_table[i].short_opt, 843 option_table[i].long_opt, 844 option_table[i].long_opt_param); 845 } else { 846 (void) sprintf(option, " -%c --%s", 847 option_table[i].short_opt, 848 option_table[i].long_opt); 849 } 850 (void) fprintf(fp, " %-40s%s", option, 851 option_table[i].comment); 852 853 if (option_table[i].long_opt_param != NULL) { 854 if (option_table[i].default_str != NULL) { 855 (void) fprintf(fp, " (default: %s)", 856 option_table[i].default_str); 857 } else if (option_table[i].default_int != NO_DEFAULT) { 858 (void) fprintf(fp, " (default: %u)", 859 option_table[i].default_int); 860 } 861 } 862 (void) fprintf(fp, "\n"); 863 } 864 exit(requested ? 0 : 1); 865} 866 867static uint64_t 868ztest_random(uint64_t range) 869{ 870 uint64_t r; 871 872 ASSERT3S(ztest_fd_rand, >=, 0); 873 874 if (range == 0) 875 return (0); 876 877 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 878 fatal(1, "short read from /dev/urandom"); 879 880 return (r % range); 881} 882 883static void 884ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) 885{ 886 char name[32]; 887 char *value; 888 int state = ZTEST_VDEV_CLASS_RND; 889 890 (void) strlcpy(name, input, sizeof (name)); 891 892 value = strchr(name, '='); 893 if (value == NULL) { 894 (void) fprintf(stderr, "missing value in property=value " 895 "'-C' argument (%s)\n", input); 896 usage(B_FALSE); 897 } 898 *(value) = '\0'; 899 value++; 900 901 if (strcmp(value, "on") == 0) { 902 state = ZTEST_VDEV_CLASS_ON; 903 } else if (strcmp(value, "off") == 0) { 904 state = ZTEST_VDEV_CLASS_OFF; 905 } else if (strcmp(value, "random") == 0) { 906 state = ZTEST_VDEV_CLASS_RND; 907 } else { 908 (void) fprintf(stderr, "invalid property value '%s'\n", value); 909 usage(B_FALSE); 910 } 911 912 if (strcmp(name, "special") == 0) { 913 zo->zo_special_vdevs = state; 914 } else { 915 (void) fprintf(stderr, "invalid property name '%s'\n", name); 916 usage(B_FALSE); 917 } 918 if (zo->zo_verbose >= 3) 919 (void) printf("%s vdev state is '%s'\n", name, value); 920} 921 922static void 923process_options(int argc, char **argv) 924{ 925 char *path; 926 ztest_shared_opts_t *zo = &ztest_opts; 927 928 int opt; 929 uint64_t value; 930 char altdir[MAXNAMELEN] = { 0 }; 931 char raid_kind[8] = { "random" }; 932 933 bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); 934 935 init_options(); 936 937 while ((opt = getopt_long(argc, argv, short_opts, long_opts, 938 NULL)) != EOF) { 939 value = 0; 940 switch (opt) { 941 case 'v': 942 case 's': 943 case 'a': 944 case 'm': 945 case 'r': 946 case 'R': 947 case 'D': 948 case 'S': 949 case 'd': 950 case 't': 951 case 'g': 952 case 'i': 953 case 'k': 954 case 'T': 955 case 'P': 956 case 'F': 957 value = nicenumtoull(optarg); 958 } 959 switch (opt) { 960 case 'v': 961 zo->zo_vdevs = value; 962 break; 963 case 's': 964 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 965 break; 966 case 'a': 967 zo->zo_ashift = value; 968 break; 969 case 'm': 970 zo->zo_mirrors = value; 971 break; 972 case 'r': 973 zo->zo_raid_children = MAX(1, value); 974 break; 975 case 'R': 976 zo->zo_raid_parity = MIN(MAX(value, 1), 3); 977 break; 978 case 'K': 979 (void) strlcpy(raid_kind, optarg, sizeof (raid_kind)); 980 break; 981 case 'D': 982 zo->zo_draid_data = MAX(1, value); 983 break; 984 case 'S': 985 zo->zo_draid_spares = MAX(1, value); 986 break; 987 case 'd': 988 zo->zo_datasets = MAX(1, value); 989 break; 990 case 't': 991 zo->zo_threads = MAX(1, value); 992 break; 993 case 'g': 994 zo->zo_metaslab_force_ganging = 995 MAX(SPA_MINBLOCKSIZE << 1, value); 996 break; 997 case 'i': 998 zo->zo_init = value; 999 break; 1000 case 'k': 1001 zo->zo_killrate = value; 1002 break; 1003 case 'p': 1004 (void) strlcpy(zo->zo_pool, optarg, 1005 sizeof (zo->zo_pool)); 1006 break; 1007 case 'f': 1008 path = realpath(optarg, NULL); 1009 if (path == NULL) { 1010 (void) fprintf(stderr, "error: %s: %s\n", 1011 optarg, strerror(errno)); 1012 usage(B_FALSE); 1013 } else { 1014 (void) strlcpy(zo->zo_dir, path, 1015 sizeof (zo->zo_dir)); 1016 free(path); 1017 } 1018 break; 1019 case 'M': 1020 zo->zo_mmp_test = 1; 1021 break; 1022 case 'V': 1023 zo->zo_verbose++; 1024 break; 1025 case 'E': 1026 zo->zo_init = 0; 1027 break; 1028 case 'T': 1029 zo->zo_time = value; 1030 break; 1031 case 'P': 1032 zo->zo_passtime = MAX(1, value); 1033 break; 1034 case 'F': 1035 zo->zo_maxloops = MAX(1, value); 1036 break; 1037 case 'B': 1038 (void) strlcpy(altdir, optarg, sizeof (altdir)); 1039 break; 1040 case 'C': 1041 ztest_parse_name_value(optarg, zo); 1042 break; 1043 case 'o': 1044 if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { 1045 (void) fprintf(stderr, 1046 "max global var count (%zu) exceeded\n", 1047 ZO_GVARS_MAX_COUNT); 1048 usage(B_FALSE); 1049 } 1050 char *v = zo->zo_gvars[zo->zo_gvars_count]; 1051 if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= 1052 ZO_GVARS_MAX_ARGLEN) { 1053 (void) fprintf(stderr, 1054 "global var option '%s' is too long\n", 1055 optarg); 1056 usage(B_FALSE); 1057 } 1058 zo->zo_gvars_count++; 1059 break; 1060 case 'G': 1061 zo->zo_dump_dbgmsg = 1; 1062 break; 1063 case 'h': 1064 usage(B_TRUE); 1065 break; 1066 case '?': 1067 default: 1068 usage(B_FALSE); 1069 break; 1070 } 1071 } 1072 1073 fini_options(); 1074 1075 /* When raid choice is 'random' add a draid pool 50% of the time */ 1076 if (strcmp(raid_kind, "random") == 0) { 1077 (void) strlcpy(raid_kind, (ztest_random(2) == 0) ? 1078 "draid" : "raidz", sizeof (raid_kind)); 1079 1080 if (ztest_opts.zo_verbose >= 3) 1081 (void) printf("choosing RAID type '%s'\n", raid_kind); 1082 } 1083 1084 if (strcmp(raid_kind, "draid") == 0) { 1085 uint64_t min_devsize; 1086 1087 /* With fewer disk use 256M, otherwise 128M is OK */ 1088 min_devsize = (ztest_opts.zo_raid_children < 16) ? 1089 (256ULL << 20) : (128ULL << 20); 1090 1091 /* No top-level mirrors with dRAID for now */ 1092 zo->zo_mirrors = 0; 1093 1094 /* Use more appropriate defaults for dRAID */ 1095 if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) 1096 zo->zo_vdevs = 1; 1097 if (zo->zo_raid_children == 1098 ztest_opts_defaults.zo_raid_children) 1099 zo->zo_raid_children = 16; 1100 if (zo->zo_ashift < 12) 1101 zo->zo_ashift = 12; 1102 if (zo->zo_vdev_size < min_devsize) 1103 zo->zo_vdev_size = min_devsize; 1104 1105 if (zo->zo_draid_data + zo->zo_raid_parity > 1106 zo->zo_raid_children - zo->zo_draid_spares) { 1107 (void) fprintf(stderr, "error: too few draid " 1108 "children (%d) for stripe width (%d)\n", 1109 zo->zo_raid_children, 1110 zo->zo_draid_data + zo->zo_raid_parity); 1111 usage(B_FALSE); 1112 } 1113 1114 (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, 1115 sizeof (zo->zo_raid_type)); 1116 1117 } else /* using raidz */ { 1118 ASSERT0(strcmp(raid_kind, "raidz")); 1119 1120 zo->zo_raid_parity = MIN(zo->zo_raid_parity, 1121 zo->zo_raid_children - 1); 1122 } 1123 1124 zo->zo_vdevtime = 1125 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 1126 UINT64_MAX >> 2); 1127 1128 if (strlen(altdir) > 0) { 1129 char *cmd; 1130 char *realaltdir; 1131 char *bin; 1132 char *ztest; 1133 char *isa; 1134 int isalen; 1135 1136 cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1137 realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1138 1139 VERIFY3P(NULL, !=, realpath(getexecname(), cmd)); 1140 if (0 != access(altdir, F_OK)) { 1141 ztest_dump_core = B_FALSE; 1142 fatal(B_TRUE, "invalid alternate ztest path: %s", 1143 altdir); 1144 } 1145 VERIFY3P(NULL, !=, realpath(altdir, realaltdir)); 1146 1147 /* 1148 * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest". 1149 * We want to extract <isa> to determine if we should use 1150 * 32 or 64 bit binaries. 1151 */ 1152 bin = strstr(cmd, "/usr/bin/"); 1153 ztest = strstr(bin, "/ztest"); 1154 isa = bin + 9; 1155 isalen = ztest - isa; 1156 (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), 1157 "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); 1158 (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), 1159 "%s/usr/lib/%.*s", realaltdir, isalen, isa); 1160 1161 if (0 != access(zo->zo_alt_ztest, X_OK)) { 1162 ztest_dump_core = B_FALSE; 1163 fatal(B_TRUE, "invalid alternate ztest: %s", 1164 zo->zo_alt_ztest); 1165 } else if (0 != access(zo->zo_alt_libpath, X_OK)) { 1166 ztest_dump_core = B_FALSE; 1167 fatal(B_TRUE, "invalid alternate lib directory %s", 1168 zo->zo_alt_libpath); 1169 } 1170 1171 umem_free(cmd, MAXPATHLEN); 1172 umem_free(realaltdir, MAXPATHLEN); 1173 } 1174} 1175 1176static void 1177ztest_kill(ztest_shared_t *zs) 1178{ 1179 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 1180 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 1181 1182 /* 1183 * Before we kill off ztest, make sure that the config is updated. 1184 * See comment above spa_write_cachefile(). 1185 */ 1186 mutex_enter(&spa_namespace_lock); 1187 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); 1188 mutex_exit(&spa_namespace_lock); 1189 1190 (void) kill(getpid(), SIGKILL); 1191} 1192 1193/* ARGSUSED */ 1194static void 1195ztest_record_enospc(const char *s) 1196{ 1197 ztest_shared->zs_enospc_count++; 1198} 1199 1200static uint64_t 1201ztest_get_ashift(void) 1202{ 1203 if (ztest_opts.zo_ashift == 0) 1204 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 1205 return (ztest_opts.zo_ashift); 1206} 1207 1208static boolean_t 1209ztest_is_draid_spare(const char *name) 1210{ 1211 uint64_t spare_id = 0, parity = 0, vdev_id = 0; 1212 1213 if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", 1214 (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, 1215 (u_longlong_t *)&spare_id) == 3) { 1216 return (B_TRUE); 1217 } 1218 1219 return (B_FALSE); 1220} 1221 1222static nvlist_t * 1223make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) 1224{ 1225 char *pathbuf; 1226 uint64_t vdev; 1227 nvlist_t *file; 1228 boolean_t draid_spare = B_FALSE; 1229 1230 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1231 1232 if (ashift == 0) 1233 ashift = ztest_get_ashift(); 1234 1235 if (path == NULL) { 1236 path = pathbuf; 1237 1238 if (aux != NULL) { 1239 vdev = ztest_shared->zs_vdev_aux; 1240 (void) snprintf(path, MAXPATHLEN, 1241 ztest_aux_template, ztest_opts.zo_dir, 1242 pool == NULL ? ztest_opts.zo_pool : pool, 1243 aux, vdev); 1244 } else { 1245 vdev = ztest_shared->zs_vdev_next_leaf++; 1246 (void) snprintf(path, MAXPATHLEN, 1247 ztest_dev_template, ztest_opts.zo_dir, 1248 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 1249 } 1250 } else { 1251 draid_spare = ztest_is_draid_spare(path); 1252 } 1253 1254 if (size != 0 && !draid_spare) { 1255 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 1256 if (fd == -1) 1257 fatal(1, "can't open %s", path); 1258 if (ftruncate(fd, size) != 0) 1259 fatal(1, "can't ftruncate %s", path); 1260 (void) close(fd); 1261 } 1262 1263 file = fnvlist_alloc(); 1264 fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, 1265 draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); 1266 fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); 1267 fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); 1268 umem_free(pathbuf, MAXPATHLEN); 1269 1270 return (file); 1271} 1272 1273static nvlist_t * 1274make_vdev_raid(char *path, char *aux, char *pool, size_t size, 1275 uint64_t ashift, int r) 1276{ 1277 nvlist_t *raid, **child; 1278 int c; 1279 1280 if (r < 2) 1281 return (make_vdev_file(path, aux, pool, size, ashift)); 1282 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 1283 1284 for (c = 0; c < r; c++) 1285 child[c] = make_vdev_file(path, aux, pool, size, ashift); 1286 1287 raid = fnvlist_alloc(); 1288 fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, 1289 ztest_opts.zo_raid_type); 1290 fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, 1291 ztest_opts.zo_raid_parity); 1292 fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, child, r); 1293 1294 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { 1295 uint64_t ndata = ztest_opts.zo_draid_data; 1296 uint64_t nparity = ztest_opts.zo_raid_parity; 1297 uint64_t nspares = ztest_opts.zo_draid_spares; 1298 uint64_t children = ztest_opts.zo_raid_children; 1299 uint64_t ngroups = 1; 1300 1301 /* 1302 * Calculate the minimum number of groups required to fill a 1303 * slice. This is the LCM of the stripe width (data + parity) 1304 * and the number of data drives (children - spares). 1305 */ 1306 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1307 ngroups++; 1308 1309 /* Store the basic dRAID configuration. */ 1310 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1311 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1312 fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1313 } 1314 1315 for (c = 0; c < r; c++) 1316 fnvlist_free(child[c]); 1317 1318 umem_free(child, r * sizeof (nvlist_t *)); 1319 1320 return (raid); 1321} 1322 1323static nvlist_t * 1324make_vdev_mirror(char *path, char *aux, char *pool, size_t size, 1325 uint64_t ashift, int r, int m) 1326{ 1327 nvlist_t *mirror, **child; 1328 int c; 1329 1330 if (m < 1) 1331 return (make_vdev_raid(path, aux, pool, size, ashift, r)); 1332 1333 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 1334 1335 for (c = 0; c < m; c++) 1336 child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); 1337 1338 mirror = fnvlist_alloc(); 1339 fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); 1340 fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, child, m); 1341 1342 for (c = 0; c < m; c++) 1343 fnvlist_free(child[c]); 1344 1345 umem_free(child, m * sizeof (nvlist_t *)); 1346 1347 return (mirror); 1348} 1349 1350static nvlist_t * 1351make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, 1352 const char *class, int r, int m, int t) 1353{ 1354 nvlist_t *root, **child; 1355 int c; 1356 boolean_t log; 1357 1358 ASSERT3S(t, >, 0); 1359 1360 log = (class != NULL && strcmp(class, "log") == 0); 1361 1362 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 1363 1364 for (c = 0; c < t; c++) { 1365 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 1366 r, m); 1367 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); 1368 1369 if (class != NULL && class[0] != '\0') { 1370 ASSERT(m > 1 || log); /* expecting a mirror */ 1371 fnvlist_add_string(child[c], 1372 ZPOOL_CONFIG_ALLOCATION_BIAS, class); 1373 } 1374 } 1375 1376 root = fnvlist_alloc(); 1377 fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 1378 fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 1379 child, t); 1380 1381 for (c = 0; c < t; c++) 1382 fnvlist_free(child[c]); 1383 1384 umem_free(child, t * sizeof (nvlist_t *)); 1385 1386 return (root); 1387} 1388 1389/* 1390 * Find a random spa version. Returns back a random spa version in the 1391 * range [initial_version, SPA_VERSION_FEATURES]. 1392 */ 1393static uint64_t 1394ztest_random_spa_version(uint64_t initial_version) 1395{ 1396 uint64_t version = initial_version; 1397 1398 if (version <= SPA_VERSION_BEFORE_FEATURES) { 1399 version = version + 1400 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 1401 } 1402 1403 if (version > SPA_VERSION_BEFORE_FEATURES) 1404 version = SPA_VERSION_FEATURES; 1405 1406 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1407 return (version); 1408} 1409 1410static int 1411ztest_random_blocksize(void) 1412{ 1413 ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); 1414 1415 /* 1416 * Choose a block size >= the ashift. 1417 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1418 */ 1419 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1420 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1421 maxbs = 20; 1422 uint64_t block_shift = 1423 ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1424 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1425} 1426 1427static int 1428ztest_random_dnodesize(void) 1429{ 1430 int slots; 1431 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1432 1433 if (max_slots == DNODE_MIN_SLOTS) 1434 return (DNODE_MIN_SIZE); 1435 1436 /* 1437 * Weight the random distribution more heavily toward smaller 1438 * dnode sizes since that is more likely to reflect real-world 1439 * usage. 1440 */ 1441 ASSERT3U(max_slots, >, 4); 1442 switch (ztest_random(10)) { 1443 case 0: 1444 slots = 5 + ztest_random(max_slots - 4); 1445 break; 1446 case 1 ... 4: 1447 slots = 2 + ztest_random(3); 1448 break; 1449 default: 1450 slots = 1; 1451 break; 1452 } 1453 1454 return (slots << DNODE_SHIFT); 1455} 1456 1457static int 1458ztest_random_ibshift(void) 1459{ 1460 return (DN_MIN_INDBLKSHIFT + 1461 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1462} 1463 1464static uint64_t 1465ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1466{ 1467 uint64_t top; 1468 vdev_t *rvd = spa->spa_root_vdev; 1469 vdev_t *tvd; 1470 1471 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 1472 1473 do { 1474 top = ztest_random(rvd->vdev_children); 1475 tvd = rvd->vdev_child[top]; 1476 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1477 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1478 1479 return (top); 1480} 1481 1482static uint64_t 1483ztest_random_dsl_prop(zfs_prop_t prop) 1484{ 1485 uint64_t value; 1486 1487 do { 1488 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1489 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1490 1491 return (value); 1492} 1493 1494static int 1495ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1496 boolean_t inherit) 1497{ 1498 const char *propname = zfs_prop_to_name(prop); 1499 const char *valname; 1500 char *setpoint; 1501 uint64_t curval; 1502 int error; 1503 1504 error = dsl_prop_set_int(osname, propname, 1505 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1506 1507 if (error == ENOSPC) { 1508 ztest_record_enospc(FTAG); 1509 return (error); 1510 } 1511 ASSERT0(error); 1512 1513 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 1514 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1515 1516 if (ztest_opts.zo_verbose >= 6) { 1517 int err; 1518 1519 err = zfs_prop_index_to_string(prop, curval, &valname); 1520 if (err) 1521 (void) printf("%s %s = %llu at '%s'\n", osname, 1522 propname, (unsigned long long)curval, setpoint); 1523 else 1524 (void) printf("%s %s = %s at '%s'\n", 1525 osname, propname, valname, setpoint); 1526 } 1527 umem_free(setpoint, MAXPATHLEN); 1528 1529 return (error); 1530} 1531 1532static int 1533ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1534{ 1535 spa_t *spa = ztest_spa; 1536 nvlist_t *props = NULL; 1537 int error; 1538 1539 props = fnvlist_alloc(); 1540 fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); 1541 1542 error = spa_prop_set(spa, props); 1543 1544 fnvlist_free(props); 1545 1546 if (error == ENOSPC) { 1547 ztest_record_enospc(FTAG); 1548 return (error); 1549 } 1550 ASSERT0(error); 1551 1552 return (error); 1553} 1554 1555static int 1556ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, 1557 boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) 1558{ 1559 int err; 1560 char *cp = NULL; 1561 char ddname[ZFS_MAX_DATASET_NAME_LEN]; 1562 1563 strcpy(ddname, name); 1564 cp = strchr(ddname, '@'); 1565 if (cp != NULL) 1566 *cp = '\0'; 1567 1568 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1569 while (decrypt && err == EACCES) { 1570 dsl_crypto_params_t *dcp; 1571 nvlist_t *crypto_args = fnvlist_alloc(); 1572 1573 fnvlist_add_uint8_array(crypto_args, "wkeydata", 1574 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 1575 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, 1576 crypto_args, &dcp)); 1577 err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); 1578 /* 1579 * Note: if there was an error loading, the wkey was not 1580 * consumed, and needs to be freed. 1581 */ 1582 dsl_crypto_params_free(dcp, (err != 0)); 1583 fnvlist_free(crypto_args); 1584 1585 if (err == EINVAL) { 1586 /* 1587 * We couldn't load a key for this dataset so try 1588 * the parent. This loop will eventually hit the 1589 * encryption root since ztest only makes clones 1590 * as children of their origin datasets. 1591 */ 1592 cp = strrchr(ddname, '/'); 1593 if (cp == NULL) 1594 return (err); 1595 1596 *cp = '\0'; 1597 err = EACCES; 1598 continue; 1599 } else if (err != 0) { 1600 break; 1601 } 1602 1603 err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); 1604 break; 1605 } 1606 1607 return (err); 1608} 1609 1610static void 1611ztest_rll_init(rll_t *rll) 1612{ 1613 rll->rll_writer = NULL; 1614 rll->rll_readers = 0; 1615 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); 1616 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); 1617} 1618 1619static void 1620ztest_rll_destroy(rll_t *rll) 1621{ 1622 ASSERT3P(rll->rll_writer, ==, NULL); 1623 ASSERT0(rll->rll_readers); 1624 mutex_destroy(&rll->rll_lock); 1625 cv_destroy(&rll->rll_cv); 1626} 1627 1628static void 1629ztest_rll_lock(rll_t *rll, rl_type_t type) 1630{ 1631 mutex_enter(&rll->rll_lock); 1632 1633 if (type == RL_READER) { 1634 while (rll->rll_writer != NULL) 1635 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1636 rll->rll_readers++; 1637 } else { 1638 while (rll->rll_writer != NULL || rll->rll_readers) 1639 (void) cv_wait(&rll->rll_cv, &rll->rll_lock); 1640 rll->rll_writer = curthread; 1641 } 1642 1643 mutex_exit(&rll->rll_lock); 1644} 1645 1646static void 1647ztest_rll_unlock(rll_t *rll) 1648{ 1649 mutex_enter(&rll->rll_lock); 1650 1651 if (rll->rll_writer) { 1652 ASSERT0(rll->rll_readers); 1653 rll->rll_writer = NULL; 1654 } else { 1655 ASSERT3S(rll->rll_readers, >, 0); 1656 ASSERT3P(rll->rll_writer, ==, NULL); 1657 rll->rll_readers--; 1658 } 1659 1660 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1661 cv_broadcast(&rll->rll_cv); 1662 1663 mutex_exit(&rll->rll_lock); 1664} 1665 1666static void 1667ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1668{ 1669 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1670 1671 ztest_rll_lock(rll, type); 1672} 1673 1674static void 1675ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1676{ 1677 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1678 1679 ztest_rll_unlock(rll); 1680} 1681 1682static rl_t * 1683ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1684 uint64_t size, rl_type_t type) 1685{ 1686 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1687 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1688 rl_t *rl; 1689 1690 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1691 rl->rl_object = object; 1692 rl->rl_offset = offset; 1693 rl->rl_size = size; 1694 rl->rl_lock = rll; 1695 1696 ztest_rll_lock(rll, type); 1697 1698 return (rl); 1699} 1700 1701static void 1702ztest_range_unlock(rl_t *rl) 1703{ 1704 rll_t *rll = rl->rl_lock; 1705 1706 ztest_rll_unlock(rll); 1707 1708 umem_free(rl, sizeof (*rl)); 1709} 1710 1711static void 1712ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1713{ 1714 zd->zd_os = os; 1715 zd->zd_zilog = dmu_objset_zil(os); 1716 zd->zd_shared = szd; 1717 dmu_objset_name(os, zd->zd_name); 1718 int l; 1719 1720 if (zd->zd_shared != NULL) 1721 zd->zd_shared->zd_seq = 0; 1722 1723 VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); 1724 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); 1725 1726 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1727 ztest_rll_init(&zd->zd_object_lock[l]); 1728 1729 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1730 ztest_rll_init(&zd->zd_range_lock[l]); 1731} 1732 1733static void 1734ztest_zd_fini(ztest_ds_t *zd) 1735{ 1736 int l; 1737 1738 mutex_destroy(&zd->zd_dirobj_lock); 1739 (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); 1740 1741 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1742 ztest_rll_destroy(&zd->zd_object_lock[l]); 1743 1744 for (l = 0; l < ZTEST_RANGE_LOCKS; l++) 1745 ztest_rll_destroy(&zd->zd_range_lock[l]); 1746} 1747 1748#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1749 1750static uint64_t 1751ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1752{ 1753 uint64_t txg; 1754 int error; 1755 1756 /* 1757 * Attempt to assign tx to some transaction group. 1758 */ 1759 error = dmu_tx_assign(tx, txg_how); 1760 if (error) { 1761 if (error == ERESTART) { 1762 ASSERT3U(txg_how, ==, TXG_NOWAIT); 1763 dmu_tx_wait(tx); 1764 } else { 1765 ASSERT3U(error, ==, ENOSPC); 1766 ztest_record_enospc(tag); 1767 } 1768 dmu_tx_abort(tx); 1769 return (0); 1770 } 1771 txg = dmu_tx_get_txg(tx); 1772 ASSERT3U(txg, !=, 0); 1773 return (txg); 1774} 1775 1776static void 1777ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1778 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1779 uint64_t crtxg) 1780{ 1781 bt->bt_magic = BT_MAGIC; 1782 bt->bt_objset = dmu_objset_id(os); 1783 bt->bt_object = object; 1784 bt->bt_dnodesize = dnodesize; 1785 bt->bt_offset = offset; 1786 bt->bt_gen = gen; 1787 bt->bt_txg = txg; 1788 bt->bt_crtxg = crtxg; 1789} 1790 1791static void 1792ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1793 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1794 uint64_t crtxg) 1795{ 1796 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1797 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1798 ASSERT3U(bt->bt_object, ==, object); 1799 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1800 ASSERT3U(bt->bt_offset, ==, offset); 1801 ASSERT3U(bt->bt_gen, <=, gen); 1802 ASSERT3U(bt->bt_txg, <=, txg); 1803 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1804} 1805 1806static ztest_block_tag_t * 1807ztest_bt_bonus(dmu_buf_t *db) 1808{ 1809 dmu_object_info_t doi; 1810 ztest_block_tag_t *bt; 1811 1812 dmu_object_info_from_db(db, &doi); 1813 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1814 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1815 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1816 1817 return (bt); 1818} 1819 1820/* 1821 * Generate a token to fill up unused bonus buffer space. Try to make 1822 * it unique to the object, generation, and offset to verify that data 1823 * is not getting overwritten by data from other dnodes. 1824 */ 1825#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1826 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1827 1828/* 1829 * Fill up the unused bonus buffer region before the block tag with a 1830 * verifiable pattern. Filling the whole bonus area with non-zero data 1831 * helps ensure that all dnode traversal code properly skips the 1832 * interior regions of large dnodes. 1833 */ 1834static void 1835ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1836 objset_t *os, uint64_t gen) 1837{ 1838 uint64_t *bonusp; 1839 1840 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1841 1842 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1843 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1844 gen, bonusp - (uint64_t *)db->db_data); 1845 *bonusp = token; 1846 } 1847} 1848 1849/* 1850 * Verify that the unused area of a bonus buffer is filled with the 1851 * expected tokens. 1852 */ 1853static void 1854ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1855 objset_t *os, uint64_t gen) 1856{ 1857 uint64_t *bonusp; 1858 1859 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1860 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1861 gen, bonusp - (uint64_t *)db->db_data); 1862 VERIFY3U(*bonusp, ==, token); 1863 } 1864} 1865 1866/* 1867 * ZIL logging ops 1868 */ 1869 1870#define lrz_type lr_mode 1871#define lrz_blocksize lr_uid 1872#define lrz_ibshift lr_gid 1873#define lrz_bonustype lr_rdev 1874#define lrz_dnodesize lr_crtime[1] 1875 1876static void 1877ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1878{ 1879 char *name = (void *)(lr + 1); /* name follows lr */ 1880 size_t namesize = strlen(name) + 1; 1881 itx_t *itx; 1882 1883 if (zil_replaying(zd->zd_zilog, tx)) 1884 return; 1885 1886 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1887 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1888 sizeof (*lr) + namesize - sizeof (lr_t)); 1889 1890 zil_itx_assign(zd->zd_zilog, itx, tx); 1891} 1892 1893static void 1894ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1895{ 1896 char *name = (void *)(lr + 1); /* name follows lr */ 1897 size_t namesize = strlen(name) + 1; 1898 itx_t *itx; 1899 1900 if (zil_replaying(zd->zd_zilog, tx)) 1901 return; 1902 1903 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1904 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1905 sizeof (*lr) + namesize - sizeof (lr_t)); 1906 1907 itx->itx_oid = object; 1908 zil_itx_assign(zd->zd_zilog, itx, tx); 1909} 1910 1911static void 1912ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1913{ 1914 itx_t *itx; 1915 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1916 1917 if (zil_replaying(zd->zd_zilog, tx)) 1918 return; 1919 1920 if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) 1921 write_state = WR_INDIRECT; 1922 1923 itx = zil_itx_create(TX_WRITE, 1924 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1925 1926 if (write_state == WR_COPIED && 1927 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1928 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1929 zil_itx_destroy(itx); 1930 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1931 write_state = WR_NEED_COPY; 1932 } 1933 itx->itx_private = zd; 1934 itx->itx_wr_state = write_state; 1935 itx->itx_sync = (ztest_random(8) == 0); 1936 1937 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1938 sizeof (*lr) - sizeof (lr_t)); 1939 1940 zil_itx_assign(zd->zd_zilog, itx, tx); 1941} 1942 1943static void 1944ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1945{ 1946 itx_t *itx; 1947 1948 if (zil_replaying(zd->zd_zilog, tx)) 1949 return; 1950 1951 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1952 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1953 sizeof (*lr) - sizeof (lr_t)); 1954 1955 itx->itx_sync = B_FALSE; 1956 zil_itx_assign(zd->zd_zilog, itx, tx); 1957} 1958 1959static void 1960ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1961{ 1962 itx_t *itx; 1963 1964 if (zil_replaying(zd->zd_zilog, tx)) 1965 return; 1966 1967 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1968 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1969 sizeof (*lr) - sizeof (lr_t)); 1970 1971 itx->itx_sync = B_FALSE; 1972 zil_itx_assign(zd->zd_zilog, itx, tx); 1973} 1974 1975/* 1976 * ZIL replay ops 1977 */ 1978static int 1979ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 1980{ 1981 ztest_ds_t *zd = arg1; 1982 lr_create_t *lr = arg2; 1983 char *name = (void *)(lr + 1); /* name follows lr */ 1984 objset_t *os = zd->zd_os; 1985 ztest_block_tag_t *bbt; 1986 dmu_buf_t *db; 1987 dmu_tx_t *tx; 1988 uint64_t txg; 1989 int error = 0; 1990 int bonuslen; 1991 1992 if (byteswap) 1993 byteswap_uint64_array(lr, sizeof (*lr)); 1994 1995 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 1996 ASSERT3S(name[0], !=, '\0'); 1997 1998 tx = dmu_tx_create(os); 1999 2000 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 2001 2002 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2003 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 2004 } else { 2005 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2006 } 2007 2008 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2009 if (txg == 0) 2010 return (ENOSPC); 2011 2012 ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); 2013 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 2014 2015 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 2016 if (lr->lr_foid == 0) { 2017 lr->lr_foid = zap_create_dnsize(os, 2018 lr->lrz_type, lr->lrz_bonustype, 2019 bonuslen, lr->lrz_dnodesize, tx); 2020 } else { 2021 error = zap_create_claim_dnsize(os, lr->lr_foid, 2022 lr->lrz_type, lr->lrz_bonustype, 2023 bonuslen, lr->lrz_dnodesize, tx); 2024 } 2025 } else { 2026 if (lr->lr_foid == 0) { 2027 lr->lr_foid = dmu_object_alloc_dnsize(os, 2028 lr->lrz_type, 0, lr->lrz_bonustype, 2029 bonuslen, lr->lrz_dnodesize, tx); 2030 } else { 2031 error = dmu_object_claim_dnsize(os, lr->lr_foid, 2032 lr->lrz_type, 0, lr->lrz_bonustype, 2033 bonuslen, lr->lrz_dnodesize, tx); 2034 } 2035 } 2036 2037 if (error) { 2038 ASSERT3U(error, ==, EEXIST); 2039 ASSERT(zd->zd_zilog->zl_replay); 2040 dmu_tx_commit(tx); 2041 return (error); 2042 } 2043 2044 ASSERT3U(lr->lr_foid, !=, 0); 2045 2046 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 2047 VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, 2048 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 2049 2050 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2051 bbt = ztest_bt_bonus(db); 2052 dmu_buf_will_dirty(db, tx); 2053 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 2054 lr->lr_gen, txg, txg); 2055 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 2056 dmu_buf_rele(db, FTAG); 2057 2058 VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 2059 &lr->lr_foid, tx)); 2060 2061 (void) ztest_log_create(zd, tx, lr); 2062 2063 dmu_tx_commit(tx); 2064 2065 return (0); 2066} 2067 2068static int 2069ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 2070{ 2071 ztest_ds_t *zd = arg1; 2072 lr_remove_t *lr = arg2; 2073 char *name = (void *)(lr + 1); /* name follows lr */ 2074 objset_t *os = zd->zd_os; 2075 dmu_object_info_t doi; 2076 dmu_tx_t *tx; 2077 uint64_t object, txg; 2078 2079 if (byteswap) 2080 byteswap_uint64_array(lr, sizeof (*lr)); 2081 2082 ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); 2083 ASSERT3S(name[0], !=, '\0'); 2084 2085 VERIFY0( 2086 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 2087 ASSERT3U(object, !=, 0); 2088 2089 ztest_object_lock(zd, object, RL_WRITER); 2090 2091 VERIFY0(dmu_object_info(os, object, &doi)); 2092 2093 tx = dmu_tx_create(os); 2094 2095 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 2096 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 2097 2098 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2099 if (txg == 0) { 2100 ztest_object_unlock(zd, object); 2101 return (ENOSPC); 2102 } 2103 2104 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 2105 VERIFY0(zap_destroy(os, object, tx)); 2106 } else { 2107 VERIFY0(dmu_object_free(os, object, tx)); 2108 } 2109 2110 VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); 2111 2112 (void) ztest_log_remove(zd, tx, lr, object); 2113 2114 dmu_tx_commit(tx); 2115 2116 ztest_object_unlock(zd, object); 2117 2118 return (0); 2119} 2120 2121static int 2122ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 2123{ 2124 ztest_ds_t *zd = arg1; 2125 lr_write_t *lr = arg2; 2126 objset_t *os = zd->zd_os; 2127 void *data = lr + 1; /* data follows lr */ 2128 uint64_t offset, length; 2129 ztest_block_tag_t *bt = data; 2130 ztest_block_tag_t *bbt; 2131 uint64_t gen, txg, lrtxg, crtxg; 2132 dmu_object_info_t doi; 2133 dmu_tx_t *tx; 2134 dmu_buf_t *db; 2135 arc_buf_t *abuf = NULL; 2136 rl_t *rl; 2137 2138 if (byteswap) 2139 byteswap_uint64_array(lr, sizeof (*lr)); 2140 2141 offset = lr->lr_offset; 2142 length = lr->lr_length; 2143 2144 /* If it's a dmu_sync() block, write the whole block */ 2145 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 2146 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 2147 if (length < blocksize) { 2148 offset -= offset % blocksize; 2149 length = blocksize; 2150 } 2151 } 2152 2153 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 2154 byteswap_uint64_array(bt, sizeof (*bt)); 2155 2156 if (bt->bt_magic != BT_MAGIC) 2157 bt = NULL; 2158 2159 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2160 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 2161 2162 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2163 2164 dmu_object_info_from_db(db, &doi); 2165 2166 bbt = ztest_bt_bonus(db); 2167 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2168 gen = bbt->bt_gen; 2169 crtxg = bbt->bt_crtxg; 2170 lrtxg = lr->lr_common.lrc_txg; 2171 2172 tx = dmu_tx_create(os); 2173 2174 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 2175 2176 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 2177 P2PHASE(offset, length) == 0) 2178 abuf = dmu_request_arcbuf(db, length); 2179 2180 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2181 if (txg == 0) { 2182 if (abuf != NULL) 2183 dmu_return_arcbuf(abuf); 2184 dmu_buf_rele(db, FTAG); 2185 ztest_range_unlock(rl); 2186 ztest_object_unlock(zd, lr->lr_foid); 2187 return (ENOSPC); 2188 } 2189 2190 if (bt != NULL) { 2191 /* 2192 * Usually, verify the old data before writing new data -- 2193 * but not always, because we also want to verify correct 2194 * behavior when the data was not recently read into cache. 2195 */ 2196 ASSERT0(offset % doi.doi_data_block_size); 2197 if (ztest_random(4) != 0) { 2198 int prefetch = ztest_random(2) ? 2199 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 2200 ztest_block_tag_t rbt; 2201 2202 VERIFY(dmu_read(os, lr->lr_foid, offset, 2203 sizeof (rbt), &rbt, prefetch) == 0); 2204 if (rbt.bt_magic == BT_MAGIC) { 2205 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 2206 offset, gen, txg, crtxg); 2207 } 2208 } 2209 2210 /* 2211 * Writes can appear to be newer than the bonus buffer because 2212 * the ztest_get_data() callback does a dmu_read() of the 2213 * open-context data, which may be different than the data 2214 * as it was when the write was generated. 2215 */ 2216 if (zd->zd_zilog->zl_replay) { 2217 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 2218 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 2219 bt->bt_crtxg); 2220 } 2221 2222 /* 2223 * Set the bt's gen/txg to the bonus buffer's gen/txg 2224 * so that all of the usual ASSERTs will work. 2225 */ 2226 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 2227 crtxg); 2228 } 2229 2230 if (abuf == NULL) { 2231 dmu_write(os, lr->lr_foid, offset, length, data, tx); 2232 } else { 2233 bcopy(data, abuf->b_data, length); 2234 dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx); 2235 } 2236 2237 (void) ztest_log_write(zd, tx, lr); 2238 2239 dmu_buf_rele(db, FTAG); 2240 2241 dmu_tx_commit(tx); 2242 2243 ztest_range_unlock(rl); 2244 ztest_object_unlock(zd, lr->lr_foid); 2245 2246 return (0); 2247} 2248 2249static int 2250ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 2251{ 2252 ztest_ds_t *zd = arg1; 2253 lr_truncate_t *lr = arg2; 2254 objset_t *os = zd->zd_os; 2255 dmu_tx_t *tx; 2256 uint64_t txg; 2257 rl_t *rl; 2258 2259 if (byteswap) 2260 byteswap_uint64_array(lr, sizeof (*lr)); 2261 2262 ztest_object_lock(zd, lr->lr_foid, RL_READER); 2263 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 2264 RL_WRITER); 2265 2266 tx = dmu_tx_create(os); 2267 2268 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 2269 2270 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2271 if (txg == 0) { 2272 ztest_range_unlock(rl); 2273 ztest_object_unlock(zd, lr->lr_foid); 2274 return (ENOSPC); 2275 } 2276 2277 VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 2278 lr->lr_length, tx)); 2279 2280 (void) ztest_log_truncate(zd, tx, lr); 2281 2282 dmu_tx_commit(tx); 2283 2284 ztest_range_unlock(rl); 2285 ztest_object_unlock(zd, lr->lr_foid); 2286 2287 return (0); 2288} 2289 2290static int 2291ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 2292{ 2293 ztest_ds_t *zd = arg1; 2294 lr_setattr_t *lr = arg2; 2295 objset_t *os = zd->zd_os; 2296 dmu_tx_t *tx; 2297 dmu_buf_t *db; 2298 ztest_block_tag_t *bbt; 2299 uint64_t txg, lrtxg, crtxg, dnodesize; 2300 2301 if (byteswap) 2302 byteswap_uint64_array(lr, sizeof (*lr)); 2303 2304 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 2305 2306 VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 2307 2308 tx = dmu_tx_create(os); 2309 dmu_tx_hold_bonus(tx, lr->lr_foid); 2310 2311 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2312 if (txg == 0) { 2313 dmu_buf_rele(db, FTAG); 2314 ztest_object_unlock(zd, lr->lr_foid); 2315 return (ENOSPC); 2316 } 2317 2318 bbt = ztest_bt_bonus(db); 2319 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2320 crtxg = bbt->bt_crtxg; 2321 lrtxg = lr->lr_common.lrc_txg; 2322 dnodesize = bbt->bt_dnodesize; 2323 2324 if (zd->zd_zilog->zl_replay) { 2325 ASSERT3U(lr->lr_size, !=, 0); 2326 ASSERT3U(lr->lr_mode, !=, 0); 2327 ASSERT3U(lrtxg, !=, 0); 2328 } else { 2329 /* 2330 * Randomly change the size and increment the generation. 2331 */ 2332 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 2333 sizeof (*bbt); 2334 lr->lr_mode = bbt->bt_gen + 1; 2335 ASSERT0(lrtxg); 2336 } 2337 2338 /* 2339 * Verify that the current bonus buffer is not newer than our txg. 2340 */ 2341 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2342 MAX(txg, lrtxg), crtxg); 2343 2344 dmu_buf_will_dirty(db, tx); 2345 2346 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 2347 ASSERT3U(lr->lr_size, <=, db->db_size); 2348 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 2349 bbt = ztest_bt_bonus(db); 2350 2351 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 2352 txg, crtxg); 2353 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 2354 dmu_buf_rele(db, FTAG); 2355 2356 (void) ztest_log_setattr(zd, tx, lr); 2357 2358 dmu_tx_commit(tx); 2359 2360 ztest_object_unlock(zd, lr->lr_foid); 2361 2362 return (0); 2363} 2364 2365zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 2366 NULL, /* 0 no such transaction type */ 2367 ztest_replay_create, /* TX_CREATE */ 2368 NULL, /* TX_MKDIR */ 2369 NULL, /* TX_MKXATTR */ 2370 NULL, /* TX_SYMLINK */ 2371 ztest_replay_remove, /* TX_REMOVE */ 2372 NULL, /* TX_RMDIR */ 2373 NULL, /* TX_LINK */ 2374 NULL, /* TX_RENAME */ 2375 ztest_replay_write, /* TX_WRITE */ 2376 ztest_replay_truncate, /* TX_TRUNCATE */ 2377 ztest_replay_setattr, /* TX_SETATTR */ 2378 NULL, /* TX_ACL */ 2379 NULL, /* TX_CREATE_ACL */ 2380 NULL, /* TX_CREATE_ATTR */ 2381 NULL, /* TX_CREATE_ACL_ATTR */ 2382 NULL, /* TX_MKDIR_ACL */ 2383 NULL, /* TX_MKDIR_ATTR */ 2384 NULL, /* TX_MKDIR_ACL_ATTR */ 2385 NULL, /* TX_WRITE2 */ 2386}; 2387 2388/* 2389 * ZIL get_data callbacks 2390 */ 2391 2392/* ARGSUSED */ 2393static void 2394ztest_get_done(zgd_t *zgd, int error) 2395{ 2396 ztest_ds_t *zd = zgd->zgd_private; 2397 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 2398 2399 if (zgd->zgd_db) 2400 dmu_buf_rele(zgd->zgd_db, zgd); 2401 2402 ztest_range_unlock((rl_t *)zgd->zgd_lr); 2403 ztest_object_unlock(zd, object); 2404 2405 umem_free(zgd, sizeof (*zgd)); 2406} 2407 2408static int 2409ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 2410 struct lwb *lwb, zio_t *zio) 2411{ 2412 ztest_ds_t *zd = arg; 2413 objset_t *os = zd->zd_os; 2414 uint64_t object = lr->lr_foid; 2415 uint64_t offset = lr->lr_offset; 2416 uint64_t size = lr->lr_length; 2417 uint64_t txg = lr->lr_common.lrc_txg; 2418 uint64_t crtxg; 2419 dmu_object_info_t doi; 2420 dmu_buf_t *db; 2421 zgd_t *zgd; 2422 int error; 2423 2424 ASSERT3P(lwb, !=, NULL); 2425 ASSERT3P(zio, !=, NULL); 2426 ASSERT3U(size, !=, 0); 2427 2428 ztest_object_lock(zd, object, RL_READER); 2429 error = dmu_bonus_hold(os, object, FTAG, &db); 2430 if (error) { 2431 ztest_object_unlock(zd, object); 2432 return (error); 2433 } 2434 2435 crtxg = ztest_bt_bonus(db)->bt_crtxg; 2436 2437 if (crtxg == 0 || crtxg > txg) { 2438 dmu_buf_rele(db, FTAG); 2439 ztest_object_unlock(zd, object); 2440 return (ENOENT); 2441 } 2442 2443 dmu_object_info_from_db(db, &doi); 2444 dmu_buf_rele(db, FTAG); 2445 db = NULL; 2446 2447 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 2448 zgd->zgd_lwb = lwb; 2449 zgd->zgd_private = zd; 2450 2451 if (buf != NULL) { /* immediate write */ 2452 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2453 object, offset, size, RL_READER); 2454 2455 error = dmu_read(os, object, offset, size, buf, 2456 DMU_READ_NO_PREFETCH); 2457 ASSERT0(error); 2458 } else { 2459 size = doi.doi_data_block_size; 2460 if (ISP2(size)) { 2461 offset = P2ALIGN(offset, size); 2462 } else { 2463 ASSERT3U(offset, <, size); 2464 offset = 0; 2465 } 2466 2467 zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, 2468 object, offset, size, RL_READER); 2469 2470 error = dmu_buf_hold(os, object, offset, zgd, &db, 2471 DMU_READ_NO_PREFETCH); 2472 2473 if (error == 0) { 2474 blkptr_t *bp = &lr->lr_blkptr; 2475 2476 zgd->zgd_db = db; 2477 zgd->zgd_bp = bp; 2478 2479 ASSERT3U(db->db_offset, ==, offset); 2480 ASSERT3U(db->db_size, ==, size); 2481 2482 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2483 ztest_get_done, zgd); 2484 2485 if (error == 0) 2486 return (0); 2487 } 2488 } 2489 2490 ztest_get_done(zgd, error); 2491 2492 return (error); 2493} 2494 2495static void * 2496ztest_lr_alloc(size_t lrsize, char *name) 2497{ 2498 char *lr; 2499 size_t namesize = name ? strlen(name) + 1 : 0; 2500 2501 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2502 2503 if (name) 2504 bcopy(name, lr + lrsize, namesize); 2505 2506 return (lr); 2507} 2508 2509static void 2510ztest_lr_free(void *lr, size_t lrsize, char *name) 2511{ 2512 size_t namesize = name ? strlen(name) + 1 : 0; 2513 2514 umem_free(lr, lrsize + namesize); 2515} 2516 2517/* 2518 * Lookup a bunch of objects. Returns the number of objects not found. 2519 */ 2520static int 2521ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2522{ 2523 int missing = 0; 2524 int error; 2525 int i; 2526 2527 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2528 2529 for (i = 0; i < count; i++, od++) { 2530 od->od_object = 0; 2531 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2532 sizeof (uint64_t), 1, &od->od_object); 2533 if (error) { 2534 ASSERT3S(error, ==, ENOENT); 2535 ASSERT0(od->od_object); 2536 missing++; 2537 } else { 2538 dmu_buf_t *db; 2539 ztest_block_tag_t *bbt; 2540 dmu_object_info_t doi; 2541 2542 ASSERT3U(od->od_object, !=, 0); 2543 ASSERT0(missing); /* there should be no gaps */ 2544 2545 ztest_object_lock(zd, od->od_object, RL_READER); 2546 VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, 2547 FTAG, &db)); 2548 dmu_object_info_from_db(db, &doi); 2549 bbt = ztest_bt_bonus(db); 2550 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2551 od->od_type = doi.doi_type; 2552 od->od_blocksize = doi.doi_data_block_size; 2553 od->od_gen = bbt->bt_gen; 2554 dmu_buf_rele(db, FTAG); 2555 ztest_object_unlock(zd, od->od_object); 2556 } 2557 } 2558 2559 return (missing); 2560} 2561 2562static int 2563ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2564{ 2565 int missing = 0; 2566 int i; 2567 2568 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2569 2570 for (i = 0; i < count; i++, od++) { 2571 if (missing) { 2572 od->od_object = 0; 2573 missing++; 2574 continue; 2575 } 2576 2577 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2578 2579 lr->lr_doid = od->od_dir; 2580 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2581 lr->lrz_type = od->od_crtype; 2582 lr->lrz_blocksize = od->od_crblocksize; 2583 lr->lrz_ibshift = ztest_random_ibshift(); 2584 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2585 lr->lrz_dnodesize = od->od_crdnodesize; 2586 lr->lr_gen = od->od_crgen; 2587 lr->lr_crtime[0] = time(NULL); 2588 2589 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2590 ASSERT0(missing); 2591 od->od_object = 0; 2592 missing++; 2593 } else { 2594 od->od_object = lr->lr_foid; 2595 od->od_type = od->od_crtype; 2596 od->od_blocksize = od->od_crblocksize; 2597 od->od_gen = od->od_crgen; 2598 ASSERT3U(od->od_object, !=, 0); 2599 } 2600 2601 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2602 } 2603 2604 return (missing); 2605} 2606 2607static int 2608ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2609{ 2610 int missing = 0; 2611 int error; 2612 int i; 2613 2614 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2615 2616 od += count - 1; 2617 2618 for (i = count - 1; i >= 0; i--, od--) { 2619 if (missing) { 2620 missing++; 2621 continue; 2622 } 2623 2624 /* 2625 * No object was found. 2626 */ 2627 if (od->od_object == 0) 2628 continue; 2629 2630 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2631 2632 lr->lr_doid = od->od_dir; 2633 2634 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2635 ASSERT3U(error, ==, ENOSPC); 2636 missing++; 2637 } else { 2638 od->od_object = 0; 2639 } 2640 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2641 } 2642 2643 return (missing); 2644} 2645 2646static int 2647ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2648 void *data) 2649{ 2650 lr_write_t *lr; 2651 int error; 2652 2653 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2654 2655 lr->lr_foid = object; 2656 lr->lr_offset = offset; 2657 lr->lr_length = size; 2658 lr->lr_blkoff = 0; 2659 BP_ZERO(&lr->lr_blkptr); 2660 2661 bcopy(data, lr + 1, size); 2662 2663 error = ztest_replay_write(zd, lr, B_FALSE); 2664 2665 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2666 2667 return (error); 2668} 2669 2670static int 2671ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2672{ 2673 lr_truncate_t *lr; 2674 int error; 2675 2676 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2677 2678 lr->lr_foid = object; 2679 lr->lr_offset = offset; 2680 lr->lr_length = size; 2681 2682 error = ztest_replay_truncate(zd, lr, B_FALSE); 2683 2684 ztest_lr_free(lr, sizeof (*lr), NULL); 2685 2686 return (error); 2687} 2688 2689static int 2690ztest_setattr(ztest_ds_t *zd, uint64_t object) 2691{ 2692 lr_setattr_t *lr; 2693 int error; 2694 2695 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2696 2697 lr->lr_foid = object; 2698 lr->lr_size = 0; 2699 lr->lr_mode = 0; 2700 2701 error = ztest_replay_setattr(zd, lr, B_FALSE); 2702 2703 ztest_lr_free(lr, sizeof (*lr), NULL); 2704 2705 return (error); 2706} 2707 2708static void 2709ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2710{ 2711 objset_t *os = zd->zd_os; 2712 dmu_tx_t *tx; 2713 uint64_t txg; 2714 rl_t *rl; 2715 2716 txg_wait_synced(dmu_objset_pool(os), 0); 2717 2718 ztest_object_lock(zd, object, RL_READER); 2719 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 2720 2721 tx = dmu_tx_create(os); 2722 2723 dmu_tx_hold_write(tx, object, offset, size); 2724 2725 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2726 2727 if (txg != 0) { 2728 dmu_prealloc(os, object, offset, size, tx); 2729 dmu_tx_commit(tx); 2730 txg_wait_synced(dmu_objset_pool(os), txg); 2731 } else { 2732 (void) dmu_free_long_range(os, object, offset, size); 2733 } 2734 2735 ztest_range_unlock(rl); 2736 ztest_object_unlock(zd, object); 2737} 2738 2739static void 2740ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2741{ 2742 int err; 2743 ztest_block_tag_t wbt; 2744 dmu_object_info_t doi; 2745 enum ztest_io_type io_type; 2746 uint64_t blocksize; 2747 void *data; 2748 2749 VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); 2750 blocksize = doi.doi_data_block_size; 2751 data = umem_alloc(blocksize, UMEM_NOFAIL); 2752 2753 /* 2754 * Pick an i/o type at random, biased toward writing block tags. 2755 */ 2756 io_type = ztest_random(ZTEST_IO_TYPES); 2757 if (ztest_random(2) == 0) 2758 io_type = ZTEST_IO_WRITE_TAG; 2759 2760 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2761 2762 switch (io_type) { 2763 2764 case ZTEST_IO_WRITE_TAG: 2765 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2766 offset, 0, 0, 0); 2767 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2768 break; 2769 2770 case ZTEST_IO_WRITE_PATTERN: 2771 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2772 if (ztest_random(2) == 0) { 2773 /* 2774 * Induce fletcher2 collisions to ensure that 2775 * zio_ddt_collision() detects and resolves them 2776 * when using fletcher2-verify for deduplication. 2777 */ 2778 ((uint64_t *)data)[0] ^= 1ULL << 63; 2779 ((uint64_t *)data)[4] ^= 1ULL << 63; 2780 } 2781 (void) ztest_write(zd, object, offset, blocksize, data); 2782 break; 2783 2784 case ZTEST_IO_WRITE_ZEROES: 2785 bzero(data, blocksize); 2786 (void) ztest_write(zd, object, offset, blocksize, data); 2787 break; 2788 2789 case ZTEST_IO_TRUNCATE: 2790 (void) ztest_truncate(zd, object, offset, blocksize); 2791 break; 2792 2793 case ZTEST_IO_SETATTR: 2794 (void) ztest_setattr(zd, object); 2795 break; 2796 default: 2797 break; 2798 2799 case ZTEST_IO_REWRITE: 2800 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2801 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2802 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2803 B_FALSE); 2804 VERIFY(err == 0 || err == ENOSPC); 2805 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2806 ZFS_PROP_COMPRESSION, 2807 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2808 B_FALSE); 2809 VERIFY(err == 0 || err == ENOSPC); 2810 (void) pthread_rwlock_unlock(&ztest_name_lock); 2811 2812 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2813 DMU_READ_NO_PREFETCH)); 2814 2815 (void) ztest_write(zd, object, offset, blocksize, data); 2816 break; 2817 } 2818 2819 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2820 2821 umem_free(data, blocksize); 2822} 2823 2824/* 2825 * Initialize an object description template. 2826 */ 2827static void 2828ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, 2829 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2830 uint64_t gen) 2831{ 2832 od->od_dir = ZTEST_DIROBJ; 2833 od->od_object = 0; 2834 2835 od->od_crtype = type; 2836 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2837 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2838 od->od_crgen = gen; 2839 2840 od->od_type = DMU_OT_NONE; 2841 od->od_blocksize = 0; 2842 od->od_gen = 0; 2843 2844 (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", 2845 tag, (longlong_t)id, (u_longlong_t)index); 2846} 2847 2848/* 2849 * Lookup or create the objects for a test using the od template. 2850 * If the objects do not all exist, or if 'remove' is specified, 2851 * remove any existing objects and create new ones. Otherwise, 2852 * use the existing objects. 2853 */ 2854static int 2855ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2856{ 2857 int count = size / sizeof (*od); 2858 int rv = 0; 2859 2860 mutex_enter(&zd->zd_dirobj_lock); 2861 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2862 (ztest_remove(zd, od, count) != 0 || 2863 ztest_create(zd, od, count) != 0)) 2864 rv = -1; 2865 zd->zd_od = od; 2866 mutex_exit(&zd->zd_dirobj_lock); 2867 2868 return (rv); 2869} 2870 2871/* ARGSUSED */ 2872void 2873ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2874{ 2875 zilog_t *zilog = zd->zd_zilog; 2876 2877 (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); 2878 2879 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2880 2881 /* 2882 * Remember the committed values in zd, which is in parent/child 2883 * shared memory. If we die, the next iteration of ztest_run() 2884 * will verify that the log really does contain this record. 2885 */ 2886 mutex_enter(&zilog->zl_lock); 2887 ASSERT3P(zd->zd_shared, !=, NULL); 2888 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2889 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2890 mutex_exit(&zilog->zl_lock); 2891 2892 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2893} 2894 2895/* 2896 * This function is designed to simulate the operations that occur during a 2897 * mount/unmount operation. We hold the dataset across these operations in an 2898 * attempt to expose any implicit assumptions about ZIL management. 2899 */ 2900/* ARGSUSED */ 2901void 2902ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2903{ 2904 objset_t *os = zd->zd_os; 2905 2906 /* 2907 * We hold the ztest_vdev_lock so we don't cause problems with 2908 * other threads that wish to remove a log device, such as 2909 * ztest_device_removal(). 2910 */ 2911 mutex_enter(&ztest_vdev_lock); 2912 2913 /* 2914 * We grab the zd_dirobj_lock to ensure that no other thread is 2915 * updating the zil (i.e. adding in-memory log records) and the 2916 * zd_zilog_lock to block any I/O. 2917 */ 2918 mutex_enter(&zd->zd_dirobj_lock); 2919 (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); 2920 2921 /* zfsvfs_teardown() */ 2922 zil_close(zd->zd_zilog); 2923 2924 /* zfsvfs_setup() */ 2925 VERIFY3P(zil_open(os, ztest_get_data), ==, zd->zd_zilog); 2926 zil_replay(os, zd, ztest_replay_vector); 2927 2928 (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); 2929 mutex_exit(&zd->zd_dirobj_lock); 2930 mutex_exit(&ztest_vdev_lock); 2931} 2932 2933/* 2934 * Verify that we can't destroy an active pool, create an existing pool, 2935 * or create a pool with a bad vdev spec. 2936 */ 2937/* ARGSUSED */ 2938void 2939ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2940{ 2941 ztest_shared_opts_t *zo = &ztest_opts; 2942 spa_t *spa; 2943 nvlist_t *nvroot; 2944 2945 if (zo->zo_mmp_test) 2946 return; 2947 2948 /* 2949 * Attempt to create using a bad file. 2950 */ 2951 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2952 VERIFY3U(ENOENT, ==, 2953 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); 2954 fnvlist_free(nvroot); 2955 2956 /* 2957 * Attempt to create using a bad mirror. 2958 */ 2959 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); 2960 VERIFY3U(ENOENT, ==, 2961 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); 2962 fnvlist_free(nvroot); 2963 2964 /* 2965 * Attempt to create an existing pool. It shouldn't matter 2966 * what's in the nvroot; we should fail with EEXIST. 2967 */ 2968 (void) pthread_rwlock_rdlock(&ztest_name_lock); 2969 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); 2970 VERIFY3U(EEXIST, ==, 2971 spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); 2972 fnvlist_free(nvroot); 2973 2974 /* 2975 * We open a reference to the spa and then we try to export it 2976 * expecting one of the following errors: 2977 * 2978 * EBUSY 2979 * Because of the reference we just opened. 2980 * 2981 * ZFS_ERR_EXPORT_IN_PROGRESS 2982 * For the case that there is another ztest thread doing 2983 * an export concurrently. 2984 */ 2985 VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); 2986 int error = spa_destroy(zo->zo_pool); 2987 if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { 2988 fatal(0, "spa_destroy(%s) returned unexpected value %d", 2989 spa->spa_name, error); 2990 } 2991 spa_close(spa, FTAG); 2992 2993 (void) pthread_rwlock_unlock(&ztest_name_lock); 2994} 2995 2996/* 2997 * Start and then stop the MMP threads to ensure the startup and shutdown code 2998 * works properly. Actual protection and property-related code tested via ZTS. 2999 */ 3000/* ARGSUSED */ 3001void 3002ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) 3003{ 3004 ztest_shared_opts_t *zo = &ztest_opts; 3005 spa_t *spa = ztest_spa; 3006 3007 if (zo->zo_mmp_test) 3008 return; 3009 3010 /* 3011 * Since enabling MMP involves setting a property, it could not be done 3012 * while the pool is suspended. 3013 */ 3014 if (spa_suspended(spa)) 3015 return; 3016 3017 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3018 mutex_enter(&spa->spa_props_lock); 3019 3020 zfs_multihost_fail_intervals = 0; 3021 3022 if (!spa_multihost(spa)) { 3023 spa->spa_multihost = B_TRUE; 3024 mmp_thread_start(spa); 3025 } 3026 3027 mutex_exit(&spa->spa_props_lock); 3028 spa_config_exit(spa, SCL_CONFIG, FTAG); 3029 3030 txg_wait_synced(spa_get_dsl(spa), 0); 3031 mmp_signal_all_threads(); 3032 txg_wait_synced(spa_get_dsl(spa), 0); 3033 3034 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3035 mutex_enter(&spa->spa_props_lock); 3036 3037 if (spa_multihost(spa)) { 3038 mmp_thread_stop(spa); 3039 spa->spa_multihost = B_FALSE; 3040 } 3041 3042 mutex_exit(&spa->spa_props_lock); 3043 spa_config_exit(spa, SCL_CONFIG, FTAG); 3044} 3045 3046/* ARGSUSED */ 3047void 3048ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 3049{ 3050 spa_t *spa; 3051 uint64_t initial_version = SPA_VERSION_INITIAL; 3052 uint64_t version, newversion; 3053 nvlist_t *nvroot, *props; 3054 char *name; 3055 3056 if (ztest_opts.zo_mmp_test) 3057 return; 3058 3059 /* dRAID added after feature flags, skip upgrade test. */ 3060 if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) 3061 return; 3062 3063 mutex_enter(&ztest_vdev_lock); 3064 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 3065 3066 /* 3067 * Clean up from previous runs. 3068 */ 3069 (void) spa_destroy(name); 3070 3071 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 3072 NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); 3073 3074 /* 3075 * If we're configuring a RAIDZ device then make sure that the 3076 * initial version is capable of supporting that feature. 3077 */ 3078 switch (ztest_opts.zo_raid_parity) { 3079 case 0: 3080 case 1: 3081 initial_version = SPA_VERSION_INITIAL; 3082 break; 3083 case 2: 3084 initial_version = SPA_VERSION_RAIDZ2; 3085 break; 3086 case 3: 3087 initial_version = SPA_VERSION_RAIDZ3; 3088 break; 3089 } 3090 3091 /* 3092 * Create a pool with a spa version that can be upgraded. Pick 3093 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 3094 */ 3095 do { 3096 version = ztest_random_spa_version(initial_version); 3097 } while (version > SPA_VERSION_BEFORE_FEATURES); 3098 3099 props = fnvlist_alloc(); 3100 fnvlist_add_uint64(props, 3101 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 3102 VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); 3103 fnvlist_free(nvroot); 3104 fnvlist_free(props); 3105 3106 VERIFY0(spa_open(name, &spa, FTAG)); 3107 VERIFY3U(spa_version(spa), ==, version); 3108 newversion = ztest_random_spa_version(version + 1); 3109 3110 if (ztest_opts.zo_verbose >= 4) { 3111 (void) printf("upgrading spa version from %llu to %llu\n", 3112 (u_longlong_t)version, (u_longlong_t)newversion); 3113 } 3114 3115 spa_upgrade(spa, newversion); 3116 VERIFY3U(spa_version(spa), >, version); 3117 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 3118 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 3119 spa_close(spa, FTAG); 3120 3121 kmem_strfree(name); 3122 mutex_exit(&ztest_vdev_lock); 3123} 3124 3125static void 3126ztest_spa_checkpoint(spa_t *spa) 3127{ 3128 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3129 3130 int error = spa_checkpoint(spa->spa_name); 3131 3132 switch (error) { 3133 case 0: 3134 case ZFS_ERR_DEVRM_IN_PROGRESS: 3135 case ZFS_ERR_DISCARDING_CHECKPOINT: 3136 case ZFS_ERR_CHECKPOINT_EXISTS: 3137 break; 3138 case ENOSPC: 3139 ztest_record_enospc(FTAG); 3140 break; 3141 default: 3142 fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); 3143 } 3144} 3145 3146static void 3147ztest_spa_discard_checkpoint(spa_t *spa) 3148{ 3149 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 3150 3151 int error = spa_checkpoint_discard(spa->spa_name); 3152 3153 switch (error) { 3154 case 0: 3155 case ZFS_ERR_DISCARDING_CHECKPOINT: 3156 case ZFS_ERR_NO_CHECKPOINT: 3157 break; 3158 default: 3159 fatal(0, "spa_discard_checkpoint(%s) = %d", 3160 spa->spa_name, error); 3161 } 3162 3163} 3164 3165/* ARGSUSED */ 3166void 3167ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 3168{ 3169 spa_t *spa = ztest_spa; 3170 3171 mutex_enter(&ztest_checkpoint_lock); 3172 if (ztest_random(2) == 0) { 3173 ztest_spa_checkpoint(spa); 3174 } else { 3175 ztest_spa_discard_checkpoint(spa); 3176 } 3177 mutex_exit(&ztest_checkpoint_lock); 3178} 3179 3180 3181static vdev_t * 3182vdev_lookup_by_path(vdev_t *vd, const char *path) 3183{ 3184 vdev_t *mvd; 3185 int c; 3186 3187 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 3188 return (vd); 3189 3190 for (c = 0; c < vd->vdev_children; c++) 3191 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 3192 NULL) 3193 return (mvd); 3194 3195 return (NULL); 3196} 3197 3198static int 3199spa_num_top_vdevs(spa_t *spa) 3200{ 3201 vdev_t *rvd = spa->spa_root_vdev; 3202 ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); 3203 return (rvd->vdev_children); 3204} 3205 3206/* 3207 * Verify that vdev_add() works as expected. 3208 */ 3209/* ARGSUSED */ 3210void 3211ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 3212{ 3213 ztest_shared_t *zs = ztest_shared; 3214 spa_t *spa = ztest_spa; 3215 uint64_t leaves; 3216 uint64_t guid; 3217 nvlist_t *nvroot; 3218 int error; 3219 3220 if (ztest_opts.zo_mmp_test) 3221 return; 3222 3223 mutex_enter(&ztest_vdev_lock); 3224 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3225 ztest_opts.zo_raid_children; 3226 3227 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3228 3229 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3230 3231 /* 3232 * If we have slogs then remove them 1/4 of the time. 3233 */ 3234 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 3235 metaslab_group_t *mg; 3236 3237 /* 3238 * find the first real slog in log allocation class 3239 */ 3240 mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; 3241 while (!mg->mg_vd->vdev_islog) 3242 mg = mg->mg_next; 3243 3244 guid = mg->mg_vd->vdev_guid; 3245 3246 spa_config_exit(spa, SCL_VDEV, FTAG); 3247 3248 /* 3249 * We have to grab the zs_name_lock as writer to 3250 * prevent a race between removing a slog (dmu_objset_find) 3251 * and destroying a dataset. Removing the slog will 3252 * grab a reference on the dataset which may cause 3253 * dsl_destroy_head() to fail with EBUSY thus 3254 * leaving the dataset in an inconsistent state. 3255 */ 3256 pthread_rwlock_wrlock(&ztest_name_lock); 3257 error = spa_vdev_remove(spa, guid, B_FALSE); 3258 pthread_rwlock_unlock(&ztest_name_lock); 3259 3260 switch (error) { 3261 case 0: 3262 case EEXIST: /* Generic zil_reset() error */ 3263 case EBUSY: /* Replay required */ 3264 case EACCES: /* Crypto key not loaded */ 3265 case ZFS_ERR_CHECKPOINT_EXISTS: 3266 case ZFS_ERR_DISCARDING_CHECKPOINT: 3267 break; 3268 default: 3269 fatal(0, "spa_vdev_remove() = %d", error); 3270 } 3271 } else { 3272 spa_config_exit(spa, SCL_VDEV, FTAG); 3273 3274 /* 3275 * Make 1/4 of the devices be log devices 3276 */ 3277 nvroot = make_vdev_root(NULL, NULL, NULL, 3278 ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? 3279 "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 3280 1); 3281 3282 error = spa_vdev_add(spa, nvroot); 3283 fnvlist_free(nvroot); 3284 3285 switch (error) { 3286 case 0: 3287 break; 3288 case ENOSPC: 3289 ztest_record_enospc("spa_vdev_add"); 3290 break; 3291 default: 3292 fatal(0, "spa_vdev_add() = %d", error); 3293 } 3294 } 3295 3296 mutex_exit(&ztest_vdev_lock); 3297} 3298 3299/* ARGSUSED */ 3300void 3301ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) 3302{ 3303 ztest_shared_t *zs = ztest_shared; 3304 spa_t *spa = ztest_spa; 3305 uint64_t leaves; 3306 nvlist_t *nvroot; 3307 const char *class = (ztest_random(2) == 0) ? 3308 VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; 3309 int error; 3310 3311 /* 3312 * By default add a special vdev 50% of the time 3313 */ 3314 if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || 3315 (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && 3316 ztest_random(2) == 0)) { 3317 return; 3318 } 3319 3320 mutex_enter(&ztest_vdev_lock); 3321 3322 /* Only test with mirrors */ 3323 if (zs->zs_mirrors < 2) { 3324 mutex_exit(&ztest_vdev_lock); 3325 return; 3326 } 3327 3328 /* requires feature@allocation_classes */ 3329 if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { 3330 mutex_exit(&ztest_vdev_lock); 3331 return; 3332 } 3333 3334 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * 3335 ztest_opts.zo_raid_children; 3336 3337 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3338 ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; 3339 spa_config_exit(spa, SCL_VDEV, FTAG); 3340 3341 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 3342 class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 3343 3344 error = spa_vdev_add(spa, nvroot); 3345 fnvlist_free(nvroot); 3346 3347 if (error == ENOSPC) 3348 ztest_record_enospc("spa_vdev_add"); 3349 else if (error != 0) 3350 fatal(0, "spa_vdev_add() = %d", error); 3351 3352 /* 3353 * 50% of the time allow small blocks in the special class 3354 */ 3355 if (error == 0 && 3356 spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { 3357 if (ztest_opts.zo_verbose >= 3) 3358 (void) printf("Enabling special VDEV small blocks\n"); 3359 (void) ztest_dsl_prop_set_uint64(zd->zd_name, 3360 ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); 3361 } 3362 3363 mutex_exit(&ztest_vdev_lock); 3364 3365 if (ztest_opts.zo_verbose >= 3) { 3366 metaslab_class_t *mc; 3367 3368 if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) 3369 mc = spa_special_class(spa); 3370 else 3371 mc = spa_dedup_class(spa); 3372 (void) printf("Added a %s mirrored vdev (of %d)\n", 3373 class, (int)mc->mc_groups); 3374 } 3375} 3376 3377/* 3378 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 3379 */ 3380/* ARGSUSED */ 3381void 3382ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 3383{ 3384 ztest_shared_t *zs = ztest_shared; 3385 spa_t *spa = ztest_spa; 3386 vdev_t *rvd = spa->spa_root_vdev; 3387 spa_aux_vdev_t *sav; 3388 char *aux; 3389 char *path; 3390 uint64_t guid = 0; 3391 int error, ignore_err = 0; 3392 3393 if (ztest_opts.zo_mmp_test) 3394 return; 3395 3396 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3397 3398 if (ztest_random(2) == 0) { 3399 sav = &spa->spa_spares; 3400 aux = ZPOOL_CONFIG_SPARES; 3401 } else { 3402 sav = &spa->spa_l2cache; 3403 aux = ZPOOL_CONFIG_L2CACHE; 3404 } 3405 3406 mutex_enter(&ztest_vdev_lock); 3407 3408 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3409 3410 if (sav->sav_count != 0 && ztest_random(4) == 0) { 3411 /* 3412 * Pick a random device to remove. 3413 */ 3414 vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3415 3416 /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ 3417 if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) 3418 ignore_err = ENOTSUP; 3419 3420 guid = svd->vdev_guid; 3421 } else { 3422 /* 3423 * Find an unused device we can add. 3424 */ 3425 zs->zs_vdev_aux = 0; 3426 for (;;) { 3427 int c; 3428 (void) snprintf(path, MAXPATHLEN, ztest_aux_template, 3429 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 3430 zs->zs_vdev_aux); 3431 for (c = 0; c < sav->sav_count; c++) 3432 if (strcmp(sav->sav_vdevs[c]->vdev_path, 3433 path) == 0) 3434 break; 3435 if (c == sav->sav_count && 3436 vdev_lookup_by_path(rvd, path) == NULL) 3437 break; 3438 zs->zs_vdev_aux++; 3439 } 3440 } 3441 3442 spa_config_exit(spa, SCL_VDEV, FTAG); 3443 3444 if (guid == 0) { 3445 /* 3446 * Add a new device. 3447 */ 3448 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 3449 (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); 3450 error = spa_vdev_add(spa, nvroot); 3451 3452 switch (error) { 3453 case 0: 3454 break; 3455 default: 3456 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); 3457 } 3458 fnvlist_free(nvroot); 3459 } else { 3460 /* 3461 * Remove an existing device. Sometimes, dirty its 3462 * vdev state first to make sure we handle removal 3463 * of devices that have pending state changes. 3464 */ 3465 if (ztest_random(2) == 0) 3466 (void) vdev_online(spa, guid, 0, NULL); 3467 3468 error = spa_vdev_remove(spa, guid, B_FALSE); 3469 3470 switch (error) { 3471 case 0: 3472 case EBUSY: 3473 case ZFS_ERR_CHECKPOINT_EXISTS: 3474 case ZFS_ERR_DISCARDING_CHECKPOINT: 3475 break; 3476 default: 3477 if (error != ignore_err) 3478 fatal(0, "spa_vdev_remove(%llu) = %d", guid, 3479 error); 3480 } 3481 } 3482 3483 mutex_exit(&ztest_vdev_lock); 3484 3485 umem_free(path, MAXPATHLEN); 3486} 3487 3488/* 3489 * split a pool if it has mirror tlvdevs 3490 */ 3491/* ARGSUSED */ 3492void 3493ztest_split_pool(ztest_ds_t *zd, uint64_t id) 3494{ 3495 ztest_shared_t *zs = ztest_shared; 3496 spa_t *spa = ztest_spa; 3497 vdev_t *rvd = spa->spa_root_vdev; 3498 nvlist_t *tree, **child, *config, *split, **schild; 3499 uint_t c, children, schildren = 0, lastlogid = 0; 3500 int error = 0; 3501 3502 if (ztest_opts.zo_mmp_test) 3503 return; 3504 3505 mutex_enter(&ztest_vdev_lock); 3506 3507 /* ensure we have a usable config; mirrors of raidz aren't supported */ 3508 if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { 3509 mutex_exit(&ztest_vdev_lock); 3510 return; 3511 } 3512 3513 /* clean up the old pool, if any */ 3514 (void) spa_destroy("splitp"); 3515 3516 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3517 3518 /* generate a config from the existing config */ 3519 mutex_enter(&spa->spa_props_lock); 3520 tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); 3521 mutex_exit(&spa->spa_props_lock); 3522 3523 VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, 3524 &child, &children)); 3525 3526 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); 3527 for (c = 0; c < children; c++) { 3528 vdev_t *tvd = rvd->vdev_child[c]; 3529 nvlist_t **mchild; 3530 uint_t mchildren; 3531 3532 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 3533 schild[schildren] = fnvlist_alloc(); 3534 fnvlist_add_string(schild[schildren], 3535 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); 3536 fnvlist_add_uint64(schild[schildren], 3537 ZPOOL_CONFIG_IS_HOLE, 1); 3538 if (lastlogid == 0) 3539 lastlogid = schildren; 3540 ++schildren; 3541 continue; 3542 } 3543 lastlogid = 0; 3544 VERIFY0(nvlist_lookup_nvlist_array(child[c], 3545 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); 3546 schild[schildren++] = fnvlist_dup(mchild[0]); 3547 } 3548 3549 /* OK, create a config that can be used to split */ 3550 split = fnvlist_alloc(); 3551 fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); 3552 fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, 3553 lastlogid != 0 ? lastlogid : schildren); 3554 3555 config = fnvlist_alloc(); 3556 fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); 3557 3558 for (c = 0; c < schildren; c++) 3559 fnvlist_free(schild[c]); 3560 free(schild); 3561 fnvlist_free(split); 3562 3563 spa_config_exit(spa, SCL_VDEV, FTAG); 3564 3565 (void) pthread_rwlock_wrlock(&ztest_name_lock); 3566 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 3567 (void) pthread_rwlock_unlock(&ztest_name_lock); 3568 3569 fnvlist_free(config); 3570 3571 if (error == 0) { 3572 (void) printf("successful split - results:\n"); 3573 mutex_enter(&spa_namespace_lock); 3574 show_pool_stats(spa); 3575 show_pool_stats(spa_lookup("splitp")); 3576 mutex_exit(&spa_namespace_lock); 3577 ++zs->zs_splits; 3578 --zs->zs_mirrors; 3579 } 3580 mutex_exit(&ztest_vdev_lock); 3581} 3582 3583/* 3584 * Verify that we can attach and detach devices. 3585 */ 3586/* ARGSUSED */ 3587void 3588ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 3589{ 3590 ztest_shared_t *zs = ztest_shared; 3591 spa_t *spa = ztest_spa; 3592 spa_aux_vdev_t *sav = &spa->spa_spares; 3593 vdev_t *rvd = spa->spa_root_vdev; 3594 vdev_t *oldvd, *newvd, *pvd; 3595 nvlist_t *root; 3596 uint64_t leaves; 3597 uint64_t leaf, top; 3598 uint64_t ashift = ztest_get_ashift(); 3599 uint64_t oldguid, pguid; 3600 uint64_t oldsize, newsize; 3601 char *oldpath, *newpath; 3602 int replacing; 3603 int oldvd_has_siblings = B_FALSE; 3604 int newvd_is_spare = B_FALSE; 3605 int newvd_is_dspare = B_FALSE; 3606 int oldvd_is_log; 3607 int error, expected_error; 3608 3609 if (ztest_opts.zo_mmp_test) 3610 return; 3611 3612 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3613 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 3614 3615 mutex_enter(&ztest_vdev_lock); 3616 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 3617 3618 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3619 3620 /* 3621 * If a vdev is in the process of being removed, its removal may 3622 * finish while we are in progress, leading to an unexpected error 3623 * value. Don't bother trying to attach while we are in the middle 3624 * of removal. 3625 */ 3626 if (ztest_device_removal_active) { 3627 spa_config_exit(spa, SCL_ALL, FTAG); 3628 goto out; 3629 } 3630 3631 /* 3632 * Decide whether to do an attach or a replace. 3633 */ 3634 replacing = ztest_random(2); 3635 3636 /* 3637 * Pick a random top-level vdev. 3638 */ 3639 top = ztest_random_vdev_top(spa, B_TRUE); 3640 3641 /* 3642 * Pick a random leaf within it. 3643 */ 3644 leaf = ztest_random(leaves); 3645 3646 /* 3647 * Locate this vdev. 3648 */ 3649 oldvd = rvd->vdev_child[top]; 3650 3651 /* pick a child from the mirror */ 3652 if (zs->zs_mirrors >= 1) { 3653 ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); 3654 ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); 3655 oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; 3656 } 3657 3658 /* pick a child out of the raidz group */ 3659 if (ztest_opts.zo_raid_children > 1) { 3660 if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) 3661 ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); 3662 else 3663 ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); 3664 ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); 3665 oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; 3666 } 3667 3668 /* 3669 * If we're already doing an attach or replace, oldvd may be a 3670 * mirror vdev -- in which case, pick a random child. 3671 */ 3672 while (oldvd->vdev_children != 0) { 3673 oldvd_has_siblings = B_TRUE; 3674 ASSERT3U(oldvd->vdev_children, >=, 2); 3675 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3676 } 3677 3678 oldguid = oldvd->vdev_guid; 3679 oldsize = vdev_get_min_asize(oldvd); 3680 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3681 (void) strcpy(oldpath, oldvd->vdev_path); 3682 pvd = oldvd->vdev_parent; 3683 pguid = pvd->vdev_guid; 3684 3685 /* 3686 * If oldvd has siblings, then half of the time, detach it. Prior 3687 * to the detach the pool is scrubbed in order to prevent creating 3688 * unrepairable blocks as a result of the data corruption injection. 3689 */ 3690 if (oldvd_has_siblings && ztest_random(2) == 0) { 3691 spa_config_exit(spa, SCL_ALL, FTAG); 3692 3693 error = ztest_scrub_impl(spa); 3694 if (error) 3695 goto out; 3696 3697 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3698 if (error != 0 && error != ENODEV && error != EBUSY && 3699 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3700 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3701 fatal(0, "detach (%s) returned %d", oldpath, error); 3702 goto out; 3703 } 3704 3705 /* 3706 * For the new vdev, choose with equal probability between the two 3707 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3708 */ 3709 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3710 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3711 newvd_is_spare = B_TRUE; 3712 3713 if (newvd->vdev_ops == &vdev_draid_spare_ops) 3714 newvd_is_dspare = B_TRUE; 3715 3716 (void) strcpy(newpath, newvd->vdev_path); 3717 } else { 3718 (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, 3719 ztest_opts.zo_dir, ztest_opts.zo_pool, 3720 top * leaves + leaf); 3721 if (ztest_random(2) == 0) 3722 newpath[strlen(newpath) - 1] = 'b'; 3723 newvd = vdev_lookup_by_path(rvd, newpath); 3724 } 3725 3726 if (newvd) { 3727 /* 3728 * Reopen to ensure the vdev's asize field isn't stale. 3729 */ 3730 vdev_reopen(newvd); 3731 newsize = vdev_get_min_asize(newvd); 3732 } else { 3733 /* 3734 * Make newsize a little bigger or smaller than oldsize. 3735 * If it's smaller, the attach should fail. 3736 * If it's larger, and we're doing a replace, 3737 * we should get dynamic LUN growth when we're done. 3738 */ 3739 newsize = 10 * oldsize / (9 + ztest_random(3)); 3740 } 3741 3742 /* 3743 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3744 * unless it's a replace; in that case any non-replacing parent is OK. 3745 * 3746 * If newvd is already part of the pool, it should fail with EBUSY. 3747 * 3748 * If newvd is too small, it should fail with EOVERFLOW. 3749 * 3750 * If newvd is a distributed spare and it's being attached to a 3751 * dRAID which is not its parent it should fail with EINVAL. 3752 */ 3753 if (pvd->vdev_ops != &vdev_mirror_ops && 3754 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3755 pvd->vdev_ops == &vdev_replacing_ops || 3756 pvd->vdev_ops == &vdev_spare_ops)) 3757 expected_error = ENOTSUP; 3758 else if (newvd_is_spare && (!replacing || oldvd_is_log)) 3759 expected_error = ENOTSUP; 3760 else if (newvd == oldvd) 3761 expected_error = replacing ? 0 : EBUSY; 3762 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3763 expected_error = EBUSY; 3764 else if (!newvd_is_dspare && newsize < oldsize) 3765 expected_error = EOVERFLOW; 3766 else if (ashift > oldvd->vdev_top->vdev_ashift) 3767 expected_error = EDOM; 3768 else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) 3769 expected_error = ENOTSUP; 3770 else 3771 expected_error = 0; 3772 3773 spa_config_exit(spa, SCL_ALL, FTAG); 3774 3775 /* 3776 * Build the nvlist describing newpath. 3777 */ 3778 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3779 ashift, NULL, 0, 0, 1); 3780 3781 /* 3782 * When supported select either a healing or sequential resilver. 3783 */ 3784 boolean_t rebuilding = B_FALSE; 3785 if (pvd->vdev_ops == &vdev_mirror_ops || 3786 pvd->vdev_ops == &vdev_root_ops) { 3787 rebuilding = !!ztest_random(2); 3788 } 3789 3790 error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); 3791 3792 fnvlist_free(root); 3793 3794 /* 3795 * If our parent was the replacing vdev, but the replace completed, 3796 * then instead of failing with ENOTSUP we may either succeed, 3797 * fail with ENODEV, or fail with EOVERFLOW. 3798 */ 3799 if (expected_error == ENOTSUP && 3800 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3801 expected_error = error; 3802 3803 /* 3804 * If someone grew the LUN, the replacement may be too small. 3805 */ 3806 if (error == EOVERFLOW || error == EBUSY) 3807 expected_error = error; 3808 3809 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3810 error == ZFS_ERR_DISCARDING_CHECKPOINT || 3811 error == ZFS_ERR_RESILVER_IN_PROGRESS || 3812 error == ZFS_ERR_REBUILD_IN_PROGRESS) 3813 expected_error = error; 3814 3815 if (error != expected_error && expected_error != EBUSY) { 3816 fatal(0, "attach (%s %llu, %s %llu, %d) " 3817 "returned %d, expected %d", 3818 oldpath, oldsize, newpath, 3819 newsize, replacing, error, expected_error); 3820 } 3821out: 3822 mutex_exit(&ztest_vdev_lock); 3823 3824 umem_free(oldpath, MAXPATHLEN); 3825 umem_free(newpath, MAXPATHLEN); 3826} 3827 3828/* ARGSUSED */ 3829void 3830ztest_device_removal(ztest_ds_t *zd, uint64_t id) 3831{ 3832 spa_t *spa = ztest_spa; 3833 vdev_t *vd; 3834 uint64_t guid; 3835 int error; 3836 3837 mutex_enter(&ztest_vdev_lock); 3838 3839 if (ztest_device_removal_active) { 3840 mutex_exit(&ztest_vdev_lock); 3841 return; 3842 } 3843 3844 /* 3845 * Remove a random top-level vdev and wait for removal to finish. 3846 */ 3847 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3848 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 3849 guid = vd->vdev_guid; 3850 spa_config_exit(spa, SCL_VDEV, FTAG); 3851 3852 error = spa_vdev_remove(spa, guid, B_FALSE); 3853 if (error == 0) { 3854 ztest_device_removal_active = B_TRUE; 3855 mutex_exit(&ztest_vdev_lock); 3856 3857 /* 3858 * spa->spa_vdev_removal is created in a sync task that 3859 * is initiated via dsl_sync_task_nowait(). Since the 3860 * task may not run before spa_vdev_remove() returns, we 3861 * must wait at least 1 txg to ensure that the removal 3862 * struct has been created. 3863 */ 3864 txg_wait_synced(spa_get_dsl(spa), 0); 3865 3866 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 3867 txg_wait_synced(spa_get_dsl(spa), 0); 3868 } else { 3869 mutex_exit(&ztest_vdev_lock); 3870 return; 3871 } 3872 3873 /* 3874 * The pool needs to be scrubbed after completing device removal. 3875 * Failure to do so may result in checksum errors due to the 3876 * strategy employed by ztest_fault_inject() when selecting which 3877 * offset are redundant and can be damaged. 3878 */ 3879 error = spa_scan(spa, POOL_SCAN_SCRUB); 3880 if (error == 0) { 3881 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 3882 txg_wait_synced(spa_get_dsl(spa), 0); 3883 } 3884 3885 mutex_enter(&ztest_vdev_lock); 3886 ztest_device_removal_active = B_FALSE; 3887 mutex_exit(&ztest_vdev_lock); 3888} 3889 3890/* 3891 * Callback function which expands the physical size of the vdev. 3892 */ 3893static vdev_t * 3894grow_vdev(vdev_t *vd, void *arg) 3895{ 3896 spa_t *spa __maybe_unused = vd->vdev_spa; 3897 size_t *newsize = arg; 3898 size_t fsize; 3899 int fd; 3900 3901 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3902 ASSERT(vd->vdev_ops->vdev_op_leaf); 3903 3904 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 3905 return (vd); 3906 3907 fsize = lseek(fd, 0, SEEK_END); 3908 VERIFY0(ftruncate(fd, *newsize)); 3909 3910 if (ztest_opts.zo_verbose >= 6) { 3911 (void) printf("%s grew from %lu to %lu bytes\n", 3912 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 3913 } 3914 (void) close(fd); 3915 return (NULL); 3916} 3917 3918/* 3919 * Callback function which expands a given vdev by calling vdev_online(). 3920 */ 3921/* ARGSUSED */ 3922static vdev_t * 3923online_vdev(vdev_t *vd, void *arg) 3924{ 3925 spa_t *spa = vd->vdev_spa; 3926 vdev_t *tvd = vd->vdev_top; 3927 uint64_t guid = vd->vdev_guid; 3928 uint64_t generation = spa->spa_config_generation + 1; 3929 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 3930 int error; 3931 3932 ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); 3933 ASSERT(vd->vdev_ops->vdev_op_leaf); 3934 3935 /* Calling vdev_online will initialize the new metaslabs */ 3936 spa_config_exit(spa, SCL_STATE, spa); 3937 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 3938 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3939 3940 /* 3941 * If vdev_online returned an error or the underlying vdev_open 3942 * failed then we abort the expand. The only way to know that 3943 * vdev_open fails is by checking the returned newstate. 3944 */ 3945 if (error || newstate != VDEV_STATE_HEALTHY) { 3946 if (ztest_opts.zo_verbose >= 5) { 3947 (void) printf("Unable to expand vdev, state %llu, " 3948 "error %d\n", (u_longlong_t)newstate, error); 3949 } 3950 return (vd); 3951 } 3952 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 3953 3954 /* 3955 * Since we dropped the lock we need to ensure that we're 3956 * still talking to the original vdev. It's possible this 3957 * vdev may have been detached/replaced while we were 3958 * trying to online it. 3959 */ 3960 if (generation != spa->spa_config_generation) { 3961 if (ztest_opts.zo_verbose >= 5) { 3962 (void) printf("vdev configuration has changed, " 3963 "guid %llu, state %llu, expected gen %llu, " 3964 "got gen %llu\n", 3965 (u_longlong_t)guid, 3966 (u_longlong_t)tvd->vdev_state, 3967 (u_longlong_t)generation, 3968 (u_longlong_t)spa->spa_config_generation); 3969 } 3970 return (vd); 3971 } 3972 return (NULL); 3973} 3974 3975/* 3976 * Traverse the vdev tree calling the supplied function. 3977 * We continue to walk the tree until we either have walked all 3978 * children or we receive a non-NULL return from the callback. 3979 * If a NULL callback is passed, then we just return back the first 3980 * leaf vdev we encounter. 3981 */ 3982static vdev_t * 3983vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 3984{ 3985 uint_t c; 3986 3987 if (vd->vdev_ops->vdev_op_leaf) { 3988 if (func == NULL) 3989 return (vd); 3990 else 3991 return (func(vd, arg)); 3992 } 3993 3994 for (c = 0; c < vd->vdev_children; c++) { 3995 vdev_t *cvd = vd->vdev_child[c]; 3996 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 3997 return (cvd); 3998 } 3999 return (NULL); 4000} 4001 4002/* 4003 * Verify that dynamic LUN growth works as expected. 4004 */ 4005/* ARGSUSED */ 4006void 4007ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 4008{ 4009 spa_t *spa = ztest_spa; 4010 vdev_t *vd, *tvd; 4011 metaslab_class_t *mc; 4012 metaslab_group_t *mg; 4013 size_t psize, newsize; 4014 uint64_t top; 4015 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 4016 4017 mutex_enter(&ztest_checkpoint_lock); 4018 mutex_enter(&ztest_vdev_lock); 4019 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4020 4021 /* 4022 * If there is a vdev removal in progress, it could complete while 4023 * we are running, in which case we would not be able to verify 4024 * that the metaslab_class space increased (because it decreases 4025 * when the device removal completes). 4026 */ 4027 if (ztest_device_removal_active) { 4028 spa_config_exit(spa, SCL_STATE, spa); 4029 mutex_exit(&ztest_vdev_lock); 4030 mutex_exit(&ztest_checkpoint_lock); 4031 return; 4032 } 4033 4034 top = ztest_random_vdev_top(spa, B_TRUE); 4035 4036 tvd = spa->spa_root_vdev->vdev_child[top]; 4037 mg = tvd->vdev_mg; 4038 mc = mg->mg_class; 4039 old_ms_count = tvd->vdev_ms_count; 4040 old_class_space = metaslab_class_get_space(mc); 4041 4042 /* 4043 * Determine the size of the first leaf vdev associated with 4044 * our top-level device. 4045 */ 4046 vd = vdev_walk_tree(tvd, NULL, NULL); 4047 ASSERT3P(vd, !=, NULL); 4048 ASSERT(vd->vdev_ops->vdev_op_leaf); 4049 4050 psize = vd->vdev_psize; 4051 4052 /* 4053 * We only try to expand the vdev if it's healthy, less than 4x its 4054 * original size, and it has a valid psize. 4055 */ 4056 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 4057 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 4058 spa_config_exit(spa, SCL_STATE, spa); 4059 mutex_exit(&ztest_vdev_lock); 4060 mutex_exit(&ztest_checkpoint_lock); 4061 return; 4062 } 4063 ASSERT3U(psize, >, 0); 4064 newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); 4065 ASSERT3U(newsize, >, psize); 4066 4067 if (ztest_opts.zo_verbose >= 6) { 4068 (void) printf("Expanding LUN %s from %lu to %lu\n", 4069 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 4070 } 4071 4072 /* 4073 * Growing the vdev is a two step process: 4074 * 1). expand the physical size (i.e. relabel) 4075 * 2). online the vdev to create the new metaslabs 4076 */ 4077 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 4078 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 4079 tvd->vdev_state != VDEV_STATE_HEALTHY) { 4080 if (ztest_opts.zo_verbose >= 5) { 4081 (void) printf("Could not expand LUN because " 4082 "the vdev configuration changed.\n"); 4083 } 4084 spa_config_exit(spa, SCL_STATE, spa); 4085 mutex_exit(&ztest_vdev_lock); 4086 mutex_exit(&ztest_checkpoint_lock); 4087 return; 4088 } 4089 4090 spa_config_exit(spa, SCL_STATE, spa); 4091 4092 /* 4093 * Expanding the LUN will update the config asynchronously, 4094 * thus we must wait for the async thread to complete any 4095 * pending tasks before proceeding. 4096 */ 4097 for (;;) { 4098 boolean_t done; 4099 mutex_enter(&spa->spa_async_lock); 4100 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 4101 mutex_exit(&spa->spa_async_lock); 4102 if (done) 4103 break; 4104 txg_wait_synced(spa_get_dsl(spa), 0); 4105 (void) poll(NULL, 0, 100); 4106 } 4107 4108 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 4109 4110 tvd = spa->spa_root_vdev->vdev_child[top]; 4111 new_ms_count = tvd->vdev_ms_count; 4112 new_class_space = metaslab_class_get_space(mc); 4113 4114 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 4115 if (ztest_opts.zo_verbose >= 5) { 4116 (void) printf("Could not verify LUN expansion due to " 4117 "intervening vdev offline or remove.\n"); 4118 } 4119 spa_config_exit(spa, SCL_STATE, spa); 4120 mutex_exit(&ztest_vdev_lock); 4121 mutex_exit(&ztest_checkpoint_lock); 4122 return; 4123 } 4124 4125 /* 4126 * Make sure we were able to grow the vdev. 4127 */ 4128 if (new_ms_count <= old_ms_count) { 4129 fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", 4130 old_ms_count, new_ms_count); 4131 } 4132 4133 /* 4134 * Make sure we were able to grow the pool. 4135 */ 4136 if (new_class_space <= old_class_space) { 4137 fatal(0, "LUN expansion failed: class_space %llu < %llu\n", 4138 old_class_space, new_class_space); 4139 } 4140 4141 if (ztest_opts.zo_verbose >= 5) { 4142 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 4143 4144 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 4145 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 4146 (void) printf("%s grew from %s to %s\n", 4147 spa->spa_name, oldnumbuf, newnumbuf); 4148 } 4149 4150 spa_config_exit(spa, SCL_STATE, spa); 4151 mutex_exit(&ztest_vdev_lock); 4152 mutex_exit(&ztest_checkpoint_lock); 4153} 4154 4155/* 4156 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 4157 */ 4158/* ARGSUSED */ 4159static void 4160ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 4161{ 4162 /* 4163 * Create the objects common to all ztest datasets. 4164 */ 4165 VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, 4166 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); 4167} 4168 4169static int 4170ztest_dataset_create(char *dsname) 4171{ 4172 int err; 4173 uint64_t rand; 4174 dsl_crypto_params_t *dcp = NULL; 4175 4176 /* 4177 * 50% of the time, we create encrypted datasets 4178 * using a random cipher suite and a hard-coded 4179 * wrapping key. 4180 */ 4181 rand = ztest_random(2); 4182 if (rand != 0) { 4183 nvlist_t *crypto_args = fnvlist_alloc(); 4184 nvlist_t *props = fnvlist_alloc(); 4185 4186 /* slight bias towards the default cipher suite */ 4187 rand = ztest_random(ZIO_CRYPT_FUNCTIONS); 4188 if (rand < ZIO_CRYPT_AES_128_CCM) 4189 rand = ZIO_CRYPT_ON; 4190 4191 fnvlist_add_uint64(props, 4192 zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); 4193 fnvlist_add_uint8_array(crypto_args, "wkeydata", 4194 (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); 4195 4196 /* 4197 * These parameters aren't really used by the kernel. They 4198 * are simply stored so that userspace knows how to load 4199 * the wrapping key. 4200 */ 4201 fnvlist_add_uint64(props, 4202 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); 4203 fnvlist_add_string(props, 4204 zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); 4205 fnvlist_add_uint64(props, 4206 zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); 4207 fnvlist_add_uint64(props, 4208 zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); 4209 4210 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, 4211 crypto_args, &dcp)); 4212 4213 /* 4214 * Cycle through all available encryption implementations 4215 * to verify interoperability. 4216 */ 4217 VERIFY0(gcm_impl_set("cycle")); 4218 VERIFY0(aes_impl_set("cycle")); 4219 4220 fnvlist_free(crypto_args); 4221 fnvlist_free(props); 4222 } 4223 4224 err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, 4225 ztest_objset_create_cb, NULL); 4226 dsl_crypto_params_free(dcp, !!err); 4227 4228 rand = ztest_random(100); 4229 if (err || rand < 80) 4230 return (err); 4231 4232 if (ztest_opts.zo_verbose >= 5) 4233 (void) printf("Setting dataset %s to sync always\n", dsname); 4234 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 4235 ZFS_SYNC_ALWAYS, B_FALSE)); 4236} 4237 4238/* ARGSUSED */ 4239static int 4240ztest_objset_destroy_cb(const char *name, void *arg) 4241{ 4242 objset_t *os; 4243 dmu_object_info_t doi; 4244 int error; 4245 4246 /* 4247 * Verify that the dataset contains a directory object. 4248 */ 4249 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4250 B_TRUE, FTAG, &os)); 4251 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 4252 if (error != ENOENT) { 4253 /* We could have crashed in the middle of destroying it */ 4254 ASSERT0(error); 4255 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 4256 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 4257 } 4258 dmu_objset_disown(os, B_TRUE, FTAG); 4259 4260 /* 4261 * Destroy the dataset. 4262 */ 4263 if (strchr(name, '@') != NULL) { 4264 VERIFY0(dsl_destroy_snapshot(name, B_TRUE)); 4265 } else { 4266 error = dsl_destroy_head(name); 4267 if (error == ENOSPC) { 4268 /* There could be checkpoint or insufficient slop */ 4269 ztest_record_enospc(FTAG); 4270 } else if (error != EBUSY) { 4271 /* There could be a hold on this dataset */ 4272 ASSERT0(error); 4273 } 4274 } 4275 return (0); 4276} 4277 4278static boolean_t 4279ztest_snapshot_create(char *osname, uint64_t id) 4280{ 4281 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4282 int error; 4283 4284 (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); 4285 4286 error = dmu_objset_snapshot_one(osname, snapname); 4287 if (error == ENOSPC) { 4288 ztest_record_enospc(FTAG); 4289 return (B_FALSE); 4290 } 4291 if (error != 0 && error != EEXIST) { 4292 fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, 4293 snapname, error); 4294 } 4295 return (B_TRUE); 4296} 4297 4298static boolean_t 4299ztest_snapshot_destroy(char *osname, uint64_t id) 4300{ 4301 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 4302 int error; 4303 4304 (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, 4305 (u_longlong_t)id); 4306 4307 error = dsl_destroy_snapshot(snapname, B_FALSE); 4308 if (error != 0 && error != ENOENT) 4309 fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); 4310 return (B_TRUE); 4311} 4312 4313/* ARGSUSED */ 4314void 4315ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 4316{ 4317 ztest_ds_t *zdtmp; 4318 int iters; 4319 int error; 4320 objset_t *os, *os2; 4321 char name[ZFS_MAX_DATASET_NAME_LEN]; 4322 zilog_t *zilog; 4323 int i; 4324 4325 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 4326 4327 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4328 4329 (void) snprintf(name, sizeof (name), "%s/temp_%llu", 4330 ztest_opts.zo_pool, (u_longlong_t)id); 4331 4332 /* 4333 * If this dataset exists from a previous run, process its replay log 4334 * half of the time. If we don't replay it, then dsl_destroy_head() 4335 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 4336 */ 4337 if (ztest_random(2) == 0 && 4338 ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 4339 B_TRUE, FTAG, &os) == 0) { 4340 ztest_zd_init(zdtmp, NULL, os); 4341 zil_replay(os, zdtmp, ztest_replay_vector); 4342 ztest_zd_fini(zdtmp); 4343 dmu_objset_disown(os, B_TRUE, FTAG); 4344 } 4345 4346 /* 4347 * There may be an old instance of the dataset we're about to 4348 * create lying around from a previous run. If so, destroy it 4349 * and all of its snapshots. 4350 */ 4351 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 4352 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 4353 4354 /* 4355 * Verify that the destroyed dataset is no longer in the namespace. 4356 */ 4357 VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 4358 B_TRUE, FTAG, &os)); 4359 4360 /* 4361 * Verify that we can create a new dataset. 4362 */ 4363 error = ztest_dataset_create(name); 4364 if (error) { 4365 if (error == ENOSPC) { 4366 ztest_record_enospc(FTAG); 4367 goto out; 4368 } 4369 fatal(0, "dmu_objset_create(%s) = %d", name, error); 4370 } 4371 4372 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, 4373 FTAG, &os)); 4374 4375 ztest_zd_init(zdtmp, NULL, os); 4376 4377 /* 4378 * Open the intent log for it. 4379 */ 4380 zilog = zil_open(os, ztest_get_data); 4381 4382 /* 4383 * Put some objects in there, do a little I/O to them, 4384 * and randomly take a couple of snapshots along the way. 4385 */ 4386 iters = ztest_random(5); 4387 for (i = 0; i < iters; i++) { 4388 ztest_dmu_object_alloc_free(zdtmp, id); 4389 if (ztest_random(iters) == 0) 4390 (void) ztest_snapshot_create(name, i); 4391 } 4392 4393 /* 4394 * Verify that we cannot create an existing dataset. 4395 */ 4396 VERIFY3U(EEXIST, ==, 4397 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); 4398 4399 /* 4400 * Verify that we can hold an objset that is also owned. 4401 */ 4402 VERIFY0(dmu_objset_hold(name, FTAG, &os2)); 4403 dmu_objset_rele(os2, FTAG); 4404 4405 /* 4406 * Verify that we cannot own an objset that is already owned. 4407 */ 4408 VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, 4409 B_FALSE, B_TRUE, FTAG, &os2)); 4410 4411 zil_close(zilog); 4412 dmu_objset_disown(os, B_TRUE, FTAG); 4413 ztest_zd_fini(zdtmp); 4414out: 4415 (void) pthread_rwlock_unlock(&ztest_name_lock); 4416 4417 umem_free(zdtmp, sizeof (ztest_ds_t)); 4418} 4419 4420/* 4421 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 4422 */ 4423void 4424ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 4425{ 4426 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4427 (void) ztest_snapshot_destroy(zd->zd_name, id); 4428 (void) ztest_snapshot_create(zd->zd_name, id); 4429 (void) pthread_rwlock_unlock(&ztest_name_lock); 4430} 4431 4432/* 4433 * Cleanup non-standard snapshots and clones. 4434 */ 4435static void 4436ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 4437{ 4438 char *snap1name; 4439 char *clone1name; 4440 char *snap2name; 4441 char *clone2name; 4442 char *snap3name; 4443 int error; 4444 4445 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4446 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4447 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4448 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4449 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4450 4451 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, 4452 "%s@s1_%llu", osname, (u_longlong_t)id); 4453 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, 4454 "%s/c1_%llu", osname, (u_longlong_t)id); 4455 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, 4456 "%s@s2_%llu", clone1name, (u_longlong_t)id); 4457 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, 4458 "%s/c2_%llu", osname, (u_longlong_t)id); 4459 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, 4460 "%s@s3_%llu", clone1name, (u_longlong_t)id); 4461 4462 error = dsl_destroy_head(clone2name); 4463 if (error && error != ENOENT) 4464 fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); 4465 error = dsl_destroy_snapshot(snap3name, B_FALSE); 4466 if (error && error != ENOENT) 4467 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); 4468 error = dsl_destroy_snapshot(snap2name, B_FALSE); 4469 if (error && error != ENOENT) 4470 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); 4471 error = dsl_destroy_head(clone1name); 4472 if (error && error != ENOENT) 4473 fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); 4474 error = dsl_destroy_snapshot(snap1name, B_FALSE); 4475 if (error && error != ENOENT) 4476 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); 4477 4478 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4479 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4480 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4481 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4482 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4483} 4484 4485/* 4486 * Verify dsl_dataset_promote handles EBUSY 4487 */ 4488void 4489ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 4490{ 4491 objset_t *os; 4492 char *snap1name; 4493 char *clone1name; 4494 char *snap2name; 4495 char *clone2name; 4496 char *snap3name; 4497 char *osname = zd->zd_name; 4498 int error; 4499 4500 snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4501 clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4502 snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4503 clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4504 snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); 4505 4506 (void) pthread_rwlock_rdlock(&ztest_name_lock); 4507 4508 ztest_dsl_dataset_cleanup(osname, id); 4509 4510 (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, 4511 "%s@s1_%llu", osname, (u_longlong_t)id); 4512 (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, 4513 "%s/c1_%llu", osname, (u_longlong_t)id); 4514 (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, 4515 "%s@s2_%llu", clone1name, (u_longlong_t)id); 4516 (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, 4517 "%s/c2_%llu", osname, (u_longlong_t)id); 4518 (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, 4519 "%s@s3_%llu", clone1name, (u_longlong_t)id); 4520 4521 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 4522 if (error && error != EEXIST) { 4523 if (error == ENOSPC) { 4524 ztest_record_enospc(FTAG); 4525 goto out; 4526 } 4527 fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); 4528 } 4529 4530 error = dmu_objset_clone(clone1name, snap1name); 4531 if (error) { 4532 if (error == ENOSPC) { 4533 ztest_record_enospc(FTAG); 4534 goto out; 4535 } 4536 fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); 4537 } 4538 4539 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 4540 if (error && error != EEXIST) { 4541 if (error == ENOSPC) { 4542 ztest_record_enospc(FTAG); 4543 goto out; 4544 } 4545 fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); 4546 } 4547 4548 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 4549 if (error && error != EEXIST) { 4550 if (error == ENOSPC) { 4551 ztest_record_enospc(FTAG); 4552 goto out; 4553 } 4554 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); 4555 } 4556 4557 error = dmu_objset_clone(clone2name, snap3name); 4558 if (error) { 4559 if (error == ENOSPC) { 4560 ztest_record_enospc(FTAG); 4561 goto out; 4562 } 4563 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); 4564 } 4565 4566 error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, 4567 FTAG, &os); 4568 if (error) 4569 fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); 4570 error = dsl_dataset_promote(clone2name, NULL); 4571 if (error == ENOSPC) { 4572 dmu_objset_disown(os, B_TRUE, FTAG); 4573 ztest_record_enospc(FTAG); 4574 goto out; 4575 } 4576 if (error != EBUSY) 4577 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, 4578 error); 4579 dmu_objset_disown(os, B_TRUE, FTAG); 4580 4581out: 4582 ztest_dsl_dataset_cleanup(osname, id); 4583 4584 (void) pthread_rwlock_unlock(&ztest_name_lock); 4585 4586 umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); 4587 umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); 4588 umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); 4589 umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); 4590 umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); 4591} 4592 4593#undef OD_ARRAY_SIZE 4594#define OD_ARRAY_SIZE 4 4595 4596/* 4597 * Verify that dmu_object_{alloc,free} work as expected. 4598 */ 4599void 4600ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 4601{ 4602 ztest_od_t *od; 4603 int batchsize; 4604 int size; 4605 int b; 4606 4607 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4608 od = umem_alloc(size, UMEM_NOFAIL); 4609 batchsize = OD_ARRAY_SIZE; 4610 4611 for (b = 0; b < batchsize; b++) 4612 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 4613 0, 0, 0); 4614 4615 /* 4616 * Destroy the previous batch of objects, create a new batch, 4617 * and do some I/O on the new objects. 4618 */ 4619 if (ztest_object_init(zd, od, size, B_TRUE) != 0) 4620 return; 4621 4622 while (ztest_random(4 * batchsize) != 0) 4623 ztest_io(zd, od[ztest_random(batchsize)].od_object, 4624 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4625 4626 umem_free(od, size); 4627} 4628 4629/* 4630 * Rewind the global allocator to verify object allocation backfilling. 4631 */ 4632void 4633ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 4634{ 4635 objset_t *os = zd->zd_os; 4636 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 4637 uint64_t object; 4638 4639 /* 4640 * Rewind the global allocator randomly back to a lower object number 4641 * to force backfilling and reclamation of recently freed dnodes. 4642 */ 4643 mutex_enter(&os->os_obj_lock); 4644 object = ztest_random(os->os_obj_next_chunk); 4645 os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); 4646 mutex_exit(&os->os_obj_lock); 4647} 4648 4649#undef OD_ARRAY_SIZE 4650#define OD_ARRAY_SIZE 2 4651 4652/* 4653 * Verify that dmu_{read,write} work as expected. 4654 */ 4655void 4656ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 4657{ 4658 int size; 4659 ztest_od_t *od; 4660 4661 objset_t *os = zd->zd_os; 4662 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4663 od = umem_alloc(size, UMEM_NOFAIL); 4664 dmu_tx_t *tx; 4665 int i, freeit, error; 4666 uint64_t n, s, txg; 4667 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 4668 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4669 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 4670 uint64_t regions = 997; 4671 uint64_t stride = 123456789ULL; 4672 uint64_t width = 40; 4673 int free_percent = 5; 4674 4675 /* 4676 * This test uses two objects, packobj and bigobj, that are always 4677 * updated together (i.e. in the same tx) so that their contents are 4678 * in sync and can be compared. Their contents relate to each other 4679 * in a simple way: packobj is a dense array of 'bufwad' structures, 4680 * while bigobj is a sparse array of the same bufwads. Specifically, 4681 * for any index n, there are three bufwads that should be identical: 4682 * 4683 * packobj, at offset n * sizeof (bufwad_t) 4684 * bigobj, at the head of the nth chunk 4685 * bigobj, at the tail of the nth chunk 4686 * 4687 * The chunk size is arbitrary. It doesn't have to be a power of two, 4688 * and it doesn't have any relation to the object blocksize. 4689 * The only requirement is that it can hold at least two bufwads. 4690 * 4691 * Normally, we write the bufwad to each of these locations. 4692 * However, free_percent of the time we instead write zeroes to 4693 * packobj and perform a dmu_free_range() on bigobj. By comparing 4694 * bigobj to packobj, we can verify that the DMU is correctly 4695 * tracking which parts of an object are allocated and free, 4696 * and that the contents of the allocated blocks are correct. 4697 */ 4698 4699 /* 4700 * Read the directory info. If it's the first time, set things up. 4701 */ 4702 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); 4703 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4704 chunksize); 4705 4706 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 4707 umem_free(od, size); 4708 return; 4709 } 4710 4711 bigobj = od[0].od_object; 4712 packobj = od[1].od_object; 4713 chunksize = od[0].od_gen; 4714 ASSERT3U(chunksize, ==, od[1].od_gen); 4715 4716 /* 4717 * Prefetch a random chunk of the big object. 4718 * Our aim here is to get some async reads in flight 4719 * for blocks that we may free below; the DMU should 4720 * handle this race correctly. 4721 */ 4722 n = ztest_random(regions) * stride + ztest_random(width); 4723 s = 1 + ztest_random(2 * width - 1); 4724 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 4725 ZIO_PRIORITY_SYNC_READ); 4726 4727 /* 4728 * Pick a random index and compute the offsets into packobj and bigobj. 4729 */ 4730 n = ztest_random(regions) * stride + ztest_random(width); 4731 s = 1 + ztest_random(width - 1); 4732 4733 packoff = n * sizeof (bufwad_t); 4734 packsize = s * sizeof (bufwad_t); 4735 4736 bigoff = n * chunksize; 4737 bigsize = s * chunksize; 4738 4739 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 4740 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 4741 4742 /* 4743 * free_percent of the time, free a range of bigobj rather than 4744 * overwriting it. 4745 */ 4746 freeit = (ztest_random(100) < free_percent); 4747 4748 /* 4749 * Read the current contents of our objects. 4750 */ 4751 error = dmu_read(os, packobj, packoff, packsize, packbuf, 4752 DMU_READ_PREFETCH); 4753 ASSERT0(error); 4754 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 4755 DMU_READ_PREFETCH); 4756 ASSERT0(error); 4757 4758 /* 4759 * Get a tx for the mods to both packobj and bigobj. 4760 */ 4761 tx = dmu_tx_create(os); 4762 4763 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4764 4765 if (freeit) 4766 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 4767 else 4768 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4769 4770 /* This accounts for setting the checksum/compression. */ 4771 dmu_tx_hold_bonus(tx, bigobj); 4772 4773 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4774 if (txg == 0) { 4775 umem_free(packbuf, packsize); 4776 umem_free(bigbuf, bigsize); 4777 umem_free(od, size); 4778 return; 4779 } 4780 4781 enum zio_checksum cksum; 4782 do { 4783 cksum = (enum zio_checksum) 4784 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 4785 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 4786 dmu_object_set_checksum(os, bigobj, cksum, tx); 4787 4788 enum zio_compress comp; 4789 do { 4790 comp = (enum zio_compress) 4791 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 4792 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 4793 dmu_object_set_compress(os, bigobj, comp, tx); 4794 4795 /* 4796 * For each index from n to n + s, verify that the existing bufwad 4797 * in packobj matches the bufwads at the head and tail of the 4798 * corresponding chunk in bigobj. Then update all three bufwads 4799 * with the new values we want to write out. 4800 */ 4801 for (i = 0; i < s; i++) { 4802 /* LINTED */ 4803 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4804 /* LINTED */ 4805 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4806 /* LINTED */ 4807 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4808 4809 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4810 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4811 4812 if (pack->bw_txg > txg) 4813 fatal(0, "future leak: got %llx, open txg is %llx", 4814 pack->bw_txg, txg); 4815 4816 if (pack->bw_data != 0 && pack->bw_index != n + i) 4817 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 4818 pack->bw_index, n, i); 4819 4820 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4821 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 4822 4823 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4824 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 4825 4826 if (freeit) { 4827 bzero(pack, sizeof (bufwad_t)); 4828 } else { 4829 pack->bw_index = n + i; 4830 pack->bw_txg = txg; 4831 pack->bw_data = 1 + ztest_random(-2ULL); 4832 } 4833 *bigH = *pack; 4834 *bigT = *pack; 4835 } 4836 4837 /* 4838 * We've verified all the old bufwads, and made new ones. 4839 * Now write them out. 4840 */ 4841 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4842 4843 if (freeit) { 4844 if (ztest_opts.zo_verbose >= 7) { 4845 (void) printf("freeing offset %llx size %llx" 4846 " txg %llx\n", 4847 (u_longlong_t)bigoff, 4848 (u_longlong_t)bigsize, 4849 (u_longlong_t)txg); 4850 } 4851 VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 4852 } else { 4853 if (ztest_opts.zo_verbose >= 7) { 4854 (void) printf("writing offset %llx size %llx" 4855 " txg %llx\n", 4856 (u_longlong_t)bigoff, 4857 (u_longlong_t)bigsize, 4858 (u_longlong_t)txg); 4859 } 4860 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 4861 } 4862 4863 dmu_tx_commit(tx); 4864 4865 /* 4866 * Sanity check the stuff we just wrote. 4867 */ 4868 { 4869 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4870 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4871 4872 VERIFY0(dmu_read(os, packobj, packoff, 4873 packsize, packcheck, DMU_READ_PREFETCH)); 4874 VERIFY0(dmu_read(os, bigobj, bigoff, 4875 bigsize, bigcheck, DMU_READ_PREFETCH)); 4876 4877 ASSERT0(bcmp(packbuf, packcheck, packsize)); 4878 ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); 4879 4880 umem_free(packcheck, packsize); 4881 umem_free(bigcheck, bigsize); 4882 } 4883 4884 umem_free(packbuf, packsize); 4885 umem_free(bigbuf, bigsize); 4886 umem_free(od, size); 4887} 4888 4889static void 4890compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 4891 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 4892{ 4893 uint64_t i; 4894 bufwad_t *pack; 4895 bufwad_t *bigH; 4896 bufwad_t *bigT; 4897 4898 /* 4899 * For each index from n to n + s, verify that the existing bufwad 4900 * in packobj matches the bufwads at the head and tail of the 4901 * corresponding chunk in bigobj. Then update all three bufwads 4902 * with the new values we want to write out. 4903 */ 4904 for (i = 0; i < s; i++) { 4905 /* LINTED */ 4906 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4907 /* LINTED */ 4908 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4909 /* LINTED */ 4910 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4911 4912 ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); 4913 ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); 4914 4915 if (pack->bw_txg > txg) 4916 fatal(0, "future leak: got %llx, open txg is %llx", 4917 pack->bw_txg, txg); 4918 4919 if (pack->bw_data != 0 && pack->bw_index != n + i) 4920 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 4921 pack->bw_index, n, i); 4922 4923 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4924 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 4925 4926 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4927 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 4928 4929 pack->bw_index = n + i; 4930 pack->bw_txg = txg; 4931 pack->bw_data = 1 + ztest_random(-2ULL); 4932 4933 *bigH = *pack; 4934 *bigT = *pack; 4935 } 4936} 4937 4938#undef OD_ARRAY_SIZE 4939#define OD_ARRAY_SIZE 2 4940 4941void 4942ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 4943{ 4944 objset_t *os = zd->zd_os; 4945 ztest_od_t *od; 4946 dmu_tx_t *tx; 4947 uint64_t i; 4948 int error; 4949 int size; 4950 uint64_t n, s, txg; 4951 bufwad_t *packbuf, *bigbuf; 4952 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4953 uint64_t blocksize = ztest_random_blocksize(); 4954 uint64_t chunksize = blocksize; 4955 uint64_t regions = 997; 4956 uint64_t stride = 123456789ULL; 4957 uint64_t width = 9; 4958 dmu_buf_t *bonus_db; 4959 arc_buf_t **bigbuf_arcbufs; 4960 dmu_object_info_t doi; 4961 4962 size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; 4963 od = umem_alloc(size, UMEM_NOFAIL); 4964 4965 /* 4966 * This test uses two objects, packobj and bigobj, that are always 4967 * updated together (i.e. in the same tx) so that their contents are 4968 * in sync and can be compared. Their contents relate to each other 4969 * in a simple way: packobj is a dense array of 'bufwad' structures, 4970 * while bigobj is a sparse array of the same bufwads. Specifically, 4971 * for any index n, there are three bufwads that should be identical: 4972 * 4973 * packobj, at offset n * sizeof (bufwad_t) 4974 * bigobj, at the head of the nth chunk 4975 * bigobj, at the tail of the nth chunk 4976 * 4977 * The chunk size is set equal to bigobj block size so that 4978 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. 4979 */ 4980 4981 /* 4982 * Read the directory info. If it's the first time, set things up. 4983 */ 4984 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 4985 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4986 chunksize); 4987 4988 4989 if (ztest_object_init(zd, od, size, B_FALSE) != 0) { 4990 umem_free(od, size); 4991 return; 4992 } 4993 4994 bigobj = od[0].od_object; 4995 packobj = od[1].od_object; 4996 blocksize = od[0].od_blocksize; 4997 chunksize = blocksize; 4998 ASSERT3U(chunksize, ==, od[1].od_gen); 4999 5000 VERIFY0(dmu_object_info(os, bigobj, &doi)); 5001 VERIFY(ISP2(doi.doi_data_block_size)); 5002 VERIFY3U(chunksize, ==, doi.doi_data_block_size); 5003 VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); 5004 5005 /* 5006 * Pick a random index and compute the offsets into packobj and bigobj. 5007 */ 5008 n = ztest_random(regions) * stride + ztest_random(width); 5009 s = 1 + ztest_random(width - 1); 5010 5011 packoff = n * sizeof (bufwad_t); 5012 packsize = s * sizeof (bufwad_t); 5013 5014 bigoff = n * chunksize; 5015 bigsize = s * chunksize; 5016 5017 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 5018 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 5019 5020 VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 5021 5022 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 5023 5024 /* 5025 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 5026 * Iteration 1 test zcopy to already referenced dbufs. 5027 * Iteration 2 test zcopy to dirty dbuf in the same txg. 5028 * Iteration 3 test zcopy to dbuf dirty in previous txg. 5029 * Iteration 4 test zcopy when dbuf is no longer dirty. 5030 * Iteration 5 test zcopy when it can't be done. 5031 * Iteration 6 one more zcopy write. 5032 */ 5033 for (i = 0; i < 7; i++) { 5034 uint64_t j; 5035 uint64_t off; 5036 5037 /* 5038 * In iteration 5 (i == 5) use arcbufs 5039 * that don't match bigobj blksz to test 5040 * dmu_assign_arcbuf_by_dbuf() when it can't directly 5041 * assign an arcbuf to a dbuf. 5042 */ 5043 for (j = 0; j < s; j++) { 5044 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5045 bigbuf_arcbufs[j] = 5046 dmu_request_arcbuf(bonus_db, chunksize); 5047 } else { 5048 bigbuf_arcbufs[2 * j] = 5049 dmu_request_arcbuf(bonus_db, chunksize / 2); 5050 bigbuf_arcbufs[2 * j + 1] = 5051 dmu_request_arcbuf(bonus_db, chunksize / 2); 5052 } 5053 } 5054 5055 /* 5056 * Get a tx for the mods to both packobj and bigobj. 5057 */ 5058 tx = dmu_tx_create(os); 5059 5060 dmu_tx_hold_write(tx, packobj, packoff, packsize); 5061 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 5062 5063 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5064 if (txg == 0) { 5065 umem_free(packbuf, packsize); 5066 umem_free(bigbuf, bigsize); 5067 for (j = 0; j < s; j++) { 5068 if (i != 5 || 5069 chunksize < (SPA_MINBLOCKSIZE * 2)) { 5070 dmu_return_arcbuf(bigbuf_arcbufs[j]); 5071 } else { 5072 dmu_return_arcbuf( 5073 bigbuf_arcbufs[2 * j]); 5074 dmu_return_arcbuf( 5075 bigbuf_arcbufs[2 * j + 1]); 5076 } 5077 } 5078 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5079 umem_free(od, size); 5080 dmu_buf_rele(bonus_db, FTAG); 5081 return; 5082 } 5083 5084 /* 5085 * 50% of the time don't read objects in the 1st iteration to 5086 * test dmu_assign_arcbuf_by_dbuf() for the case when there are 5087 * no existing dbufs for the specified offsets. 5088 */ 5089 if (i != 0 || ztest_random(2) != 0) { 5090 error = dmu_read(os, packobj, packoff, 5091 packsize, packbuf, DMU_READ_PREFETCH); 5092 ASSERT0(error); 5093 error = dmu_read(os, bigobj, bigoff, bigsize, 5094 bigbuf, DMU_READ_PREFETCH); 5095 ASSERT0(error); 5096 } 5097 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 5098 n, chunksize, txg); 5099 5100 /* 5101 * We've verified all the old bufwads, and made new ones. 5102 * Now write them out. 5103 */ 5104 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 5105 if (ztest_opts.zo_verbose >= 7) { 5106 (void) printf("writing offset %llx size %llx" 5107 " txg %llx\n", 5108 (u_longlong_t)bigoff, 5109 (u_longlong_t)bigsize, 5110 (u_longlong_t)txg); 5111 } 5112 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 5113 dmu_buf_t *dbt; 5114 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5115 bcopy((caddr_t)bigbuf + (off - bigoff), 5116 bigbuf_arcbufs[j]->b_data, chunksize); 5117 } else { 5118 bcopy((caddr_t)bigbuf + (off - bigoff), 5119 bigbuf_arcbufs[2 * j]->b_data, 5120 chunksize / 2); 5121 bcopy((caddr_t)bigbuf + (off - bigoff) + 5122 chunksize / 2, 5123 bigbuf_arcbufs[2 * j + 1]->b_data, 5124 chunksize / 2); 5125 } 5126 5127 if (i == 1) { 5128 VERIFY(dmu_buf_hold(os, bigobj, off, 5129 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 5130 } 5131 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 5132 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5133 off, bigbuf_arcbufs[j], tx)); 5134 } else { 5135 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5136 off, bigbuf_arcbufs[2 * j], tx)); 5137 VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, 5138 off + chunksize / 2, 5139 bigbuf_arcbufs[2 * j + 1], tx)); 5140 } 5141 if (i == 1) { 5142 dmu_buf_rele(dbt, FTAG); 5143 } 5144 } 5145 dmu_tx_commit(tx); 5146 5147 /* 5148 * Sanity check the stuff we just wrote. 5149 */ 5150 { 5151 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 5152 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 5153 5154 VERIFY0(dmu_read(os, packobj, packoff, 5155 packsize, packcheck, DMU_READ_PREFETCH)); 5156 VERIFY0(dmu_read(os, bigobj, bigoff, 5157 bigsize, bigcheck, DMU_READ_PREFETCH)); 5158 5159 ASSERT0(bcmp(packbuf, packcheck, packsize)); 5160 ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); 5161 5162 umem_free(packcheck, packsize); 5163 umem_free(bigcheck, bigsize); 5164 } 5165 if (i == 2) { 5166 txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); 5167 } else if (i == 3) { 5168 txg_wait_synced(dmu_objset_pool(os), 0); 5169 } 5170 } 5171 5172 dmu_buf_rele(bonus_db, FTAG); 5173 umem_free(packbuf, packsize); 5174 umem_free(bigbuf, bigsize); 5175 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 5176 umem_free(od, size); 5177} 5178 5179/* ARGSUSED */ 5180void 5181ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 5182{ 5183 ztest_od_t *od; 5184 5185 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5186 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 5187 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5188 5189 /* 5190 * Have multiple threads write to large offsets in an object 5191 * to verify that parallel writes to an object -- even to the 5192 * same blocks within the object -- doesn't cause any trouble. 5193 */ 5194 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5195 5196 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) 5197 return; 5198 5199 while (ztest_random(10) != 0) 5200 ztest_io(zd, od->od_object, offset); 5201 5202 umem_free(od, sizeof (ztest_od_t)); 5203} 5204 5205void 5206ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 5207{ 5208 ztest_od_t *od; 5209 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 5210 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 5211 uint64_t count = ztest_random(20) + 1; 5212 uint64_t blocksize = ztest_random_blocksize(); 5213 void *data; 5214 5215 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5216 5217 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); 5218 5219 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5220 !ztest_random(2)) != 0) { 5221 umem_free(od, sizeof (ztest_od_t)); 5222 return; 5223 } 5224 5225 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { 5226 umem_free(od, sizeof (ztest_od_t)); 5227 return; 5228 } 5229 5230 ztest_prealloc(zd, od->od_object, offset, count * blocksize); 5231 5232 data = umem_zalloc(blocksize, UMEM_NOFAIL); 5233 5234 while (ztest_random(count) != 0) { 5235 uint64_t randoff = offset + (ztest_random(count) * blocksize); 5236 if (ztest_write(zd, od->od_object, randoff, blocksize, 5237 data) != 0) 5238 break; 5239 while (ztest_random(4) != 0) 5240 ztest_io(zd, od->od_object, randoff); 5241 } 5242 5243 umem_free(data, blocksize); 5244 umem_free(od, sizeof (ztest_od_t)); 5245} 5246 5247/* 5248 * Verify that zap_{create,destroy,add,remove,update} work as expected. 5249 */ 5250#define ZTEST_ZAP_MIN_INTS 1 5251#define ZTEST_ZAP_MAX_INTS 4 5252#define ZTEST_ZAP_MAX_PROPS 1000 5253 5254void 5255ztest_zap(ztest_ds_t *zd, uint64_t id) 5256{ 5257 objset_t *os = zd->zd_os; 5258 ztest_od_t *od; 5259 uint64_t object; 5260 uint64_t txg, last_txg; 5261 uint64_t value[ZTEST_ZAP_MAX_INTS]; 5262 uint64_t zl_ints, zl_intsize, prop; 5263 int i, ints; 5264 dmu_tx_t *tx; 5265 char propname[100], txgname[100]; 5266 int error; 5267 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 5268 5269 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5270 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5271 5272 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5273 !ztest_random(2)) != 0) 5274 goto out; 5275 5276 object = od->od_object; 5277 5278 /* 5279 * Generate a known hash collision, and verify that 5280 * we can lookup and remove both entries. 5281 */ 5282 tx = dmu_tx_create(os); 5283 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5284 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5285 if (txg == 0) 5286 goto out; 5287 for (i = 0; i < 2; i++) { 5288 value[i] = i; 5289 VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 5290 1, &value[i], tx)); 5291 } 5292 for (i = 0; i < 2; i++) { 5293 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 5294 sizeof (uint64_t), 1, &value[i], tx)); 5295 VERIFY0( 5296 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 5297 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5298 ASSERT3U(zl_ints, ==, 1); 5299 } 5300 for (i = 0; i < 2; i++) { 5301 VERIFY0(zap_remove(os, object, hc[i], tx)); 5302 } 5303 dmu_tx_commit(tx); 5304 5305 /* 5306 * Generate a bunch of random entries. 5307 */ 5308 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 5309 5310 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5311 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 5312 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 5313 bzero(value, sizeof (value)); 5314 last_txg = 0; 5315 5316 /* 5317 * If these zap entries already exist, validate their contents. 5318 */ 5319 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5320 if (error == 0) { 5321 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5322 ASSERT3U(zl_ints, ==, 1); 5323 5324 VERIFY0(zap_lookup(os, object, txgname, zl_intsize, 5325 zl_ints, &last_txg)); 5326 5327 VERIFY0(zap_length(os, object, propname, &zl_intsize, 5328 &zl_ints)); 5329 5330 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 5331 ASSERT3U(zl_ints, ==, ints); 5332 5333 VERIFY0(zap_lookup(os, object, propname, zl_intsize, 5334 zl_ints, value)); 5335 5336 for (i = 0; i < ints; i++) { 5337 ASSERT3U(value[i], ==, last_txg + object + i); 5338 } 5339 } else { 5340 ASSERT3U(error, ==, ENOENT); 5341 } 5342 5343 /* 5344 * Atomically update two entries in our zap object. 5345 * The first is named txg_%llu, and contains the txg 5346 * in which the property was last updated. The second 5347 * is named prop_%llu, and the nth element of its value 5348 * should be txg + object + n. 5349 */ 5350 tx = dmu_tx_create(os); 5351 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5352 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5353 if (txg == 0) 5354 goto out; 5355 5356 if (last_txg > txg) 5357 fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); 5358 5359 for (i = 0; i < ints; i++) 5360 value[i] = txg + object + i; 5361 5362 VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 5363 1, &txg, tx)); 5364 VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), 5365 ints, value, tx)); 5366 5367 dmu_tx_commit(tx); 5368 5369 /* 5370 * Remove a random pair of entries. 5371 */ 5372 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 5373 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 5374 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 5375 5376 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 5377 5378 if (error == ENOENT) 5379 goto out; 5380 5381 ASSERT0(error); 5382 5383 tx = dmu_tx_create(os); 5384 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5385 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5386 if (txg == 0) 5387 goto out; 5388 VERIFY0(zap_remove(os, object, txgname, tx)); 5389 VERIFY0(zap_remove(os, object, propname, tx)); 5390 dmu_tx_commit(tx); 5391out: 5392 umem_free(od, sizeof (ztest_od_t)); 5393} 5394 5395/* 5396 * Test case to test the upgrading of a microzap to fatzap. 5397 */ 5398void 5399ztest_fzap(ztest_ds_t *zd, uint64_t id) 5400{ 5401 objset_t *os = zd->zd_os; 5402 ztest_od_t *od; 5403 uint64_t object, txg; 5404 int i; 5405 5406 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5407 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 5408 5409 if (ztest_object_init(zd, od, sizeof (ztest_od_t), 5410 !ztest_random(2)) != 0) 5411 goto out; 5412 object = od->od_object; 5413 5414 /* 5415 * Add entries to this ZAP and make sure it spills over 5416 * and gets upgraded to a fatzap. Also, since we are adding 5417 * 2050 entries we should see ptrtbl growth and leaf-block split. 5418 */ 5419 for (i = 0; i < 2050; i++) { 5420 char name[ZFS_MAX_DATASET_NAME_LEN]; 5421 uint64_t value = i; 5422 dmu_tx_t *tx; 5423 int error; 5424 5425 (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", 5426 (u_longlong_t)id, (u_longlong_t)value); 5427 5428 tx = dmu_tx_create(os); 5429 dmu_tx_hold_zap(tx, object, B_TRUE, name); 5430 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5431 if (txg == 0) 5432 goto out; 5433 error = zap_add(os, object, name, sizeof (uint64_t), 1, 5434 &value, tx); 5435 ASSERT(error == 0 || error == EEXIST); 5436 dmu_tx_commit(tx); 5437 } 5438out: 5439 umem_free(od, sizeof (ztest_od_t)); 5440} 5441 5442/* ARGSUSED */ 5443void 5444ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 5445{ 5446 objset_t *os = zd->zd_os; 5447 ztest_od_t *od; 5448 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 5449 dmu_tx_t *tx; 5450 int i, namelen, error; 5451 int micro = ztest_random(2); 5452 char name[20], string_value[20]; 5453 void *data; 5454 5455 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5456 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); 5457 5458 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5459 umem_free(od, sizeof (ztest_od_t)); 5460 return; 5461 } 5462 5463 object = od->od_object; 5464 5465 /* 5466 * Generate a random name of the form 'xxx.....' where each 5467 * x is a random printable character and the dots are dots. 5468 * There are 94 such characters, and the name length goes from 5469 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 5470 */ 5471 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 5472 5473 for (i = 0; i < 3; i++) 5474 name[i] = '!' + ztest_random('~' - '!' + 1); 5475 for (; i < namelen - 1; i++) 5476 name[i] = '.'; 5477 name[i] = '\0'; 5478 5479 if ((namelen & 1) || micro) { 5480 wsize = sizeof (txg); 5481 wc = 1; 5482 data = &txg; 5483 } else { 5484 wsize = 1; 5485 wc = namelen; 5486 data = string_value; 5487 } 5488 5489 count = -1ULL; 5490 VERIFY0(zap_count(os, object, &count)); 5491 ASSERT3S(count, !=, -1ULL); 5492 5493 /* 5494 * Select an operation: length, lookup, add, update, remove. 5495 */ 5496 i = ztest_random(5); 5497 5498 if (i >= 2) { 5499 tx = dmu_tx_create(os); 5500 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 5501 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 5502 if (txg == 0) { 5503 umem_free(od, sizeof (ztest_od_t)); 5504 return; 5505 } 5506 bcopy(name, string_value, namelen); 5507 } else { 5508 tx = NULL; 5509 txg = 0; 5510 bzero(string_value, namelen); 5511 } 5512 5513 switch (i) { 5514 5515 case 0: 5516 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 5517 if (error == 0) { 5518 ASSERT3U(wsize, ==, zl_wsize); 5519 ASSERT3U(wc, ==, zl_wc); 5520 } else { 5521 ASSERT3U(error, ==, ENOENT); 5522 } 5523 break; 5524 5525 case 1: 5526 error = zap_lookup(os, object, name, wsize, wc, data); 5527 if (error == 0) { 5528 if (data == string_value && 5529 bcmp(name, data, namelen) != 0) 5530 fatal(0, "name '%s' != val '%s' len %d", 5531 name, data, namelen); 5532 } else { 5533 ASSERT3U(error, ==, ENOENT); 5534 } 5535 break; 5536 5537 case 2: 5538 error = zap_add(os, object, name, wsize, wc, data, tx); 5539 ASSERT(error == 0 || error == EEXIST); 5540 break; 5541 5542 case 3: 5543 VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); 5544 break; 5545 5546 case 4: 5547 error = zap_remove(os, object, name, tx); 5548 ASSERT(error == 0 || error == ENOENT); 5549 break; 5550 } 5551 5552 if (tx != NULL) 5553 dmu_tx_commit(tx); 5554 5555 umem_free(od, sizeof (ztest_od_t)); 5556} 5557 5558/* 5559 * Commit callback data. 5560 */ 5561typedef struct ztest_cb_data { 5562 list_node_t zcd_node; 5563 uint64_t zcd_txg; 5564 int zcd_expected_err; 5565 boolean_t zcd_added; 5566 boolean_t zcd_called; 5567 spa_t *zcd_spa; 5568} ztest_cb_data_t; 5569 5570/* This is the actual commit callback function */ 5571static void 5572ztest_commit_callback(void *arg, int error) 5573{ 5574 ztest_cb_data_t *data = arg; 5575 uint64_t synced_txg; 5576 5577 VERIFY3P(data, !=, NULL); 5578 VERIFY3S(data->zcd_expected_err, ==, error); 5579 VERIFY(!data->zcd_called); 5580 5581 synced_txg = spa_last_synced_txg(data->zcd_spa); 5582 if (data->zcd_txg > synced_txg) 5583 fatal(0, "commit callback of txg %" PRIu64 " called prematurely" 5584 ", last synced txg = %" PRIu64 "\n", data->zcd_txg, 5585 synced_txg); 5586 5587 data->zcd_called = B_TRUE; 5588 5589 if (error == ECANCELED) { 5590 ASSERT0(data->zcd_txg); 5591 ASSERT(!data->zcd_added); 5592 5593 /* 5594 * The private callback data should be destroyed here, but 5595 * since we are going to check the zcd_called field after 5596 * dmu_tx_abort(), we will destroy it there. 5597 */ 5598 return; 5599 } 5600 5601 ASSERT(data->zcd_added); 5602 ASSERT3U(data->zcd_txg, !=, 0); 5603 5604 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5605 5606 /* See if this cb was called more quickly */ 5607 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) 5608 zc_min_txg_delay = synced_txg - data->zcd_txg; 5609 5610 /* Remove our callback from the list */ 5611 list_remove(&zcl.zcl_callbacks, data); 5612 5613 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5614 5615 umem_free(data, sizeof (ztest_cb_data_t)); 5616} 5617 5618/* Allocate and initialize callback data structure */ 5619static ztest_cb_data_t * 5620ztest_create_cb_data(objset_t *os, uint64_t txg) 5621{ 5622 ztest_cb_data_t *cb_data; 5623 5624 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 5625 5626 cb_data->zcd_txg = txg; 5627 cb_data->zcd_spa = dmu_objset_spa(os); 5628 list_link_init(&cb_data->zcd_node); 5629 5630 return (cb_data); 5631} 5632 5633/* 5634 * Commit callback test. 5635 */ 5636void 5637ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 5638{ 5639 objset_t *os = zd->zd_os; 5640 ztest_od_t *od; 5641 dmu_tx_t *tx; 5642 ztest_cb_data_t *cb_data[3], *tmp_cb; 5643 uint64_t old_txg, txg; 5644 int i, error = 0; 5645 5646 od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); 5647 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 5648 5649 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { 5650 umem_free(od, sizeof (ztest_od_t)); 5651 return; 5652 } 5653 5654 tx = dmu_tx_create(os); 5655 5656 cb_data[0] = ztest_create_cb_data(os, 0); 5657 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 5658 5659 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); 5660 5661 /* Every once in a while, abort the transaction on purpose */ 5662 if (ztest_random(100) == 0) 5663 error = -1; 5664 5665 if (!error) 5666 error = dmu_tx_assign(tx, TXG_NOWAIT); 5667 5668 txg = error ? 0 : dmu_tx_get_txg(tx); 5669 5670 cb_data[0]->zcd_txg = txg; 5671 cb_data[1] = ztest_create_cb_data(os, txg); 5672 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 5673 5674 if (error) { 5675 /* 5676 * It's not a strict requirement to call the registered 5677 * callbacks from inside dmu_tx_abort(), but that's what 5678 * it's supposed to happen in the current implementation 5679 * so we will check for that. 5680 */ 5681 for (i = 0; i < 2; i++) { 5682 cb_data[i]->zcd_expected_err = ECANCELED; 5683 VERIFY(!cb_data[i]->zcd_called); 5684 } 5685 5686 dmu_tx_abort(tx); 5687 5688 for (i = 0; i < 2; i++) { 5689 VERIFY(cb_data[i]->zcd_called); 5690 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 5691 } 5692 5693 umem_free(od, sizeof (ztest_od_t)); 5694 return; 5695 } 5696 5697 cb_data[2] = ztest_create_cb_data(os, txg); 5698 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 5699 5700 /* 5701 * Read existing data to make sure there isn't a future leak. 5702 */ 5703 VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), 5704 &old_txg, DMU_READ_PREFETCH)); 5705 5706 if (old_txg > txg) 5707 fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, 5708 old_txg, txg); 5709 5710 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); 5711 5712 (void) mutex_enter(&zcl.zcl_callbacks_lock); 5713 5714 /* 5715 * Since commit callbacks don't have any ordering requirement and since 5716 * it is theoretically possible for a commit callback to be called 5717 * after an arbitrary amount of time has elapsed since its txg has been 5718 * synced, it is difficult to reliably determine whether a commit 5719 * callback hasn't been called due to high load or due to a flawed 5720 * implementation. 5721 * 5722 * In practice, we will assume that if after a certain number of txgs a 5723 * commit callback hasn't been called, then most likely there's an 5724 * implementation bug.. 5725 */ 5726 tmp_cb = list_head(&zcl.zcl_callbacks); 5727 if (tmp_cb != NULL && 5728 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { 5729 fatal(0, "Commit callback threshold exceeded, oldest txg: %" 5730 PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); 5731 } 5732 5733 /* 5734 * Let's find the place to insert our callbacks. 5735 * 5736 * Even though the list is ordered by txg, it is possible for the 5737 * insertion point to not be the end because our txg may already be 5738 * quiescing at this point and other callbacks in the open txg 5739 * (from other objsets) may have sneaked in. 5740 */ 5741 tmp_cb = list_tail(&zcl.zcl_callbacks); 5742 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 5743 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 5744 5745 /* Add the 3 callbacks to the list */ 5746 for (i = 0; i < 3; i++) { 5747 if (tmp_cb == NULL) 5748 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 5749 else 5750 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 5751 cb_data[i]); 5752 5753 cb_data[i]->zcd_added = B_TRUE; 5754 VERIFY(!cb_data[i]->zcd_called); 5755 5756 tmp_cb = cb_data[i]; 5757 } 5758 5759 zc_cb_counter += 3; 5760 5761 (void) mutex_exit(&zcl.zcl_callbacks_lock); 5762 5763 dmu_tx_commit(tx); 5764 5765 umem_free(od, sizeof (ztest_od_t)); 5766} 5767 5768/* 5769 * Visit each object in the dataset. Verify that its properties 5770 * are consistent what was stored in the block tag when it was created, 5771 * and that its unused bonus buffer space has not been overwritten. 5772 */ 5773/* ARGSUSED */ 5774void 5775ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 5776{ 5777 objset_t *os = zd->zd_os; 5778 uint64_t obj; 5779 int err = 0; 5780 5781 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 5782 ztest_block_tag_t *bt = NULL; 5783 dmu_object_info_t doi; 5784 dmu_buf_t *db; 5785 5786 ztest_object_lock(zd, obj, RL_READER); 5787 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { 5788 ztest_object_unlock(zd, obj); 5789 continue; 5790 } 5791 5792 dmu_object_info_from_db(db, &doi); 5793 if (doi.doi_bonus_size >= sizeof (*bt)) 5794 bt = ztest_bt_bonus(db); 5795 5796 if (bt && bt->bt_magic == BT_MAGIC) { 5797 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 5798 bt->bt_offset, bt->bt_gen, bt->bt_txg, 5799 bt->bt_crtxg); 5800 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 5801 } 5802 5803 dmu_buf_rele(db, FTAG); 5804 ztest_object_unlock(zd, obj); 5805 } 5806} 5807 5808/* ARGSUSED */ 5809void 5810ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 5811{ 5812 zfs_prop_t proplist[] = { 5813 ZFS_PROP_CHECKSUM, 5814 ZFS_PROP_COMPRESSION, 5815 ZFS_PROP_COPIES, 5816 ZFS_PROP_DEDUP 5817 }; 5818 int p; 5819 5820 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5821 5822 for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) 5823 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 5824 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 5825 5826 VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, 5827 ztest_random_blocksize(), (int)ztest_random(2))); 5828 5829 (void) pthread_rwlock_unlock(&ztest_name_lock); 5830} 5831 5832/* ARGSUSED */ 5833void 5834ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 5835{ 5836 nvlist_t *props = NULL; 5837 5838 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5839 5840 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); 5841 5842 VERIFY0(spa_prop_get(ztest_spa, &props)); 5843 5844 if (ztest_opts.zo_verbose >= 6) 5845 dump_nvlist(props, 4); 5846 5847 fnvlist_free(props); 5848 5849 (void) pthread_rwlock_unlock(&ztest_name_lock); 5850} 5851 5852static int 5853user_release_one(const char *snapname, const char *holdname) 5854{ 5855 nvlist_t *snaps, *holds; 5856 int error; 5857 5858 snaps = fnvlist_alloc(); 5859 holds = fnvlist_alloc(); 5860 fnvlist_add_boolean(holds, holdname); 5861 fnvlist_add_nvlist(snaps, snapname, holds); 5862 fnvlist_free(holds); 5863 error = dsl_dataset_user_release(snaps, NULL); 5864 fnvlist_free(snaps); 5865 return (error); 5866} 5867 5868/* 5869 * Test snapshot hold/release and deferred destroy. 5870 */ 5871void 5872ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 5873{ 5874 int error; 5875 objset_t *os = zd->zd_os; 5876 objset_t *origin; 5877 char snapname[100]; 5878 char fullname[100]; 5879 char clonename[100]; 5880 char tag[100]; 5881 char osname[ZFS_MAX_DATASET_NAME_LEN]; 5882 nvlist_t *holds; 5883 5884 (void) pthread_rwlock_rdlock(&ztest_name_lock); 5885 5886 dmu_objset_name(os, osname); 5887 5888 (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", 5889 (u_longlong_t)id); 5890 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 5891 (void) snprintf(clonename, sizeof (clonename), 5892 "%s/ch1_%llu", osname, (u_longlong_t)id); 5893 (void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id); 5894 5895 /* 5896 * Clean up from any previous run. 5897 */ 5898 error = dsl_destroy_head(clonename); 5899 if (error != ENOENT) 5900 ASSERT0(error); 5901 error = user_release_one(fullname, tag); 5902 if (error != ESRCH && error != ENOENT) 5903 ASSERT0(error); 5904 error = dsl_destroy_snapshot(fullname, B_FALSE); 5905 if (error != ENOENT) 5906 ASSERT0(error); 5907 5908 /* 5909 * Create snapshot, clone it, mark snap for deferred destroy, 5910 * destroy clone, verify snap was also destroyed. 5911 */ 5912 error = dmu_objset_snapshot_one(osname, snapname); 5913 if (error) { 5914 if (error == ENOSPC) { 5915 ztest_record_enospc("dmu_objset_snapshot"); 5916 goto out; 5917 } 5918 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 5919 } 5920 5921 error = dmu_objset_clone(clonename, fullname); 5922 if (error) { 5923 if (error == ENOSPC) { 5924 ztest_record_enospc("dmu_objset_clone"); 5925 goto out; 5926 } 5927 fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); 5928 } 5929 5930 error = dsl_destroy_snapshot(fullname, B_TRUE); 5931 if (error) { 5932 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5933 fullname, error); 5934 } 5935 5936 error = dsl_destroy_head(clonename); 5937 if (error) 5938 fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); 5939 5940 error = dmu_objset_hold(fullname, FTAG, &origin); 5941 if (error != ENOENT) 5942 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); 5943 5944 /* 5945 * Create snapshot, add temporary hold, verify that we can't 5946 * destroy a held snapshot, mark for deferred destroy, 5947 * release hold, verify snapshot was destroyed. 5948 */ 5949 error = dmu_objset_snapshot_one(osname, snapname); 5950 if (error) { 5951 if (error == ENOSPC) { 5952 ztest_record_enospc("dmu_objset_snapshot"); 5953 goto out; 5954 } 5955 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 5956 } 5957 5958 holds = fnvlist_alloc(); 5959 fnvlist_add_string(holds, fullname, tag); 5960 error = dsl_dataset_user_hold(holds, 0, NULL); 5961 fnvlist_free(holds); 5962 5963 if (error == ENOSPC) { 5964 ztest_record_enospc("dsl_dataset_user_hold"); 5965 goto out; 5966 } else if (error) { 5967 fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", 5968 fullname, tag, error); 5969 } 5970 5971 error = dsl_destroy_snapshot(fullname, B_FALSE); 5972 if (error != EBUSY) { 5973 fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 5974 fullname, error); 5975 } 5976 5977 error = dsl_destroy_snapshot(fullname, B_TRUE); 5978 if (error) { 5979 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5980 fullname, error); 5981 } 5982 5983 error = user_release_one(fullname, tag); 5984 if (error) 5985 fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); 5986 5987 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 5988 5989out: 5990 (void) pthread_rwlock_unlock(&ztest_name_lock); 5991} 5992 5993/* 5994 * Inject random faults into the on-disk data. 5995 */ 5996/* ARGSUSED */ 5997void 5998ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 5999{ 6000 ztest_shared_t *zs = ztest_shared; 6001 spa_t *spa = ztest_spa; 6002 int fd; 6003 uint64_t offset; 6004 uint64_t leaves; 6005 uint64_t bad = 0x1990c0ffeedecadeull; 6006 uint64_t top, leaf; 6007 char *path0; 6008 char *pathrand; 6009 size_t fsize; 6010 int bshift = SPA_MAXBLOCKSHIFT + 2; 6011 int iters = 1000; 6012 int maxfaults; 6013 int mirror_save; 6014 vdev_t *vd0 = NULL; 6015 uint64_t guid0 = 0; 6016 boolean_t islog = B_FALSE; 6017 6018 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6019 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6020 6021 mutex_enter(&ztest_vdev_lock); 6022 6023 /* 6024 * Device removal is in progress, fault injection must be disabled 6025 * until it completes and the pool is scrubbed. The fault injection 6026 * strategy for damaging blocks does not take in to account evacuated 6027 * blocks which may have already been damaged. 6028 */ 6029 if (ztest_device_removal_active) { 6030 mutex_exit(&ztest_vdev_lock); 6031 goto out; 6032 } 6033 6034 maxfaults = MAXFAULTS(zs); 6035 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; 6036 mirror_save = zs->zs_mirrors; 6037 mutex_exit(&ztest_vdev_lock); 6038 6039 ASSERT3U(leaves, >=, 1); 6040 6041 /* 6042 * While ztest is running the number of leaves will not change. This 6043 * is critical for the fault injection logic as it determines where 6044 * errors can be safely injected such that they are always repairable. 6045 * 6046 * When restarting ztest a different number of leaves may be requested 6047 * which will shift the regions to be damaged. This is fine as long 6048 * as the pool has been scrubbed prior to using the new mapping. 6049 * Failure to do can result in non-repairable damage being injected. 6050 */ 6051 if (ztest_pool_scrubbed == B_FALSE) 6052 goto out; 6053 6054 /* 6055 * Grab the name lock as reader. There are some operations 6056 * which don't like to have their vdevs changed while 6057 * they are in progress (i.e. spa_change_guid). Those 6058 * operations will have grabbed the name lock as writer. 6059 */ 6060 (void) pthread_rwlock_rdlock(&ztest_name_lock); 6061 6062 /* 6063 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 6064 */ 6065 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6066 6067 if (ztest_random(2) == 0) { 6068 /* 6069 * Inject errors on a normal data device or slog device. 6070 */ 6071 top = ztest_random_vdev_top(spa, B_TRUE); 6072 leaf = ztest_random(leaves) + zs->zs_splits; 6073 6074 /* 6075 * Generate paths to the first leaf in this top-level vdev, 6076 * and to the random leaf we selected. We'll induce transient 6077 * write failures and random online/offline activity on leaf 0, 6078 * and we'll write random garbage to the randomly chosen leaf. 6079 */ 6080 (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, 6081 ztest_opts.zo_dir, ztest_opts.zo_pool, 6082 top * leaves + zs->zs_splits); 6083 (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, 6084 ztest_opts.zo_dir, ztest_opts.zo_pool, 6085 top * leaves + leaf); 6086 6087 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 6088 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 6089 islog = B_TRUE; 6090 6091 /* 6092 * If the top-level vdev needs to be resilvered 6093 * then we only allow faults on the device that is 6094 * resilvering. 6095 */ 6096 if (vd0 != NULL && maxfaults != 1 && 6097 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 6098 vd0->vdev_resilver_txg != 0)) { 6099 /* 6100 * Make vd0 explicitly claim to be unreadable, 6101 * or unwritable, or reach behind its back 6102 * and close the underlying fd. We can do this if 6103 * maxfaults == 0 because we'll fail and reexecute, 6104 * and we can do it if maxfaults >= 2 because we'll 6105 * have enough redundancy. If maxfaults == 1, the 6106 * combination of this with injection of random data 6107 * corruption below exceeds the pool's fault tolerance. 6108 */ 6109 vdev_file_t *vf = vd0->vdev_tsd; 6110 6111 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 6112 (long long)vd0->vdev_id, (int)maxfaults); 6113 6114 if (vf != NULL && ztest_random(3) == 0) { 6115 (void) close(vf->vf_file->f_fd); 6116 vf->vf_file->f_fd = -1; 6117 } else if (ztest_random(2) == 0) { 6118 vd0->vdev_cant_read = B_TRUE; 6119 } else { 6120 vd0->vdev_cant_write = B_TRUE; 6121 } 6122 guid0 = vd0->vdev_guid; 6123 } 6124 } else { 6125 /* 6126 * Inject errors on an l2cache device. 6127 */ 6128 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6129 6130 if (sav->sav_count == 0) { 6131 spa_config_exit(spa, SCL_STATE, FTAG); 6132 (void) pthread_rwlock_unlock(&ztest_name_lock); 6133 goto out; 6134 } 6135 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 6136 guid0 = vd0->vdev_guid; 6137 (void) strcpy(path0, vd0->vdev_path); 6138 (void) strcpy(pathrand, vd0->vdev_path); 6139 6140 leaf = 0; 6141 leaves = 1; 6142 maxfaults = INT_MAX; /* no limit on cache devices */ 6143 } 6144 6145 spa_config_exit(spa, SCL_STATE, FTAG); 6146 (void) pthread_rwlock_unlock(&ztest_name_lock); 6147 6148 /* 6149 * If we can tolerate two or more faults, or we're dealing 6150 * with a slog, randomly online/offline vd0. 6151 */ 6152 if ((maxfaults >= 2 || islog) && guid0 != 0) { 6153 if (ztest_random(10) < 6) { 6154 int flags = (ztest_random(2) == 0 ? 6155 ZFS_OFFLINE_TEMPORARY : 0); 6156 6157 /* 6158 * We have to grab the zs_name_lock as writer to 6159 * prevent a race between offlining a slog and 6160 * destroying a dataset. Offlining the slog will 6161 * grab a reference on the dataset which may cause 6162 * dsl_destroy_head() to fail with EBUSY thus 6163 * leaving the dataset in an inconsistent state. 6164 */ 6165 if (islog) 6166 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6167 6168 VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); 6169 6170 if (islog) 6171 (void) pthread_rwlock_unlock(&ztest_name_lock); 6172 } else { 6173 /* 6174 * Ideally we would like to be able to randomly 6175 * call vdev_[on|off]line without holding locks 6176 * to force unpredictable failures but the side 6177 * effects of vdev_[on|off]line prevent us from 6178 * doing so. We grab the ztest_vdev_lock here to 6179 * prevent a race between injection testing and 6180 * aux_vdev removal. 6181 */ 6182 mutex_enter(&ztest_vdev_lock); 6183 (void) vdev_online(spa, guid0, 0, NULL); 6184 mutex_exit(&ztest_vdev_lock); 6185 } 6186 } 6187 6188 if (maxfaults == 0) 6189 goto out; 6190 6191 /* 6192 * We have at least single-fault tolerance, so inject data corruption. 6193 */ 6194 fd = open(pathrand, O_RDWR); 6195 6196 if (fd == -1) /* we hit a gap in the device namespace */ 6197 goto out; 6198 6199 fsize = lseek(fd, 0, SEEK_END); 6200 6201 while (--iters != 0) { 6202 /* 6203 * The offset must be chosen carefully to ensure that 6204 * we do not inject a given logical block with errors 6205 * on two different leaf devices, because ZFS can not 6206 * tolerate that (if maxfaults==1). 6207 * 6208 * To achieve this we divide each leaf device into 6209 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). 6210 * Each chunk is further divided into error-injection 6211 * ranges (can accept errors) and clear ranges (we do 6212 * not inject errors in those). Each error-injection 6213 * range can accept errors only for a single leaf vdev. 6214 * Error-injection ranges are separated by clear ranges. 6215 * 6216 * For example, with 3 leaves, each chunk looks like: 6217 * 0 to 32M: injection range for leaf 0 6218 * 32M to 64M: clear range - no injection allowed 6219 * 64M to 96M: injection range for leaf 1 6220 * 96M to 128M: clear range - no injection allowed 6221 * 128M to 160M: injection range for leaf 2 6222 * 160M to 192M: clear range - no injection allowed 6223 * 6224 * Each clear range must be large enough such that a 6225 * single block cannot straddle it. This way a block 6226 * can't be a target in two different injection ranges 6227 * (on different leaf vdevs). 6228 */ 6229 offset = ztest_random(fsize / (leaves << bshift)) * 6230 (leaves << bshift) + (leaf << bshift) + 6231 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 6232 6233 /* 6234 * Only allow damage to the labels at one end of the vdev. 6235 * 6236 * If all labels are damaged, the device will be totally 6237 * inaccessible, which will result in loss of data, 6238 * because we also damage (parts of) the other side of 6239 * the mirror/raidz. 6240 * 6241 * Additionally, we will always have both an even and an 6242 * odd label, so that we can handle crashes in the 6243 * middle of vdev_config_sync(). 6244 */ 6245 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 6246 continue; 6247 6248 /* 6249 * The two end labels are stored at the "end" of the disk, but 6250 * the end of the disk (vdev_psize) is aligned to 6251 * sizeof (vdev_label_t). 6252 */ 6253 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); 6254 if ((leaf & 1) == 1 && 6255 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 6256 continue; 6257 6258 mutex_enter(&ztest_vdev_lock); 6259 if (mirror_save != zs->zs_mirrors) { 6260 mutex_exit(&ztest_vdev_lock); 6261 (void) close(fd); 6262 goto out; 6263 } 6264 6265 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 6266 fatal(1, "can't inject bad word at 0x%llx in %s", 6267 offset, pathrand); 6268 6269 mutex_exit(&ztest_vdev_lock); 6270 6271 if (ztest_opts.zo_verbose >= 7) 6272 (void) printf("injected bad word into %s," 6273 " offset 0x%llx\n", pathrand, (u_longlong_t)offset); 6274 } 6275 6276 (void) close(fd); 6277out: 6278 umem_free(path0, MAXPATHLEN); 6279 umem_free(pathrand, MAXPATHLEN); 6280} 6281 6282/* 6283 * By design ztest will never inject uncorrectable damage in to the pool. 6284 * Issue a scrub, wait for it to complete, and verify there is never any 6285 * persistent damage. 6286 * 6287 * Only after a full scrub has been completed is it safe to start injecting 6288 * data corruption. See the comment in zfs_fault_inject(). 6289 */ 6290static int 6291ztest_scrub_impl(spa_t *spa) 6292{ 6293 int error = spa_scan(spa, POOL_SCAN_SCRUB); 6294 if (error) 6295 return (error); 6296 6297 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 6298 txg_wait_synced(spa_get_dsl(spa), 0); 6299 6300 if (spa_get_errlog_size(spa) > 0) 6301 return (ECKSUM); 6302 6303 ztest_pool_scrubbed = B_TRUE; 6304 6305 return (0); 6306} 6307 6308/* 6309 * Scrub the pool. 6310 */ 6311/* ARGSUSED */ 6312void 6313ztest_scrub(ztest_ds_t *zd, uint64_t id) 6314{ 6315 spa_t *spa = ztest_spa; 6316 int error; 6317 6318 /* 6319 * Scrub in progress by device removal. 6320 */ 6321 if (ztest_device_removal_active) 6322 return; 6323 6324 /* 6325 * Start a scrub, wait a moment, then force a restart. 6326 */ 6327 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6328 (void) poll(NULL, 0, 100); 6329 6330 error = ztest_scrub_impl(spa); 6331 if (error == EBUSY) 6332 error = 0; 6333 ASSERT0(error); 6334} 6335 6336/* 6337 * Change the guid for the pool. 6338 */ 6339/* ARGSUSED */ 6340void 6341ztest_reguid(ztest_ds_t *zd, uint64_t id) 6342{ 6343 spa_t *spa = ztest_spa; 6344 uint64_t orig, load; 6345 int error; 6346 6347 if (ztest_opts.zo_mmp_test) 6348 return; 6349 6350 orig = spa_guid(spa); 6351 load = spa_load_guid(spa); 6352 6353 (void) pthread_rwlock_wrlock(&ztest_name_lock); 6354 error = spa_change_guid(spa); 6355 (void) pthread_rwlock_unlock(&ztest_name_lock); 6356 6357 if (error != 0) 6358 return; 6359 6360 if (ztest_opts.zo_verbose >= 4) { 6361 (void) printf("Changed guid old %llu -> %llu\n", 6362 (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); 6363 } 6364 6365 VERIFY3U(orig, !=, spa_guid(spa)); 6366 VERIFY3U(load, ==, spa_load_guid(spa)); 6367} 6368 6369void 6370ztest_fletcher(ztest_ds_t *zd, uint64_t id) 6371{ 6372 hrtime_t end = gethrtime() + NANOSEC; 6373 6374 while (gethrtime() <= end) { 6375 int run_count = 100; 6376 void *buf; 6377 struct abd *abd_data, *abd_meta; 6378 uint32_t size; 6379 int *ptr; 6380 int i; 6381 zio_cksum_t zc_ref; 6382 zio_cksum_t zc_ref_byteswap; 6383 6384 size = ztest_random_blocksize(); 6385 6386 buf = umem_alloc(size, UMEM_NOFAIL); 6387 abd_data = abd_alloc(size, B_FALSE); 6388 abd_meta = abd_alloc(size, B_TRUE); 6389 6390 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6391 *ptr = ztest_random(UINT_MAX); 6392 6393 abd_copy_from_buf_off(abd_data, buf, 0, size); 6394 abd_copy_from_buf_off(abd_meta, buf, 0, size); 6395 6396 VERIFY0(fletcher_4_impl_set("scalar")); 6397 fletcher_4_native(buf, size, NULL, &zc_ref); 6398 fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); 6399 6400 VERIFY0(fletcher_4_impl_set("cycle")); 6401 while (run_count-- > 0) { 6402 zio_cksum_t zc; 6403 zio_cksum_t zc_byteswap; 6404 6405 fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); 6406 fletcher_4_native(buf, size, NULL, &zc); 6407 6408 VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); 6409 VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, 6410 sizeof (zc_byteswap))); 6411 6412 /* Test ABD - data */ 6413 abd_fletcher_4_byteswap(abd_data, size, NULL, 6414 &zc_byteswap); 6415 abd_fletcher_4_native(abd_data, size, NULL, &zc); 6416 6417 VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); 6418 VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, 6419 sizeof (zc_byteswap))); 6420 6421 /* Test ABD - metadata */ 6422 abd_fletcher_4_byteswap(abd_meta, size, NULL, 6423 &zc_byteswap); 6424 abd_fletcher_4_native(abd_meta, size, NULL, &zc); 6425 6426 VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); 6427 VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, 6428 sizeof (zc_byteswap))); 6429 6430 } 6431 6432 umem_free(buf, size); 6433 abd_free(abd_data); 6434 abd_free(abd_meta); 6435 } 6436} 6437 6438void 6439ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) 6440{ 6441 void *buf; 6442 size_t size; 6443 int *ptr; 6444 int i; 6445 zio_cksum_t zc_ref; 6446 zio_cksum_t zc_ref_bswap; 6447 6448 hrtime_t end = gethrtime() + NANOSEC; 6449 6450 while (gethrtime() <= end) { 6451 int run_count = 100; 6452 6453 size = ztest_random_blocksize(); 6454 buf = umem_alloc(size, UMEM_NOFAIL); 6455 6456 for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) 6457 *ptr = ztest_random(UINT_MAX); 6458 6459 VERIFY0(fletcher_4_impl_set("scalar")); 6460 fletcher_4_native(buf, size, NULL, &zc_ref); 6461 fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); 6462 6463 VERIFY0(fletcher_4_impl_set("cycle")); 6464 6465 while (run_count-- > 0) { 6466 zio_cksum_t zc; 6467 zio_cksum_t zc_bswap; 6468 size_t pos = 0; 6469 6470 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6471 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6472 6473 while (pos < size) { 6474 size_t inc = 64 * ztest_random(size / 67); 6475 /* sometimes add few bytes to test non-simd */ 6476 if (ztest_random(100) < 10) 6477 inc += P2ALIGN(ztest_random(64), 6478 sizeof (uint32_t)); 6479 6480 if (inc > (size - pos)) 6481 inc = size - pos; 6482 6483 fletcher_4_incremental_native(buf + pos, inc, 6484 &zc); 6485 fletcher_4_incremental_byteswap(buf + pos, inc, 6486 &zc_bswap); 6487 6488 pos += inc; 6489 } 6490 6491 VERIFY3U(pos, ==, size); 6492 6493 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6494 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6495 6496 /* 6497 * verify if incremental on the whole buffer is 6498 * equivalent to non-incremental version 6499 */ 6500 ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); 6501 ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); 6502 6503 fletcher_4_incremental_native(buf, size, &zc); 6504 fletcher_4_incremental_byteswap(buf, size, &zc_bswap); 6505 6506 VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); 6507 VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); 6508 } 6509 6510 umem_free(buf, size); 6511 } 6512} 6513 6514static int 6515ztest_set_global_vars(void) 6516{ 6517 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6518 char *kv = ztest_opts.zo_gvars[i]; 6519 VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); 6520 VERIFY3U(strlen(kv), >, 0); 6521 int err = set_global_var(kv); 6522 if (ztest_opts.zo_verbose > 0) { 6523 (void) printf("setting global var %s ... %s\n", kv, 6524 err ? "failed" : "ok"); 6525 } 6526 if (err != 0) { 6527 (void) fprintf(stderr, 6528 "failed to set global var '%s'\n", kv); 6529 return (err); 6530 } 6531 } 6532 return (0); 6533} 6534 6535static char ** 6536ztest_global_vars_to_zdb_args(void) 6537{ 6538 char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); 6539 char **cur = args; 6540 for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { 6541 char *kv = ztest_opts.zo_gvars[i]; 6542 *cur = "-o"; 6543 cur++; 6544 *cur = strdup(kv); 6545 cur++; 6546 } 6547 ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); 6548 *cur = NULL; 6549 return (args); 6550} 6551 6552/* The end of strings is indicated by a NULL element */ 6553static char * 6554join_strings(char **strings, const char *sep) 6555{ 6556 size_t totallen = 0; 6557 for (char **sp = strings; *sp != NULL; sp++) { 6558 totallen += strlen(*sp); 6559 totallen += strlen(sep); 6560 } 6561 if (totallen > 0) { 6562 ASSERT(totallen >= strlen(sep)); 6563 totallen -= strlen(sep); 6564 } 6565 6566 size_t buflen = totallen + 1; 6567 char *o = malloc(buflen); /* trailing 0 byte */ 6568 o[0] = '\0'; 6569 for (char **sp = strings; *sp != NULL; sp++) { 6570 size_t would; 6571 would = strlcat(o, *sp, buflen); 6572 VERIFY3U(would, <, buflen); 6573 if (*(sp+1) == NULL) { 6574 break; 6575 } 6576 would = strlcat(o, sep, buflen); 6577 VERIFY3U(would, <, buflen); 6578 } 6579 ASSERT3S(strlen(o), ==, totallen); 6580 return (o); 6581} 6582 6583static int 6584ztest_check_path(char *path) 6585{ 6586 struct stat s; 6587 /* return true on success */ 6588 return (!stat(path, &s)); 6589} 6590 6591static void 6592ztest_get_zdb_bin(char *bin, int len) 6593{ 6594 char *zdb_path; 6595 /* 6596 * Try to use ZDB_PATH and in-tree zdb path. If not successful, just 6597 * let popen to search through PATH. 6598 */ 6599 if ((zdb_path = getenv("ZDB_PATH"))) { 6600 strlcpy(bin, zdb_path, len); /* In env */ 6601 if (!ztest_check_path(bin)) { 6602 ztest_dump_core = 0; 6603 fatal(1, "invalid ZDB_PATH '%s'", bin); 6604 } 6605 return; 6606 } 6607 6608 VERIFY3P(realpath(getexecname(), bin), !=, NULL); 6609 if (strstr(bin, "/ztest/")) { 6610 strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */ 6611 strcat(bin, "/zdb/zdb"); 6612 if (ztest_check_path(bin)) 6613 return; 6614 } 6615 strcpy(bin, "zdb"); 6616} 6617 6618static vdev_t * 6619ztest_random_concrete_vdev_leaf(vdev_t *vd) 6620{ 6621 if (vd == NULL) 6622 return (NULL); 6623 6624 if (vd->vdev_children == 0) 6625 return (vd); 6626 6627 vdev_t *eligible[vd->vdev_children]; 6628 int eligible_idx = 0, i; 6629 for (i = 0; i < vd->vdev_children; i++) { 6630 vdev_t *cvd = vd->vdev_child[i]; 6631 if (cvd->vdev_top->vdev_removing) 6632 continue; 6633 if (cvd->vdev_children > 0 || 6634 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 6635 eligible[eligible_idx++] = cvd; 6636 } 6637 } 6638 VERIFY3S(eligible_idx, >, 0); 6639 6640 uint64_t child_no = ztest_random(eligible_idx); 6641 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 6642} 6643 6644/* ARGSUSED */ 6645void 6646ztest_initialize(ztest_ds_t *zd, uint64_t id) 6647{ 6648 spa_t *spa = ztest_spa; 6649 int error = 0; 6650 6651 mutex_enter(&ztest_vdev_lock); 6652 6653 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6654 6655 /* Random leaf vdev */ 6656 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6657 if (rand_vd == NULL) { 6658 spa_config_exit(spa, SCL_VDEV, FTAG); 6659 mutex_exit(&ztest_vdev_lock); 6660 return; 6661 } 6662 6663 /* 6664 * The random vdev we've selected may change as soon as we 6665 * drop the spa_config_lock. We create local copies of things 6666 * we're interested in. 6667 */ 6668 uint64_t guid = rand_vd->vdev_guid; 6669 char *path = strdup(rand_vd->vdev_path); 6670 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 6671 6672 zfs_dbgmsg("vd %px, guid %llu", rand_vd, guid); 6673 spa_config_exit(spa, SCL_VDEV, FTAG); 6674 6675 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 6676 6677 nvlist_t *vdev_guids = fnvlist_alloc(); 6678 nvlist_t *vdev_errlist = fnvlist_alloc(); 6679 fnvlist_add_uint64(vdev_guids, path, guid); 6680 error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); 6681 fnvlist_free(vdev_guids); 6682 fnvlist_free(vdev_errlist); 6683 6684 switch (cmd) { 6685 case POOL_INITIALIZE_CANCEL: 6686 if (ztest_opts.zo_verbose >= 4) { 6687 (void) printf("Cancel initialize %s", path); 6688 if (!active) 6689 (void) printf(" failed (no initialize active)"); 6690 (void) printf("\n"); 6691 } 6692 break; 6693 case POOL_INITIALIZE_START: 6694 if (ztest_opts.zo_verbose >= 4) { 6695 (void) printf("Start initialize %s", path); 6696 if (active && error == 0) 6697 (void) printf(" failed (already active)"); 6698 else if (error != 0) 6699 (void) printf(" failed (error %d)", error); 6700 (void) printf("\n"); 6701 } 6702 break; 6703 case POOL_INITIALIZE_SUSPEND: 6704 if (ztest_opts.zo_verbose >= 4) { 6705 (void) printf("Suspend initialize %s", path); 6706 if (!active) 6707 (void) printf(" failed (no initialize active)"); 6708 (void) printf("\n"); 6709 } 6710 break; 6711 } 6712 free(path); 6713 mutex_exit(&ztest_vdev_lock); 6714} 6715 6716/* ARGSUSED */ 6717void 6718ztest_trim(ztest_ds_t *zd, uint64_t id) 6719{ 6720 spa_t *spa = ztest_spa; 6721 int error = 0; 6722 6723 mutex_enter(&ztest_vdev_lock); 6724 6725 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6726 6727 /* Random leaf vdev */ 6728 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 6729 if (rand_vd == NULL) { 6730 spa_config_exit(spa, SCL_VDEV, FTAG); 6731 mutex_exit(&ztest_vdev_lock); 6732 return; 6733 } 6734 6735 /* 6736 * The random vdev we've selected may change as soon as we 6737 * drop the spa_config_lock. We create local copies of things 6738 * we're interested in. 6739 */ 6740 uint64_t guid = rand_vd->vdev_guid; 6741 char *path = strdup(rand_vd->vdev_path); 6742 boolean_t active = rand_vd->vdev_trim_thread != NULL; 6743 6744 zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); 6745 spa_config_exit(spa, SCL_VDEV, FTAG); 6746 6747 uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); 6748 uint64_t rate = 1 << ztest_random(30); 6749 boolean_t partial = (ztest_random(5) > 0); 6750 boolean_t secure = (ztest_random(5) > 0); 6751 6752 nvlist_t *vdev_guids = fnvlist_alloc(); 6753 nvlist_t *vdev_errlist = fnvlist_alloc(); 6754 fnvlist_add_uint64(vdev_guids, path, guid); 6755 error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, 6756 secure, vdev_errlist); 6757 fnvlist_free(vdev_guids); 6758 fnvlist_free(vdev_errlist); 6759 6760 switch (cmd) { 6761 case POOL_TRIM_CANCEL: 6762 if (ztest_opts.zo_verbose >= 4) { 6763 (void) printf("Cancel TRIM %s", path); 6764 if (!active) 6765 (void) printf(" failed (no TRIM active)"); 6766 (void) printf("\n"); 6767 } 6768 break; 6769 case POOL_TRIM_START: 6770 if (ztest_opts.zo_verbose >= 4) { 6771 (void) printf("Start TRIM %s", path); 6772 if (active && error == 0) 6773 (void) printf(" failed (already active)"); 6774 else if (error != 0) 6775 (void) printf(" failed (error %d)", error); 6776 (void) printf("\n"); 6777 } 6778 break; 6779 case POOL_TRIM_SUSPEND: 6780 if (ztest_opts.zo_verbose >= 4) { 6781 (void) printf("Suspend TRIM %s", path); 6782 if (!active) 6783 (void) printf(" failed (no TRIM active)"); 6784 (void) printf("\n"); 6785 } 6786 break; 6787 } 6788 free(path); 6789 mutex_exit(&ztest_vdev_lock); 6790} 6791 6792/* 6793 * Verify pool integrity by running zdb. 6794 */ 6795static void 6796ztest_run_zdb(char *pool) 6797{ 6798 int status; 6799 char *bin; 6800 char *zdb; 6801 char *zbuf; 6802 const int len = MAXPATHLEN + MAXNAMELEN + 20; 6803 FILE *fp; 6804 6805 bin = umem_alloc(len, UMEM_NOFAIL); 6806 zdb = umem_alloc(len, UMEM_NOFAIL); 6807 zbuf = umem_alloc(1024, UMEM_NOFAIL); 6808 6809 ztest_get_zdb_bin(bin, len); 6810 6811 char **set_gvars_args = ztest_global_vars_to_zdb_args(); 6812 char *set_gvars_args_joined = join_strings(set_gvars_args, " "); 6813 free(set_gvars_args); 6814 6815 size_t would = snprintf(zdb, len, 6816 "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", 6817 bin, 6818 ztest_opts.zo_verbose >= 3 ? "s" : "", 6819 ztest_opts.zo_verbose >= 4 ? "v" : "", 6820 set_gvars_args_joined, 6821 ztest_opts.zo_dir, 6822 pool); 6823 ASSERT3U(would, <, len); 6824 6825 free(set_gvars_args_joined); 6826 6827 if (ztest_opts.zo_verbose >= 5) 6828 (void) printf("Executing %s\n", strstr(zdb, "zdb ")); 6829 6830 fp = popen(zdb, "r"); 6831 6832 while (fgets(zbuf, 1024, fp) != NULL) 6833 if (ztest_opts.zo_verbose >= 3) 6834 (void) printf("%s", zbuf); 6835 6836 status = pclose(fp); 6837 6838 if (status == 0) 6839 goto out; 6840 6841 ztest_dump_core = 0; 6842 if (WIFEXITED(status)) 6843 fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 6844 else 6845 fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); 6846out: 6847 umem_free(bin, len); 6848 umem_free(zdb, len); 6849 umem_free(zbuf, 1024); 6850} 6851 6852static void 6853ztest_walk_pool_directory(char *header) 6854{ 6855 spa_t *spa = NULL; 6856 6857 if (ztest_opts.zo_verbose >= 6) 6858 (void) printf("%s\n", header); 6859 6860 mutex_enter(&spa_namespace_lock); 6861 while ((spa = spa_next(spa)) != NULL) 6862 if (ztest_opts.zo_verbose >= 6) 6863 (void) printf("\t%s\n", spa_name(spa)); 6864 mutex_exit(&spa_namespace_lock); 6865} 6866 6867static void 6868ztest_spa_import_export(char *oldname, char *newname) 6869{ 6870 nvlist_t *config, *newconfig; 6871 uint64_t pool_guid; 6872 spa_t *spa; 6873 int error; 6874 6875 if (ztest_opts.zo_verbose >= 4) { 6876 (void) printf("import/export: old = %s, new = %s\n", 6877 oldname, newname); 6878 } 6879 6880 /* 6881 * Clean up from previous runs. 6882 */ 6883 (void) spa_destroy(newname); 6884 6885 /* 6886 * Get the pool's configuration and guid. 6887 */ 6888 VERIFY0(spa_open(oldname, &spa, FTAG)); 6889 6890 /* 6891 * Kick off a scrub to tickle scrub/export races. 6892 */ 6893 if (ztest_random(2) == 0) 6894 (void) spa_scan(spa, POOL_SCAN_SCRUB); 6895 6896 pool_guid = spa_guid(spa); 6897 spa_close(spa, FTAG); 6898 6899 ztest_walk_pool_directory("pools before export"); 6900 6901 /* 6902 * Export it. 6903 */ 6904 VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); 6905 6906 ztest_walk_pool_directory("pools after export"); 6907 6908 /* 6909 * Try to import it. 6910 */ 6911 newconfig = spa_tryimport(config); 6912 ASSERT3P(newconfig, !=, NULL); 6913 fnvlist_free(newconfig); 6914 6915 /* 6916 * Import it under the new name. 6917 */ 6918 error = spa_import(newname, config, NULL, 0); 6919 if (error != 0) { 6920 dump_nvlist(config, 0); 6921 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 6922 oldname, newname, error); 6923 } 6924 6925 ztest_walk_pool_directory("pools after import"); 6926 6927 /* 6928 * Try to import it again -- should fail with EEXIST. 6929 */ 6930 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 6931 6932 /* 6933 * Try to import it under a different name -- should fail with EEXIST. 6934 */ 6935 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 6936 6937 /* 6938 * Verify that the pool is no longer visible under the old name. 6939 */ 6940 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 6941 6942 /* 6943 * Verify that we can open and close the pool using the new name. 6944 */ 6945 VERIFY0(spa_open(newname, &spa, FTAG)); 6946 ASSERT3U(pool_guid, ==, spa_guid(spa)); 6947 spa_close(spa, FTAG); 6948 6949 fnvlist_free(config); 6950} 6951 6952static void 6953ztest_resume(spa_t *spa) 6954{ 6955 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 6956 (void) printf("resuming from suspended state\n"); 6957 spa_vdev_state_enter(spa, SCL_NONE); 6958 vdev_clear(spa, NULL); 6959 (void) spa_vdev_state_exit(spa, NULL, 0); 6960 (void) zio_resume(spa); 6961} 6962 6963static void 6964ztest_resume_thread(void *arg) 6965{ 6966 spa_t *spa = arg; 6967 6968 while (!ztest_exiting) { 6969 if (spa_suspended(spa)) 6970 ztest_resume(spa); 6971 (void) poll(NULL, 0, 100); 6972 6973 /* 6974 * Periodically change the zfs_compressed_arc_enabled setting. 6975 */ 6976 if (ztest_random(10) == 0) 6977 zfs_compressed_arc_enabled = ztest_random(2); 6978 6979 /* 6980 * Periodically change the zfs_abd_scatter_enabled setting. 6981 */ 6982 if (ztest_random(10) == 0) 6983 zfs_abd_scatter_enabled = ztest_random(2); 6984 } 6985 6986 thread_exit(); 6987} 6988 6989static void 6990ztest_deadman_thread(void *arg) 6991{ 6992 ztest_shared_t *zs = arg; 6993 spa_t *spa = ztest_spa; 6994 hrtime_t delay, overdue, last_run = gethrtime(); 6995 6996 delay = (zs->zs_thread_stop - zs->zs_thread_start) + 6997 MSEC2NSEC(zfs_deadman_synctime_ms); 6998 6999 while (!ztest_exiting) { 7000 /* 7001 * Wait for the delay timer while checking occasionally 7002 * if we should stop. 7003 */ 7004 if (gethrtime() < last_run + delay) { 7005 (void) poll(NULL, 0, 1000); 7006 continue; 7007 } 7008 7009 /* 7010 * If the pool is suspended then fail immediately. Otherwise, 7011 * check to see if the pool is making any progress. If 7012 * vdev_deadman() discovers that there hasn't been any recent 7013 * I/Os then it will end up aborting the tests. 7014 */ 7015 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 7016 fatal(0, "aborting test after %llu seconds because " 7017 "pool has transitioned to a suspended state.", 7018 zfs_deadman_synctime_ms / 1000); 7019 } 7020 vdev_deadman(spa->spa_root_vdev, FTAG); 7021 7022 /* 7023 * If the process doesn't complete within a grace period of 7024 * zfs_deadman_synctime_ms over the expected finish time, 7025 * then it may be hung and is terminated. 7026 */ 7027 overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); 7028 if (gethrtime() > overdue) { 7029 fatal(0, "aborting test after %llu seconds because " 7030 "the process is overdue for termination.", 7031 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7032 } 7033 7034 (void) printf("ztest has been running for %lld seconds\n", 7035 (gethrtime() - zs->zs_proc_start) / NANOSEC); 7036 7037 last_run = gethrtime(); 7038 delay = MSEC2NSEC(zfs_deadman_checktime_ms); 7039 } 7040 7041 thread_exit(); 7042} 7043 7044static void 7045ztest_execute(int test, ztest_info_t *zi, uint64_t id) 7046{ 7047 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 7048 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 7049 hrtime_t functime = gethrtime(); 7050 int i; 7051 7052 for (i = 0; i < zi->zi_iters; i++) 7053 zi->zi_func(zd, id); 7054 7055 functime = gethrtime() - functime; 7056 7057 atomic_add_64(&zc->zc_count, 1); 7058 atomic_add_64(&zc->zc_time, functime); 7059 7060 if (ztest_opts.zo_verbose >= 4) 7061 (void) printf("%6.2f sec in %s\n", 7062 (double)functime / NANOSEC, zi->zi_funcname); 7063} 7064 7065static void 7066ztest_thread(void *arg) 7067{ 7068 int rand; 7069 uint64_t id = (uintptr_t)arg; 7070 ztest_shared_t *zs = ztest_shared; 7071 uint64_t call_next; 7072 hrtime_t now; 7073 ztest_info_t *zi; 7074 ztest_shared_callstate_t *zc; 7075 7076 while ((now = gethrtime()) < zs->zs_thread_stop) { 7077 /* 7078 * See if it's time to force a crash. 7079 */ 7080 if (now > zs->zs_thread_kill) 7081 ztest_kill(zs); 7082 7083 /* 7084 * If we're getting ENOSPC with some regularity, stop. 7085 */ 7086 if (zs->zs_enospc_count > 10) 7087 break; 7088 7089 /* 7090 * Pick a random function to execute. 7091 */ 7092 rand = ztest_random(ZTEST_FUNCS); 7093 zi = &ztest_info[rand]; 7094 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 7095 call_next = zc->zc_next; 7096 7097 if (now >= call_next && 7098 atomic_cas_64(&zc->zc_next, call_next, call_next + 7099 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 7100 ztest_execute(rand, zi, id); 7101 } 7102 } 7103 7104 thread_exit(); 7105} 7106 7107static void 7108ztest_dataset_name(char *dsname, char *pool, int d) 7109{ 7110 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 7111} 7112 7113static void 7114ztest_dataset_destroy(int d) 7115{ 7116 char name[ZFS_MAX_DATASET_NAME_LEN]; 7117 int t; 7118 7119 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7120 7121 if (ztest_opts.zo_verbose >= 3) 7122 (void) printf("Destroying %s to free up space\n", name); 7123 7124 /* 7125 * Cleanup any non-standard clones and snapshots. In general, 7126 * ztest thread t operates on dataset (t % zopt_datasets), 7127 * so there may be more than one thing to clean up. 7128 */ 7129 for (t = d; t < ztest_opts.zo_threads; 7130 t += ztest_opts.zo_datasets) 7131 ztest_dsl_dataset_cleanup(name, t); 7132 7133 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 7134 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7135} 7136 7137static void 7138ztest_dataset_dirobj_verify(ztest_ds_t *zd) 7139{ 7140 uint64_t usedobjs, dirobjs, scratch; 7141 7142 /* 7143 * ZTEST_DIROBJ is the object directory for the entire dataset. 7144 * Therefore, the number of objects in use should equal the 7145 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 7146 * If not, we have an object leak. 7147 * 7148 * Note that we can only check this in ztest_dataset_open(), 7149 * when the open-context and syncing-context values agree. 7150 * That's because zap_count() returns the open-context value, 7151 * while dmu_objset_space() returns the rootbp fill count. 7152 */ 7153 VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 7154 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 7155 ASSERT3U(dirobjs + 1, ==, usedobjs); 7156} 7157 7158static int 7159ztest_dataset_open(int d) 7160{ 7161 ztest_ds_t *zd = &ztest_ds[d]; 7162 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 7163 objset_t *os; 7164 zilog_t *zilog; 7165 char name[ZFS_MAX_DATASET_NAME_LEN]; 7166 int error; 7167 7168 ztest_dataset_name(name, ztest_opts.zo_pool, d); 7169 7170 (void) pthread_rwlock_rdlock(&ztest_name_lock); 7171 7172 error = ztest_dataset_create(name); 7173 if (error == ENOSPC) { 7174 (void) pthread_rwlock_unlock(&ztest_name_lock); 7175 ztest_record_enospc(FTAG); 7176 return (error); 7177 } 7178 ASSERT(error == 0 || error == EEXIST); 7179 7180 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, 7181 B_TRUE, zd, &os)); 7182 (void) pthread_rwlock_unlock(&ztest_name_lock); 7183 7184 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 7185 7186 zilog = zd->zd_zilog; 7187 7188 if (zilog->zl_header->zh_claim_lr_seq != 0 && 7189 zilog->zl_header->zh_claim_lr_seq < committed_seq) 7190 fatal(0, "missing log records: claimed %llu < committed %llu", 7191 zilog->zl_header->zh_claim_lr_seq, committed_seq); 7192 7193 ztest_dataset_dirobj_verify(zd); 7194 7195 zil_replay(os, zd, ztest_replay_vector); 7196 7197 ztest_dataset_dirobj_verify(zd); 7198 7199 if (ztest_opts.zo_verbose >= 6) 7200 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", 7201 zd->zd_name, 7202 (u_longlong_t)zilog->zl_parse_blk_count, 7203 (u_longlong_t)zilog->zl_parse_lr_count, 7204 (u_longlong_t)zilog->zl_replaying_seq); 7205 7206 zilog = zil_open(os, ztest_get_data); 7207 7208 if (zilog->zl_replaying_seq != 0 && 7209 zilog->zl_replaying_seq < committed_seq) 7210 fatal(0, "missing log records: replayed %llu < committed %llu", 7211 zilog->zl_replaying_seq, committed_seq); 7212 7213 return (0); 7214} 7215 7216static void 7217ztest_dataset_close(int d) 7218{ 7219 ztest_ds_t *zd = &ztest_ds[d]; 7220 7221 zil_close(zd->zd_zilog); 7222 dmu_objset_disown(zd->zd_os, B_TRUE, zd); 7223 7224 ztest_zd_fini(zd); 7225} 7226 7227/* ARGSUSED */ 7228static int 7229ztest_replay_zil_cb(const char *name, void *arg) 7230{ 7231 objset_t *os; 7232 ztest_ds_t *zdtmp; 7233 7234 VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, 7235 B_TRUE, FTAG, &os)); 7236 7237 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); 7238 7239 ztest_zd_init(zdtmp, NULL, os); 7240 zil_replay(os, zdtmp, ztest_replay_vector); 7241 ztest_zd_fini(zdtmp); 7242 7243 if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && 7244 ztest_opts.zo_verbose >= 6) { 7245 zilog_t *zilog = dmu_objset_zil(os); 7246 7247 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", 7248 name, 7249 (u_longlong_t)zilog->zl_parse_blk_count, 7250 (u_longlong_t)zilog->zl_parse_lr_count, 7251 (u_longlong_t)zilog->zl_replaying_seq); 7252 } 7253 7254 umem_free(zdtmp, sizeof (ztest_ds_t)); 7255 7256 dmu_objset_disown(os, B_TRUE, FTAG); 7257 return (0); 7258} 7259 7260static void 7261ztest_freeze(void) 7262{ 7263 ztest_ds_t *zd = &ztest_ds[0]; 7264 spa_t *spa; 7265 int numloops = 0; 7266 7267 if (ztest_opts.zo_verbose >= 3) 7268 (void) printf("testing spa_freeze()...\n"); 7269 7270 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7271 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7272 VERIFY0(ztest_dataset_open(0)); 7273 ztest_spa = spa; 7274 7275 /* 7276 * Force the first log block to be transactionally allocated. 7277 * We have to do this before we freeze the pool -- otherwise 7278 * the log chain won't be anchored. 7279 */ 7280 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 7281 ztest_dmu_object_alloc_free(zd, 0); 7282 zil_commit(zd->zd_zilog, 0); 7283 } 7284 7285 txg_wait_synced(spa_get_dsl(spa), 0); 7286 7287 /* 7288 * Freeze the pool. This stops spa_sync() from doing anything, 7289 * so that the only way to record changes from now on is the ZIL. 7290 */ 7291 spa_freeze(spa); 7292 7293 /* 7294 * Because it is hard to predict how much space a write will actually 7295 * require beforehand, we leave ourselves some fudge space to write over 7296 * capacity. 7297 */ 7298 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 7299 7300 /* 7301 * Run tests that generate log records but don't alter the pool config 7302 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 7303 * We do a txg_wait_synced() after each iteration to force the txg 7304 * to increase well beyond the last synced value in the uberblock. 7305 * The ZIL should be OK with that. 7306 * 7307 * Run a random number of times less than zo_maxloops and ensure we do 7308 * not run out of space on the pool. 7309 */ 7310 while (ztest_random(10) != 0 && 7311 numloops++ < ztest_opts.zo_maxloops && 7312 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 7313 ztest_od_t od; 7314 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 7315 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 7316 ztest_io(zd, od.od_object, 7317 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 7318 txg_wait_synced(spa_get_dsl(spa), 0); 7319 } 7320 7321 /* 7322 * Commit all of the changes we just generated. 7323 */ 7324 zil_commit(zd->zd_zilog, 0); 7325 txg_wait_synced(spa_get_dsl(spa), 0); 7326 7327 /* 7328 * Close our dataset and close the pool. 7329 */ 7330 ztest_dataset_close(0); 7331 spa_close(spa, FTAG); 7332 kernel_fini(); 7333 7334 /* 7335 * Open and close the pool and dataset to induce log replay. 7336 */ 7337 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7338 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7339 ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); 7340 VERIFY0(ztest_dataset_open(0)); 7341 ztest_spa = spa; 7342 txg_wait_synced(spa_get_dsl(spa), 0); 7343 ztest_dataset_close(0); 7344 ztest_reguid(NULL, 0); 7345 7346 spa_close(spa, FTAG); 7347 kernel_fini(); 7348} 7349 7350static void 7351ztest_import_impl(ztest_shared_t *zs) 7352{ 7353 importargs_t args = { 0 }; 7354 nvlist_t *cfg = NULL; 7355 int nsearch = 1; 7356 char *searchdirs[nsearch]; 7357 int flags = ZFS_IMPORT_MISSING_LOG; 7358 7359 searchdirs[0] = ztest_opts.zo_dir; 7360 args.paths = nsearch; 7361 args.path = searchdirs; 7362 args.can_be_active = B_FALSE; 7363 7364 VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, 7365 &libzpool_config_ops)); 7366 VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); 7367 fnvlist_free(cfg); 7368} 7369 7370/* 7371 * Import a storage pool with the given name. 7372 */ 7373static void 7374ztest_import(ztest_shared_t *zs) 7375{ 7376 spa_t *spa; 7377 7378 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7379 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7380 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7381 7382 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7383 7384 ztest_import_impl(zs); 7385 7386 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7387 zs->zs_metaslab_sz = 7388 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7389 spa_close(spa, FTAG); 7390 7391 kernel_fini(); 7392 7393 if (!ztest_opts.zo_mmp_test) { 7394 ztest_run_zdb(ztest_opts.zo_pool); 7395 ztest_freeze(); 7396 ztest_run_zdb(ztest_opts.zo_pool); 7397 } 7398 7399 (void) pthread_rwlock_destroy(&ztest_name_lock); 7400 mutex_destroy(&ztest_vdev_lock); 7401 mutex_destroy(&ztest_checkpoint_lock); 7402} 7403 7404/* 7405 * Kick off threads to run tests on all datasets in parallel. 7406 */ 7407static void 7408ztest_run(ztest_shared_t *zs) 7409{ 7410 spa_t *spa; 7411 objset_t *os; 7412 kthread_t *resume_thread, *deadman_thread; 7413 kthread_t **run_threads; 7414 uint64_t object; 7415 int error; 7416 int t, d; 7417 7418 ztest_exiting = B_FALSE; 7419 7420 /* 7421 * Initialize parent/child shared state. 7422 */ 7423 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7424 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7425 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7426 7427 zs->zs_thread_start = gethrtime(); 7428 zs->zs_thread_stop = 7429 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 7430 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 7431 zs->zs_thread_kill = zs->zs_thread_stop; 7432 if (ztest_random(100) < ztest_opts.zo_killrate) { 7433 zs->zs_thread_kill -= 7434 ztest_random(ztest_opts.zo_passtime * NANOSEC); 7435 } 7436 7437 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); 7438 7439 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 7440 offsetof(ztest_cb_data_t, zcd_node)); 7441 7442 /* 7443 * Open our pool. It may need to be imported first depending on 7444 * what tests were running when the previous pass was terminated. 7445 */ 7446 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7447 error = spa_open(ztest_opts.zo_pool, &spa, FTAG); 7448 if (error) { 7449 VERIFY3S(error, ==, ENOENT); 7450 ztest_import_impl(zs); 7451 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7452 zs->zs_metaslab_sz = 7453 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7454 } 7455 7456 metaslab_preload_limit = ztest_random(20) + 1; 7457 ztest_spa = spa; 7458 7459 VERIFY0(vdev_raidz_impl_set("cycle")); 7460 7461 dmu_objset_stats_t dds; 7462 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, 7463 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); 7464 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 7465 dmu_objset_fast_stat(os, &dds); 7466 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 7467 zs->zs_guid = dds.dds_guid; 7468 dmu_objset_disown(os, B_TRUE, FTAG); 7469 7470 /* 7471 * Create a thread to periodically resume suspended I/O. 7472 */ 7473 resume_thread = thread_create(NULL, 0, ztest_resume_thread, 7474 spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7475 7476 /* 7477 * Create a deadman thread and set to panic if we hang. 7478 */ 7479 deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, 7480 zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); 7481 7482 spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; 7483 7484 /* 7485 * Verify that we can safely inquire about any object, 7486 * whether it's allocated or not. To make it interesting, 7487 * we probe a 5-wide window around each power of two. 7488 * This hits all edge cases, including zero and the max. 7489 */ 7490 for (t = 0; t < 64; t++) { 7491 for (d = -5; d <= 5; d++) { 7492 error = dmu_object_info(spa->spa_meta_objset, 7493 (1ULL << t) + d, NULL); 7494 ASSERT(error == 0 || error == ENOENT || 7495 error == EINVAL); 7496 } 7497 } 7498 7499 /* 7500 * If we got any ENOSPC errors on the previous run, destroy something. 7501 */ 7502 if (zs->zs_enospc_count != 0) { 7503 int d = ztest_random(ztest_opts.zo_datasets); 7504 ztest_dataset_destroy(d); 7505 } 7506 zs->zs_enospc_count = 0; 7507 7508 /* 7509 * If we were in the middle of ztest_device_removal() and were killed 7510 * we need to ensure the removal and scrub complete before running 7511 * any tests that check ztest_device_removal_active. The removal will 7512 * be restarted automatically when the spa is opened, but we need to 7513 * initiate the scrub manually if it is not already in progress. Note 7514 * that we always run the scrub whenever an indirect vdev exists 7515 * because we have no way of knowing for sure if ztest_device_removal() 7516 * fully completed its scrub before the pool was reimported. 7517 */ 7518 if (spa->spa_removing_phys.sr_state == DSS_SCANNING || 7519 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7520 while (spa->spa_removing_phys.sr_state == DSS_SCANNING) 7521 txg_wait_synced(spa_get_dsl(spa), 0); 7522 7523 error = ztest_scrub_impl(spa); 7524 if (error == EBUSY) 7525 error = 0; 7526 ASSERT0(error); 7527 } 7528 7529 run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), 7530 UMEM_NOFAIL); 7531 7532 if (ztest_opts.zo_verbose >= 4) 7533 (void) printf("starting main threads...\n"); 7534 7535 /* 7536 * Replay all logs of all datasets in the pool. This is primarily for 7537 * temporary datasets which wouldn't otherwise get replayed, which 7538 * can trigger failures when attempting to offline a SLOG in 7539 * ztest_fault_inject(). 7540 */ 7541 (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, 7542 NULL, DS_FIND_CHILDREN); 7543 7544 /* 7545 * Kick off all the tests that run in parallel. 7546 */ 7547 for (t = 0; t < ztest_opts.zo_threads; t++) { 7548 if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { 7549 umem_free(run_threads, ztest_opts.zo_threads * 7550 sizeof (kthread_t *)); 7551 return; 7552 } 7553 7554 run_threads[t] = thread_create(NULL, 0, ztest_thread, 7555 (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, 7556 defclsyspri); 7557 } 7558 7559 /* 7560 * Wait for all of the tests to complete. 7561 */ 7562 for (t = 0; t < ztest_opts.zo_threads; t++) 7563 VERIFY0(thread_join(run_threads[t])); 7564 7565 /* 7566 * Close all datasets. This must be done after all the threads 7567 * are joined so we can be sure none of the datasets are in-use 7568 * by any of the threads. 7569 */ 7570 for (t = 0; t < ztest_opts.zo_threads; t++) { 7571 if (t < ztest_opts.zo_datasets) 7572 ztest_dataset_close(t); 7573 } 7574 7575 txg_wait_synced(spa_get_dsl(spa), 0); 7576 7577 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 7578 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 7579 7580 umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); 7581 7582 /* Kill the resume and deadman threads */ 7583 ztest_exiting = B_TRUE; 7584 VERIFY0(thread_join(resume_thread)); 7585 VERIFY0(thread_join(deadman_thread)); 7586 ztest_resume(spa); 7587 7588 /* 7589 * Right before closing the pool, kick off a bunch of async I/O; 7590 * spa_close() should wait for it to complete. 7591 */ 7592 for (object = 1; object < 50; object++) { 7593 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 7594 ZIO_PRIORITY_SYNC_READ); 7595 } 7596 7597 /* Verify that at least one commit cb was called in a timely fashion */ 7598 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) 7599 VERIFY0(zc_min_txg_delay); 7600 7601 spa_close(spa, FTAG); 7602 7603 /* 7604 * Verify that we can loop over all pools. 7605 */ 7606 mutex_enter(&spa_namespace_lock); 7607 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 7608 if (ztest_opts.zo_verbose > 3) 7609 (void) printf("spa_next: found %s\n", spa_name(spa)); 7610 mutex_exit(&spa_namespace_lock); 7611 7612 /* 7613 * Verify that we can export the pool and reimport it under a 7614 * different name. 7615 */ 7616 if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { 7617 char name[ZFS_MAX_DATASET_NAME_LEN]; 7618 (void) snprintf(name, sizeof (name), "%s_import", 7619 ztest_opts.zo_pool); 7620 ztest_spa_import_export(ztest_opts.zo_pool, name); 7621 ztest_spa_import_export(name, ztest_opts.zo_pool); 7622 } 7623 7624 kernel_fini(); 7625 7626 list_destroy(&zcl.zcl_callbacks); 7627 mutex_destroy(&zcl.zcl_callbacks_lock); 7628 (void) pthread_rwlock_destroy(&ztest_name_lock); 7629 mutex_destroy(&ztest_vdev_lock); 7630 mutex_destroy(&ztest_checkpoint_lock); 7631} 7632 7633static void 7634print_time(hrtime_t t, char *timebuf) 7635{ 7636 hrtime_t s = t / NANOSEC; 7637 hrtime_t m = s / 60; 7638 hrtime_t h = m / 60; 7639 hrtime_t d = h / 24; 7640 7641 s -= m * 60; 7642 m -= h * 60; 7643 h -= d * 24; 7644 7645 timebuf[0] = '\0'; 7646 7647 if (d) 7648 (void) sprintf(timebuf, 7649 "%llud%02lluh%02llum%02llus", d, h, m, s); 7650 else if (h) 7651 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 7652 else if (m) 7653 (void) sprintf(timebuf, "%llum%02llus", m, s); 7654 else 7655 (void) sprintf(timebuf, "%llus", s); 7656} 7657 7658static nvlist_t * 7659make_random_props(void) 7660{ 7661 nvlist_t *props; 7662 7663 props = fnvlist_alloc(); 7664 7665 if (ztest_random(2) == 0) 7666 return (props); 7667 7668 fnvlist_add_uint64(props, 7669 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); 7670 7671 return (props); 7672} 7673 7674/* 7675 * Create a storage pool with the given name and initial vdev size. 7676 * Then test spa_freeze() functionality. 7677 */ 7678static void 7679ztest_init(ztest_shared_t *zs) 7680{ 7681 spa_t *spa; 7682 nvlist_t *nvroot, *props; 7683 int i; 7684 7685 mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 7686 mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); 7687 VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); 7688 7689 kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); 7690 7691 /* 7692 * Create the storage pool. 7693 */ 7694 (void) spa_destroy(ztest_opts.zo_pool); 7695 ztest_shared->zs_vdev_next_leaf = 0; 7696 zs->zs_splits = 0; 7697 zs->zs_mirrors = ztest_opts.zo_mirrors; 7698 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 7699 NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); 7700 props = make_random_props(); 7701 7702 /* 7703 * We don't expect the pool to suspend unless maxfaults == 0, 7704 * in which case ztest_fault_inject() temporarily takes away 7705 * the only valid replica. 7706 */ 7707 fnvlist_add_uint64(props, 7708 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 7709 MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); 7710 7711 for (i = 0; i < SPA_FEATURES; i++) { 7712 char *buf; 7713 7714 if (!spa_feature_table[i].fi_zfs_mod_supported) 7715 continue; 7716 7717 /* 7718 * 75% chance of using the log space map feature. We want ztest 7719 * to exercise both the code paths that use the log space map 7720 * feature and the ones that don't. 7721 */ 7722 if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) 7723 continue; 7724 7725 VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", 7726 spa_feature_table[i].fi_uname)); 7727 fnvlist_add_uint64(props, buf, 0); 7728 free(buf); 7729 } 7730 7731 VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); 7732 fnvlist_free(nvroot); 7733 fnvlist_free(props); 7734 7735 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 7736 zs->zs_metaslab_sz = 7737 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 7738 spa_close(spa, FTAG); 7739 7740 kernel_fini(); 7741 7742 if (!ztest_opts.zo_mmp_test) { 7743 ztest_run_zdb(ztest_opts.zo_pool); 7744 ztest_freeze(); 7745 ztest_run_zdb(ztest_opts.zo_pool); 7746 } 7747 7748 (void) pthread_rwlock_destroy(&ztest_name_lock); 7749 mutex_destroy(&ztest_vdev_lock); 7750 mutex_destroy(&ztest_checkpoint_lock); 7751} 7752 7753static void 7754setup_data_fd(void) 7755{ 7756 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 7757 7758 ztest_fd_data = mkstemp(ztest_name_data); 7759 ASSERT3S(ztest_fd_data, >=, 0); 7760 (void) unlink(ztest_name_data); 7761} 7762 7763static int 7764shared_data_size(ztest_shared_hdr_t *hdr) 7765{ 7766 int size; 7767 7768 size = hdr->zh_hdr_size; 7769 size += hdr->zh_opts_size; 7770 size += hdr->zh_size; 7771 size += hdr->zh_stats_size * hdr->zh_stats_count; 7772 size += hdr->zh_ds_size * hdr->zh_ds_count; 7773 7774 return (size); 7775} 7776 7777static void 7778setup_hdr(void) 7779{ 7780 int size; 7781 ztest_shared_hdr_t *hdr; 7782 7783 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7784 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7785 ASSERT3P(hdr, !=, MAP_FAILED); 7786 7787 VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 7788 7789 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 7790 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 7791 hdr->zh_size = sizeof (ztest_shared_t); 7792 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 7793 hdr->zh_stats_count = ZTEST_FUNCS; 7794 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 7795 hdr->zh_ds_count = ztest_opts.zo_datasets; 7796 7797 size = shared_data_size(hdr); 7798 VERIFY0(ftruncate(ztest_fd_data, size)); 7799 7800 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7801} 7802 7803static void 7804setup_data(void) 7805{ 7806 int size, offset; 7807 ztest_shared_hdr_t *hdr; 7808 uint8_t *buf; 7809 7810 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 7811 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 7812 ASSERT3P(hdr, !=, MAP_FAILED); 7813 7814 size = shared_data_size(hdr); 7815 7816 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 7817 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 7818 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 7819 ASSERT3P(hdr, !=, MAP_FAILED); 7820 buf = (uint8_t *)hdr; 7821 7822 offset = hdr->zh_hdr_size; 7823 ztest_shared_opts = (void *)&buf[offset]; 7824 offset += hdr->zh_opts_size; 7825 ztest_shared = (void *)&buf[offset]; 7826 offset += hdr->zh_size; 7827 ztest_shared_callstate = (void *)&buf[offset]; 7828 offset += hdr->zh_stats_size * hdr->zh_stats_count; 7829 ztest_shared_ds = (void *)&buf[offset]; 7830} 7831 7832static boolean_t 7833exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 7834{ 7835 pid_t pid; 7836 int status; 7837 char *cmdbuf = NULL; 7838 7839 pid = fork(); 7840 7841 if (cmd == NULL) { 7842 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 7843 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 7844 cmd = cmdbuf; 7845 } 7846 7847 if (pid == -1) 7848 fatal(1, "fork failed"); 7849 7850 if (pid == 0) { /* child */ 7851 char *emptyargv[2] = { cmd, NULL }; 7852 char fd_data_str[12]; 7853 7854 struct rlimit rl = { 1024, 1024 }; 7855 (void) setrlimit(RLIMIT_NOFILE, &rl); 7856 7857 (void) close(ztest_fd_rand); 7858 VERIFY3S(11, >=, 7859 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 7860 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 7861 7862 (void) enable_extended_FILE_stdio(-1, -1); 7863 if (libpath != NULL) 7864 VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); 7865 (void) execv(cmd, emptyargv); 7866 ztest_dump_core = B_FALSE; 7867 fatal(B_TRUE, "exec failed: %s", cmd); 7868 } 7869 7870 if (cmdbuf != NULL) { 7871 umem_free(cmdbuf, MAXPATHLEN); 7872 cmd = NULL; 7873 } 7874 7875 while (waitpid(pid, &status, 0) != pid) 7876 continue; 7877 if (statusp != NULL) 7878 *statusp = status; 7879 7880 if (WIFEXITED(status)) { 7881 if (WEXITSTATUS(status) != 0) { 7882 (void) fprintf(stderr, "child exited with code %d\n", 7883 WEXITSTATUS(status)); 7884 exit(2); 7885 } 7886 return (B_FALSE); 7887 } else if (WIFSIGNALED(status)) { 7888 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 7889 (void) fprintf(stderr, "child died with signal %d\n", 7890 WTERMSIG(status)); 7891 exit(3); 7892 } 7893 return (B_TRUE); 7894 } else { 7895 (void) fprintf(stderr, "something strange happened to child\n"); 7896 exit(4); 7897 /* NOTREACHED */ 7898 } 7899} 7900 7901static void 7902ztest_run_init(void) 7903{ 7904 int i; 7905 7906 ztest_shared_t *zs = ztest_shared; 7907 7908 /* 7909 * Blow away any existing copy of zpool.cache 7910 */ 7911 (void) remove(spa_config_path); 7912 7913 if (ztest_opts.zo_init == 0) { 7914 if (ztest_opts.zo_verbose >= 1) 7915 (void) printf("Importing pool %s\n", 7916 ztest_opts.zo_pool); 7917 ztest_import(zs); 7918 return; 7919 } 7920 7921 /* 7922 * Create and initialize our storage pool. 7923 */ 7924 for (i = 1; i <= ztest_opts.zo_init; i++) { 7925 bzero(zs, sizeof (ztest_shared_t)); 7926 if (ztest_opts.zo_verbose >= 3 && 7927 ztest_opts.zo_init != 1) { 7928 (void) printf("ztest_init(), pass %d\n", i); 7929 } 7930 ztest_init(zs); 7931 } 7932} 7933 7934int 7935main(int argc, char **argv) 7936{ 7937 int kills = 0; 7938 int iters = 0; 7939 int older = 0; 7940 int newer = 0; 7941 ztest_shared_t *zs; 7942 ztest_info_t *zi; 7943 ztest_shared_callstate_t *zc; 7944 char timebuf[100]; 7945 char numbuf[NN_NUMBUF_SZ]; 7946 char *cmd; 7947 boolean_t hasalt; 7948 int f, err; 7949 char *fd_data_str = getenv("ZTEST_FD_DATA"); 7950 struct sigaction action; 7951 7952 (void) setvbuf(stdout, NULL, _IOLBF, 0); 7953 7954 dprintf_setup(&argc, argv); 7955 zfs_deadman_synctime_ms = 300000; 7956 zfs_deadman_checktime_ms = 30000; 7957 /* 7958 * As two-word space map entries may not come up often (especially 7959 * if pool and vdev sizes are small) we want to force at least some 7960 * of them so the feature get tested. 7961 */ 7962 zfs_force_some_double_word_sm_entries = B_TRUE; 7963 7964 /* 7965 * Verify that even extensively damaged split blocks with many 7966 * segments can be reconstructed in a reasonable amount of time 7967 * when reconstruction is known to be possible. 7968 * 7969 * Note: the lower this value is, the more damage we inflict, and 7970 * the more time ztest spends in recovering that damage. We chose 7971 * to induce damage 1/100th of the time so recovery is tested but 7972 * not so frequently that ztest doesn't get to test other code paths. 7973 */ 7974 zfs_reconstruct_indirect_damage_fraction = 100; 7975 7976 action.sa_handler = sig_handler; 7977 sigemptyset(&action.sa_mask); 7978 action.sa_flags = 0; 7979 7980 if (sigaction(SIGSEGV, &action, NULL) < 0) { 7981 (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", 7982 strerror(errno)); 7983 exit(EXIT_FAILURE); 7984 } 7985 7986 if (sigaction(SIGABRT, &action, NULL) < 0) { 7987 (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", 7988 strerror(errno)); 7989 exit(EXIT_FAILURE); 7990 } 7991 7992 /* 7993 * Force random_get_bytes() to use /dev/urandom in order to prevent 7994 * ztest from needlessly depleting the system entropy pool. 7995 */ 7996 random_path = "/dev/urandom"; 7997 ztest_fd_rand = open(random_path, O_RDONLY); 7998 ASSERT3S(ztest_fd_rand, >=, 0); 7999 8000 if (!fd_data_str) { 8001 process_options(argc, argv); 8002 8003 setup_data_fd(); 8004 setup_hdr(); 8005 setup_data(); 8006 bcopy(&ztest_opts, ztest_shared_opts, 8007 sizeof (*ztest_shared_opts)); 8008 } else { 8009 ztest_fd_data = atoi(fd_data_str); 8010 setup_data(); 8011 bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); 8012 } 8013 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 8014 8015 err = ztest_set_global_vars(); 8016 if (err != 0 && !fd_data_str) { 8017 /* error message done by ztest_set_global_vars */ 8018 exit(EXIT_FAILURE); 8019 } else { 8020 /* children should not be spawned if setting gvars fails */ 8021 VERIFY3S(err, ==, 0); 8022 } 8023 8024 /* Override location of zpool.cache */ 8025 VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", 8026 ztest_opts.zo_dir), !=, -1); 8027 8028 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 8029 UMEM_NOFAIL); 8030 zs = ztest_shared; 8031 8032 if (fd_data_str) { 8033 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 8034 metaslab_df_alloc_threshold = 8035 zs->zs_metaslab_df_alloc_threshold; 8036 8037 if (zs->zs_do_init) 8038 ztest_run_init(); 8039 else 8040 ztest_run(zs); 8041 exit(0); 8042 } 8043 8044 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 8045 8046 if (ztest_opts.zo_verbose >= 1) { 8047 (void) printf("%llu vdevs, %d datasets, %d threads," 8048 "%d %s disks, %llu seconds...\n\n", 8049 (u_longlong_t)ztest_opts.zo_vdevs, 8050 ztest_opts.zo_datasets, 8051 ztest_opts.zo_threads, 8052 ztest_opts.zo_raid_children, 8053 ztest_opts.zo_raid_type, 8054 (u_longlong_t)ztest_opts.zo_time); 8055 } 8056 8057 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 8058 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 8059 8060 zs->zs_do_init = B_TRUE; 8061 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 8062 if (ztest_opts.zo_verbose >= 1) { 8063 (void) printf("Executing older ztest for " 8064 "initialization: %s\n", ztest_opts.zo_alt_ztest); 8065 } 8066 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 8067 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 8068 } else { 8069 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 8070 } 8071 zs->zs_do_init = B_FALSE; 8072 8073 zs->zs_proc_start = gethrtime(); 8074 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 8075 8076 for (f = 0; f < ZTEST_FUNCS; f++) { 8077 zi = &ztest_info[f]; 8078 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8079 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 8080 zc->zc_next = UINT64_MAX; 8081 else 8082 zc->zc_next = zs->zs_proc_start + 8083 ztest_random(2 * zi->zi_interval[0] + 1); 8084 } 8085 8086 /* 8087 * Run the tests in a loop. These tests include fault injection 8088 * to verify that self-healing data works, and forced crashes 8089 * to verify that we never lose on-disk consistency. 8090 */ 8091 while (gethrtime() < zs->zs_proc_stop) { 8092 int status; 8093 boolean_t killed; 8094 8095 /* 8096 * Initialize the workload counters for each function. 8097 */ 8098 for (f = 0; f < ZTEST_FUNCS; f++) { 8099 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8100 zc->zc_count = 0; 8101 zc->zc_time = 0; 8102 } 8103 8104 /* Set the allocation switch size */ 8105 zs->zs_metaslab_df_alloc_threshold = 8106 ztest_random(zs->zs_metaslab_sz / 4) + 1; 8107 8108 if (!hasalt || ztest_random(2) == 0) { 8109 if (hasalt && ztest_opts.zo_verbose >= 1) { 8110 (void) printf("Executing newer ztest: %s\n", 8111 cmd); 8112 } 8113 newer++; 8114 killed = exec_child(cmd, NULL, B_TRUE, &status); 8115 } else { 8116 if (hasalt && ztest_opts.zo_verbose >= 1) { 8117 (void) printf("Executing older ztest: %s\n", 8118 ztest_opts.zo_alt_ztest); 8119 } 8120 older++; 8121 killed = exec_child(ztest_opts.zo_alt_ztest, 8122 ztest_opts.zo_alt_libpath, B_TRUE, &status); 8123 } 8124 8125 if (killed) 8126 kills++; 8127 iters++; 8128 8129 if (ztest_opts.zo_verbose >= 1) { 8130 hrtime_t now = gethrtime(); 8131 8132 now = MIN(now, zs->zs_proc_stop); 8133 print_time(zs->zs_proc_stop - now, timebuf); 8134 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 8135 8136 (void) printf("Pass %3d, %8s, %3llu ENOSPC, " 8137 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 8138 iters, 8139 WIFEXITED(status) ? "Complete" : "SIGKILL", 8140 (u_longlong_t)zs->zs_enospc_count, 8141 100.0 * zs->zs_alloc / zs->zs_space, 8142 numbuf, 8143 100.0 * (now - zs->zs_proc_start) / 8144 (ztest_opts.zo_time * NANOSEC), timebuf); 8145 } 8146 8147 if (ztest_opts.zo_verbose >= 2) { 8148 (void) printf("\nWorkload summary:\n\n"); 8149 (void) printf("%7s %9s %s\n", 8150 "Calls", "Time", "Function"); 8151 (void) printf("%7s %9s %s\n", 8152 "-----", "----", "--------"); 8153 for (f = 0; f < ZTEST_FUNCS; f++) { 8154 zi = &ztest_info[f]; 8155 zc = ZTEST_GET_SHARED_CALLSTATE(f); 8156 print_time(zc->zc_time, timebuf); 8157 (void) printf("%7llu %9s %s\n", 8158 (u_longlong_t)zc->zc_count, timebuf, 8159 zi->zi_funcname); 8160 } 8161 (void) printf("\n"); 8162 } 8163 8164 if (!ztest_opts.zo_mmp_test) 8165 ztest_run_zdb(ztest_opts.zo_pool); 8166 } 8167 8168 if (ztest_opts.zo_verbose >= 1) { 8169 if (hasalt) { 8170 (void) printf("%d runs of older ztest: %s\n", older, 8171 ztest_opts.zo_alt_ztest); 8172 (void) printf("%d runs of newer ztest: %s\n", newer, 8173 cmd); 8174 } 8175 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 8176 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 8177 } 8178 8179 umem_free(cmd, MAXNAMELEN); 8180 8181 return (0); 8182} 8183