spa_history.c revision 277585
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 */ 26 27#include <sys/spa.h> 28#include <sys/spa_impl.h> 29#include <sys/zap.h> 30#include <sys/dsl_synctask.h> 31#include <sys/dmu_tx.h> 32#include <sys/dmu_objset.h> 33#include <sys/dsl_dataset.h> 34#include <sys/dsl_dir.h> 35#include <sys/utsname.h> 36#include <sys/sunddi.h> 37#include <sys/cred.h> 38#include "zfs_comutil.h" 39#ifdef _KERNEL 40#include <sys/cmn_err.h> 41#include <sys/zone.h> 42#endif 43 44/* 45 * Routines to manage the on-disk history log. 46 * 47 * The history log is stored as a dmu object containing 48 * <packed record length, record nvlist> tuples. 49 * 50 * Where "record nvlist" is a nvlist containing uint64_ts and strings, and 51 * "packed record length" is the packed length of the "record nvlist" stored 52 * as a little endian uint64_t. 53 * 54 * The log is implemented as a ring buffer, though the original creation 55 * of the pool ('zpool create') is never overwritten. 56 * 57 * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer 58 * of 'spa_history' stores the offsets for logging/retrieving history as 59 * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of 60 * where the 'zpool create' record is stored. This allows us to never 61 * overwrite the original creation of the pool. 'sh_phys_max_off' is the 62 * physical ending offset in bytes of the log. This tells you the length of 63 * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record 64 * is added, 'sh_eof' is incremented by the the size of the record. 65 * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). 66 * This is where the consumer should start reading from after reading in 67 * the 'zpool create' portion of the log. 68 * 69 * 'sh_records_lost' keeps track of how many records have been overwritten 70 * and permanently lost. 71 */ 72 73/* convert a logical offset to physical */ 74static uint64_t 75spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) 76{ 77 uint64_t phys_len; 78 79 phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; 80 return ((log_off - shpp->sh_pool_create_len) % phys_len 81 + shpp->sh_pool_create_len); 82} 83 84void 85spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) 86{ 87 dmu_buf_t *dbp; 88 spa_history_phys_t *shpp; 89 objset_t *mos = spa->spa_meta_objset; 90 91 ASSERT(spa->spa_history == 0); 92 spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, 93 SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, 94 sizeof (spa_history_phys_t), tx); 95 96 VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, 97 DMU_POOL_HISTORY, sizeof (uint64_t), 1, 98 &spa->spa_history, tx) == 0); 99 100 VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); 101 ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); 102 103 shpp = dbp->db_data; 104 dmu_buf_will_dirty(dbp, tx); 105 106 /* 107 * Figure out maximum size of history log. We set it at 108 * 0.1% of pool size, with a max of 1G and min of 128KB. 109 */ 110 shpp->sh_phys_max_off = 111 metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; 112 shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); 113 shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); 114 115 dmu_buf_rele(dbp, FTAG); 116} 117 118/* 119 * Change 'sh_bof' to the beginning of the next record. 120 */ 121static int 122spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) 123{ 124 objset_t *mos = spa->spa_meta_objset; 125 uint64_t firstread, reclen, phys_bof; 126 char buf[sizeof (reclen)]; 127 int err; 128 129 phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); 130 firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); 131 132 if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, 133 buf, DMU_READ_PREFETCH)) != 0) 134 return (err); 135 if (firstread != sizeof (reclen)) { 136 if ((err = dmu_read(mos, spa->spa_history, 137 shpp->sh_pool_create_len, sizeof (reclen) - firstread, 138 buf + firstread, DMU_READ_PREFETCH)) != 0) 139 return (err); 140 } 141 142 reclen = LE_64(*((uint64_t *)buf)); 143 shpp->sh_bof += reclen + sizeof (reclen); 144 shpp->sh_records_lost++; 145 return (0); 146} 147 148static int 149spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, 150 dmu_tx_t *tx) 151{ 152 uint64_t firstwrite, phys_eof; 153 objset_t *mos = spa->spa_meta_objset; 154 int err; 155 156 ASSERT(MUTEX_HELD(&spa->spa_history_lock)); 157 158 /* see if we need to reset logical BOF */ 159 while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - 160 (shpp->sh_eof - shpp->sh_bof) <= len) { 161 if ((err = spa_history_advance_bof(spa, shpp)) != 0) { 162 return (err); 163 } 164 } 165 166 phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); 167 firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); 168 shpp->sh_eof += len; 169 dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); 170 171 len -= firstwrite; 172 if (len > 0) { 173 /* write out the rest at the beginning of physical file */ 174 dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, 175 len, (char *)buf + firstwrite, tx); 176 } 177 178 return (0); 179} 180 181static char * 182spa_history_zone(void) 183{ 184#ifdef _KERNEL 185 /* XXX: pr_hostname can be changed by default from within a jail! */ 186 if (jailed(curthread->td_ucred)) 187 return (curthread->td_ucred->cr_prison->pr_hostname); 188#endif 189 return (NULL); 190} 191 192/* 193 * Write out a history event. 194 */ 195/*ARGSUSED*/ 196static void 197spa_history_log_sync(void *arg, dmu_tx_t *tx) 198{ 199 nvlist_t *nvl = arg; 200 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 201 objset_t *mos = spa->spa_meta_objset; 202 dmu_buf_t *dbp; 203 spa_history_phys_t *shpp; 204 size_t reclen; 205 uint64_t le_len; 206 char *record_packed = NULL; 207 int ret; 208 209 /* 210 * If we have an older pool that doesn't have a command 211 * history object, create it now. 212 */ 213 mutex_enter(&spa->spa_history_lock); 214 if (!spa->spa_history) 215 spa_history_create_obj(spa, tx); 216 mutex_exit(&spa->spa_history_lock); 217 218 /* 219 * Get the offset of where we need to write via the bonus buffer. 220 * Update the offset when the write completes. 221 */ 222 VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); 223 shpp = dbp->db_data; 224 225 dmu_buf_will_dirty(dbp, tx); 226 227#ifdef ZFS_DEBUG 228 { 229 dmu_object_info_t doi; 230 dmu_object_info_from_db(dbp, &doi); 231 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); 232 } 233#endif 234 235 fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); 236#ifdef _KERNEL 237 fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename); 238#endif 239 if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { 240 zfs_dbgmsg("command: %s", 241 fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); 242 } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { 243 if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { 244 zfs_dbgmsg("txg %lld %s %s (id %llu) %s", 245 fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), 246 fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), 247 fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), 248 fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), 249 fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); 250 } else { 251 zfs_dbgmsg("txg %lld %s %s", 252 fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), 253 fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), 254 fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); 255 } 256 } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { 257 zfs_dbgmsg("ioctl %s", 258 fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); 259 } 260 261 record_packed = fnvlist_pack(nvl, &reclen); 262 263 mutex_enter(&spa->spa_history_lock); 264 265 /* write out the packed length as little endian */ 266 le_len = LE_64((uint64_t)reclen); 267 ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); 268 if (!ret) 269 ret = spa_history_write(spa, record_packed, reclen, shpp, tx); 270 271 /* The first command is the create, which we keep forever */ 272 if (ret == 0 && shpp->sh_pool_create_len == 0 && 273 nvlist_exists(nvl, ZPOOL_HIST_CMD)) { 274 shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; 275 } 276 277 mutex_exit(&spa->spa_history_lock); 278 fnvlist_pack_free(record_packed, reclen); 279 dmu_buf_rele(dbp, FTAG); 280 fnvlist_free(nvl); 281} 282 283/* 284 * Write out a history event. 285 */ 286int 287spa_history_log(spa_t *spa, const char *msg) 288{ 289 int err; 290 nvlist_t *nvl = fnvlist_alloc(); 291 292 fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg); 293 err = spa_history_log_nvl(spa, nvl); 294 fnvlist_free(nvl); 295 return (err); 296} 297 298int 299spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) 300{ 301 int err = 0; 302 dmu_tx_t *tx; 303 nvlist_t *nvarg; 304 305 if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) 306 return (EINVAL); 307 308 if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) 309 return (SET_ERROR(EINVAL)); 310 311 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 312 err = dmu_tx_assign(tx, TXG_WAIT); 313 if (err) { 314 dmu_tx_abort(tx); 315 return (err); 316 } 317 318 nvarg = fnvlist_dup(nvl); 319 if (spa_history_zone() != NULL) { 320 fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, 321 spa_history_zone()); 322 } 323 fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); 324 325 /* Kick this off asynchronously; errors are ignored. */ 326 dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, 327 nvarg, 0, ZFS_SPACE_CHECK_NONE, tx); 328 dmu_tx_commit(tx); 329 330 /* spa_history_log_sync will free nvl */ 331 return (err); 332 333} 334 335/* 336 * Read out the command history. 337 */ 338int 339spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) 340{ 341 objset_t *mos = spa->spa_meta_objset; 342 dmu_buf_t *dbp; 343 uint64_t read_len, phys_read_off, phys_eof; 344 uint64_t leftover = 0; 345 spa_history_phys_t *shpp; 346 int err; 347 348 /* 349 * If the command history doesn't exist (older pool), 350 * that's ok, just return ENOENT. 351 */ 352 if (!spa->spa_history) 353 return (SET_ERROR(ENOENT)); 354 355 /* 356 * The history is logged asynchronously, so when they request 357 * the first chunk of history, make sure everything has been 358 * synced to disk so that we get it. 359 */ 360 if (*offp == 0 && spa_writeable(spa)) 361 txg_wait_synced(spa_get_dsl(spa), 0); 362 363 if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) 364 return (err); 365 shpp = dbp->db_data; 366 367#ifdef ZFS_DEBUG 368 { 369 dmu_object_info_t doi; 370 dmu_object_info_from_db(dbp, &doi); 371 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); 372 } 373#endif 374 375 mutex_enter(&spa->spa_history_lock); 376 phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); 377 378 if (*offp < shpp->sh_pool_create_len) { 379 /* read in just the zpool create history */ 380 phys_read_off = *offp; 381 read_len = MIN(*len, shpp->sh_pool_create_len - 382 phys_read_off); 383 } else { 384 /* 385 * Need to reset passed in offset to BOF if the passed in 386 * offset has since been overwritten. 387 */ 388 *offp = MAX(*offp, shpp->sh_bof); 389 phys_read_off = spa_history_log_to_phys(*offp, shpp); 390 391 /* 392 * Read up to the minimum of what the user passed down or 393 * the EOF (physical or logical). If we hit physical EOF, 394 * use 'leftover' to read from the physical BOF. 395 */ 396 if (phys_read_off <= phys_eof) { 397 read_len = MIN(*len, phys_eof - phys_read_off); 398 } else { 399 read_len = MIN(*len, 400 shpp->sh_phys_max_off - phys_read_off); 401 if (phys_read_off + *len > shpp->sh_phys_max_off) { 402 leftover = MIN(*len - read_len, 403 phys_eof - shpp->sh_pool_create_len); 404 } 405 } 406 } 407 408 /* offset for consumer to use next */ 409 *offp += read_len + leftover; 410 411 /* tell the consumer how much you actually read */ 412 *len = read_len + leftover; 413 414 if (read_len == 0) { 415 mutex_exit(&spa->spa_history_lock); 416 dmu_buf_rele(dbp, FTAG); 417 return (0); 418 } 419 420 err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, 421 DMU_READ_PREFETCH); 422 if (leftover && err == 0) { 423 err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, 424 leftover, buf + read_len, DMU_READ_PREFETCH); 425 } 426 mutex_exit(&spa->spa_history_lock); 427 428 dmu_buf_rele(dbp, FTAG); 429 return (err); 430} 431 432/* 433 * The nvlist will be consumed by this call. 434 */ 435static void 436log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, 437 dmu_tx_t *tx, const char *fmt, va_list adx) 438{ 439 char *msg; 440 va_list adx2; 441 442 /* 443 * If this is part of creating a pool, not everything is 444 * initialized yet, so don't bother logging the internal events. 445 * Likewise if the pool is not writeable. 446 */ 447 if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { 448 fnvlist_free(nvl); 449 return; 450 } 451 452 va_copy(adx2, adx); 453 454 msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP); 455 (void) vsprintf(msg, fmt, adx2); 456 fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); 457 strfree(msg); 458 459 va_end(adx2); 460 461 fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); 462 fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); 463 464 if (dmu_tx_is_syncing(tx)) { 465 spa_history_log_sync(nvl, tx); 466 } else { 467 dsl_sync_task_nowait(spa_get_dsl(spa), 468 spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx); 469 } 470 /* spa_history_log_sync() will free nvl */ 471} 472 473void 474spa_history_log_internal(spa_t *spa, const char *operation, 475 dmu_tx_t *tx, const char *fmt, ...) 476{ 477 dmu_tx_t *htx = tx; 478 va_list adx; 479 480 /* create a tx if we didn't get one */ 481 if (tx == NULL) { 482 htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 483 if (dmu_tx_assign(htx, TXG_WAIT) != 0) { 484 dmu_tx_abort(htx); 485 return; 486 } 487 } 488 489 va_start(adx, fmt); 490 log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx); 491 va_end(adx); 492 493 /* if we didn't get a tx from the caller, commit the one we made */ 494 if (tx == NULL) 495 dmu_tx_commit(htx); 496} 497 498void 499spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation, 500 dmu_tx_t *tx, const char *fmt, ...) 501{ 502 va_list adx; 503 char namebuf[MAXNAMELEN]; 504 nvlist_t *nvl = fnvlist_alloc(); 505 506 ASSERT(tx != NULL); 507 508 dsl_dataset_name(ds, namebuf); 509 fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); 510 fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object); 511 512 va_start(adx, fmt); 513 log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx); 514 va_end(adx); 515} 516 517void 518spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, 519 dmu_tx_t *tx, const char *fmt, ...) 520{ 521 va_list adx; 522 char namebuf[MAXNAMELEN]; 523 nvlist_t *nvl = fnvlist_alloc(); 524 525 ASSERT(tx != NULL); 526 527 dsl_dir_name(dd, namebuf); 528 fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); 529 fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, 530 dsl_dir_phys(dd)->dd_head_dataset_obj); 531 532 va_start(adx, fmt); 533 log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx); 534 va_end(adx); 535} 536 537void 538spa_history_log_version(spa_t *spa, const char *operation) 539{ 540 spa_history_log_internal(spa, operation, NULL, 541 "pool version %llu; software version %llu/%d; uts %s %s %s %s", 542 (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION, 543 utsname.nodename, utsname.release, utsname.version, 544 utsname.machine); 545} 546