spa_history.c revision 276081
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25 */
26
27#include <sys/spa.h>
28#include <sys/spa_impl.h>
29#include <sys/zap.h>
30#include <sys/dsl_synctask.h>
31#include <sys/dmu_tx.h>
32#include <sys/dmu_objset.h>
33#include <sys/dsl_dataset.h>
34#include <sys/dsl_dir.h>
35#include <sys/utsname.h>
36#include <sys/sunddi.h>
37#include <sys/cred.h>
38#include "zfs_comutil.h"
39#ifdef _KERNEL
40#include <sys/cmn_err.h>
41#include <sys/zone.h>
42#endif
43
44/*
45 * Routines to manage the on-disk history log.
46 *
47 * The history log is stored as a dmu object containing
48 * <packed record length, record nvlist> tuples.
49 *
50 * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
51 * "packed record length" is the packed length of the "record nvlist" stored
52 * as a little endian uint64_t.
53 *
54 * The log is implemented as a ring buffer, though the original creation
55 * of the pool ('zpool create') is never overwritten.
56 *
57 * The history log is tracked as object 'spa_t::spa_history'.  The bonus buffer
58 * of 'spa_history' stores the offsets for logging/retrieving history as
59 * 'spa_history_phys_t'.  'sh_pool_create_len' is the ending offset in bytes of
60 * where the 'zpool create' record is stored.  This allows us to never
61 * overwrite the original creation of the pool.  'sh_phys_max_off' is the
62 * physical ending offset in bytes of the log.  This tells you the length of
63 * the buffer. 'sh_eof' is the logical EOF (in bytes).  Whenever a record
64 * is added, 'sh_eof' is incremented by the the size of the record.
65 * 'sh_eof' is never decremented.  'sh_bof' is the logical BOF (in bytes).
66 * This is where the consumer should start reading from after reading in
67 * the 'zpool create' portion of the log.
68 *
69 * 'sh_records_lost' keeps track of how many records have been overwritten
70 * and permanently lost.
71 */
72
73/* convert a logical offset to physical */
74static uint64_t
75spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
76{
77	uint64_t phys_len;
78
79	phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
80	return ((log_off - shpp->sh_pool_create_len) % phys_len
81	    + shpp->sh_pool_create_len);
82}
83
84void
85spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
86{
87	dmu_buf_t *dbp;
88	spa_history_phys_t *shpp;
89	objset_t *mos = spa->spa_meta_objset;
90
91	ASSERT(spa->spa_history == 0);
92	spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
93	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
94	    sizeof (spa_history_phys_t), tx);
95
96	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
97	    DMU_POOL_HISTORY, sizeof (uint64_t), 1,
98	    &spa->spa_history, tx) == 0);
99
100	VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
101	ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
102
103	shpp = dbp->db_data;
104	dmu_buf_will_dirty(dbp, tx);
105
106	/*
107	 * Figure out maximum size of history log.  We set it at
108	 * 0.1% of pool size, with a max of 1G and min of 128KB.
109	 */
110	shpp->sh_phys_max_off =
111	    metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
112	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
113	shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
114
115	dmu_buf_rele(dbp, FTAG);
116}
117
118/*
119 * Change 'sh_bof' to the beginning of the next record.
120 */
121static int
122spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
123{
124	objset_t *mos = spa->spa_meta_objset;
125	uint64_t firstread, reclen, phys_bof;
126	char buf[sizeof (reclen)];
127	int err;
128
129	phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
130	firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
131
132	if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
133	    buf, DMU_READ_PREFETCH)) != 0)
134		return (err);
135	if (firstread != sizeof (reclen)) {
136		if ((err = dmu_read(mos, spa->spa_history,
137		    shpp->sh_pool_create_len, sizeof (reclen) - firstread,
138		    buf + firstread, DMU_READ_PREFETCH)) != 0)
139			return (err);
140	}
141
142	reclen = LE_64(*((uint64_t *)buf));
143	shpp->sh_bof += reclen + sizeof (reclen);
144	shpp->sh_records_lost++;
145	return (0);
146}
147
148static int
149spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
150    dmu_tx_t *tx)
151{
152	uint64_t firstwrite, phys_eof;
153	objset_t *mos = spa->spa_meta_objset;
154	int err;
155
156	ASSERT(MUTEX_HELD(&spa->spa_history_lock));
157
158	/* see if we need to reset logical BOF */
159	while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
160	    (shpp->sh_eof - shpp->sh_bof) <= len) {
161		if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
162			return (err);
163		}
164	}
165
166	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
167	firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
168	shpp->sh_eof += len;
169	dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
170
171	len -= firstwrite;
172	if (len > 0) {
173		/* write out the rest at the beginning of physical file */
174		dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
175		    len, (char *)buf + firstwrite, tx);
176	}
177
178	return (0);
179}
180
181static char *
182spa_history_zone(void)
183{
184#ifdef _KERNEL
185	/* XXX: pr_hostname can be changed by default from within a jail! */
186	if (jailed(curthread->td_ucred))
187		return (curthread->td_ucred->cr_prison->pr_hostname);
188#endif
189	return (NULL);
190}
191
192/*
193 * Write out a history event.
194 */
195/*ARGSUSED*/
196static void
197spa_history_log_sync(void *arg, dmu_tx_t *tx)
198{
199	nvlist_t	*nvl = arg;
200	spa_t		*spa = dmu_tx_pool(tx)->dp_spa;
201	objset_t	*mos = spa->spa_meta_objset;
202	dmu_buf_t	*dbp;
203	spa_history_phys_t *shpp;
204	size_t		reclen;
205	uint64_t	le_len;
206	char		*record_packed = NULL;
207	int		ret;
208
209	/*
210	 * If we have an older pool that doesn't have a command
211	 * history object, create it now.
212	 */
213	mutex_enter(&spa->spa_history_lock);
214	if (!spa->spa_history)
215		spa_history_create_obj(spa, tx);
216	mutex_exit(&spa->spa_history_lock);
217
218	/*
219	 * Get the offset of where we need to write via the bonus buffer.
220	 * Update the offset when the write completes.
221	 */
222	VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
223	shpp = dbp->db_data;
224
225	dmu_buf_will_dirty(dbp, tx);
226
227#ifdef ZFS_DEBUG
228	{
229		dmu_object_info_t doi;
230		dmu_object_info_from_db(dbp, &doi);
231		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
232	}
233#endif
234
235	fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec());
236#ifdef _KERNEL
237	fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename);
238#endif
239	if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
240		zfs_dbgmsg("command: %s",
241		    fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD));
242	} else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) {
243		if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) {
244			zfs_dbgmsg("txg %lld %s %s (id %llu) %s",
245			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
246			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
247			    fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME),
248			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID),
249			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
250		} else {
251			zfs_dbgmsg("txg %lld %s %s",
252			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
253			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
254			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
255		}
256	} else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
257		zfs_dbgmsg("ioctl %s",
258		    fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
259	}
260
261	record_packed = fnvlist_pack(nvl, &reclen);
262
263	mutex_enter(&spa->spa_history_lock);
264
265	/* write out the packed length as little endian */
266	le_len = LE_64((uint64_t)reclen);
267	ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
268	if (!ret)
269		ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
270
271	/* The first command is the create, which we keep forever */
272	if (ret == 0 && shpp->sh_pool_create_len == 0 &&
273	    nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
274		shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof;
275	}
276
277	mutex_exit(&spa->spa_history_lock);
278	fnvlist_pack_free(record_packed, reclen);
279	dmu_buf_rele(dbp, FTAG);
280	fnvlist_free(nvl);
281}
282
283/*
284 * Write out a history event.
285 */
286int
287spa_history_log(spa_t *spa, const char *msg)
288{
289	int err;
290	nvlist_t *nvl = fnvlist_alloc();
291
292	fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg);
293	err = spa_history_log_nvl(spa, nvl);
294	fnvlist_free(nvl);
295	return (err);
296}
297
298int
299spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
300{
301	int err = 0;
302	dmu_tx_t *tx;
303	nvlist_t *nvarg;
304
305	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY)
306		return (EINVAL);
307
308	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
309		return (SET_ERROR(EINVAL));
310
311	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
312	err = dmu_tx_assign(tx, TXG_WAIT);
313	if (err) {
314		dmu_tx_abort(tx);
315		return (err);
316	}
317
318	nvarg = fnvlist_dup(nvl);
319	if (spa_history_zone() != NULL) {
320		fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE,
321		    spa_history_zone());
322	}
323	fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
324
325	/* Kick this off asynchronously; errors are ignored. */
326	dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
327	    nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
328	dmu_tx_commit(tx);
329
330	/* spa_history_log_sync will free nvl */
331	return (err);
332
333}
334
335/*
336 * Read out the command history.
337 */
338int
339spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
340{
341	objset_t *mos = spa->spa_meta_objset;
342	dmu_buf_t *dbp;
343	uint64_t read_len, phys_read_off, phys_eof;
344	uint64_t leftover = 0;
345	spa_history_phys_t *shpp;
346	int err;
347
348	/*
349	 * If the command history doesn't exist (older pool),
350	 * that's ok, just return ENOENT.
351	 */
352	if (!spa->spa_history)
353		return (SET_ERROR(ENOENT));
354
355	/*
356	 * The history is logged asynchronously, so when they request
357	 * the first chunk of history, make sure everything has been
358	 * synced to disk so that we get it.
359	 */
360	if (*offp == 0 && spa_writeable(spa))
361		txg_wait_synced(spa_get_dsl(spa), 0);
362
363	if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
364		return (err);
365	shpp = dbp->db_data;
366
367#ifdef ZFS_DEBUG
368	{
369		dmu_object_info_t doi;
370		dmu_object_info_from_db(dbp, &doi);
371		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
372	}
373#endif
374
375	mutex_enter(&spa->spa_history_lock);
376	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
377
378	if (*offp < shpp->sh_pool_create_len) {
379		/* read in just the zpool create history */
380		phys_read_off = *offp;
381		read_len = MIN(*len, shpp->sh_pool_create_len -
382		    phys_read_off);
383	} else {
384		/*
385		 * Need to reset passed in offset to BOF if the passed in
386		 * offset has since been overwritten.
387		 */
388		*offp = MAX(*offp, shpp->sh_bof);
389		phys_read_off = spa_history_log_to_phys(*offp, shpp);
390
391		/*
392		 * Read up to the minimum of what the user passed down or
393		 * the EOF (physical or logical).  If we hit physical EOF,
394		 * use 'leftover' to read from the physical BOF.
395		 */
396		if (phys_read_off <= phys_eof) {
397			read_len = MIN(*len, phys_eof - phys_read_off);
398		} else {
399			read_len = MIN(*len,
400			    shpp->sh_phys_max_off - phys_read_off);
401			if (phys_read_off + *len > shpp->sh_phys_max_off) {
402				leftover = MIN(*len - read_len,
403				    phys_eof - shpp->sh_pool_create_len);
404			}
405		}
406	}
407
408	/* offset for consumer to use next */
409	*offp += read_len + leftover;
410
411	/* tell the consumer how much you actually read */
412	*len = read_len + leftover;
413
414	if (read_len == 0) {
415		mutex_exit(&spa->spa_history_lock);
416		dmu_buf_rele(dbp, FTAG);
417		return (0);
418	}
419
420	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
421	    DMU_READ_PREFETCH);
422	if (leftover && err == 0) {
423		err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
424		    leftover, buf + read_len, DMU_READ_PREFETCH);
425	}
426	mutex_exit(&spa->spa_history_lock);
427
428	dmu_buf_rele(dbp, FTAG);
429	return (err);
430}
431
432/*
433 * The nvlist will be consumed by this call.
434 */
435static void
436log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
437    dmu_tx_t *tx, const char *fmt, va_list adx)
438{
439	char *msg;
440	va_list adx2;
441
442	/*
443	 * If this is part of creating a pool, not everything is
444	 * initialized yet, so don't bother logging the internal events.
445	 * Likewise if the pool is not writeable.
446	 */
447	if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) {
448		fnvlist_free(nvl);
449		return;
450	}
451
452	va_copy(adx2, adx);
453
454	msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
455	(void) vsprintf(msg, fmt, adx2);
456	fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg);
457	strfree(msg);
458
459	va_end(adx2);
460
461	fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation);
462	fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
463
464	if (dmu_tx_is_syncing(tx)) {
465		spa_history_log_sync(nvl, tx);
466	} else {
467		dsl_sync_task_nowait(spa_get_dsl(spa),
468		    spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
469	}
470	/* spa_history_log_sync() will free nvl */
471}
472
473void
474spa_history_log_internal(spa_t *spa, const char *operation,
475    dmu_tx_t *tx, const char *fmt, ...)
476{
477	dmu_tx_t *htx = tx;
478	va_list adx;
479
480	/* create a tx if we didn't get one */
481	if (tx == NULL) {
482		htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
483		if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
484			dmu_tx_abort(htx);
485			return;
486		}
487	}
488
489	va_start(adx, fmt);
490	log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx);
491	va_end(adx);
492
493	/* if we didn't get a tx from the caller, commit the one we made */
494	if (tx == NULL)
495		dmu_tx_commit(htx);
496}
497
498void
499spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation,
500    dmu_tx_t *tx, const char *fmt, ...)
501{
502	va_list adx;
503	char namebuf[MAXNAMELEN];
504	nvlist_t *nvl = fnvlist_alloc();
505
506	ASSERT(tx != NULL);
507
508	dsl_dataset_name(ds, namebuf);
509	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
510	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object);
511
512	va_start(adx, fmt);
513	log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx);
514	va_end(adx);
515}
516
517void
518spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
519    dmu_tx_t *tx, const char *fmt, ...)
520{
521	va_list adx;
522	char namebuf[MAXNAMELEN];
523	nvlist_t *nvl = fnvlist_alloc();
524
525	ASSERT(tx != NULL);
526
527	dsl_dir_name(dd, namebuf);
528	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
529	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
530	    dd->dd_phys->dd_head_dataset_obj);
531
532	va_start(adx, fmt);
533	log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
534	va_end(adx);
535}
536
537void
538spa_history_log_version(spa_t *spa, const char *operation)
539{
540	spa_history_log_internal(spa, operation, NULL,
541	    "pool version %llu; software version %llu/%d; uts %s %s %s %s",
542	    (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION,
543	    utsname.nodename, utsname.release, utsname.version,
544	    utsname.machine);
545}
546