case_file.cc revision 331395
1/*-
2 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions, and the following disclaimer,
10 *    without modification.
11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12 *    substantially similar to the "NO WARRANTY" disclaimer below
13 *    ("Disclaimer") and any redistribution must be conditioned upon
14 *    including a substantially similar Disclaimer requirement for further
15 *    binary redistribution.
16 *
17 * NO WARRANTY
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGES.
29 *
30 * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31 */
32
33/**
34 * \file case_file.cc
35 *
36 * We keep case files for any leaf vdev that is not in the optimal state.
37 * However, we only serialize to disk those events that need to be preserved
38 * across reboots.  For now, this is just a log of soft errors which we
39 * accumulate in order to mark a device as degraded.
40 */
41#include <sys/cdefs.h>
42#include <sys/time.h>
43
44#include <sys/fs/zfs.h>
45
46#include <dirent.h>
47#include <iomanip>
48#include <fstream>
49#include <functional>
50#include <sstream>
51#include <syslog.h>
52#include <unistd.h>
53
54#include <libzfs.h>
55
56#include <list>
57#include <map>
58#include <string>
59
60#include <devdctl/guid.h>
61#include <devdctl/event.h>
62#include <devdctl/event_factory.h>
63#include <devdctl/exception.h>
64#include <devdctl/consumer.h>
65
66#include "callout.h"
67#include "vdev_iterator.h"
68#include "zfsd_event.h"
69#include "case_file.h"
70#include "vdev.h"
71#include "zfsd.h"
72#include "zfsd_exception.h"
73#include "zpool_list.h"
74
75__FBSDID("$FreeBSD: stable/11/cddl/usr.sbin/zfsd/case_file.cc 331395 2018-03-22 23:54:14Z mav $");
76
77/*============================ Namespace Control =============================*/
78using std::auto_ptr;
79using std::hex;
80using std::ifstream;
81using std::stringstream;
82using std::setfill;
83using std::setw;
84
85using DevdCtl::Event;
86using DevdCtl::EventFactory;
87using DevdCtl::EventList;
88using DevdCtl::Guid;
89using DevdCtl::ParseException;
90
91/*--------------------------------- CaseFile ---------------------------------*/
92//- CaseFile Static Data -------------------------------------------------------
93
94CaseFileList  CaseFile::s_activeCases;
95const string  CaseFile::s_caseFilePath = "/var/db/zfsd/cases";
96const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/};
97
98//- CaseFile Static Public Methods ---------------------------------------------
99CaseFile *
100CaseFile::Find(Guid poolGUID, Guid vdevGUID)
101{
102	for (CaseFileList::iterator curCase = s_activeCases.begin();
103	     curCase != s_activeCases.end(); curCase++) {
104
105		if (((*curCase)->PoolGUID() != poolGUID
106		  && Guid::InvalidGuid() != poolGUID)
107		 || (*curCase)->VdevGUID() != vdevGUID)
108			continue;
109
110		/*
111		 * We only carry one active case per-vdev.
112		 */
113		return (*curCase);
114	}
115	return (NULL);
116}
117
118CaseFile *
119CaseFile::Find(const string &physPath)
120{
121	CaseFile *result = NULL;
122
123	for (CaseFileList::iterator curCase = s_activeCases.begin();
124	     curCase != s_activeCases.end(); curCase++) {
125
126		if ((*curCase)->PhysicalPath() != physPath)
127			continue;
128
129		if (result != NULL) {
130			syslog(LOG_WARNING, "Multiple casefiles found for "
131			    "physical path %s.  "
132			    "This is most likely a bug in zfsd",
133			    physPath.c_str());
134		}
135		result = *curCase;
136	}
137	return (result);
138}
139
140
141void
142CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
143{
144	CaseFileList::iterator casefile;
145	for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
146		CaseFileList::iterator next = casefile;
147		next++;
148		if (poolGUID == (*casefile)->PoolGUID())
149			(*casefile)->ReEvaluate(event);
150		casefile = next;
151	}
152}
153
154CaseFile &
155CaseFile::Create(Vdev &vdev)
156{
157	CaseFile *activeCase;
158
159	activeCase = Find(vdev.PoolGUID(), vdev.GUID());
160	if (activeCase == NULL)
161		activeCase = new CaseFile(vdev);
162
163	return (*activeCase);
164}
165
166void
167CaseFile::DeSerialize()
168{
169	struct dirent **caseFiles;
170
171	int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
172			 DeSerializeSelector, /*compar*/NULL));
173
174	if (numCaseFiles == -1)
175		return;
176	if (numCaseFiles == 0) {
177		free(caseFiles);
178		return;
179	}
180
181	for (int i = 0; i < numCaseFiles; i++) {
182
183		DeSerializeFile(caseFiles[i]->d_name);
184		free(caseFiles[i]);
185	}
186	free(caseFiles);
187}
188
189bool
190CaseFile::Empty()
191{
192	return (s_activeCases.empty());
193}
194
195void
196CaseFile::LogAll()
197{
198	for (CaseFileList::iterator curCase = s_activeCases.begin();
199	     curCase != s_activeCases.end(); curCase++)
200		(*curCase)->Log();
201}
202
203void
204CaseFile::PurgeAll()
205{
206	/*
207	 * Serialize casefiles before deleting them so that they can be reread
208	 * and revalidated during BuildCaseFiles.
209	 * CaseFiles remove themselves from this list on destruction.
210	 */
211	while (s_activeCases.size() != 0) {
212		CaseFile *casefile = s_activeCases.front();
213		casefile->Serialize();
214		delete casefile;
215	}
216
217}
218
219//- CaseFile Public Methods ----------------------------------------------------
220bool
221CaseFile::RefreshVdevState()
222{
223	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
224	zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
225	if (casePool == NULL)
226		return (false);
227
228	Vdev vd(casePool, CaseVdev(casePool));
229	if (vd.DoesNotExist())
230		return (false);
231
232	m_vdevState    = vd.State();
233	m_vdevPhysPath = vd.PhysicalPath();
234	return (true);
235}
236
237bool
238CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
239{
240	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
241	zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
242	zpool_boot_label_t boot_type;
243	uint64_t boot_size;
244
245	if (pool == NULL || !RefreshVdevState()) {
246		/*
247		 * The pool or vdev for this case file is no longer
248		 * part of the configuration.  This can happen
249		 * if we process a device arrival notification
250		 * before seeing the ZFS configuration change
251		 * event.
252		 */
253		syslog(LOG_INFO,
254		       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured.  "
255		       "Closing\n",
256		       PoolGUIDString().c_str(),
257		       VdevGUIDString().c_str());
258		Close();
259
260		/*
261		 * Since this event was not used to close this
262		 * case, do not report it as consumed.
263		 */
264		return (/*consumed*/false);
265	}
266
267	if (VdevState() > VDEV_STATE_CANT_OPEN) {
268		/*
269		 * For now, newly discovered devices only help for
270		 * devices that are missing.  In the future, we might
271		 * use a newly inserted spare to replace a degraded
272		 * or faulted device.
273		 */
274		syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
275		    PoolGUIDString().c_str(), VdevGUIDString().c_str());
276		return (/*consumed*/false);
277	}
278
279	if (vdev != NULL
280	 && ( vdev->PoolGUID() == m_poolGUID
281	   || vdev->PoolGUID() == Guid::InvalidGuid())
282	 && vdev->GUID() == m_vdevGUID) {
283
284		zpool_vdev_online(pool, vdev->GUIDString().c_str(),
285				  ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE,
286				  &m_vdevState);
287		syslog(LOG_INFO, "Onlined vdev(%s/%s:%s).  State now %s.\n",
288		       zpool_get_name(pool), vdev->GUIDString().c_str(),
289		       devPath.c_str(),
290		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
291
292		/*
293		 * Check the vdev state post the online action to see
294		 * if we can retire this case.
295		 */
296		CloseIfSolved();
297
298		return (/*consumed*/true);
299	}
300
301	/*
302	 * If the auto-replace policy is enabled, and we have physical
303	 * path information, try a physical path replacement.
304	 */
305	if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
306		syslog(LOG_INFO,
307		       "CaseFile(%s:%s:%s): AutoReplace not set.  "
308		       "Ignoring device insertion.\n",
309		       PoolGUIDString().c_str(),
310		       VdevGUIDString().c_str(),
311		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
312		return (/*consumed*/false);
313	}
314
315	if (PhysicalPath().empty()) {
316		syslog(LOG_INFO,
317		       "CaseFile(%s:%s:%s): No physical path information.  "
318		       "Ignoring device insertion.\n",
319		       PoolGUIDString().c_str(),
320		       VdevGUIDString().c_str(),
321		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
322		return (/*consumed*/false);
323	}
324
325	if (physPath != PhysicalPath()) {
326		syslog(LOG_INFO,
327		       "CaseFile(%s:%s:%s): Physical path mismatch.  "
328		       "Ignoring device insertion.\n",
329		       PoolGUIDString().c_str(),
330		       VdevGUIDString().c_str(),
331		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
332		return (/*consumed*/false);
333	}
334
335	/* Write a label on the newly inserted disk. */
336	if (zpool_is_bootable(pool))
337		boot_type = ZPOOL_COPY_BOOT_LABEL;
338	else
339		boot_type = ZPOOL_NO_BOOT_LABEL;
340	boot_size = zpool_get_prop_int(pool, ZPOOL_PROP_BOOTSIZE, NULL);
341	if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str(),
342	    boot_type, boot_size, NULL) != 0) {
343		syslog(LOG_ERR,
344		       "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
345		       zpool_get_name(pool), VdevGUIDString().c_str(),
346		       libzfs_error_action(g_zfsHandle),
347		       libzfs_error_description(g_zfsHandle));
348		return (/*consumed*/false);
349	}
350
351	syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
352	    PoolGUIDString().c_str(), VdevGUIDString().c_str(),
353	    devPath.c_str());
354	return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
355}
356
357bool
358CaseFile::ReEvaluate(const ZfsEvent &event)
359{
360	bool consumed(false);
361
362	if (event.Value("type") == "misc.fs.zfs.vdev_remove") {
363		/*
364		 * The Vdev we represent has been removed from the
365		 * configuration.  This case is no longer of value.
366		 */
367		Close();
368
369		return (/*consumed*/true);
370	} else if (event.Value("type") == "misc.fs.zfs.pool_destroy") {
371		/* This Pool has been destroyed.  Discard the case */
372		Close();
373
374		return (/*consumed*/true);
375	} else if (event.Value("type") == "misc.fs.zfs.config_sync") {
376		RefreshVdevState();
377		if (VdevState() < VDEV_STATE_HEALTHY)
378			consumed = ActivateSpare();
379	}
380
381
382	if (event.Value("class") == "resource.fs.zfs.removed") {
383		bool spare_activated;
384
385		if (!RefreshVdevState()) {
386			/*
387			 * The pool or vdev for this case file is no longer
388			 * part of the configuration.  This can happen
389			 * if we process a device arrival notification
390			 * before seeing the ZFS configuration change
391			 * event.
392			 */
393			syslog(LOG_INFO,
394			       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
395			       "unconfigured.  Closing\n",
396			       PoolGUIDString().c_str(),
397			       VdevGUIDString().c_str());
398			/*
399			 * Close the case now so we won't waste cycles in the
400			 * system rescan
401			 */
402			Close();
403
404			/*
405			 * Since this event was not used to close this
406			 * case, do not report it as consumed.
407			 */
408			return (/*consumed*/false);
409		}
410
411		/*
412		 * Discard any tentative I/O error events for
413		 * this case.  They were most likely caused by the
414		 * hot-unplug of this device.
415		 */
416		PurgeTentativeEvents();
417
418		/* Try to activate spares if they are available */
419		spare_activated = ActivateSpare();
420
421		/*
422		 * Rescan the drives in the system to see if a recent
423		 * drive arrival can be used to solve this case.
424		 */
425		ZfsDaemon::RequestSystemRescan();
426
427		/*
428		 * Consume the event if we successfully activated a spare.
429		 * Otherwise, leave it in the unconsumed events list so that the
430		 * future addition of a spare to this pool might be able to
431		 * close the case
432		 */
433		consumed = spare_activated;
434	} else if (event.Value("class") == "resource.fs.zfs.statechange") {
435		RefreshVdevState();
436		/*
437		 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
438		 * activate a hotspare.  Otherwise, ignore the event
439		 */
440		if (VdevState() == VDEV_STATE_FAULTED ||
441		    VdevState() == VDEV_STATE_DEGRADED ||
442		    VdevState() == VDEV_STATE_CANT_OPEN)
443			(void) ActivateSpare();
444		consumed = true;
445	}
446	else if (event.Value("class") == "ereport.fs.zfs.io" ||
447	         event.Value("class") == "ereport.fs.zfs.checksum") {
448
449		m_tentativeEvents.push_front(event.DeepCopy());
450		RegisterCallout(event);
451		consumed = true;
452	}
453
454	bool closed(CloseIfSolved());
455
456	return (consumed || closed);
457}
458
459/* Find a Vdev containing the vdev with the given GUID */
460static nvlist_t*
461find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid)
462{
463	nvlist_t **vdevChildren;
464	int        error;
465	unsigned   ch, numChildren;
466
467	error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
468					   &vdevChildren, &numChildren);
469
470	if (error != 0 || numChildren == 0)
471		return (NULL);
472
473	for (ch = 0; ch < numChildren; ch++) {
474		nvlist *result;
475		Vdev vdev(pool_config, vdevChildren[ch]);
476
477		if (vdev.GUID() == child_guid)
478			return (config);
479
480		result = find_parent(pool_config, vdevChildren[ch], child_guid);
481		if (result != NULL)
482			return (result);
483	}
484
485	return (NULL);
486}
487
488bool
489CaseFile::ActivateSpare() {
490	nvlist_t	*config, *nvroot, *parent_config;
491	nvlist_t       **spares;
492	char		*devPath, *vdev_type;
493	const char	*poolname;
494	u_int		 nspares, i;
495	int		 error;
496
497	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
498	zpool_handle_t	*zhp(zpl.empty() ? NULL : zpl.front());
499	if (zhp == NULL) {
500		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
501		       "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
502		return (false);
503	}
504	poolname = zpool_get_name(zhp);
505	config = zpool_get_config(zhp, NULL);
506	if (config == NULL) {
507		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
508		       "config for pool %s", poolname);
509		return (false);
510	}
511	error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
512	if (error != 0){
513		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
514		       "tree for pool %s", poolname);
515		return (false);
516	}
517
518	parent_config = find_parent(config, nvroot, m_vdevGUID);
519	if (parent_config != NULL) {
520		char *parent_type;
521
522		/*
523		 * Don't activate spares for members of a "replacing" vdev.
524		 * They're already dealt with.  Sparing them will just drag out
525		 * the resilver process.
526		 */
527		error = nvlist_lookup_string(parent_config,
528		    ZPOOL_CONFIG_TYPE, &parent_type);
529		if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0)
530			return (false);
531	}
532
533	nspares = 0;
534	nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
535				   &nspares);
536	if (nspares == 0) {
537		/* The pool has no spares configured */
538		syslog(LOG_INFO, "CaseFile::ActivateSpare: "
539		       "No spares available for pool %s", poolname);
540		return (false);
541	}
542	for (i = 0; i < nspares; i++) {
543		uint64_t    *nvlist_array;
544		vdev_stat_t *vs;
545		uint_t	     nstats;
546
547		if (nvlist_lookup_uint64_array(spares[i],
548		    ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
549			syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
550			       "find vdev stats for pool %s, spare %d",
551			       poolname, i);
552			return (false);
553		}
554		vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);
555
556		if ((vs->vs_aux != VDEV_AUX_SPARED)
557		 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
558			/* We found a usable spare */
559			break;
560		}
561	}
562
563	if (i == nspares) {
564		/* No available spares were found */
565		return (false);
566	}
567
568	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
569	if (error != 0) {
570		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
571		       "the path of pool %s, spare %d. Error %d",
572		       poolname, i, error);
573		return (false);
574	}
575
576	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
577	if (error != 0) {
578		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
579		       "the vdev type of pool %s, spare %d. Error %d",
580		       poolname, i, error);
581		return (false);
582	}
583
584	return (Replace(vdev_type, devPath, /*isspare*/true));
585}
586
587void
588CaseFile::RegisterCallout(const Event &event)
589{
590	timeval now, countdown, elapsed, timestamp, zero, remaining;
591
592	gettimeofday(&now, 0);
593	timestamp = event.GetTimestamp();
594	timersub(&now, &timestamp, &elapsed);
595	timersub(&s_removeGracePeriod, &elapsed, &countdown);
596	/*
597	 * If countdown is <= zero, Reset the timer to the
598	 * smallest positive time value instead
599	 */
600	timerclear(&zero);
601	if (timercmp(&countdown, &zero, <=)) {
602		timerclear(&countdown);
603		countdown.tv_usec = 1;
604	}
605
606	remaining = m_tentativeTimer.TimeRemaining();
607
608	if (!m_tentativeTimer.IsPending()
609	 || timercmp(&countdown, &remaining, <))
610		m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
611}
612
613
614bool
615CaseFile::CloseIfSolved()
616{
617	if (m_events.empty()
618	 && m_tentativeEvents.empty()) {
619
620		/*
621		 * We currently do not track or take actions on
622		 * devices in the degraded or faulted state.
623		 * Once we have support for spare pools, we'll
624		 * retain these cases so that any spares added in
625		 * the future can be applied to them.
626		 */
627		switch (VdevState()) {
628		case VDEV_STATE_HEALTHY:
629			/* No need to keep cases for healthy vdevs */
630			Close();
631			return (true);
632		case VDEV_STATE_REMOVED:
633		case VDEV_STATE_CANT_OPEN:
634			/*
635			 * Keep open.  We may solve it with a newly inserted
636			 * device.
637			 */
638		case VDEV_STATE_FAULTED:
639		case VDEV_STATE_DEGRADED:
640			/*
641			 * Keep open.  We may solve it with the future
642			 * addition of a spare to the pool
643			 */
644		case VDEV_STATE_UNKNOWN:
645		case VDEV_STATE_CLOSED:
646		case VDEV_STATE_OFFLINE:
647			/*
648			 * Keep open?  This may not be the correct behavior,
649			 * but it's what we've always done
650			 */
651			;
652		}
653
654		/*
655		 * Re-serialize the case in order to remove any
656		 * previous event data.
657		 */
658		Serialize();
659	}
660
661	return (false);
662}
663
664void
665CaseFile::Log()
666{
667	syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
668	       VdevGUIDString().c_str(), PhysicalPath().c_str());
669	syslog(LOG_INFO, "\tVdev State = %s\n",
670	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
671	if (m_tentativeEvents.size() != 0) {
672		syslog(LOG_INFO, "\t=== Tentative Events ===\n");
673		for (EventList::iterator event(m_tentativeEvents.begin());
674		     event != m_tentativeEvents.end(); event++)
675			(*event)->Log(LOG_INFO);
676	}
677	if (m_events.size() != 0) {
678		syslog(LOG_INFO, "\t=== Events ===\n");
679		for (EventList::iterator event(m_events.begin());
680		     event != m_events.end(); event++)
681			(*event)->Log(LOG_INFO);
682	}
683}
684
685//- CaseFile Static Protected Methods ------------------------------------------
686void
687CaseFile::OnGracePeriodEnded(void *arg)
688{
689	CaseFile &casefile(*static_cast<CaseFile *>(arg));
690
691	casefile.OnGracePeriodEnded();
692}
693
694int
695CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
696{
697	uint64_t poolGUID;
698	uint64_t vdevGUID;
699
700	if (dirEntry->d_type == DT_REG
701	 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
702		   &poolGUID, &vdevGUID) == 2)
703		return (1);
704	return (0);
705}
706
707void
708CaseFile::DeSerializeFile(const char *fileName)
709{
710	string	  fullName(s_caseFilePath + '/' + fileName);
711	CaseFile *existingCaseFile(NULL);
712	CaseFile *caseFile(NULL);
713
714	try {
715		uint64_t poolGUID;
716		uint64_t vdevGUID;
717		nvlist_t *vdevConf;
718
719		if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
720		       &poolGUID, &vdevGUID) != 2) {
721			throw ZfsdException("CaseFile::DeSerialize: "
722			    "Unintelligible CaseFile filename %s.\n", fileName);
723		}
724		existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
725		if (existingCaseFile != NULL) {
726			/*
727			 * If the vdev is already degraded or faulted,
728			 * there's no point in keeping the state around
729			 * that we use to put a drive into the degraded
730			 * state.  However, if the vdev is simply missing,
731			 * preserve the case data in the hopes that it will
732			 * return.
733			 */
734			caseFile = existingCaseFile;
735			vdev_state curState(caseFile->VdevState());
736			if (curState > VDEV_STATE_CANT_OPEN
737			 && curState < VDEV_STATE_HEALTHY) {
738				unlink(fileName);
739				return;
740			}
741		} else {
742			ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
743			if (zpl.empty()
744			 || (vdevConf = VdevIterator(zpl.front())
745						    .Find(vdevGUID)) == NULL) {
746				/*
747				 * Either the pool no longer exists
748				 * or this vdev is no longer a member of
749				 * the pool.
750				 */
751				unlink(fullName.c_str());
752				return;
753			}
754
755			/*
756			 * Any vdev we find that does not have a case file
757			 * must be in the healthy state and thus worthy of
758			 * continued SERD data tracking.
759			 */
760			caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
761		}
762
763		ifstream caseStream(fullName.c_str());
764		if (!caseStream)
765			throw ZfsdException("CaseFile::DeSerialize: Unable to "
766					    "read %s.\n", fileName);
767
768		caseFile->DeSerialize(caseStream);
769	} catch (const ParseException &exp) {
770
771		exp.Log();
772		if (caseFile != existingCaseFile)
773			delete caseFile;
774
775		/*
776		 * Since we can't parse the file, unlink it so we don't
777		 * trip over it again.
778		 */
779		unlink(fileName);
780	} catch (const ZfsdException &zfsException) {
781
782		zfsException.Log();
783		if (caseFile != existingCaseFile)
784			delete caseFile;
785	}
786}
787
788//- CaseFile Protected Methods -------------------------------------------------
789CaseFile::CaseFile(const Vdev &vdev)
790 : m_poolGUID(vdev.PoolGUID()),
791   m_vdevGUID(vdev.GUID()),
792   m_vdevState(vdev.State()),
793   m_vdevPhysPath(vdev.PhysicalPath())
794{
795	stringstream guidString;
796
797	guidString << m_vdevGUID;
798	m_vdevGUIDString = guidString.str();
799	guidString.str("");
800	guidString << m_poolGUID;
801	m_poolGUIDString = guidString.str();
802
803	s_activeCases.push_back(this);
804
805	syslog(LOG_INFO, "Creating new CaseFile:\n");
806	Log();
807}
808
809CaseFile::~CaseFile()
810{
811	PurgeEvents();
812	PurgeTentativeEvents();
813	m_tentativeTimer.Stop();
814	s_activeCases.remove(this);
815}
816
817void
818CaseFile::PurgeEvents()
819{
820	for (EventList::iterator event(m_events.begin());
821	     event != m_events.end(); event++)
822		delete *event;
823
824	m_events.clear();
825}
826
827void
828CaseFile::PurgeTentativeEvents()
829{
830	for (EventList::iterator event(m_tentativeEvents.begin());
831	     event != m_tentativeEvents.end(); event++)
832		delete *event;
833
834	m_tentativeEvents.clear();
835}
836
837void
838CaseFile::SerializeEvList(const EventList events, int fd,
839		const char* prefix) const
840{
841	if (events.empty())
842		return;
843	for (EventList::const_iterator curEvent = events.begin();
844	     curEvent != events.end(); curEvent++) {
845		const string &eventString((*curEvent)->GetEventString());
846
847		// TODO: replace many write(2) calls with a single writev(2)
848		if (prefix)
849			write(fd, prefix, strlen(prefix));
850		write(fd, eventString.c_str(), eventString.length());
851	}
852}
853
854void
855CaseFile::Serialize()
856{
857	stringstream saveFile;
858
859	saveFile << setfill('0')
860		 << s_caseFilePath << "/"
861		 << "pool_" << PoolGUIDString()
862		 << "_vdev_" << VdevGUIDString()
863		 << ".case";
864
865	if (m_events.empty() && m_tentativeEvents.empty()) {
866		unlink(saveFile.str().c_str());
867		return;
868	}
869
870	int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
871	if (fd == -1) {
872		syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
873		       saveFile.str().c_str());
874		return;
875	}
876	SerializeEvList(m_events, fd);
877	SerializeEvList(m_tentativeEvents, fd, "tentative ");
878	close(fd);
879}
880
881/*
882 * XXX: This method assumes that events may not contain embedded newlines.  If
883 * ever events can contain embedded newlines, then CaseFile must switch
884 * serialization formats
885 */
886void
887CaseFile::DeSerialize(ifstream &caseStream)
888{
889	string	      evString;
890	const EventFactory &factory(ZfsDaemon::Get().GetFactory());
891
892	caseStream >> std::noskipws >> std::ws;
893	while (caseStream.good()) {
894		/*
895		 * Outline:
896		 * read the beginning of a line and check it for
897		 * "tentative".  If found, discard "tentative".
898		 * Create a new event
899		 * continue
900		 */
901		EventList* destEvents;
902		const string tentFlag("tentative ");
903		string line;
904		std::stringbuf lineBuf;
905
906		caseStream.get(lineBuf);
907		caseStream.ignore();  /*discard the newline character*/
908		line = lineBuf.str();
909		if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
910			/* Discard "tentative" */
911			line.erase(0, tentFlag.size());
912			destEvents = &m_tentativeEvents;
913		} else {
914			destEvents = &m_events;
915		}
916		Event *event(Event::CreateEvent(factory, line));
917		if (event != NULL) {
918			destEvents->push_back(event);
919			RegisterCallout(*event);
920		}
921	}
922}
923
924void
925CaseFile::Close()
926{
927	/*
928	 * This case is no longer relevant.  Clean up our
929	 * serialization file, and delete the case.
930	 */
931	syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
932	       PoolGUIDString().c_str(), VdevGUIDString().c_str(),
933	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
934
935	/*
936	 * Serialization of a Case with no event data, clears the
937	 * Serialization data for that event.
938	 */
939	PurgeEvents();
940	Serialize();
941
942	delete this;
943}
944
945void
946CaseFile::OnGracePeriodEnded()
947{
948	bool should_fault, should_degrade;
949	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
950	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
951
952	m_events.splice(m_events.begin(), m_tentativeEvents);
953	should_fault = ShouldFault();
954	should_degrade = ShouldDegrade();
955
956	if (should_fault || should_degrade) {
957		if (zhp == NULL
958		 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
959			/*
960			 * Either the pool no longer exists
961			 * or this vdev is no longer a member of
962			 * the pool.
963			 */
964			Close();
965			return;
966		}
967
968	}
969
970	/* A fault condition has priority over a degrade condition */
971	if (ShouldFault()) {
972		/* Fault the vdev and close the case. */
973		if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
974				       VDEV_AUX_ERR_EXCEEDED) == 0) {
975			syslog(LOG_INFO, "Faulting vdev(%s/%s)",
976			       PoolGUIDString().c_str(),
977			       VdevGUIDString().c_str());
978			Close();
979			return;
980		}
981		else {
982			syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
983			       PoolGUIDString().c_str(),
984			       VdevGUIDString().c_str(),
985			       libzfs_error_action(g_zfsHandle),
986			       libzfs_error_description(g_zfsHandle));
987		}
988	}
989	else if (ShouldDegrade()) {
990		/* Degrade the vdev and close the case. */
991		if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
992				       VDEV_AUX_ERR_EXCEEDED) == 0) {
993			syslog(LOG_INFO, "Degrading vdev(%s/%s)",
994			       PoolGUIDString().c_str(),
995			       VdevGUIDString().c_str());
996			Close();
997			return;
998		}
999		else {
1000			syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
1001			       PoolGUIDString().c_str(),
1002			       VdevGUIDString().c_str(),
1003			       libzfs_error_action(g_zfsHandle),
1004			       libzfs_error_description(g_zfsHandle));
1005		}
1006	}
1007	Serialize();
1008}
1009
1010Vdev
1011CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
1012	Vdev vd(zhp, CaseVdev(zhp));
1013	std::list<Vdev> children;
1014	std::list<Vdev>::iterator children_it;
1015
1016	Vdev parent(vd.Parent());
1017	Vdev replacing(NonexistentVdev);
1018
1019	/*
1020	 * To determine whether we are being replaced by another spare that
1021	 * is still working, then make sure that it is currently spared and
1022	 * that the spare is either resilvering or healthy.  If any of these
1023	 * conditions fail, then we are not being replaced by a spare.
1024	 *
1025	 * If the spare is healthy, then the case file should be closed very
1026	 * soon after this check.
1027	 */
1028	if (parent.DoesNotExist()
1029	 || parent.Name(zhp, /*verbose*/false) != "spare")
1030		return (NonexistentVdev);
1031
1032	children = parent.Children();
1033	children_it = children.begin();
1034	for (;children_it != children.end(); children_it++) {
1035		Vdev child = *children_it;
1036
1037		/* Skip our vdev. */
1038		if (child.GUID() == VdevGUID())
1039			continue;
1040		/*
1041		 * Accept the first child that doesn't match our GUID, or
1042		 * any resilvering/healthy device if one exists.
1043		 */
1044		if (replacing.DoesNotExist() || child.IsResilvering()
1045		 || child.State() == VDEV_STATE_HEALTHY)
1046			replacing = child;
1047	}
1048
1049	return (replacing);
1050}
1051
1052bool
1053CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
1054	nvlist_t *nvroot, *newvd;
1055	const char *poolname;
1056	string oldstr(VdevGUIDString());
1057	bool retval = true;
1058
1059	/* Figure out what pool we're working on */
1060	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1061	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1062	if (zhp == NULL) {
1063		syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
1064		       "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
1065		return (false);
1066	}
1067	poolname = zpool_get_name(zhp);
1068	Vdev vd(zhp, CaseVdev(zhp));
1069	Vdev replaced(BeingReplacedBy(zhp));
1070
1071	if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
1072		/* If we are already being replaced by a working spare, pass. */
1073		if (replaced.IsResilvering()
1074		 || replaced.State() == VDEV_STATE_HEALTHY) {
1075			syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
1076			    "replaced", VdevGUIDString().c_str(), path);
1077			return (/*consumed*/false);
1078		}
1079		/*
1080		 * If we have already been replaced by a spare, but that spare
1081		 * is broken, we must spare the spare, not the original device.
1082		 */
1083		oldstr = replaced.GUIDString();
1084		syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
1085		    "broken spare %s instead", VdevGUIDString().c_str(),
1086		    path, oldstr.c_str());
1087	}
1088
1089	/*
1090	 * Build a root vdev/leaf vdev configuration suitable for
1091	 * zpool_vdev_attach. Only enough data for the kernel to find
1092	 * the device (i.e. type and disk device node path) are needed.
1093	 */
1094	nvroot = NULL;
1095	newvd = NULL;
1096
1097	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
1098	 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
1099		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
1100		    "configuration data.", poolname, oldstr.c_str());
1101		if (nvroot != NULL)
1102			nvlist_free(nvroot);
1103		return (false);
1104	}
1105	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
1106	 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
1107	 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
1108	 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1109				    &newvd, 1) != 0) {
1110		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
1111		    "configuration data.", poolname, oldstr.c_str());
1112		nvlist_free(newvd);
1113		nvlist_free(nvroot);
1114		return (true);
1115	}
1116
1117	/* Data was copied when added to the root vdev. */
1118	nvlist_free(newvd);
1119
1120	retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
1121	    /*replace*/B_TRUE) == 0);
1122	if (retval)
1123		syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
1124		    poolname, oldstr.c_str(), path);
1125	else
1126		syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
1127		    poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
1128		    libzfs_error_description(g_zfsHandle));
1129	nvlist_free(nvroot);
1130
1131	return (retval);
1132}
1133
1134/* Does the argument event refer to a checksum error? */
1135static bool
1136IsChecksumEvent(const Event* const event)
1137{
1138	return ("ereport.fs.zfs.checksum" == event->Value("type"));
1139}
1140
1141/* Does the argument event refer to an IO error? */
1142static bool
1143IsIOEvent(const Event* const event)
1144{
1145	return ("ereport.fs.zfs.io" == event->Value("type"));
1146}
1147
1148bool
1149CaseFile::ShouldDegrade() const
1150{
1151	return (std::count_if(m_events.begin(), m_events.end(),
1152			      IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT);
1153}
1154
1155bool
1156CaseFile::ShouldFault() const
1157{
1158	return (std::count_if(m_events.begin(), m_events.end(),
1159			      IsIOEvent) > ZFS_DEGRADE_IO_COUNT);
1160}
1161
1162nvlist_t *
1163CaseFile::CaseVdev(zpool_handle_t *zhp) const
1164{
1165	return (VdevIterator(zhp).Find(VdevGUID()));
1166}
1167