1/*-
2 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions, and the following disclaimer,
10 *    without modification.
11 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12 *    substantially similar to the "NO WARRANTY" disclaimer below
13 *    ("Disclaimer") and any redistribution must be conditioned upon
14 *    including a substantially similar Disclaimer requirement for further
15 *    binary redistribution.
16 *
17 * NO WARRANTY
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGES.
29 *
30 * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31 */
32
33/**
34 * \file case_file.cc
35 *
36 * We keep case files for any leaf vdev that is not in the optimal state.
37 * However, we only serialize to disk those events that need to be preserved
38 * across reboots.  For now, this is just a log of soft errors which we
39 * accumulate in order to mark a device as degraded.
40 */
41#include <sys/cdefs.h>
42#include <sys/byteorder.h>
43#include <sys/time.h>
44
45#include <sys/fs/zfs.h>
46
47#include <dirent.h>
48#include <fcntl.h>
49#include <iomanip>
50#include <fstream>
51#include <functional>
52#include <sstream>
53#include <syslog.h>
54#include <unistd.h>
55
56#include <libzutil.h>
57#include <libzfs.h>
58
59#include <list>
60#include <map>
61#include <string>
62
63#include <devdctl/guid.h>
64#include <devdctl/event.h>
65#include <devdctl/event_factory.h>
66#include <devdctl/exception.h>
67#include <devdctl/consumer.h>
68
69#include "callout.h"
70#include "vdev_iterator.h"
71#include "zfsd_event.h"
72#include "case_file.h"
73#include "vdev.h"
74#include "zfsd.h"
75#include "zfsd_exception.h"
76#include "zpool_list.h"
77/*============================ Namespace Control =============================*/
78using std::hex;
79using std::ifstream;
80using std::stringstream;
81using std::setfill;
82using std::setw;
83
84using DevdCtl::Event;
85using DevdCtl::EventFactory;
86using DevdCtl::EventList;
87using DevdCtl::Guid;
88using DevdCtl::ParseException;
89
90/*--------------------------------- CaseFile ---------------------------------*/
91//- CaseFile Static Data -------------------------------------------------------
92
93CaseFileList  CaseFile::s_activeCases;
94const string  CaseFile::s_caseFilePath = "/var/db/zfsd/cases";
95
96//- CaseFile Static Public Methods ---------------------------------------------
97CaseFile *
98CaseFile::Find(Guid poolGUID, Guid vdevGUID)
99{
100	for (CaseFileList::iterator curCase = s_activeCases.begin();
101	     curCase != s_activeCases.end(); curCase++) {
102
103		if (((*curCase)->PoolGUID() != poolGUID
104		  && Guid::InvalidGuid() != poolGUID)
105		 || (*curCase)->VdevGUID() != vdevGUID)
106			continue;
107
108		/*
109		 * We only carry one active case per-vdev.
110		 */
111		return (*curCase);
112	}
113	return (NULL);
114}
115
116void
117CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases)
118{
119	for (CaseFileList::iterator curCase = s_activeCases.begin();
120	    curCase != s_activeCases.end(); curCase++) {
121		if (((*curCase)->PoolGUID() != poolGUID &&
122		    Guid::InvalidGuid() != poolGUID) ||
123		    (*curCase)->VdevGUID() != vdevGUID)
124			continue;
125
126		/*
127		 * We can have multiple cases for spare vdevs
128		 */
129		cases.push_back(*curCase);
130		if (!(*curCase)->IsSpare()) {
131			return;
132		}
133	}
134}
135
136CaseFile *
137CaseFile::Find(const string &physPath)
138{
139	CaseFile *result = NULL;
140
141	for (CaseFileList::iterator curCase = s_activeCases.begin();
142	     curCase != s_activeCases.end(); curCase++) {
143
144		if ((*curCase)->PhysicalPath() != physPath)
145			continue;
146
147		if (result != NULL) {
148			syslog(LOG_WARNING, "Multiple casefiles found for "
149			    "physical path %s.  "
150			    "This is most likely a bug in zfsd",
151			    physPath.c_str());
152		}
153		result = *curCase;
154	}
155	return (result);
156}
157
158
159void
160CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
161{
162	CaseFileList::iterator casefile;
163	for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
164		CaseFileList::iterator next = casefile;
165		next++;
166		if (poolGUID == (*casefile)->PoolGUID())
167			(*casefile)->ReEvaluate(event);
168		casefile = next;
169	}
170}
171
172CaseFile &
173CaseFile::Create(Vdev &vdev)
174{
175	CaseFile *activeCase;
176
177	activeCase = Find(vdev.PoolGUID(), vdev.GUID());
178	if (activeCase == NULL)
179		activeCase = new CaseFile(vdev);
180
181	return (*activeCase);
182}
183
184void
185CaseFile::DeSerialize()
186{
187	struct dirent **caseFiles;
188
189	int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
190			 DeSerializeSelector, /*compar*/NULL));
191
192	if (numCaseFiles == -1)
193		return;
194	if (numCaseFiles == 0) {
195		free(caseFiles);
196		return;
197	}
198
199	for (int i = 0; i < numCaseFiles; i++) {
200
201		DeSerializeFile(caseFiles[i]->d_name);
202		free(caseFiles[i]);
203	}
204	free(caseFiles);
205}
206
207bool
208CaseFile::Empty()
209{
210	return (s_activeCases.empty());
211}
212
213void
214CaseFile::LogAll()
215{
216	for (CaseFileList::iterator curCase = s_activeCases.begin();
217	     curCase != s_activeCases.end(); curCase++)
218		(*curCase)->Log();
219}
220
221void
222CaseFile::PurgeAll()
223{
224	/*
225	 * Serialize casefiles before deleting them so that they can be reread
226	 * and revalidated during BuildCaseFiles.
227	 * CaseFiles remove themselves from this list on destruction.
228	 */
229	while (s_activeCases.size() != 0) {
230		CaseFile *casefile = s_activeCases.front();
231		casefile->Serialize();
232		delete casefile;
233	}
234
235}
236
237int
238CaseFile::IsSpare()
239{
240	return (m_is_spare);
241}
242
243//- CaseFile Public Methods ----------------------------------------------------
244bool
245CaseFile::RefreshVdevState()
246{
247	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
248	zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
249	if (casePool == NULL)
250		return (false);
251
252	Vdev vd(casePool, CaseVdev(casePool));
253	if (vd.DoesNotExist())
254		return (false);
255
256	m_vdevState    = vd.State();
257	m_vdevPhysPath = vd.PhysicalPath();
258	m_vdevName = vd.Name(casePool, false);
259	return (true);
260}
261
262bool
263CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
264{
265	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
266	zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
267	int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
268
269	if (pool == NULL || !RefreshVdevState()) {
270		/*
271		 * The pool or vdev for this case file is no longer
272		 * part of the configuration.  This can happen
273		 * if we process a device arrival notification
274		 * before seeing the ZFS configuration change
275		 * event.
276		 */
277		syslog(LOG_INFO,
278		       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured.  "
279		       "Closing\n",
280		       PoolGUIDString().c_str(),
281		       VdevGUIDString().c_str());
282		Close();
283
284		/*
285		 * Since this event was not used to close this
286		 * case, do not report it as consumed.
287		 */
288		return (/*consumed*/false);
289	}
290
291	if (VdevState() > VDEV_STATE_CANT_OPEN) {
292		/*
293		 * For now, newly discovered devices only help for
294		 * devices that are missing.  In the future, we might
295		 * use a newly inserted spare to replace a degraded
296		 * or faulted device.
297		 */
298		syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
299		    PoolGUIDString().c_str(), VdevGUIDString().c_str());
300		return (/*consumed*/false);
301	}
302
303	if (vdev != NULL
304	 && ( vdev->PoolGUID() == m_poolGUID
305	   || vdev->PoolGUID() == Guid::InvalidGuid())
306	 && vdev->GUID() == m_vdevGUID) {
307
308		if (IsSpare())
309			flags |= ZFS_ONLINE_SPARE;
310		if (zpool_vdev_online(pool, vdev->GUIDString().c_str(),
311		    flags, &m_vdevState) != 0) {
312			syslog(LOG_ERR,
313			    "Failed to online vdev(%s/%s:%s): %s: %s\n",
314			    zpool_get_name(pool), vdev->GUIDString().c_str(),
315			    devPath.c_str(), libzfs_error_action(g_zfsHandle),
316			    libzfs_error_description(g_zfsHandle));
317			return (/*consumed*/false);
318		}
319
320		syslog(LOG_INFO, "Onlined vdev(%s/%s:%s).  State now %s.\n",
321		       zpool_get_name(pool), vdev->GUIDString().c_str(),
322		       devPath.c_str(),
323		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
324
325		/*
326		 * Check the vdev state post the online action to see
327		 * if we can retire this case.
328		 */
329		CloseIfSolved();
330
331		return (/*consumed*/true);
332	}
333
334	/*
335	 * If the auto-replace policy is enabled, and we have physical
336	 * path information, try a physical path replacement.
337	 */
338	if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
339		syslog(LOG_INFO,
340		       "CaseFile(%s:%s:%s): AutoReplace not set.  "
341		       "Ignoring device insertion.\n",
342		       PoolGUIDString().c_str(),
343		       VdevGUIDString().c_str(),
344		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
345		return (/*consumed*/false);
346	}
347
348	if (PhysicalPath().empty()) {
349		syslog(LOG_INFO,
350		       "CaseFile(%s:%s:%s): No physical path information.  "
351		       "Ignoring device insertion.\n",
352		       PoolGUIDString().c_str(),
353		       VdevGUIDString().c_str(),
354		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
355		return (/*consumed*/false);
356	}
357
358	if (physPath != PhysicalPath()) {
359		syslog(LOG_INFO,
360		       "CaseFile(%s:%s:%s): Physical path mismatch.  "
361		       "Ignoring device insertion.\n",
362		       PoolGUIDString().c_str(),
363		       VdevGUIDString().c_str(),
364		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
365		return (/*consumed*/false);
366	}
367
368	/* Write a label on the newly inserted disk. */
369	if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
370		syslog(LOG_ERR,
371		       "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
372		       zpool_get_name(pool), VdevGUIDString().c_str(),
373		       libzfs_error_action(g_zfsHandle),
374		       libzfs_error_description(g_zfsHandle));
375		return (/*consumed*/false);
376	}
377
378	syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
379	    PoolGUIDString().c_str(), VdevGUIDString().c_str(),
380	    devPath.c_str());
381	return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
382}
383
384bool
385CaseFile::ReEvaluate(const ZfsEvent &event)
386{
387	bool consumed(false);
388
389	if (event.Value("type") == "sysevent.fs.zfs.vdev_remove") {
390		/*
391		 * The Vdev we represent has been removed from the
392		 * configuration.  This case is no longer of value.
393		 */
394		Close();
395
396		return (/*consumed*/true);
397	} else if (event.Value("type") == "sysevent.fs.zfs.pool_destroy") {
398		/* This Pool has been destroyed.  Discard the case */
399		Close();
400
401		return (/*consumed*/true);
402	} else if (event.Value("type") == "sysevent.fs.zfs.config_sync") {
403		RefreshVdevState();
404		if (VdevState() < VDEV_STATE_HEALTHY)
405			consumed = ActivateSpare();
406	}
407
408
409	if (event.Value("class") == "resource.fs.zfs.removed") {
410		bool spare_activated;
411
412		if (!RefreshVdevState()) {
413			/*
414			 * The pool or vdev for this case file is no longer
415			 * part of the configuration.  This can happen
416			 * if we process a device arrival notification
417			 * before seeing the ZFS configuration change
418			 * event.
419			 */
420			syslog(LOG_INFO,
421			       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
422			       "unconfigured.  Closing\n",
423			       PoolGUIDString().c_str(),
424			       VdevGUIDString().c_str());
425			/*
426			 * Close the case now so we won't waste cycles in the
427			 * system rescan
428			 */
429			Close();
430
431			/*
432			 * Since this event was not used to close this
433			 * case, do not report it as consumed.
434			 */
435			return (/*consumed*/false);
436		}
437
438		/*
439		 * Discard any tentative I/O error events for
440		 * this case.  They were most likely caused by the
441		 * hot-unplug of this device.
442		 */
443		PurgeTentativeEvents();
444
445		/* Try to activate spares if they are available */
446		spare_activated = ActivateSpare();
447
448		/*
449		 * Rescan the drives in the system to see if a recent
450		 * drive arrival can be used to solve this case.
451		 */
452		ZfsDaemon::RequestSystemRescan();
453
454		/*
455		 * Consume the event if we successfully activated a spare.
456		 * Otherwise, leave it in the unconsumed events list so that the
457		 * future addition of a spare to this pool might be able to
458		 * close the case
459		 */
460		consumed = spare_activated;
461	} else if (event.Value("class") == "resource.fs.zfs.statechange") {
462		RefreshVdevState();
463		/*
464		 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
465		 * activate a hotspare.  Otherwise, ignore the event
466		 */
467		if (VdevState() == VDEV_STATE_FAULTED ||
468		    VdevState() == VDEV_STATE_DEGRADED ||
469		    VdevState() == VDEV_STATE_CANT_OPEN)
470			(void) ActivateSpare();
471		consumed = true;
472	}
473	else if (event.Value("class") == "ereport.fs.zfs.io" ||
474	         event.Value("class") == "ereport.fs.zfs.checksum" ||
475		 event.Value("class") == "ereport.fs.zfs.delay") {
476
477		m_tentativeEvents.push_front(event.DeepCopy());
478		RegisterCallout(event);
479		consumed = true;
480	}
481
482	bool closed(CloseIfSolved());
483
484	return (consumed || closed);
485}
486
487/* Find a Vdev containing the vdev with the given GUID */
488static nvlist_t*
489find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid)
490{
491	nvlist_t **vdevChildren;
492	int        error;
493	unsigned   ch, numChildren;
494
495	error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
496					   &vdevChildren, &numChildren);
497
498	if (error != 0 || numChildren == 0)
499		return (NULL);
500
501	for (ch = 0; ch < numChildren; ch++) {
502		nvlist *result;
503		Vdev vdev(pool_config, vdevChildren[ch]);
504
505		if (vdev.GUID() == child_guid)
506			return (config);
507
508		result = find_parent(pool_config, vdevChildren[ch], child_guid);
509		if (result != NULL)
510			return (result);
511	}
512
513	return (NULL);
514}
515
516bool
517CaseFile::ActivateSpare() {
518	nvlist_t	*config, *nvroot, *parent_config;
519	nvlist_t       **spares;
520	const char	*devPath, *poolname, *vdev_type;
521	u_int		 nspares, i;
522	int		 error;
523
524	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
525	zpool_handle_t	*zhp(zpl.empty() ? NULL : zpl.front());
526	if (zhp == NULL) {
527		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
528		       "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
529		return (false);
530	}
531	poolname = zpool_get_name(zhp);
532	config = zpool_get_config(zhp, NULL);
533	if (config == NULL) {
534		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
535		       "config for pool %s", poolname);
536		return (false);
537	}
538	error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
539	if (error != 0){
540		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
541		       "tree for pool %s", poolname);
542		return (false);
543	}
544
545	parent_config = find_parent(config, nvroot, m_vdevGUID);
546	if (parent_config != NULL) {
547		const char *parent_type;
548
549		/*
550		 * Don't activate spares for members of a "replacing" vdev.
551		 * They're already dealt with.  Sparing them will just drag out
552		 * the resilver process.
553		 */
554		error = nvlist_lookup_string(parent_config,
555		    ZPOOL_CONFIG_TYPE, &parent_type);
556		if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0)
557			return (false);
558	}
559
560	nspares = 0;
561	nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
562				   &nspares);
563	if (nspares == 0) {
564		/* The pool has no spares configured */
565		syslog(LOG_INFO, "CaseFile::ActivateSpare: "
566		       "No spares available for pool %s", poolname);
567		return (false);
568	}
569	for (i = 0; i < nspares; i++) {
570		uint64_t    *nvlist_array;
571		vdev_stat_t *vs;
572		uint_t	     nstats;
573
574		if (nvlist_lookup_uint64_array(spares[i],
575		    ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
576			syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
577			       "find vdev stats for pool %s, spare %d",
578			       poolname, i);
579			return (false);
580		}
581		vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);
582
583		if ((vs->vs_aux != VDEV_AUX_SPARED)
584		 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
585			/* We found a usable spare */
586			break;
587		}
588	}
589
590	if (i == nspares) {
591		/* No available spares were found */
592		return (false);
593	}
594
595	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
596	if (error != 0) {
597		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
598		       "the path of pool %s, spare %d. Error %d",
599		       poolname, i, error);
600		return (false);
601	}
602
603	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
604	if (error != 0) {
605		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
606		       "the vdev type of pool %s, spare %d. Error %d",
607		       poolname, i, error);
608		return (false);
609	}
610
611	return (Replace(vdev_type, devPath, /*isspare*/true));
612}
613
614/* Does the argument event refer to a checksum error? */
615static bool
616IsChecksumEvent(const Event* const event)
617{
618	return ("ereport.fs.zfs.checksum" == event->Value("type"));
619}
620
621/* Does the argument event refer to an IO error? */
622static bool
623IsIOEvent(const Event* const event)
624{
625	return ("ereport.fs.zfs.io" == event->Value("type"));
626}
627
628/* Does the argument event refer to an IO delay? */
629static bool
630IsDelayEvent(const Event* const event)
631{
632	return ("ereport.fs.zfs.delay" == event->Value("type"));
633}
634
635void
636CaseFile::RegisterCallout(const Event &event)
637{
638	timeval now, countdown, elapsed, timestamp, zero, remaining;
639	/**
640	 * The time ZFSD waits before promoting a tentative event
641	 * into a permanent event.
642	 */
643	int sec = -1;
644	if (IsChecksumEvent(&event))
645		sec = CaseFile::GetVdevProp(VDEV_PROP_CHECKSUM_T);
646	else if (IsIOEvent(&event))
647		sec = CaseFile::GetVdevProp(VDEV_PROP_IO_T);
648	else if (IsDelayEvent(&event))
649		sec = CaseFile::GetVdevProp(VDEV_PROP_SLOW_IO_T);
650
651	if (sec == -1)
652		sec = 60; /* default */
653
654	timeval removeGracePeriod = {
655	    sec, /*sec*/
656	    0 /*usec*/
657	};
658
659	gettimeofday(&now, 0);
660	timestamp = event.GetTimestamp();
661	timersub(&now, &timestamp, &elapsed);
662	timersub(&removeGracePeriod, &elapsed, &countdown);
663	/*
664	 * If countdown is <= zero, Reset the timer to the
665	 * smallest positive time value instead
666	 */
667	timerclear(&zero);
668	if (timercmp(&countdown, &zero, <=)) {
669		timerclear(&countdown);
670		countdown.tv_usec = 1;
671	}
672
673	remaining = m_tentativeTimer.TimeRemaining();
674
675	if (!m_tentativeTimer.IsPending()
676	 || timercmp(&countdown, &remaining, <))
677		m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
678}
679
680
681bool
682CaseFile::CloseIfSolved()
683{
684	if (m_events.empty()
685	 && m_tentativeEvents.empty()) {
686
687		/*
688		 * We currently do not track or take actions on
689		 * devices in the degraded or faulted state.
690		 * Once we have support for spare pools, we'll
691		 * retain these cases so that any spares added in
692		 * the future can be applied to them.
693		 */
694		switch (VdevState()) {
695		case VDEV_STATE_HEALTHY:
696			/* No need to keep cases for healthy vdevs */
697			Close();
698			return (true);
699		case VDEV_STATE_REMOVED:
700		case VDEV_STATE_CANT_OPEN:
701			/*
702			 * Keep open.  We may solve it with a newly inserted
703			 * device.
704			 */
705		case VDEV_STATE_FAULTED:
706		case VDEV_STATE_DEGRADED:
707			/*
708			 * Keep open.  We may solve it with the future
709			 * addition of a spare to the pool
710			 */
711		case VDEV_STATE_UNKNOWN:
712		case VDEV_STATE_CLOSED:
713		case VDEV_STATE_OFFLINE:
714			/*
715			 * Keep open?  This may not be the correct behavior,
716			 * but it's what we've always done
717			 */
718			;
719		}
720
721		/*
722		 * Re-serialize the case in order to remove any
723		 * previous event data.
724		 */
725		Serialize();
726	}
727
728	return (false);
729}
730
731void
732CaseFile::Log()
733{
734	syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
735	       VdevGUIDString().c_str(), PhysicalPath().c_str());
736	syslog(LOG_INFO, "\tVdev State = %s\n",
737	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
738	if (m_tentativeEvents.size() != 0) {
739		syslog(LOG_INFO, "\t=== Tentative Events ===\n");
740		for (EventList::iterator event(m_tentativeEvents.begin());
741		     event != m_tentativeEvents.end(); event++)
742			(*event)->Log(LOG_INFO);
743	}
744	if (m_events.size() != 0) {
745		syslog(LOG_INFO, "\t=== Events ===\n");
746		for (EventList::iterator event(m_events.begin());
747		     event != m_events.end(); event++)
748			(*event)->Log(LOG_INFO);
749	}
750}
751
752//- CaseFile Static Protected Methods ------------------------------------------
753void
754CaseFile::OnGracePeriodEnded(void *arg)
755{
756	CaseFile &casefile(*static_cast<CaseFile *>(arg));
757
758	casefile.OnGracePeriodEnded();
759}
760
761int
762CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
763{
764	uint64_t poolGUID;
765	uint64_t vdevGUID;
766
767	if (dirEntry->d_type == DT_REG
768	 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
769		   &poolGUID, &vdevGUID) == 2)
770		return (1);
771	return (0);
772}
773
774void
775CaseFile::DeSerializeFile(const char *fileName)
776{
777	string	  fullName(s_caseFilePath + '/' + fileName);
778	CaseFile *existingCaseFile(NULL);
779	CaseFile *caseFile(NULL);
780
781	try {
782		uint64_t poolGUID;
783		uint64_t vdevGUID;
784		nvlist_t *vdevConf;
785
786		if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
787		       &poolGUID, &vdevGUID) != 2) {
788			throw ZfsdException("CaseFile::DeSerialize: "
789			    "Unintelligible CaseFile filename %s.\n", fileName);
790		}
791		existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
792		if (existingCaseFile != NULL) {
793			/*
794			 * If the vdev is already degraded or faulted,
795			 * there's no point in keeping the state around
796			 * that we use to put a drive into the degraded
797			 * state.  However, if the vdev is simply missing,
798			 * preserve the case data in the hopes that it will
799			 * return.
800			 */
801			caseFile = existingCaseFile;
802			vdev_state curState(caseFile->VdevState());
803			if (curState > VDEV_STATE_CANT_OPEN
804			 && curState < VDEV_STATE_HEALTHY) {
805				unlink(fileName);
806				return;
807			}
808		} else {
809			ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
810			if (zpl.empty()
811			 || (vdevConf = VdevIterator(zpl.front())
812						    .Find(vdevGUID)) == NULL) {
813				/*
814				 * Either the pool no longer exists
815				 * or this vdev is no longer a member of
816				 * the pool.
817				 */
818				unlink(fullName.c_str());
819				return;
820			}
821
822			/*
823			 * Any vdev we find that does not have a case file
824			 * must be in the healthy state and thus worthy of
825			 * continued SERD data tracking.
826			 */
827			caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
828		}
829
830		ifstream caseStream(fullName.c_str());
831		if (!caseStream)
832			throw ZfsdException("CaseFile::DeSerialize: Unable to "
833					    "read %s.\n", fileName);
834
835		caseFile->DeSerialize(caseStream);
836	} catch (const ParseException &exp) {
837
838		exp.Log();
839		if (caseFile != existingCaseFile)
840			delete caseFile;
841
842		/*
843		 * Since we can't parse the file, unlink it so we don't
844		 * trip over it again.
845		 */
846		unlink(fileName);
847	} catch (const ZfsdException &zfsException) {
848
849		zfsException.Log();
850		if (caseFile != existingCaseFile)
851			delete caseFile;
852	}
853}
854
855//- CaseFile Protected Methods -------------------------------------------------
856CaseFile::CaseFile(const Vdev &vdev)
857 : m_poolGUID(vdev.PoolGUID()),
858   m_vdevGUID(vdev.GUID()),
859   m_vdevState(vdev.State()),
860   m_vdevPhysPath(vdev.PhysicalPath()),
861   m_is_spare(vdev.IsSpare())
862{
863	stringstream guidString;
864
865	guidString << m_vdevGUID;
866	m_vdevGUIDString = guidString.str();
867	guidString.str("");
868	guidString << m_poolGUID;
869	m_poolGUIDString = guidString.str();
870
871	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
872	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
873	m_vdevName = vdev.Name(zhp, false);
874
875	s_activeCases.push_back(this);
876
877	syslog(LOG_INFO, "Creating new CaseFile:\n");
878	Log();
879}
880
881CaseFile::~CaseFile()
882{
883	PurgeEvents();
884	PurgeTentativeEvents();
885	m_tentativeTimer.Stop();
886	s_activeCases.remove(this);
887}
888
889void
890CaseFile::PurgeEvents()
891{
892	for (EventList::iterator event(m_events.begin());
893	     event != m_events.end(); event++)
894		delete *event;
895
896	m_events.clear();
897}
898
899void
900CaseFile::PurgeTentativeEvents()
901{
902	for (EventList::iterator event(m_tentativeEvents.begin());
903	     event != m_tentativeEvents.end(); event++)
904		delete *event;
905
906	m_tentativeEvents.clear();
907}
908
909void
910CaseFile::SerializeEvList(const EventList events, int fd,
911		const char* prefix) const
912{
913	if (events.empty())
914		return;
915	for (EventList::const_iterator curEvent = events.begin();
916	     curEvent != events.end(); curEvent++) {
917		const string &eventString((*curEvent)->GetEventString());
918
919		// TODO: replace many write(2) calls with a single writev(2)
920		if (prefix)
921			write(fd, prefix, strlen(prefix));
922		write(fd, eventString.c_str(), eventString.length());
923	}
924}
925
926void
927CaseFile::Serialize()
928{
929	stringstream saveFile;
930
931	saveFile << setfill('0')
932		 << s_caseFilePath << "/"
933		 << "pool_" << PoolGUIDString()
934		 << "_vdev_" << VdevGUIDString()
935		 << ".case";
936
937	if (m_events.empty() && m_tentativeEvents.empty()) {
938		unlink(saveFile.str().c_str());
939		return;
940	}
941
942	int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
943	if (fd == -1) {
944		syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
945		       saveFile.str().c_str());
946		return;
947	}
948	SerializeEvList(m_events, fd);
949	SerializeEvList(m_tentativeEvents, fd, "tentative ");
950	close(fd);
951}
952
953/*
954 * XXX: This method assumes that events may not contain embedded newlines.  If
955 * ever events can contain embedded newlines, then CaseFile must switch
956 * serialization formats
957 */
958void
959CaseFile::DeSerialize(ifstream &caseStream)
960{
961	string	      evString;
962	const EventFactory &factory(ZfsDaemon::Get().GetFactory());
963
964	caseStream >> std::noskipws >> std::ws;
965	while (caseStream.good()) {
966		/*
967		 * Outline:
968		 * read the beginning of a line and check it for
969		 * "tentative".  If found, discard "tentative".
970		 * Create a new event
971		 * continue
972		 */
973		EventList* destEvents;
974		const string tentFlag("tentative ");
975		string line;
976		std::stringbuf lineBuf;
977
978		caseStream.get(lineBuf);
979		caseStream.ignore();  /*discard the newline character*/
980		line = lineBuf.str();
981		if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
982			/* Discard "tentative" */
983			line.erase(0, tentFlag.size());
984			destEvents = &m_tentativeEvents;
985		} else {
986			destEvents = &m_events;
987		}
988		Event *event(Event::CreateEvent(factory, line));
989		if (event != NULL) {
990			destEvents->push_back(event);
991			RegisterCallout(*event);
992		}
993	}
994}
995
996void
997CaseFile::Close()
998{
999	/*
1000	 * This case is no longer relevant.  Clean up our
1001	 * serialization file, and delete the case.
1002	 */
1003	syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
1004	       PoolGUIDString().c_str(), VdevGUIDString().c_str(),
1005	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
1006
1007	/*
1008	 * Serialization of a Case with no event data, clears the
1009	 * Serialization data for that event.
1010	 */
1011	PurgeEvents();
1012	Serialize();
1013
1014	delete this;
1015}
1016
1017void
1018CaseFile::OnGracePeriodEnded()
1019{
1020	bool should_fault, should_degrade;
1021	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1022	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1023
1024	m_events.splice(m_events.begin(), m_tentativeEvents);
1025	should_fault = ShouldFault();
1026	should_degrade = ShouldDegrade();
1027
1028	if (should_fault || should_degrade) {
1029		if (zhp == NULL
1030		 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
1031			/*
1032			 * Either the pool no longer exists
1033			 * or this vdev is no longer a member of
1034			 * the pool.
1035			 */
1036			Close();
1037			return;
1038		}
1039
1040	}
1041
1042	/* A fault condition has priority over a degrade condition */
1043	if (ShouldFault()) {
1044		/* Fault the vdev and close the case. */
1045		if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
1046				       VDEV_AUX_ERR_EXCEEDED) == 0) {
1047			syslog(LOG_INFO, "Faulting vdev(%s/%s)",
1048			       PoolGUIDString().c_str(),
1049			       VdevGUIDString().c_str());
1050			Close();
1051			return;
1052		}
1053		else {
1054			syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
1055			       PoolGUIDString().c_str(),
1056			       VdevGUIDString().c_str(),
1057			       libzfs_error_action(g_zfsHandle),
1058			       libzfs_error_description(g_zfsHandle));
1059		}
1060	}
1061	else if (ShouldDegrade()) {
1062		/* Degrade the vdev and close the case. */
1063		if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
1064				       VDEV_AUX_ERR_EXCEEDED) == 0) {
1065			syslog(LOG_INFO, "Degrading vdev(%s/%s)",
1066			       PoolGUIDString().c_str(),
1067			       VdevGUIDString().c_str());
1068			Close();
1069			return;
1070		}
1071		else {
1072			syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
1073			       PoolGUIDString().c_str(),
1074			       VdevGUIDString().c_str(),
1075			       libzfs_error_action(g_zfsHandle),
1076			       libzfs_error_description(g_zfsHandle));
1077		}
1078	}
1079	Serialize();
1080}
1081
1082Vdev
1083CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
1084	Vdev vd(zhp, CaseVdev(zhp));
1085	std::list<Vdev> children;
1086	std::list<Vdev>::iterator children_it;
1087
1088	Vdev parent(vd.Parent());
1089	Vdev replacing(NonexistentVdev);
1090
1091	/*
1092	 * To determine whether we are being replaced by another spare that
1093	 * is still working, then make sure that it is currently spared and
1094	 * that the spare is either resilvering or healthy.  If any of these
1095	 * conditions fail, then we are not being replaced by a spare.
1096	 *
1097	 * If the spare is healthy, then the case file should be closed very
1098	 * soon after this check.
1099	 */
1100	if (parent.DoesNotExist()
1101	 || parent.Name(zhp, /*verbose*/false) != "spare")
1102		return (NonexistentVdev);
1103
1104	children = parent.Children();
1105	children_it = children.begin();
1106	for (;children_it != children.end(); children_it++) {
1107		Vdev child = *children_it;
1108
1109		/* Skip our vdev. */
1110		if (child.GUID() == VdevGUID())
1111			continue;
1112		/*
1113		 * Accept the first child that doesn't match our GUID, or
1114		 * any resilvering/healthy device if one exists.
1115		 */
1116		if (replacing.DoesNotExist() || child.IsResilvering()
1117		 || child.State() == VDEV_STATE_HEALTHY)
1118			replacing = child;
1119	}
1120
1121	return (replacing);
1122}
1123
1124bool
1125CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
1126	nvlist_t *nvroot, *newvd;
1127	const char *poolname;
1128	string oldstr(VdevGUIDString());
1129	bool retval = true;
1130
1131	/* Figure out what pool we're working on */
1132	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1133	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1134	if (zhp == NULL) {
1135		syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
1136		       "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
1137		return (false);
1138	}
1139	poolname = zpool_get_name(zhp);
1140	Vdev vd(zhp, CaseVdev(zhp));
1141	Vdev replaced(BeingReplacedBy(zhp));
1142
1143	if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
1144		/* If we are already being replaced by a working spare, pass. */
1145		if (replaced.IsResilvering()
1146		 || replaced.State() == VDEV_STATE_HEALTHY) {
1147			syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
1148			    "replaced", VdevGUIDString().c_str(), path);
1149			return (/*consumed*/false);
1150		}
1151		/*
1152		 * If we have already been replaced by a spare, but that spare
1153		 * is broken, we must spare the spare, not the original device.
1154		 */
1155		oldstr = replaced.GUIDString();
1156		syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
1157		    "broken spare %s instead", VdevGUIDString().c_str(),
1158		    path, oldstr.c_str());
1159	}
1160
1161	/*
1162	 * Build a root vdev/leaf vdev configuration suitable for
1163	 * zpool_vdev_attach. Only enough data for the kernel to find
1164	 * the device (i.e. type and disk device node path) are needed.
1165	 */
1166	nvroot = NULL;
1167	newvd = NULL;
1168
1169	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
1170	 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
1171		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
1172		    "configuration data.", poolname, oldstr.c_str());
1173		if (nvroot != NULL)
1174			nvlist_free(nvroot);
1175		return (false);
1176	}
1177	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
1178	 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
1179	 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
1180	 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1181				    &newvd, 1) != 0) {
1182		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
1183		    "configuration data.", poolname, oldstr.c_str());
1184		nvlist_free(newvd);
1185		nvlist_free(nvroot);
1186		return (true);
1187	}
1188
1189	/* Data was copied when added to the root vdev. */
1190	nvlist_free(newvd);
1191
1192	retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
1193       /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0);
1194	if (retval)
1195		syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
1196		    poolname, oldstr.c_str(), path);
1197	else
1198		syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
1199		    poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
1200		    libzfs_error_description(g_zfsHandle));
1201	nvlist_free(nvroot);
1202
1203	return (retval);
1204}
1205
1206/* Lookup the vdev prop. Used for checksum, IO, or slow IO props */
1207int
1208CaseFile::GetVdevProp(vdev_prop_t vdev_prop) const
1209{
1210	char val[ZFS_MAXPROPLEN];
1211	zprop_source_t srctype;
1212	DevdCtl::Guid poolGUID = PoolGUID();
1213	ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
1214	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1215
1216	char *prop_str = (char *) vdev_prop_to_name(vdev_prop);
1217	if (zhp == NULL || zpool_get_vdev_prop(zhp, m_vdevName.c_str(),
1218	    vdev_prop, prop_str, val, sizeof (val), &srctype, B_FALSE) != 0)
1219		return (-1);
1220
1221	/* we'll get "-" from libzfs for a prop that is not set */
1222	if (zfs_isnumber(val) == B_FALSE)
1223		return (-1);
1224
1225	return (atoi(val));
1226}
1227
1228bool
1229CaseFile::ShouldDegrade() const
1230{
1231	int checksum_n = GetVdevProp(VDEV_PROP_CHECKSUM_N);
1232	if (checksum_n == -1)
1233		checksum_n = DEFAULT_ZFS_DEGRADE_IO_COUNT;
1234	return (std::count_if(m_events.begin(), m_events.end(),
1235			      IsChecksumEvent) > checksum_n);
1236}
1237
1238bool
1239CaseFile::ShouldFault() const
1240{
1241	bool should_fault_for_io, should_fault_for_delay;
1242	int io_n = GetVdevProp(VDEV_PROP_IO_N);
1243	int slow_io_n = GetVdevProp(VDEV_PROP_SLOW_IO_N);
1244
1245	if (io_n == -1)
1246		io_n = DEFAULT_ZFS_DEGRADE_IO_COUNT;
1247	if (slow_io_n == -1)
1248		slow_io_n = DEFAULT_ZFS_FAULT_SLOW_IO_COUNT;
1249
1250	should_fault_for_io = std::count_if(m_events.begin(), m_events.end(),
1251			      IsIOEvent) > io_n;
1252	should_fault_for_delay = std::count_if(m_events.begin(), m_events.end(),
1253			      IsDelayEvent) > slow_io_n;
1254
1255	return (should_fault_for_io || should_fault_for_delay);
1256}
1257
1258nvlist_t *
1259CaseFile::CaseVdev(zpool_handle_t *zhp) const
1260{
1261	return (VdevIterator(zhp).Find(VdevGUID()));
1262}
1263