1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
6 * You can obtain a copy of the license from the top-level file
7 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
8 * You may not use this file except in compliance with the license.
9 *
10 * CDDL HEADER END
11 */
12
13/*
14 * Copyright (c) 2016, Intel Corporation.
15 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
16 * Copyright (c) 2021 Hewlett Packard Enterprise Development LP
17 */
18
19#include <libnvpair.h>
20#include <libzfs.h>
21#include <stddef.h>
22#include <stdlib.h>
23#include <string.h>
24#include <sys/list.h>
25#include <sys/time.h>
26#include <sys/sysevent/eventdefs.h>
27#include <sys/sysevent/dev.h>
28#include <sys/fm/protocol.h>
29#include <sys/fm/fs/zfs.h>
30#include <pthread.h>
31#include <unistd.h>
32
33#include "zfs_agents.h"
34#include "fmd_api.h"
35#include "../zed_log.h"
36
37/*
38 * agent dispatch code
39 */
40
41static pthread_mutex_t	agent_lock = PTHREAD_MUTEX_INITIALIZER;
42static pthread_cond_t	agent_cond = PTHREAD_COND_INITIALIZER;
43static list_t		agent_events;	/* list of pending events */
44static int		agent_exiting;
45
46typedef struct agent_event {
47	char		ae_class[64];
48	char		ae_subclass[32];
49	nvlist_t	*ae_nvl;
50	list_node_t	ae_node;
51} agent_event_t;
52
53pthread_t g_agents_tid;
54
55libzfs_handle_t *g_zfs_hdl;
56
57/* guid search data */
58typedef enum device_type {
59	DEVICE_TYPE_L2ARC,	/* l2arc device */
60	DEVICE_TYPE_SPARE,	/* spare device */
61	DEVICE_TYPE_PRIMARY	/* any primary pool storage device */
62} device_type_t;
63
64typedef struct guid_search {
65	uint64_t	gs_pool_guid;
66	uint64_t	gs_vdev_guid;
67	const char	*gs_devid;
68	device_type_t	gs_vdev_type;
69	uint64_t	gs_vdev_expandtime;	/* vdev expansion time */
70} guid_search_t;
71
72/*
73 * Walks the vdev tree recursively looking for a matching devid.
74 * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise.
75 */
76static boolean_t
77zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
78{
79	guid_search_t *gsp = arg;
80	const char *path = NULL;
81	uint_t c, children;
82	nvlist_t **child;
83	uint64_t vdev_guid;
84
85	/*
86	 * First iterate over any children.
87	 */
88	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
89	    &child, &children) == 0) {
90		for (c = 0; c < children; c++) {
91			if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
92				gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY;
93				return (B_TRUE);
94			}
95		}
96	}
97	/*
98	 * Iterate over any spares and cache devices
99	 */
100	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
101	    &child, &children) == 0) {
102		for (c = 0; c < children; c++) {
103			if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
104				gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
105				return (B_TRUE);
106			}
107		}
108	}
109	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
110	    &child, &children) == 0) {
111		for (c = 0; c < children; c++) {
112			if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
113				gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
114				return (B_TRUE);
115			}
116		}
117	}
118	/*
119	 * On a devid match, grab the vdev guid and expansion time, if any.
120	 */
121	if (gsp->gs_devid != NULL &&
122	    (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
123	    (strcmp(gsp->gs_devid, path) == 0)) {
124		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
125		    &gsp->gs_vdev_guid);
126		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
127		    &gsp->gs_vdev_expandtime);
128		return (B_TRUE);
129	}
130	/*
131	 * Otherwise, on a vdev guid match, grab the devid and expansion
132	 * time. The devid might be missing on removal since its not part
133	 * of blkid cache and L2ARC VDEV does not contain pool guid in its
134	 * blkid, so this is a special case for L2ARC VDEV.
135	 */
136	else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL &&
137	    nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 &&
138	    gsp->gs_vdev_guid == vdev_guid) {
139		(void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID,
140		    &gsp->gs_devid);
141		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
142		    &gsp->gs_vdev_expandtime);
143		return (B_TRUE);
144	}
145
146	return (B_FALSE);
147}
148
149static int
150zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
151{
152	guid_search_t *gsp = arg;
153	nvlist_t *config, *nvl;
154
155	/*
156	 * For each vdev in this pool, look for a match by devid
157	 */
158	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
159		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
160		    &nvl) == 0) {
161			(void) zfs_agent_iter_vdev(zhp, nvl, gsp);
162		}
163	}
164	/*
165	 * if a match was found then grab the pool guid
166	 */
167	if (gsp->gs_vdev_guid && gsp->gs_devid) {
168		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
169		    &gsp->gs_pool_guid);
170	}
171
172	zpool_close(zhp);
173	return (gsp->gs_devid != NULL && gsp->gs_vdev_guid != 0);
174}
175
176void
177zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
178{
179	agent_event_t *event;
180
181	if (subclass == NULL)
182		subclass = "";
183
184	event = malloc(sizeof (agent_event_t));
185	if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) {
186		if (event)
187			free(event);
188		return;
189	}
190
191	if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) {
192		class = EC_ZFS;
193		subclass = ESC_ZFS_VDEV_CHECK;
194	}
195
196	/*
197	 * On Linux, we don't get the expected FM_RESOURCE_REMOVED ereport
198	 * from the vdev_disk layer after a hot unplug. Fortunately we do
199	 * get an EC_DEV_REMOVE from our disk monitor and it is a suitable
200	 * proxy so we remap it here for the benefit of the diagnosis engine.
201	 * Starting in OpenZFS 2.0, we do get FM_RESOURCE_REMOVED from the spa
202	 * layer. Processing multiple FM_RESOURCE_REMOVED events is not harmful.
203	 */
204	if ((strcmp(class, EC_DEV_REMOVE) == 0) &&
205	    (strcmp(subclass, ESC_DISK) == 0) &&
206	    (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) ||
207	    nvlist_exists(nvl, DEV_IDENTIFIER))) {
208		nvlist_t *payload = event->ae_nvl;
209		struct timeval tv;
210		int64_t tod[2];
211		uint64_t pool_guid = 0, vdev_guid = 0;
212		guid_search_t search = { 0 };
213		device_type_t devtype = DEVICE_TYPE_PRIMARY;
214		const char *devid = NULL;
215
216		class = "resource.fs.zfs.removed";
217		subclass = "";
218
219		(void) nvlist_add_string(payload, FM_CLASS, class);
220		(void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid);
221		(void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
222		(void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
223
224		(void) gettimeofday(&tv, NULL);
225		tod[0] = tv.tv_sec;
226		tod[1] = tv.tv_usec;
227		(void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2);
228
229		/*
230		 * If devid is missing but vdev_guid is available, find devid
231		 * and pool_guid from vdev_guid.
232		 * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
233		 * ZFS_EV_POOL_GUID may be missing so find them.
234		 */
235		if (devid == NULL || pool_guid == 0 || vdev_guid == 0) {
236			if (devid == NULL)
237				search.gs_vdev_guid = vdev_guid;
238			else
239				search.gs_devid = devid;
240			zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
241			if (devid == NULL)
242				devid = search.gs_devid;
243			if (pool_guid == 0)
244				pool_guid = search.gs_pool_guid;
245			if (vdev_guid == 0)
246				vdev_guid = search.gs_vdev_guid;
247			devtype = search.gs_vdev_type;
248		}
249
250		/*
251		 * We want to avoid reporting "remove" events coming from
252		 * libudev for VDEVs which were expanded recently (10s) and
253		 * avoid activating spares in response to partitions being
254		 * deleted and created in rapid succession.
255		 */
256		if (search.gs_vdev_expandtime != 0 &&
257		    search.gs_vdev_expandtime + 10 > tv.tv_sec) {
258			zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' "
259			    "for recently expanded device '%s'", EC_DEV_REMOVE,
260			    devid);
261			fnvlist_free(payload);
262			free(event);
263			goto out;
264		}
265
266		(void) nvlist_add_uint64(payload,
267		    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid);
268		(void) nvlist_add_uint64(payload,
269		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid);
270		switch (devtype) {
271		case DEVICE_TYPE_L2ARC:
272			(void) nvlist_add_string(payload,
273			    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
274			    VDEV_TYPE_L2CACHE);
275			break;
276		case DEVICE_TYPE_SPARE:
277			(void) nvlist_add_string(payload,
278			    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE);
279			break;
280		case DEVICE_TYPE_PRIMARY:
281			(void) nvlist_add_string(payload,
282			    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK);
283			break;
284		}
285
286		zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'",
287		    EC_DEV_REMOVE, class);
288	}
289
290	(void) strlcpy(event->ae_class, class, sizeof (event->ae_class));
291	(void) strlcpy(event->ae_subclass, subclass,
292	    sizeof (event->ae_subclass));
293
294	(void) pthread_mutex_lock(&agent_lock);
295	list_insert_tail(&agent_events, event);
296	(void) pthread_mutex_unlock(&agent_lock);
297
298out:
299	(void) pthread_cond_signal(&agent_cond);
300}
301
302static void
303zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl)
304{
305	/*
306	 * The diagnosis engine subscribes to the following events.
307	 * On illumos these subscriptions reside in:
308	 * 	/usr/lib/fm/fmd/plugins/zfs-diagnosis.conf
309	 */
310	if (strstr(class, "ereport.fs.zfs.") != NULL ||
311	    strstr(class, "resource.fs.zfs.") != NULL ||
312	    strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 ||
313	    strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 ||
314	    strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) {
315		fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class);
316	}
317
318	/*
319	 * The retire agent subscribes to the following events.
320	 * On illumos these subscriptions reside in:
321	 * 	/usr/lib/fm/fmd/plugins/zfs-retire.conf
322	 *
323	 * NOTE: faults events come directly from our diagnosis engine
324	 * and will not pass through the zfs kernel module.
325	 */
326	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
327	    strcmp(class, "resource.fs.zfs.removed") == 0 ||
328	    strcmp(class, "resource.fs.zfs.statechange") == 0 ||
329	    strcmp(class, "sysevent.fs.zfs.vdev_remove")  == 0) {
330		fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class);
331	}
332
333	/*
334	 * The SLM module only consumes disk events and vdev check events
335	 *
336	 * NOTE: disk events come directly from disk monitor and will
337	 * not pass through the zfs kernel module.
338	 */
339	if (strstr(class, "EC_dev_") != NULL ||
340	    strcmp(class, EC_ZFS) == 0) {
341		(void) zfs_slm_event(class, subclass, nvl);
342	}
343}
344
345/*
346 * Events are consumed and dispatched from this thread
347 * An agent can also post an event so event list lock
348 * is not held when calling an agent.
349 * One event is consumed at a time.
350 */
351static void *
352zfs_agent_consumer_thread(void *arg)
353{
354	(void) arg;
355
356	for (;;) {
357		agent_event_t *event;
358
359		(void) pthread_mutex_lock(&agent_lock);
360
361		/* wait for an event to show up */
362		while (!agent_exiting && list_is_empty(&agent_events))
363			(void) pthread_cond_wait(&agent_cond, &agent_lock);
364
365		if (agent_exiting) {
366			(void) pthread_mutex_unlock(&agent_lock);
367			zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: "
368			    "exiting");
369			return (NULL);
370		}
371
372		if ((event = list_remove_head(&agent_events)) != NULL) {
373			(void) pthread_mutex_unlock(&agent_lock);
374
375			/* dispatch to all event subscribers */
376			zfs_agent_dispatch(event->ae_class, event->ae_subclass,
377			    event->ae_nvl);
378
379			nvlist_free(event->ae_nvl);
380			free(event);
381			continue;
382		}
383
384		(void) pthread_mutex_unlock(&agent_lock);
385	}
386
387	return (NULL);
388}
389
390void
391zfs_agent_init(libzfs_handle_t *zfs_hdl)
392{
393	fmd_hdl_t *hdl;
394
395	g_zfs_hdl = zfs_hdl;
396
397	if (zfs_slm_init() != 0)
398		zed_log_die("Failed to initialize zfs slm");
399	zed_log_msg(LOG_INFO, "Add Agent: init");
400
401	hdl = fmd_module_hdl("zfs-diagnosis");
402	_zfs_diagnosis_init(hdl);
403	if (!fmd_module_initialized(hdl))
404		zed_log_die("Failed to initialize zfs diagnosis");
405
406	hdl = fmd_module_hdl("zfs-retire");
407	_zfs_retire_init(hdl);
408	if (!fmd_module_initialized(hdl))
409		zed_log_die("Failed to initialize zfs retire");
410
411	list_create(&agent_events, sizeof (agent_event_t),
412	    offsetof(struct agent_event, ae_node));
413
414	if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread,
415	    NULL) != 0) {
416		list_destroy(&agent_events);
417		zed_log_die("Failed to initialize agents");
418	}
419	pthread_setname_np(g_agents_tid, "agents");
420}
421
422void
423zfs_agent_fini(void)
424{
425	fmd_hdl_t *hdl;
426	agent_event_t *event;
427
428	agent_exiting = 1;
429	(void) pthread_cond_signal(&agent_cond);
430
431	/* wait for zfs_enum_pools thread to complete */
432	(void) pthread_join(g_agents_tid, NULL);
433
434	/* drain any pending events */
435	while ((event = list_remove_head(&agent_events)) != NULL) {
436		nvlist_free(event->ae_nvl);
437		free(event);
438	}
439
440	list_destroy(&agent_events);
441
442	if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) {
443		_zfs_retire_fini(hdl);
444		fmd_hdl_unregister(hdl);
445	}
446	if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) {
447		_zfs_diagnosis_fini(hdl);
448		fmd_hdl_unregister(hdl);
449	}
450
451	zed_log_msg(LOG_INFO, "Add Agent: fini");
452	zfs_slm_fini();
453
454	g_zfs_hdl = NULL;
455}
456