1240868Spjd/*
2240868Spjd * CDDL HEADER START
3240868Spjd *
4240868Spjd * The contents of this file are subject to the terms of the
5240868Spjd * Common Development and Distribution License (the "License").
6240868Spjd * You may not use this file except in compliance with the License.
7240868Spjd *
8240868Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9240868Spjd * or http://www.opensolaris.org/os/licensing.
10240868Spjd * See the License for the specific language governing permissions
11240868Spjd * and limitations under the License.
12240868Spjd *
13240868Spjd * When distributing Covered Code, include this CDDL HEADER in each
14240868Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15240868Spjd * If applicable, add the following below this CDDL HEADER, with the
16240868Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17240868Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18240868Spjd *
19240868Spjd * CDDL HEADER END
20240868Spjd */
21240868Spjd/*
22240868Spjd * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
23240868Spjd * All rights reserved.
24240868Spjd */
25240868Spjd
26240868Spjd#include <sys/zfs_context.h>
27240868Spjd#include <sys/spa_impl.h>
28240868Spjd#include <sys/vdev_impl.h>
29240868Spjd#include <sys/trim_map.h>
30248575Ssmh#include <sys/time.h>
31240868Spjd
32244187Ssmh/*
33244187Ssmh * Calculate the zio end, upgrading based on ashift which would be
34244187Ssmh * done by zio_vdev_io_start.
35244187Ssmh *
36244187Ssmh * This makes free range consolidation much more effective
37244187Ssmh * than it would otherwise be as well as ensuring that entire
38244187Ssmh * blocks are invalidated by writes.
39244187Ssmh */
40248572Ssmh#define	TRIM_ZIO_END(vd, offset, size)	(offset +		\
41248572Ssmh 	P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
42244187Ssmh
43248577Ssmh#define TRIM_MAP_SINC(tm, size)					\
44248577Ssmh	atomic_add_64(&(tm)->tm_bytes, (size))
45248577Ssmh
46248577Ssmh#define TRIM_MAP_SDEC(tm, size)					\
47248602Ssmh	atomic_add_64(&(tm)->tm_bytes, -(size))
48248577Ssmh
49248577Ssmh#define TRIM_MAP_QINC(tm)					\
50248577Ssmh	atomic_inc_64(&(tm)->tm_pending);			\
51248577Ssmh
52248577Ssmh#define TRIM_MAP_QDEC(tm)					\
53248577Ssmh	atomic_dec_64(&(tm)->tm_pending);
54248577Ssmh
55240868Spjdtypedef struct trim_map {
56240868Spjd	list_t		tm_head;		/* List of segments sorted by txg. */
57240868Spjd	avl_tree_t	tm_queued_frees;	/* AVL tree of segments waiting for TRIM. */
58240868Spjd	avl_tree_t	tm_inflight_frees;	/* AVL tree of in-flight TRIMs. */
59240868Spjd	avl_tree_t	tm_inflight_writes;	/* AVL tree of in-flight writes. */
60240868Spjd	list_t		tm_pending_writes;	/* Writes blocked on in-flight frees. */
61240868Spjd	kmutex_t	tm_lock;
62248577Ssmh	uint64_t	tm_pending;		/* Count of pending TRIMs. */
63248577Ssmh	uint64_t	tm_bytes;		/* Total size in bytes of queued TRIMs. */
64240868Spjd} trim_map_t;
65240868Spjd
66240868Spjdtypedef struct trim_seg {
67240868Spjd	avl_node_t	ts_node;	/* AVL node. */
68240868Spjd	list_node_t	ts_next;	/* List element. */
69240868Spjd	uint64_t	ts_start;	/* Starting offset of this segment. */
70240868Spjd	uint64_t	ts_end;		/* Ending offset (non-inclusive). */
71240868Spjd	uint64_t	ts_txg;		/* Segment creation txg. */
72248575Ssmh	hrtime_t	ts_time;	/* Segment creation time. */
73240868Spjd} trim_seg_t;
74240868Spjd
75249921Ssmhextern boolean_t zfs_trim_enabled;
76240868Spjd
77248577Ssmhstatic u_int trim_txg_delay = 32;
78248577Ssmhstatic u_int trim_timeout = 30;
79248577Ssmhstatic u_int trim_max_interval = 1;
80248577Ssmh/* Limit outstanding TRIMs to 2G (max size for a single TRIM request) */
81248577Ssmhstatic uint64_t trim_vdev_max_bytes = 2147483648;
82248577Ssmh/* Limit outstanding TRIMs to 64 (max ranges for a single TRIM request) */
83248577Ssmhstatic u_int trim_vdev_max_pending = 64;
84248577Ssmh
85240868SpjdSYSCTL_DECL(_vfs_zfs);
86248577SsmhSYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM");
87240868Spjd
88248577SsmhTUNABLE_INT("vfs.zfs.trim.txg_delay", &trim_txg_delay);
89248577SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
90248577Ssmh    0, "Delay TRIMs by up to this many TXGs");
91248575Ssmh
92248577SsmhTUNABLE_INT("vfs.zfs.trim.timeout", &trim_timeout);
93248577SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
94248577Ssmh    "Delay TRIMs by up to this many seconds");
95248577Ssmh
96248577SsmhTUNABLE_INT("vfs.zfs.trim.max_interval", &trim_max_interval);
97248577SsmhSYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
98248577Ssmh    &trim_max_interval, 0,
99248577Ssmh    "Maximum interval between TRIM queue processing (seconds)");
100248577Ssmh
101248577SsmhSYSCTL_DECL(_vfs_zfs_vdev);
102248577SsmhTUNABLE_QUAD("vfs.zfs.vdev.trim_max_bytes", &trim_vdev_max_bytes);
103248577SsmhSYSCTL_QUAD(_vfs_zfs_vdev, OID_AUTO, trim_max_bytes, CTLFLAG_RWTUN,
104248577Ssmh    &trim_vdev_max_bytes, 0,
105248577Ssmh    "Maximum pending TRIM bytes for a vdev");
106248577Ssmh
107248577SsmhTUNABLE_INT("vfs.zfs.vdev.trim_max_pending", &trim_vdev_max_pending);
108248577SsmhSYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
109248577Ssmh    &trim_vdev_max_pending, 0,
110248577Ssmh    "Maximum pending TRIM segments for a vdev");
111248577Ssmh
112248577Ssmh
113240868Spjdstatic void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
114240868Spjd
115240868Spjdstatic int
116240868Spjdtrim_map_seg_compare(const void *x1, const void *x2)
117240868Spjd{
118240868Spjd	const trim_seg_t *s1 = x1;
119240868Spjd	const trim_seg_t *s2 = x2;
120240868Spjd
121240868Spjd	if (s1->ts_start < s2->ts_start) {
122240868Spjd		if (s1->ts_end > s2->ts_start)
123240868Spjd			return (0);
124240868Spjd		return (-1);
125240868Spjd	}
126240868Spjd	if (s1->ts_start > s2->ts_start) {
127240868Spjd		if (s1->ts_start < s2->ts_end)
128240868Spjd			return (0);
129240868Spjd		return (1);
130240868Spjd	}
131240868Spjd	return (0);
132240868Spjd}
133240868Spjd
134240868Spjdstatic int
135240868Spjdtrim_map_zio_compare(const void *x1, const void *x2)
136240868Spjd{
137240868Spjd	const zio_t *z1 = x1;
138240868Spjd	const zio_t *z2 = x2;
139240868Spjd
140240868Spjd	if (z1->io_offset < z2->io_offset) {
141240868Spjd		if (z1->io_offset + z1->io_size > z2->io_offset)
142240868Spjd			return (0);
143240868Spjd		return (-1);
144240868Spjd	}
145240868Spjd	if (z1->io_offset > z2->io_offset) {
146240868Spjd		if (z1->io_offset < z2->io_offset + z2->io_size)
147240868Spjd			return (0);
148240868Spjd		return (1);
149240868Spjd	}
150240868Spjd	return (0);
151240868Spjd}
152240868Spjd
153240868Spjdvoid
154240868Spjdtrim_map_create(vdev_t *vd)
155240868Spjd{
156240868Spjd	trim_map_t *tm;
157240868Spjd
158284193Sdelphij	ASSERT(zfs_trim_enabled && !vd->vdev_notrim &&
159284193Sdelphij		vd->vdev_ops->vdev_op_leaf);
160240868Spjd
161240868Spjd	tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
162240868Spjd	mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
163240868Spjd	list_create(&tm->tm_head, sizeof (trim_seg_t),
164240868Spjd	    offsetof(trim_seg_t, ts_next));
165240868Spjd	list_create(&tm->tm_pending_writes, sizeof (zio_t),
166240868Spjd	    offsetof(zio_t, io_trim_link));
167240868Spjd	avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
168240868Spjd	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
169240868Spjd	avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
170240868Spjd	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
171240868Spjd	avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
172240868Spjd	    sizeof (zio_t), offsetof(zio_t, io_trim_node));
173240868Spjd	vd->vdev_trimmap = tm;
174240868Spjd}
175240868Spjd
176240868Spjdvoid
177240868Spjdtrim_map_destroy(vdev_t *vd)
178240868Spjd{
179240868Spjd	trim_map_t *tm;
180240868Spjd	trim_seg_t *ts;
181240868Spjd
182240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
183240868Spjd
184249921Ssmh	if (!zfs_trim_enabled)
185240868Spjd		return;
186240868Spjd
187240868Spjd	tm = vd->vdev_trimmap;
188240868Spjd	if (tm == NULL)
189240868Spjd		return;
190240868Spjd
191240868Spjd	/*
192240868Spjd	 * We may have been called before trim_map_vdev_commit_done()
193240868Spjd	 * had a chance to run, so do it now to prune the remaining
194240868Spjd	 * inflight frees.
195240868Spjd	 */
196240868Spjd	trim_map_vdev_commit_done(vd->vdev_spa, vd);
197240868Spjd
198240868Spjd	mutex_enter(&tm->tm_lock);
199240868Spjd	while ((ts = list_head(&tm->tm_head)) != NULL) {
200240868Spjd		avl_remove(&tm->tm_queued_frees, ts);
201240868Spjd		list_remove(&tm->tm_head, ts);
202240868Spjd		kmem_free(ts, sizeof (*ts));
203248577Ssmh		TRIM_MAP_SDEC(tm, ts->ts_end - ts->ts_start);
204248577Ssmh		TRIM_MAP_QDEC(tm);
205240868Spjd	}
206240868Spjd	mutex_exit(&tm->tm_lock);
207240868Spjd
208240868Spjd	avl_destroy(&tm->tm_queued_frees);
209240868Spjd	avl_destroy(&tm->tm_inflight_frees);
210240868Spjd	avl_destroy(&tm->tm_inflight_writes);
211240868Spjd	list_destroy(&tm->tm_pending_writes);
212240868Spjd	list_destroy(&tm->tm_head);
213240868Spjd	mutex_destroy(&tm->tm_lock);
214240868Spjd	kmem_free(tm, sizeof (*tm));
215240868Spjd	vd->vdev_trimmap = NULL;
216240868Spjd}
217240868Spjd
218240868Spjdstatic void
219240868Spjdtrim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
220240868Spjd{
221240868Spjd	avl_index_t where;
222240868Spjd	trim_seg_t tsearch, *ts_before, *ts_after, *ts;
223240868Spjd	boolean_t merge_before, merge_after;
224248575Ssmh	hrtime_t time;
225240868Spjd
226240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
227240868Spjd	VERIFY(start < end);
228240868Spjd
229248575Ssmh	time = gethrtime();
230240868Spjd	tsearch.ts_start = start;
231240868Spjd	tsearch.ts_end = end;
232240868Spjd
233240868Spjd	ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
234240868Spjd	if (ts != NULL) {
235240868Spjd		if (start < ts->ts_start)
236240868Spjd			trim_map_segment_add(tm, start, ts->ts_start, txg);
237240868Spjd		if (end > ts->ts_end)
238240868Spjd			trim_map_segment_add(tm, ts->ts_end, end, txg);
239240868Spjd		return;
240240868Spjd	}
241240868Spjd
242240868Spjd	ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
243240868Spjd	ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
244240868Spjd
245248577Ssmh	merge_before = (ts_before != NULL && ts_before->ts_end == start);
246248577Ssmh	merge_after = (ts_after != NULL && ts_after->ts_start == end);
247240868Spjd
248240868Spjd	if (merge_before && merge_after) {
249248577Ssmh		TRIM_MAP_SINC(tm, ts_after->ts_start - ts_before->ts_end);
250248577Ssmh		TRIM_MAP_QDEC(tm);
251240868Spjd		avl_remove(&tm->tm_queued_frees, ts_before);
252240868Spjd		list_remove(&tm->tm_head, ts_before);
253240868Spjd		ts_after->ts_start = ts_before->ts_start;
254248577Ssmh		ts_after->ts_txg = txg;
255248577Ssmh		ts_after->ts_time = time;
256240868Spjd		kmem_free(ts_before, sizeof (*ts_before));
257240868Spjd	} else if (merge_before) {
258248577Ssmh		TRIM_MAP_SINC(tm, end - ts_before->ts_end);
259240868Spjd		ts_before->ts_end = end;
260248577Ssmh		ts_before->ts_txg = txg;
261248577Ssmh		ts_before->ts_time = time;
262240868Spjd	} else if (merge_after) {
263248577Ssmh		TRIM_MAP_SINC(tm, ts_after->ts_start - start);
264240868Spjd		ts_after->ts_start = start;
265248577Ssmh		ts_after->ts_txg = txg;
266248577Ssmh		ts_after->ts_time = time;
267240868Spjd	} else {
268248577Ssmh		TRIM_MAP_SINC(tm, end - start);
269248577Ssmh		TRIM_MAP_QINC(tm);
270240868Spjd		ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
271240868Spjd		ts->ts_start = start;
272240868Spjd		ts->ts_end = end;
273240868Spjd		ts->ts_txg = txg;
274248575Ssmh		ts->ts_time = time;
275240868Spjd		avl_insert(&tm->tm_queued_frees, ts, where);
276240868Spjd		list_insert_tail(&tm->tm_head, ts);
277240868Spjd	}
278240868Spjd}
279240868Spjd
280240868Spjdstatic void
281240868Spjdtrim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
282240868Spjd    uint64_t end)
283240868Spjd{
284240868Spjd	trim_seg_t *nts;
285240868Spjd	boolean_t left_over, right_over;
286240868Spjd
287240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
288240868Spjd
289240868Spjd	left_over = (ts->ts_start < start);
290240868Spjd	right_over = (ts->ts_end > end);
291240868Spjd
292248577Ssmh	TRIM_MAP_SDEC(tm, end - start);
293240868Spjd	if (left_over && right_over) {
294240868Spjd		nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
295240868Spjd		nts->ts_start = end;
296240868Spjd		nts->ts_end = ts->ts_end;
297240868Spjd		nts->ts_txg = ts->ts_txg;
298248575Ssmh		nts->ts_time = ts->ts_time;
299240868Spjd		ts->ts_end = start;
300240868Spjd		avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
301240868Spjd		list_insert_after(&tm->tm_head, ts, nts);
302248577Ssmh		TRIM_MAP_QINC(tm);
303240868Spjd	} else if (left_over) {
304240868Spjd		ts->ts_end = start;
305240868Spjd	} else if (right_over) {
306240868Spjd		ts->ts_start = end;
307240868Spjd	} else {
308240868Spjd		avl_remove(&tm->tm_queued_frees, ts);
309240868Spjd		list_remove(&tm->tm_head, ts);
310248577Ssmh		TRIM_MAP_QDEC(tm);
311240868Spjd		kmem_free(ts, sizeof (*ts));
312240868Spjd	}
313240868Spjd}
314240868Spjd
315240868Spjdstatic void
316240868Spjdtrim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
317240868Spjd{
318240868Spjd	zio_t zsearch, *zs;
319240868Spjd
320240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
321240868Spjd
322240868Spjd	zsearch.io_offset = start;
323240868Spjd	zsearch.io_size = end - start;
324240868Spjd
325240868Spjd	zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
326240868Spjd	if (zs == NULL) {
327240868Spjd		trim_map_segment_add(tm, start, end, txg);
328240868Spjd		return;
329240868Spjd	}
330240868Spjd	if (start < zs->io_offset)
331240868Spjd		trim_map_free_locked(tm, start, zs->io_offset, txg);
332240868Spjd	if (zs->io_offset + zs->io_size < end)
333240868Spjd		trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
334240868Spjd}
335240868Spjd
336240868Spjdvoid
337248574Ssmhtrim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
338240868Spjd{
339240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
340240868Spjd
341249921Ssmh	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
342240868Spjd		return;
343240868Spjd
344240868Spjd	mutex_enter(&tm->tm_lock);
345248574Ssmh	trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
346240868Spjd	mutex_exit(&tm->tm_lock);
347240868Spjd}
348240868Spjd
349240868Spjdboolean_t
350240868Spjdtrim_map_write_start(zio_t *zio)
351240868Spjd{
352240868Spjd	vdev_t *vd = zio->io_vd;
353240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
354240868Spjd	trim_seg_t tsearch, *ts;
355240868Spjd	boolean_t left_over, right_over;
356240868Spjd	uint64_t start, end;
357240868Spjd
358249921Ssmh	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
359240868Spjd		return (B_TRUE);
360240868Spjd
361240868Spjd	start = zio->io_offset;
362248572Ssmh	end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
363240868Spjd	tsearch.ts_start = start;
364240868Spjd	tsearch.ts_end = end;
365240868Spjd
366240868Spjd	mutex_enter(&tm->tm_lock);
367240868Spjd
368240868Spjd	/*
369240868Spjd	 * Checking for colliding in-flight frees.
370240868Spjd	 */
371240868Spjd	ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
372240868Spjd	if (ts != NULL) {
373240868Spjd		list_insert_tail(&tm->tm_pending_writes, zio);
374240868Spjd		mutex_exit(&tm->tm_lock);
375240868Spjd		return (B_FALSE);
376240868Spjd	}
377240868Spjd
378240868Spjd	ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
379240868Spjd	if (ts != NULL) {
380240868Spjd		/*
381240868Spjd		 * Loop until all overlapping segments are removed.
382240868Spjd		 */
383240868Spjd		do {
384240868Spjd			trim_map_segment_remove(tm, ts, start, end);
385240868Spjd			ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
386240868Spjd		} while (ts != NULL);
387240868Spjd	}
388240868Spjd	avl_add(&tm->tm_inflight_writes, zio);
389240868Spjd
390240868Spjd	mutex_exit(&tm->tm_lock);
391240868Spjd
392240868Spjd	return (B_TRUE);
393240868Spjd}
394240868Spjd
395240868Spjdvoid
396240868Spjdtrim_map_write_done(zio_t *zio)
397240868Spjd{
398240868Spjd	vdev_t *vd = zio->io_vd;
399240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
400240868Spjd
401240868Spjd	/*
402240868Spjd	 * Don't check for vdev_notrim, since the write could have
403240868Spjd	 * started before vdev_notrim was set.
404240868Spjd	 */
405249921Ssmh	if (!zfs_trim_enabled || tm == NULL)
406240868Spjd		return;
407240868Spjd
408240868Spjd	mutex_enter(&tm->tm_lock);
409240868Spjd	/*
410240868Spjd	 * Don't fail if the write isn't in the tree, since the write
411240868Spjd	 * could have started after vdev_notrim was set.
412240868Spjd	 */
413240868Spjd	if (zio->io_trim_node.avl_child[0] ||
414240868Spjd	    zio->io_trim_node.avl_child[1] ||
415240868Spjd	    AVL_XPARENT(&zio->io_trim_node) ||
416240868Spjd	    tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
417240868Spjd		avl_remove(&tm->tm_inflight_writes, zio);
418240868Spjd	mutex_exit(&tm->tm_lock);
419240868Spjd}
420240868Spjd
421240868Spjd/*
422248577Ssmh * Return the oldest segment (the one with the lowest txg / time) or NULL if:
423248577Ssmh * 1. The list is empty
424248577Ssmh * 2. The first element's txg is greater than txgsafe
425248577Ssmh * 3. The first element's txg is not greater than the txg argument and the
426248577Ssmh *    the first element's time is not greater than time argument
427240868Spjd */
428240868Spjdstatic trim_seg_t *
429248577Ssmhtrim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time)
430240868Spjd{
431240868Spjd	trim_seg_t *ts;
432240868Spjd
433240868Spjd	ASSERT(MUTEX_HELD(&tm->tm_lock));
434248577Ssmh	VERIFY(txgsafe >= txg);
435240868Spjd
436240868Spjd	ts = list_head(&tm->tm_head);
437248577Ssmh	if (ts != NULL && ts->ts_txg <= txgsafe &&
438248577Ssmh	    (ts->ts_txg <= txg || ts->ts_time <= time ||
439248577Ssmh	    tm->tm_bytes > trim_vdev_max_bytes ||
440248577Ssmh	    tm->tm_pending > trim_vdev_max_pending))
441240868Spjd		return (ts);
442240868Spjd	return (NULL);
443240868Spjd}
444240868Spjd
445240868Spjdstatic void
446240868Spjdtrim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
447240868Spjd{
448240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
449240868Spjd	trim_seg_t *ts;
450270312Ssmh	uint64_t size, offset, txgtarget, txgsafe;
451248575Ssmh	hrtime_t timelimit;
452240868Spjd
453240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
454240868Spjd
455240868Spjd	if (tm == NULL)
456240868Spjd		return;
457240868Spjd
458248577Ssmh	timelimit = gethrtime() - trim_timeout * NANOSEC;
459248575Ssmh	if (vd->vdev_isl2cache) {
460248577Ssmh		txgsafe = UINT64_MAX;
461248577Ssmh		txgtarget = UINT64_MAX;
462248575Ssmh	} else {
463248577Ssmh		txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
464248577Ssmh		if (txgsafe > trim_txg_delay)
465248577Ssmh			txgtarget = txgsafe - trim_txg_delay;
466248577Ssmh		else
467248577Ssmh			txgtarget = 0;
468248575Ssmh	}
469240868Spjd
470240868Spjd	mutex_enter(&tm->tm_lock);
471248577Ssmh	/* Loop until we have sent all outstanding free's */
472248577Ssmh	while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit))
473248577Ssmh	    != NULL) {
474240868Spjd		list_remove(&tm->tm_head, ts);
475240868Spjd		avl_remove(&tm->tm_queued_frees, ts);
476240868Spjd		avl_add(&tm->tm_inflight_frees, ts);
477248577Ssmh		size = ts->ts_end - ts->ts_start;
478270312Ssmh		offset = ts->ts_start;
479248577Ssmh		TRIM_MAP_SDEC(tm, size);
480248577Ssmh		TRIM_MAP_QDEC(tm);
481270312Ssmh		/*
482270312Ssmh		 * We drop the lock while we call zio_nowait as the IO
483270312Ssmh		 * scheduler can result in a different IO being run e.g.
484270312Ssmh		 * a write which would result in a recursive lock.
485270312Ssmh		 */
486270312Ssmh		mutex_exit(&tm->tm_lock);
487270312Ssmh
488270312Ssmh		zio_nowait(zio_trim(zio, spa, vd, offset, size));
489270312Ssmh
490270312Ssmh		mutex_enter(&tm->tm_lock);
491270312Ssmh		ts = trim_map_first(tm, txgtarget, txgsafe, timelimit);
492240868Spjd	}
493240868Spjd	mutex_exit(&tm->tm_lock);
494240868Spjd}
495240868Spjd
496240868Spjdstatic void
497240868Spjdtrim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
498240868Spjd{
499240868Spjd	trim_map_t *tm = vd->vdev_trimmap;
500240868Spjd	trim_seg_t *ts;
501240868Spjd	list_t pending_writes;
502240868Spjd	zio_t *zio;
503240868Spjd	uint64_t start, size;
504240868Spjd	void *cookie;
505240868Spjd
506240868Spjd	ASSERT(vd->vdev_ops->vdev_op_leaf);
507240868Spjd
508240868Spjd	if (tm == NULL)
509240868Spjd		return;
510240868Spjd
511240868Spjd	mutex_enter(&tm->tm_lock);
512240868Spjd	if (!avl_is_empty(&tm->tm_inflight_frees)) {
513240868Spjd		cookie = NULL;
514240868Spjd		while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
515240868Spjd		    &cookie)) != NULL) {
516240868Spjd			kmem_free(ts, sizeof (*ts));
517240868Spjd		}
518240868Spjd	}
519240868Spjd	list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
520240868Spjd	    io_trim_link));
521240868Spjd	list_move_tail(&pending_writes, &tm->tm_pending_writes);
522240868Spjd	mutex_exit(&tm->tm_lock);
523240868Spjd
524240868Spjd	while ((zio = list_remove_head(&pending_writes)) != NULL) {
525240868Spjd		zio_vdev_io_reissue(zio);
526240868Spjd		zio_execute(zio);
527240868Spjd	}
528240868Spjd	list_destroy(&pending_writes);
529240868Spjd}
530240868Spjd
531240868Spjdstatic void
532240868Spjdtrim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
533240868Spjd{
534240868Spjd	int c;
535240868Spjd
536248577Ssmh	if (vd == NULL)
537240868Spjd		return;
538240868Spjd
539240868Spjd	if (vd->vdev_ops->vdev_op_leaf) {
540240868Spjd		trim_map_vdev_commit(spa, zio, vd);
541240868Spjd	} else {
542240868Spjd		for (c = 0; c < vd->vdev_children; c++)
543240868Spjd			trim_map_commit(spa, zio, vd->vdev_child[c]);
544240868Spjd	}
545240868Spjd}
546240868Spjd
547240868Spjdstatic void
548240868Spjdtrim_map_commit_done(spa_t *spa, vdev_t *vd)
549240868Spjd{
550240868Spjd	int c;
551240868Spjd
552240868Spjd	if (vd == NULL)
553240868Spjd		return;
554240868Spjd
555240868Spjd	if (vd->vdev_ops->vdev_op_leaf) {
556240868Spjd		trim_map_vdev_commit_done(spa, vd);
557240868Spjd	} else {
558240868Spjd		for (c = 0; c < vd->vdev_children; c++)
559240868Spjd			trim_map_commit_done(spa, vd->vdev_child[c]);
560240868Spjd	}
561240868Spjd}
562240868Spjd
563240868Spjdstatic void
564240868Spjdtrim_thread(void *arg)
565240868Spjd{
566240868Spjd	spa_t *spa = arg;
567240868Spjd	zio_t *zio;
568240868Spjd
569248576Ssmh#ifdef _KERNEL
570248576Ssmh	(void) snprintf(curthread->td_name, sizeof(curthread->td_name),
571248576Ssmh	    "trim %s", spa_name(spa));
572248576Ssmh#endif
573248576Ssmh
574240868Spjd	for (;;) {
575240868Spjd		mutex_enter(&spa->spa_trim_lock);
576240868Spjd		if (spa->spa_trim_thread == NULL) {
577240868Spjd			spa->spa_trim_thread = curthread;
578240868Spjd			cv_signal(&spa->spa_trim_cv);
579240868Spjd			mutex_exit(&spa->spa_trim_lock);
580240868Spjd			thread_exit();
581240868Spjd		}
582248577Ssmh
583248577Ssmh		(void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
584248577Ssmh		    hz * trim_max_interval);
585240868Spjd		mutex_exit(&spa->spa_trim_lock);
586240868Spjd
587240868Spjd		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
588240868Spjd
589240868Spjd		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
590240868Spjd		trim_map_commit(spa, zio, spa->spa_root_vdev);
591240868Spjd		(void) zio_wait(zio);
592240868Spjd		trim_map_commit_done(spa, spa->spa_root_vdev);
593240868Spjd		spa_config_exit(spa, SCL_STATE, FTAG);
594240868Spjd	}
595240868Spjd}
596240868Spjd
597240868Spjdvoid
598240868Spjdtrim_thread_create(spa_t *spa)
599240868Spjd{
600240868Spjd
601249921Ssmh	if (!zfs_trim_enabled)
602240868Spjd		return;
603240868Spjd
604240868Spjd	mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
605240868Spjd	cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
606240868Spjd	mutex_enter(&spa->spa_trim_lock);
607240868Spjd	spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
608240868Spjd	    TS_RUN, minclsyspri);
609240868Spjd	mutex_exit(&spa->spa_trim_lock);
610240868Spjd}
611240868Spjd
612240868Spjdvoid
613240868Spjdtrim_thread_destroy(spa_t *spa)
614240868Spjd{
615240868Spjd
616249921Ssmh	if (!zfs_trim_enabled)
617240868Spjd		return;
618240868Spjd	if (spa->spa_trim_thread == NULL)
619240868Spjd		return;
620240868Spjd
621240868Spjd	mutex_enter(&spa->spa_trim_lock);
622240868Spjd	/* Setting spa_trim_thread to NULL tells the thread to stop. */
623240868Spjd	spa->spa_trim_thread = NULL;
624240868Spjd	cv_signal(&spa->spa_trim_cv);
625240868Spjd	/* The thread will set it back to != NULL on exit. */
626240868Spjd	while (spa->spa_trim_thread == NULL)
627240868Spjd		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
628240868Spjd	spa->spa_trim_thread = NULL;
629240868Spjd	mutex_exit(&spa->spa_trim_lock);
630240868Spjd
631240868Spjd	cv_destroy(&spa->spa_trim_cv);
632240868Spjd	mutex_destroy(&spa->spa_trim_lock);
633240868Spjd}
634240868Spjd
635240868Spjdvoid
636240868Spjdtrim_thread_wakeup(spa_t *spa)
637240868Spjd{
638240868Spjd
639249921Ssmh	if (!zfs_trim_enabled)
640240868Spjd		return;
641240868Spjd	if (spa->spa_trim_thread == NULL)
642240868Spjd		return;
643240868Spjd
644240868Spjd	mutex_enter(&spa->spa_trim_lock);
645240868Spjd	cv_signal(&spa->spa_trim_cv);
646240868Spjd	mutex_exit(&spa->spa_trim_lock);
647240868Spjd}
648