range_tree.c revision 262093
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25/*
26 * Copyright (c) 2013 by Delphix. All rights reserved.
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/dmu.h>
32#include <sys/dnode.h>
33#include <sys/zio.h>
34#include <sys/range_tree.h>
35
36static kmem_cache_t *range_seg_cache;
37
38void
39range_tree_init(void)
40{
41	ASSERT(range_seg_cache == NULL);
42	range_seg_cache = kmem_cache_create("range_seg_cache",
43	    sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
44}
45
46void
47range_tree_fini(void)
48{
49	kmem_cache_destroy(range_seg_cache);
50	range_seg_cache = NULL;
51}
52
53void
54range_tree_stat_verify(range_tree_t *rt)
55{
56	range_seg_t *rs;
57	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
58	int i;
59
60	for (rs = avl_first(&rt->rt_root); rs != NULL;
61	    rs = AVL_NEXT(&rt->rt_root, rs)) {
62		uint64_t size = rs->rs_end - rs->rs_start;
63		int idx	= highbit(size) - 1;
64
65		hist[idx]++;
66		ASSERT3U(hist[idx], !=, 0);
67	}
68
69	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
70		if (hist[i] != rt->rt_histogram[i]) {
71			zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
72			    i, hist, hist[i], rt->rt_histogram[i]);
73		}
74		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
75	}
76}
77
78static void
79range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
80{
81	uint64_t size = rs->rs_end - rs->rs_start;
82	int idx = highbit(size) - 1;
83
84	ASSERT3U(idx, <,
85	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
86
87	ASSERT(MUTEX_HELD(rt->rt_lock));
88	rt->rt_histogram[idx]++;
89	ASSERT3U(rt->rt_histogram[idx], !=, 0);
90}
91
92static void
93range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
94{
95	uint64_t size = rs->rs_end - rs->rs_start;
96	int idx = highbit(size) - 1;
97
98	ASSERT3U(idx, <,
99	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
100
101	ASSERT(MUTEX_HELD(rt->rt_lock));
102	ASSERT3U(rt->rt_histogram[idx], !=, 0);
103	rt->rt_histogram[idx]--;
104}
105
106/*
107 * NOTE: caller is responsible for all locking.
108 */
109static int
110range_tree_seg_compare(const void *x1, const void *x2)
111{
112	const range_seg_t *r1 = x1;
113	const range_seg_t *r2 = x2;
114
115	if (r1->rs_start < r2->rs_start) {
116		if (r1->rs_end > r2->rs_start)
117			return (0);
118		return (-1);
119	}
120	if (r1->rs_start > r2->rs_start) {
121		if (r1->rs_start < r2->rs_end)
122			return (0);
123		return (1);
124	}
125	return (0);
126}
127
128range_tree_t *
129range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
130{
131	range_tree_t *rt;
132
133	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
134
135	avl_create(&rt->rt_root, range_tree_seg_compare,
136	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
137
138	rt->rt_lock = lp;
139	rt->rt_ops = ops;
140	rt->rt_arg = arg;
141
142	if (rt->rt_ops != NULL)
143		rt->rt_ops->rtop_create(rt, rt->rt_arg);
144
145	return (rt);
146}
147
148void
149range_tree_destroy(range_tree_t *rt)
150{
151	VERIFY0(rt->rt_space);
152
153	if (rt->rt_ops != NULL)
154		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
155
156	avl_destroy(&rt->rt_root);
157	kmem_free(rt, sizeof (*rt));
158}
159
160void
161range_tree_add(void *arg, uint64_t start, uint64_t size)
162{
163	range_tree_t *rt = arg;
164	avl_index_t where;
165	range_seg_t rsearch, *rs_before, *rs_after, *rs;
166	uint64_t end = start + size;
167	boolean_t merge_before, merge_after;
168
169	ASSERT(MUTEX_HELD(rt->rt_lock));
170	VERIFY(size != 0);
171
172	rsearch.rs_start = start;
173	rsearch.rs_end = end;
174	rs = avl_find(&rt->rt_root, &rsearch, &where);
175
176	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
177		zfs_panic_recover("zfs: allocating allocated segment"
178		    "(offset=%llu size=%llu)\n",
179		    (longlong_t)start, (longlong_t)size);
180		return;
181	}
182
183	/* Make sure we don't overlap with either of our neighbors */
184	VERIFY(rs == NULL);
185
186	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
187	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
188
189	merge_before = (rs_before != NULL && rs_before->rs_end == start);
190	merge_after = (rs_after != NULL && rs_after->rs_start == end);
191
192	if (merge_before && merge_after) {
193		avl_remove(&rt->rt_root, rs_before);
194		if (rt->rt_ops != NULL) {
195			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
196			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
197		}
198
199		range_tree_stat_decr(rt, rs_before);
200		range_tree_stat_decr(rt, rs_after);
201
202		rs_after->rs_start = rs_before->rs_start;
203		kmem_cache_free(range_seg_cache, rs_before);
204		rs = rs_after;
205	} else if (merge_before) {
206		if (rt->rt_ops != NULL)
207			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
208
209		range_tree_stat_decr(rt, rs_before);
210
211		rs_before->rs_end = end;
212		rs = rs_before;
213	} else if (merge_after) {
214		if (rt->rt_ops != NULL)
215			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
216
217		range_tree_stat_decr(rt, rs_after);
218
219		rs_after->rs_start = start;
220		rs = rs_after;
221	} else {
222		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
223		rs->rs_start = start;
224		rs->rs_end = end;
225		avl_insert(&rt->rt_root, rs, where);
226	}
227
228	if (rt->rt_ops != NULL)
229		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
230
231	range_tree_stat_incr(rt, rs);
232	rt->rt_space += size;
233}
234
235void
236range_tree_remove(void *arg, uint64_t start, uint64_t size)
237{
238	range_tree_t *rt = arg;
239	avl_index_t where;
240	range_seg_t rsearch, *rs, *newseg;
241	uint64_t end = start + size;
242	boolean_t left_over, right_over;
243
244	ASSERT(MUTEX_HELD(rt->rt_lock));
245	VERIFY3U(size, !=, 0);
246	VERIFY3U(size, <=, rt->rt_space);
247
248	rsearch.rs_start = start;
249	rsearch.rs_end = end;
250	rs = avl_find(&rt->rt_root, &rsearch, &where);
251
252	/* Make sure we completely overlap with someone */
253	if (rs == NULL) {
254		zfs_panic_recover("zfs: freeing free segment "
255		    "(offset=%llu size=%llu)",
256		    (longlong_t)start, (longlong_t)size);
257		return;
258	}
259	VERIFY3U(rs->rs_start, <=, start);
260	VERIFY3U(rs->rs_end, >=, end);
261
262	left_over = (rs->rs_start != start);
263	right_over = (rs->rs_end != end);
264
265	range_tree_stat_decr(rt, rs);
266
267	if (rt->rt_ops != NULL)
268		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
269
270	if (left_over && right_over) {
271		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
272		newseg->rs_start = end;
273		newseg->rs_end = rs->rs_end;
274		range_tree_stat_incr(rt, newseg);
275
276		rs->rs_end = start;
277
278		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
279		if (rt->rt_ops != NULL)
280			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
281	} else if (left_over) {
282		rs->rs_end = start;
283	} else if (right_over) {
284		rs->rs_start = end;
285	} else {
286		avl_remove(&rt->rt_root, rs);
287		kmem_cache_free(range_seg_cache, rs);
288		rs = NULL;
289	}
290
291	if (rs != NULL) {
292		range_tree_stat_incr(rt, rs);
293
294		if (rt->rt_ops != NULL)
295			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
296	}
297
298	rt->rt_space -= size;
299}
300
301static range_seg_t *
302range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size,
303    avl_index_t *wherep)
304{
305	range_seg_t rsearch, *rs;
306	uint64_t end = start + size;
307
308	ASSERT(MUTEX_HELD(rt->rt_lock));
309	VERIFY(size != 0);
310
311	rsearch.rs_start = start;
312	rsearch.rs_end = end;
313	rs = avl_find(&rt->rt_root, &rsearch, wherep);
314
315	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end)
316		return (rs);
317	return (NULL);
318}
319
320void
321range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
322{
323	range_seg_t *rs;
324	avl_index_t where;
325
326	mutex_enter(rt->rt_lock);
327	rs = range_tree_find(rt, off, size, &where);
328	if (rs != NULL)
329		panic("freeing free block; rs=%p", (void *)rs);
330	mutex_exit(rt->rt_lock);
331}
332
333boolean_t
334range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
335{
336	avl_index_t where;
337
338	return (range_tree_find(rt, start, size, &where) != NULL);
339}
340
341void
342range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
343{
344	range_tree_t *rt;
345
346	ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
347	ASSERT0(range_tree_space(*rtdst));
348	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
349
350	rt = *rtsrc;
351	*rtsrc = *rtdst;
352	*rtdst = rt;
353}
354
355void
356range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
357{
358	range_seg_t *rs;
359	void *cookie = NULL;
360
361	ASSERT(MUTEX_HELD(rt->rt_lock));
362
363	if (rt->rt_ops != NULL)
364		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
365
366	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
367		if (func != NULL)
368			func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
369		kmem_cache_free(range_seg_cache, rs);
370	}
371
372	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
373	rt->rt_space = 0;
374}
375
376void
377range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
378{
379	range_seg_t *rs;
380
381	ASSERT(MUTEX_HELD(rt->rt_lock));
382
383	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
384		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
385}
386
387uint64_t
388range_tree_space(range_tree_t *rt)
389{
390	return (rt->rt_space);
391}
392