range_tree.c revision 269773
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25/*
26 * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/dmu.h>
32#include <sys/dnode.h>
33#include <sys/zio.h>
34#include <sys/range_tree.h>
35
36static kmem_cache_t *range_seg_cache;
37
38void
39range_tree_init(void)
40{
41	ASSERT(range_seg_cache == NULL);
42	range_seg_cache = kmem_cache_create("range_seg_cache",
43	    sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
44}
45
46void
47range_tree_fini(void)
48{
49	kmem_cache_destroy(range_seg_cache);
50	range_seg_cache = NULL;
51}
52
53void
54range_tree_stat_verify(range_tree_t *rt)
55{
56	range_seg_t *rs;
57	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
58	int i;
59
60	for (rs = avl_first(&rt->rt_root); rs != NULL;
61	    rs = AVL_NEXT(&rt->rt_root, rs)) {
62		uint64_t size = rs->rs_end - rs->rs_start;
63		int idx	= highbit64(size) - 1;
64
65		hist[idx]++;
66		ASSERT3U(hist[idx], !=, 0);
67	}
68
69	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
70		if (hist[i] != rt->rt_histogram[i]) {
71			zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
72			    i, hist, hist[i], rt->rt_histogram[i]);
73		}
74		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
75	}
76}
77
78static void
79range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
80{
81	uint64_t size = rs->rs_end - rs->rs_start;
82	int idx = highbit64(size) - 1;
83
84	ASSERT(size != 0);
85	ASSERT3U(idx, <,
86	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
87
88	ASSERT(MUTEX_HELD(rt->rt_lock));
89	rt->rt_histogram[idx]++;
90	ASSERT3U(rt->rt_histogram[idx], !=, 0);
91}
92
93static void
94range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
95{
96	uint64_t size = rs->rs_end - rs->rs_start;
97	int idx = highbit64(size) - 1;
98
99	ASSERT(size != 0);
100	ASSERT3U(idx, <,
101	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
102
103	ASSERT(MUTEX_HELD(rt->rt_lock));
104	ASSERT3U(rt->rt_histogram[idx], !=, 0);
105	rt->rt_histogram[idx]--;
106}
107
108/*
109 * NOTE: caller is responsible for all locking.
110 */
111static int
112range_tree_seg_compare(const void *x1, const void *x2)
113{
114	const range_seg_t *r1 = x1;
115	const range_seg_t *r2 = x2;
116
117	if (r1->rs_start < r2->rs_start) {
118		if (r1->rs_end > r2->rs_start)
119			return (0);
120		return (-1);
121	}
122	if (r1->rs_start > r2->rs_start) {
123		if (r1->rs_start < r2->rs_end)
124			return (0);
125		return (1);
126	}
127	return (0);
128}
129
130range_tree_t *
131range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
132{
133	range_tree_t *rt;
134
135	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
136
137	avl_create(&rt->rt_root, range_tree_seg_compare,
138	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
139
140	rt->rt_lock = lp;
141	rt->rt_ops = ops;
142	rt->rt_arg = arg;
143
144	if (rt->rt_ops != NULL)
145		rt->rt_ops->rtop_create(rt, rt->rt_arg);
146
147	return (rt);
148}
149
150void
151range_tree_destroy(range_tree_t *rt)
152{
153	VERIFY0(rt->rt_space);
154
155	if (rt->rt_ops != NULL)
156		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
157
158	avl_destroy(&rt->rt_root);
159	kmem_free(rt, sizeof (*rt));
160}
161
162void
163range_tree_add(void *arg, uint64_t start, uint64_t size)
164{
165	range_tree_t *rt = arg;
166	avl_index_t where;
167	range_seg_t rsearch, *rs_before, *rs_after, *rs;
168	uint64_t end = start + size;
169	boolean_t merge_before, merge_after;
170
171	ASSERT(MUTEX_HELD(rt->rt_lock));
172	VERIFY(size != 0);
173
174	rsearch.rs_start = start;
175	rsearch.rs_end = end;
176	rs = avl_find(&rt->rt_root, &rsearch, &where);
177
178	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
179		zfs_panic_recover("zfs: allocating allocated segment"
180		    "(offset=%llu size=%llu)\n",
181		    (longlong_t)start, (longlong_t)size);
182		return;
183	}
184
185	/* Make sure we don't overlap with either of our neighbors */
186	VERIFY(rs == NULL);
187
188	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
189	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
190
191	merge_before = (rs_before != NULL && rs_before->rs_end == start);
192	merge_after = (rs_after != NULL && rs_after->rs_start == end);
193
194	if (merge_before && merge_after) {
195		avl_remove(&rt->rt_root, rs_before);
196		if (rt->rt_ops != NULL) {
197			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
198			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
199		}
200
201		range_tree_stat_decr(rt, rs_before);
202		range_tree_stat_decr(rt, rs_after);
203
204		rs_after->rs_start = rs_before->rs_start;
205		kmem_cache_free(range_seg_cache, rs_before);
206		rs = rs_after;
207	} else if (merge_before) {
208		if (rt->rt_ops != NULL)
209			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
210
211		range_tree_stat_decr(rt, rs_before);
212
213		rs_before->rs_end = end;
214		rs = rs_before;
215	} else if (merge_after) {
216		if (rt->rt_ops != NULL)
217			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
218
219		range_tree_stat_decr(rt, rs_after);
220
221		rs_after->rs_start = start;
222		rs = rs_after;
223	} else {
224		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
225		rs->rs_start = start;
226		rs->rs_end = end;
227		avl_insert(&rt->rt_root, rs, where);
228	}
229
230	if (rt->rt_ops != NULL)
231		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
232
233	range_tree_stat_incr(rt, rs);
234	rt->rt_space += size;
235}
236
237void
238range_tree_remove(void *arg, uint64_t start, uint64_t size)
239{
240	range_tree_t *rt = arg;
241	avl_index_t where;
242	range_seg_t rsearch, *rs, *newseg;
243	uint64_t end = start + size;
244	boolean_t left_over, right_over;
245
246	ASSERT(MUTEX_HELD(rt->rt_lock));
247	VERIFY3U(size, !=, 0);
248	VERIFY3U(size, <=, rt->rt_space);
249
250	rsearch.rs_start = start;
251	rsearch.rs_end = end;
252	rs = avl_find(&rt->rt_root, &rsearch, &where);
253
254	/* Make sure we completely overlap with someone */
255	if (rs == NULL) {
256		zfs_panic_recover("zfs: freeing free segment "
257		    "(offset=%llu size=%llu)",
258		    (longlong_t)start, (longlong_t)size);
259		return;
260	}
261	VERIFY3U(rs->rs_start, <=, start);
262	VERIFY3U(rs->rs_end, >=, end);
263
264	left_over = (rs->rs_start != start);
265	right_over = (rs->rs_end != end);
266
267	range_tree_stat_decr(rt, rs);
268
269	if (rt->rt_ops != NULL)
270		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
271
272	if (left_over && right_over) {
273		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
274		newseg->rs_start = end;
275		newseg->rs_end = rs->rs_end;
276		range_tree_stat_incr(rt, newseg);
277
278		rs->rs_end = start;
279
280		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
281		if (rt->rt_ops != NULL)
282			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
283	} else if (left_over) {
284		rs->rs_end = start;
285	} else if (right_over) {
286		rs->rs_start = end;
287	} else {
288		avl_remove(&rt->rt_root, rs);
289		kmem_cache_free(range_seg_cache, rs);
290		rs = NULL;
291	}
292
293	if (rs != NULL) {
294		range_tree_stat_incr(rt, rs);
295
296		if (rt->rt_ops != NULL)
297			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
298	}
299
300	rt->rt_space -= size;
301}
302
303static range_seg_t *
304range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
305{
306	avl_index_t where;
307	range_seg_t rsearch;
308	uint64_t end = start + size;
309
310	ASSERT(MUTEX_HELD(rt->rt_lock));
311	VERIFY(size != 0);
312
313	rsearch.rs_start = start;
314	rsearch.rs_end = end;
315	return (avl_find(&rt->rt_root, &rsearch, &where));
316}
317
318static range_seg_t *
319range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
320{
321	range_seg_t *rs = range_tree_find_impl(rt, start, size);
322	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
323		return (rs);
324	return (NULL);
325}
326
327void
328range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
329{
330	range_seg_t *rs;
331
332	mutex_enter(rt->rt_lock);
333	rs = range_tree_find(rt, off, size);
334	if (rs != NULL)
335		panic("freeing free block; rs=%p", (void *)rs);
336	mutex_exit(rt->rt_lock);
337}
338
339boolean_t
340range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
341{
342	return (range_tree_find(rt, start, size) != NULL);
343}
344
345/*
346 * Ensure that this range is not in the tree, regardless of whether
347 * it is currently in the tree.
348 */
349void
350range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
351{
352	range_seg_t *rs;
353
354	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
355		uint64_t free_start = MAX(rs->rs_start, start);
356		uint64_t free_end = MIN(rs->rs_end, start + size);
357		range_tree_remove(rt, free_start, free_end - free_start);
358	}
359}
360
361void
362range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
363{
364	range_tree_t *rt;
365
366	ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
367	ASSERT0(range_tree_space(*rtdst));
368	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
369
370	rt = *rtsrc;
371	*rtsrc = *rtdst;
372	*rtdst = rt;
373}
374
375void
376range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
377{
378	range_seg_t *rs;
379	void *cookie = NULL;
380
381	ASSERT(MUTEX_HELD(rt->rt_lock));
382
383	if (rt->rt_ops != NULL)
384		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
385
386	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
387		if (func != NULL)
388			func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
389		kmem_cache_free(range_seg_cache, rs);
390	}
391
392	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
393	rt->rt_space = 0;
394}
395
396void
397range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
398{
399	range_seg_t *rs;
400
401	ASSERT(MUTEX_HELD(rt->rt_lock));
402
403	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
404		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
405}
406
407uint64_t
408range_tree_space(range_tree_t *rt)
409{
410	return (rt->rt_space);
411}
412