dmu_zfetch.c revision 262167
1253883Ssjg/*
2246149Ssjg * CDDL HEADER START
3246149Ssjg *
4246149Ssjg * The contents of this file are subject to the terms of the
5246149Ssjg * Common Development and Distribution License (the "License").
6246149Ssjg * You may not use this file except in compliance with the License.
7246149Ssjg *
8246149Ssjg * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9246149Ssjg * or http://www.opensolaris.org/os/licensing.
10246149Ssjg * See the License for the specific language governing permissions
11246149Ssjg * and limitations under the License.
12246149Ssjg *
13246149Ssjg * When distributing Covered Code, include this CDDL HEADER in each
14246149Ssjg * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15246149Ssjg * If applicable, add the following below this CDDL HEADER, with the
16246149Ssjg * fields enclosed by brackets "[]" replaced with your own identifying
17246149Ssjg * information: Portions Copyright [yyyy] [name of copyright owner]
18246149Ssjg *
19246149Ssjg * CDDL HEADER END
20246149Ssjg */
21246149Ssjg/*
22246149Ssjg * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23246149Ssjg * Use is subject to license terms.
24246149Ssjg */
25246149Ssjg
26246149Ssjg/*
27246149Ssjg * Copyright (c) 2013 by Delphix. All rights reserved.
28246149Ssjg */
29246149Ssjg
30246149Ssjg#include <sys/zfs_context.h>
31246149Ssjg#include <sys/dnode.h>
32246149Ssjg#include <sys/dmu_objset.h>
33246149Ssjg#include <sys/dmu_zfetch.h>
34246149Ssjg#include <sys/dmu.h>
35246149Ssjg#include <sys/dbuf.h>
36246149Ssjg#include <sys/kstat.h>
37246149Ssjg
38246149Ssjg/*
39246149Ssjg * I'm against tune-ables, but these should probably exist as tweakable globals
40246149Ssjg * until we can get this working the way we want it to.
41246149Ssjg */
42246149Ssjg
43246149Ssjgint zfs_prefetch_disable = 0;
44246149Ssjg
45246149Ssjg/* max # of streams per zfetch */
46246149Ssjguint32_t	zfetch_max_streams = 8;
47246149Ssjg/* min time before stream reclaim */
48246149Ssjguint32_t	zfetch_min_sec_reap = 2;
49246149Ssjg/* max number of blocks to fetch at a time */
50246149Ssjguint32_t	zfetch_block_cap = 256;
51246149Ssjg/* number of bytes in a array_read at which we stop prefetching (1Mb) */
52246149Ssjguint64_t	zfetch_array_rd_sz = 1024 * 1024;
53246149Ssjg
54246149SsjgSYSCTL_DECL(_vfs_zfs);
55246149SsjgSYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
56246149Ssjg    &zfs_prefetch_disable, 0, "Disable prefetch");
57246149SsjgSYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
58246149SsjgTUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams);
59246149SsjgSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RW,
60246149Ssjg    &zfetch_max_streams, 0, "Max # of streams per zfetch");
61246149SsjgTUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap);
62246149SsjgSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN,
63246149Ssjg    &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
64246149SsjgTUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap);
65246149SsjgSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN,
66246149Ssjg    &zfetch_block_cap, 0, "Max number of blocks to fetch at a time");
67246149SsjgTUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz);
68246149SsjgSYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN,
69246149Ssjg    &zfetch_array_rd_sz, 0,
70246149Ssjg    "Number of bytes in a array_read at which we stop prefetching");
71246149Ssjg
72246149Ssjg/* forward decls for static routines */
73246149Ssjgstatic boolean_t	dmu_zfetch_colinear(zfetch_t *, zstream_t *);
74246149Ssjgstatic void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
75246149Ssjgstatic uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
76246149Ssjgstatic uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
77246149Ssjgstatic boolean_t	dmu_zfetch_find(zfetch_t *, zstream_t *, int);
78246149Ssjgstatic int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
79246149Ssjgstatic zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
80246149Ssjgstatic void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
81246149Ssjgstatic int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
82246149Ssjg
83246149Ssjgtypedef struct zfetch_stats {
84246149Ssjg	kstat_named_t zfetchstat_hits;
85246149Ssjg	kstat_named_t zfetchstat_misses;
86246149Ssjg	kstat_named_t zfetchstat_colinear_hits;
87246149Ssjg	kstat_named_t zfetchstat_colinear_misses;
88246149Ssjg	kstat_named_t zfetchstat_stride_hits;
89246149Ssjg	kstat_named_t zfetchstat_stride_misses;
90246149Ssjg	kstat_named_t zfetchstat_reclaim_successes;
91246149Ssjg	kstat_named_t zfetchstat_reclaim_failures;
92246149Ssjg	kstat_named_t zfetchstat_stream_resets;
93246149Ssjg	kstat_named_t zfetchstat_stream_noresets;
94246149Ssjg	kstat_named_t zfetchstat_bogus_streams;
95246149Ssjg} zfetch_stats_t;
96246149Ssjg
97246149Ssjgstatic zfetch_stats_t zfetch_stats = {
98246149Ssjg	{ "hits",			KSTAT_DATA_UINT64 },
99246149Ssjg	{ "misses",			KSTAT_DATA_UINT64 },
100246149Ssjg	{ "colinear_hits",		KSTAT_DATA_UINT64 },
101246149Ssjg	{ "colinear_misses",		KSTAT_DATA_UINT64 },
102246149Ssjg	{ "stride_hits",		KSTAT_DATA_UINT64 },
103246149Ssjg	{ "stride_misses",		KSTAT_DATA_UINT64 },
104246149Ssjg	{ "reclaim_successes",		KSTAT_DATA_UINT64 },
105246149Ssjg	{ "reclaim_failures",		KSTAT_DATA_UINT64 },
106246149Ssjg	{ "streams_resets",		KSTAT_DATA_UINT64 },
107246149Ssjg	{ "streams_noresets",		KSTAT_DATA_UINT64 },
108246149Ssjg	{ "bogus_streams",		KSTAT_DATA_UINT64 },
109246149Ssjg};
110246149Ssjg
111246149Ssjg#define	ZFETCHSTAT_INCR(stat, val) \
112246149Ssjg	atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
113246149Ssjg
114246149Ssjg#define	ZFETCHSTAT_BUMP(stat)		ZFETCHSTAT_INCR(stat, 1);
115246149Ssjg
116246149Ssjgkstat_t		*zfetch_ksp;
117246149Ssjg
118246149Ssjg/*
119246149Ssjg * Given a zfetch structure and a zstream structure, determine whether the
120246149Ssjg * blocks to be read are part of a co-linear pair of existing prefetch
121246149Ssjg * streams.  If a set is found, coalesce the streams, removing one, and
122246149Ssjg * configure the prefetch so it looks for a strided access pattern.
123246149Ssjg *
124246149Ssjg * In other words: if we find two sequential access streams that are
125246149Ssjg * the same length and distance N appart, and this read is N from the
126246149Ssjg * last stream, then we are probably in a strided access pattern.  So
127246149Ssjg * combine the two sequential streams into a single strided stream.
128246149Ssjg *
129246149Ssjg * Returns whether co-linear streams were found.
130246149Ssjg */
131246149Ssjgstatic boolean_t
132246149Ssjgdmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
133246149Ssjg{
134246149Ssjg	zstream_t	*z_walk;
135246149Ssjg	zstream_t	*z_comp;
136246149Ssjg
137246149Ssjg	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
138246149Ssjg		return (0);
139246149Ssjg
140246149Ssjg	if (zh == NULL) {
141246149Ssjg		rw_exit(&zf->zf_rwlock);
142246149Ssjg		return (0);
143246149Ssjg	}
144246149Ssjg
145246149Ssjg	for (z_walk = list_head(&zf->zf_stream); z_walk;
146246149Ssjg	    z_walk = list_next(&zf->zf_stream, z_walk)) {
147246149Ssjg		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
148246149Ssjg		    z_comp = list_next(&zf->zf_stream, z_comp)) {
149246149Ssjg			int64_t		diff;
150246149Ssjg
151246149Ssjg			if (z_walk->zst_len != z_walk->zst_stride ||
152246149Ssjg			    z_comp->zst_len != z_comp->zst_stride) {
153246149Ssjg				continue;
154246149Ssjg			}
155246149Ssjg
156246149Ssjg			diff = z_comp->zst_offset - z_walk->zst_offset;
157246149Ssjg			if (z_comp->zst_offset + diff == zh->zst_offset) {
158246149Ssjg				z_walk->zst_offset = zh->zst_offset;
159246149Ssjg				z_walk->zst_direction = diff < 0 ? -1 : 1;
160246149Ssjg				z_walk->zst_stride =
161246149Ssjg				    diff * z_walk->zst_direction;
162246149Ssjg				z_walk->zst_ph_offset =
163246149Ssjg				    zh->zst_offset + z_walk->zst_stride;
164246149Ssjg				dmu_zfetch_stream_remove(zf, z_comp);
165246149Ssjg				mutex_destroy(&z_comp->zst_lock);
166246149Ssjg				kmem_free(z_comp, sizeof (zstream_t));
167246149Ssjg
168246149Ssjg				dmu_zfetch_dofetch(zf, z_walk);
169246149Ssjg
170246149Ssjg				rw_exit(&zf->zf_rwlock);
171246149Ssjg				return (1);
172246149Ssjg			}
173246149Ssjg
174246149Ssjg			diff = z_walk->zst_offset - z_comp->zst_offset;
175246149Ssjg			if (z_walk->zst_offset + diff == zh->zst_offset) {
176246149Ssjg				z_walk->zst_offset = zh->zst_offset;
177246149Ssjg				z_walk->zst_direction = diff < 0 ? -1 : 1;
178246149Ssjg				z_walk->zst_stride =
179246149Ssjg				    diff * z_walk->zst_direction;
180246149Ssjg				z_walk->zst_ph_offset =
181246149Ssjg				    zh->zst_offset + z_walk->zst_stride;
182246149Ssjg				dmu_zfetch_stream_remove(zf, z_comp);
183246149Ssjg				mutex_destroy(&z_comp->zst_lock);
184246149Ssjg				kmem_free(z_comp, sizeof (zstream_t));
185246149Ssjg
186246149Ssjg				dmu_zfetch_dofetch(zf, z_walk);
187246149Ssjg
188246149Ssjg				rw_exit(&zf->zf_rwlock);
189246149Ssjg				return (1);
190246149Ssjg			}
191246149Ssjg		}
192246149Ssjg	}
193246149Ssjg
194246149Ssjg	rw_exit(&zf->zf_rwlock);
195246149Ssjg	return (0);
196246149Ssjg}
197246149Ssjg
198246149Ssjg/*
199246149Ssjg * Given a zstream_t, determine the bounds of the prefetch.  Then call the
200246149Ssjg * routine that actually prefetches the individual blocks.
201246149Ssjg */
202246149Ssjgstatic void
203246149Ssjgdmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
204246149Ssjg{
205246149Ssjg	uint64_t	prefetch_tail;
206246149Ssjg	uint64_t	prefetch_limit;
207246149Ssjg	uint64_t	prefetch_ofst;
208246149Ssjg	uint64_t	prefetch_len;
209246149Ssjg	uint64_t	blocks_fetched;
210246149Ssjg
211246149Ssjg	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
212246149Ssjg	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
213246149Ssjg
214246149Ssjg	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
215246149Ssjg	    (int64_t)(zs->zst_offset + zs->zst_stride));
216246149Ssjg	/*
217246149Ssjg	 * XXX: use a faster division method?
218246149Ssjg	 */
219246149Ssjg	prefetch_limit = zs->zst_offset + zs->zst_len +
220246149Ssjg	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
221246149Ssjg
222246149Ssjg	while (prefetch_tail < prefetch_limit) {
223246149Ssjg		prefetch_ofst = zs->zst_offset + zs->zst_direction *
224246149Ssjg		    (prefetch_tail - zs->zst_offset);
225246149Ssjg
226246149Ssjg		prefetch_len = zs->zst_len;
227246149Ssjg
228246149Ssjg		/*
229246149Ssjg		 * Don't prefetch beyond the end of the file, if working
230246149Ssjg		 * backwards.
231246149Ssjg		 */
232246149Ssjg		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
233246149Ssjg		    (prefetch_ofst > prefetch_tail)) {
234246149Ssjg			prefetch_len += prefetch_ofst;
235246149Ssjg			prefetch_ofst = 0;
236246149Ssjg		}
237246149Ssjg
238246149Ssjg		/* don't prefetch more than we're supposed to */
239246149Ssjg		if (prefetch_len > zs->zst_len)
240246149Ssjg			break;
241246149Ssjg
242246149Ssjg		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
243246149Ssjg		    prefetch_ofst, zs->zst_len);
244246149Ssjg
245246149Ssjg		prefetch_tail += zs->zst_stride;
246246149Ssjg		/* stop if we've run out of stuff to prefetch */
247246149Ssjg		if (blocks_fetched < zs->zst_len)
248246149Ssjg			break;
249246149Ssjg	}
250246149Ssjg	zs->zst_ph_offset = prefetch_tail;
251246149Ssjg	zs->zst_last = ddi_get_lbolt();
252246149Ssjg}
253246149Ssjg
254246149Ssjgvoid
255246149Ssjgzfetch_init(void)
256246149Ssjg{
257246149Ssjg
258246149Ssjg	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
259246149Ssjg	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
260246149Ssjg	    KSTAT_FLAG_VIRTUAL);
261246149Ssjg
262246149Ssjg	if (zfetch_ksp != NULL) {
263246149Ssjg		zfetch_ksp->ks_data = &zfetch_stats;
264246149Ssjg		kstat_install(zfetch_ksp);
265246149Ssjg	}
266246149Ssjg}
267246149Ssjg
268246149Ssjgvoid
269246149Ssjgzfetch_fini(void)
270246149Ssjg{
271246149Ssjg	if (zfetch_ksp != NULL) {
272246149Ssjg		kstat_delete(zfetch_ksp);
273246149Ssjg		zfetch_ksp = NULL;
274246149Ssjg	}
275246149Ssjg}
276246149Ssjg
277246149Ssjg/*
278246149Ssjg * This takes a pointer to a zfetch structure and a dnode.  It performs the
279246149Ssjg * necessary setup for the zfetch structure, grokking data from the
280246149Ssjg * associated dnode.
281246149Ssjg */
282246149Ssjgvoid
283246149Ssjgdmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
284253883Ssjg{
285246149Ssjg	if (zf == NULL) {
286246149Ssjg		return;
287246149Ssjg	}
288246149Ssjg
289246149Ssjg	zf->zf_dnode = dno;
290246149Ssjg	zf->zf_stream_cnt = 0;
291246149Ssjg	zf->zf_alloc_fail = 0;
292246149Ssjg
293253883Ssjg	list_create(&zf->zf_stream, sizeof (zstream_t),
294253883Ssjg	    offsetof(zstream_t, zst_node));
295246149Ssjg
296246149Ssjg	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
297246149Ssjg}
298246149Ssjg
299253883Ssjg/*
300253883Ssjg * This function computes the actual size, in blocks, that can be prefetched,
301246149Ssjg * and fetches it.
302246149Ssjg */
303246149Ssjgstatic uint64_t
304246149Ssjgdmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
305246149Ssjg{
306246149Ssjg	uint64_t	fetchsz;
307246149Ssjg	uint64_t	i;
308246149Ssjg
309246149Ssjg	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
310246149Ssjg
311246149Ssjg	for (i = 0; i < fetchsz; i++) {
312246149Ssjg		dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ);
313246149Ssjg	}
314246149Ssjg
315246149Ssjg	return (fetchsz);
316246149Ssjg}
317246149Ssjg
318246149Ssjg/*
319253883Ssjg * this function returns the number of blocks that would be prefetched, based
320246149Ssjg * upon the supplied dnode, blockid, and nblks.  This is used so that we can
321246149Ssjg * update streams in place, and then prefetch with their old value after the
322246149Ssjg * fact.  This way, we can delay the prefetch, but subsequent accesses to the
323246149Ssjg * stream won't result in the same data being prefetched multiple times.
324246149Ssjg */
325246149Ssjgstatic uint64_t
326246149Ssjgdmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
327246149Ssjg{
328246149Ssjg	uint64_t	fetchsz;
329246149Ssjg
330246149Ssjg	if (blkid > dn->dn_maxblkid) {
331246149Ssjg		return (0);
332246149Ssjg	}
333246149Ssjg
334246149Ssjg	/* compute fetch size */
335253883Ssjg	if (blkid + nblks + 1 > dn->dn_maxblkid) {
336253883Ssjg		fetchsz = (dn->dn_maxblkid - blkid) + 1;
337246149Ssjg		ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
338246149Ssjg	} else {
339246149Ssjg		fetchsz = nblks;
340246149Ssjg	}
341246149Ssjg
342246149Ssjg
343246149Ssjg	return (fetchsz);
344246149Ssjg}
345246149Ssjg
346246149Ssjg/*
347246149Ssjg * given a zfetch and a zstream structure, see if there is an associated zstream
348246149Ssjg * for this block read.  If so, it starts a prefetch for the stream it
349246149Ssjg * located and returns true, otherwise it returns false
350246149Ssjg */
351246149Ssjgstatic boolean_t
352246149Ssjgdmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
353246149Ssjg{
354246149Ssjg	zstream_t	*zs;
355246149Ssjg	int64_t		diff;
356246149Ssjg	int		reset = !prefetched;
357246149Ssjg	int		rc = 0;
358246149Ssjg
359246149Ssjg	if (zh == NULL)
360246149Ssjg		return (0);
361246149Ssjg
362246149Ssjg	/*
363246149Ssjg	 * XXX: This locking strategy is a bit coarse; however, it's impact has
364246149Ssjg	 * yet to be tested.  If this turns out to be an issue, it can be
365246149Ssjg	 * modified in a number of different ways.
366246149Ssjg	 */
367246149Ssjg
368246149Ssjg	rw_enter(&zf->zf_rwlock, RW_READER);
369246149Ssjgtop:
370246149Ssjg
371246149Ssjg	for (zs = list_head(&zf->zf_stream); zs;
372246149Ssjg	    zs = list_next(&zf->zf_stream, zs)) {
373246149Ssjg
374246149Ssjg		/*
375246149Ssjg		 * XXX - should this be an assert?
376246149Ssjg		 */
377246149Ssjg		if (zs->zst_len == 0) {
378246149Ssjg			/* bogus stream */
379246149Ssjg			ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
380246149Ssjg			continue;
381246149Ssjg		}
382246149Ssjg
383246149Ssjg		/*
384246149Ssjg		 * We hit this case when we are in a strided prefetch stream:
385246149Ssjg		 * we will read "len" blocks before "striding".
386246149Ssjg		 */
387246149Ssjg		if (zh->zst_offset >= zs->zst_offset &&
388246149Ssjg		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
389246149Ssjg			if (prefetched) {
390246149Ssjg				/* already fetched */
391246149Ssjg				ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
392246149Ssjg				rc = 1;
393246149Ssjg				goto out;
394246149Ssjg			} else {
395246149Ssjg				ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
396246149Ssjg			}
397246149Ssjg		}
398246149Ssjg
399246149Ssjg		/*
400246149Ssjg		 * This is the forward sequential read case: we increment
401246149Ssjg		 * len by one each time we hit here, so we will enter this
402246149Ssjg		 * case on every read.
403246149Ssjg		 */
404246149Ssjg		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
405246149Ssjg
406246149Ssjg			reset = !prefetched && zs->zst_len > 1;
407246149Ssjg
408246149Ssjg			if (mutex_tryenter(&zs->zst_lock) == 0) {
409246149Ssjg				rc = 1;
410246149Ssjg				goto out;
411246149Ssjg			}
412246149Ssjg
413246149Ssjg			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
414246149Ssjg				mutex_exit(&zs->zst_lock);
415246149Ssjg				goto top;
416246149Ssjg			}
417246149Ssjg			zs->zst_len += zh->zst_len;
418246149Ssjg			diff = zs->zst_len - zfetch_block_cap;
419246149Ssjg			if (diff > 0) {
420246149Ssjg				zs->zst_offset += diff;
421246149Ssjg				zs->zst_len = zs->zst_len > diff ?
422246149Ssjg				    zs->zst_len - diff : 0;
423246149Ssjg			}
424246149Ssjg			zs->zst_direction = ZFETCH_FORWARD;
425246149Ssjg
426246149Ssjg			break;
427246149Ssjg
428246149Ssjg		/*
429246149Ssjg		 * Same as above, but reading backwards through the file.
430246149Ssjg		 */
431246149Ssjg		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
432246149Ssjg			/* backwards sequential access */
433246149Ssjg
434246149Ssjg			reset = !prefetched && zs->zst_len > 1;
435246149Ssjg
436246149Ssjg			if (mutex_tryenter(&zs->zst_lock) == 0) {
437246149Ssjg				rc = 1;
438246149Ssjg				goto out;
439246149Ssjg			}
440246149Ssjg
441246149Ssjg			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
442246149Ssjg				mutex_exit(&zs->zst_lock);
443246149Ssjg				goto top;
444246149Ssjg			}
445246149Ssjg
446246149Ssjg			zs->zst_offset = zs->zst_offset > zh->zst_len ?
447246149Ssjg			    zs->zst_offset - zh->zst_len : 0;
448246149Ssjg			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
449246149Ssjg			    zs->zst_ph_offset - zh->zst_len : 0;
450246149Ssjg			zs->zst_len += zh->zst_len;
451246149Ssjg
452246149Ssjg			diff = zs->zst_len - zfetch_block_cap;
453246149Ssjg			if (diff > 0) {
454246149Ssjg				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
455246149Ssjg				    zs->zst_ph_offset - diff : 0;
456246149Ssjg				zs->zst_len = zs->zst_len > diff ?
457246149Ssjg				    zs->zst_len - diff : zs->zst_len;
458246149Ssjg			}
459246149Ssjg			zs->zst_direction = ZFETCH_BACKWARD;
460246149Ssjg
461246149Ssjg			break;
462246149Ssjg
463246149Ssjg		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
464246149Ssjg		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
465246149Ssjg			/* strided forward access */
466246149Ssjg
467246149Ssjg			if (mutex_tryenter(&zs->zst_lock) == 0) {
468246149Ssjg				rc = 1;
469246149Ssjg				goto out;
470246149Ssjg			}
471246149Ssjg
472246149Ssjg			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
473246149Ssjg			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
474246149Ssjg				mutex_exit(&zs->zst_lock);
475246149Ssjg				goto top;
476246149Ssjg			}
477246149Ssjg
478246149Ssjg			zs->zst_offset += zs->zst_stride;
479246149Ssjg			zs->zst_direction = ZFETCH_FORWARD;
480246149Ssjg
481246149Ssjg			break;
482246149Ssjg
483246149Ssjg		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
484246149Ssjg		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
485246149Ssjg			/* strided reverse access */
486246149Ssjg
487246149Ssjg			if (mutex_tryenter(&zs->zst_lock) == 0) {
488246149Ssjg				rc = 1;
489246149Ssjg				goto out;
490246149Ssjg			}
491246149Ssjg
492246149Ssjg			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
493246149Ssjg			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
494246149Ssjg				mutex_exit(&zs->zst_lock);
495246149Ssjg				goto top;
496246149Ssjg			}
497246149Ssjg
498246149Ssjg			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
499246149Ssjg			    zs->zst_offset - zs->zst_stride : 0;
500246149Ssjg			zs->zst_ph_offset = (zs->zst_ph_offset >
501246149Ssjg			    (2 * zs->zst_stride)) ?
502246149Ssjg			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
503246149Ssjg			zs->zst_direction = ZFETCH_BACKWARD;
504246149Ssjg
505246149Ssjg			break;
506246149Ssjg		}
507246149Ssjg	}
508246149Ssjg
509246149Ssjg	if (zs) {
510246149Ssjg		if (reset) {
511246149Ssjg			zstream_t *remove = zs;
512246149Ssjg
513246149Ssjg			ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
514246149Ssjg			rc = 0;
515246149Ssjg			mutex_exit(&zs->zst_lock);
516246149Ssjg			rw_exit(&zf->zf_rwlock);
517246149Ssjg			rw_enter(&zf->zf_rwlock, RW_WRITER);
518246149Ssjg			/*
519246149Ssjg			 * Relocate the stream, in case someone removes
520246149Ssjg			 * it while we were acquiring the WRITER lock.
521246149Ssjg			 */
522246149Ssjg			for (zs = list_head(&zf->zf_stream); zs;
523246149Ssjg			    zs = list_next(&zf->zf_stream, zs)) {
524246149Ssjg				if (zs == remove) {
525246149Ssjg					dmu_zfetch_stream_remove(zf, zs);
526246149Ssjg					mutex_destroy(&zs->zst_lock);
527246149Ssjg					kmem_free(zs, sizeof (zstream_t));
528246149Ssjg					break;
529246149Ssjg				}
530246149Ssjg			}
531246149Ssjg		} else {
532246149Ssjg			ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
533246149Ssjg			rc = 1;
534246149Ssjg			dmu_zfetch_dofetch(zf, zs);
535246149Ssjg			mutex_exit(&zs->zst_lock);
536246149Ssjg		}
537246149Ssjg	}
538246149Ssjgout:
539246149Ssjg	rw_exit(&zf->zf_rwlock);
540246149Ssjg	return (rc);
541246149Ssjg}
542246149Ssjg
543246149Ssjg/*
544246149Ssjg * Clean-up state associated with a zfetch structure.  This frees allocated
545246149Ssjg * structure members, empties the zf_stream tree, and generally makes things
546246149Ssjg * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
547246149Ssjg */
548246149Ssjgvoid
549246149Ssjgdmu_zfetch_rele(zfetch_t *zf)
550246149Ssjg{
551246149Ssjg	zstream_t	*zs;
552246149Ssjg	zstream_t	*zs_next;
553246149Ssjg
554246149Ssjg	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
555246149Ssjg
556246149Ssjg	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
557246149Ssjg		zs_next = list_next(&zf->zf_stream, zs);
558246149Ssjg
559246149Ssjg		list_remove(&zf->zf_stream, zs);
560246149Ssjg		mutex_destroy(&zs->zst_lock);
561246149Ssjg		kmem_free(zs, sizeof (zstream_t));
562246149Ssjg	}
563246149Ssjg	list_destroy(&zf->zf_stream);
564246149Ssjg	rw_destroy(&zf->zf_rwlock);
565246149Ssjg
566246149Ssjg	zf->zf_dnode = NULL;
567246149Ssjg}
568246149Ssjg
569246149Ssjg/*
570246149Ssjg * Given a zfetch and zstream structure, insert the zstream structure into the
571246149Ssjg * AVL tree contained within the zfetch structure.  Peform the appropriate
572246149Ssjg * book-keeping.  It is possible that another thread has inserted a stream which
573246149Ssjg * matches one that we are about to insert, so we must be sure to check for this
574246149Ssjg * case.  If one is found, return failure, and let the caller cleanup the
575246149Ssjg * duplicates.
576246149Ssjg */
577246149Ssjgstatic int
578246149Ssjgdmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
579246149Ssjg{
580246149Ssjg	zstream_t	*zs_walk;
581246149Ssjg	zstream_t	*zs_next;
582246149Ssjg
583246149Ssjg	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
584246149Ssjg
585246149Ssjg	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
586246149Ssjg		zs_next = list_next(&zf->zf_stream, zs_walk);
587246149Ssjg
588246149Ssjg		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
589246149Ssjg			return (0);
590246149Ssjg		}
591246149Ssjg	}
592246149Ssjg
593246149Ssjg	list_insert_head(&zf->zf_stream, zs);
594246149Ssjg	zf->zf_stream_cnt++;
595246149Ssjg	return (1);
596246149Ssjg}
597246149Ssjg
598246149Ssjg
599246149Ssjg/*
600246149Ssjg * Walk the list of zstreams in the given zfetch, find an old one (by time), and
601246149Ssjg * reclaim it for use by the caller.
602246149Ssjg */
603static zstream_t *
604dmu_zfetch_stream_reclaim(zfetch_t *zf)
605{
606	zstream_t	*zs;
607	clock_t		ticks;
608
609	ticks = zfetch_min_sec_reap * hz;
610	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
611		return (0);
612
613	for (zs = list_head(&zf->zf_stream); zs;
614	    zs = list_next(&zf->zf_stream, zs)) {
615
616		if (ddi_get_lbolt() - zs->zst_last > ticks)
617			break;
618	}
619
620	if (zs) {
621		dmu_zfetch_stream_remove(zf, zs);
622		mutex_destroy(&zs->zst_lock);
623		bzero(zs, sizeof (zstream_t));
624	} else {
625		zf->zf_alloc_fail++;
626	}
627	rw_exit(&zf->zf_rwlock);
628
629	return (zs);
630}
631
632/*
633 * Given a zfetch and zstream structure, remove the zstream structure from its
634 * container in the zfetch structure.  Perform the appropriate book-keeping.
635 */
636static void
637dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
638{
639	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
640
641	list_remove(&zf->zf_stream, zs);
642	zf->zf_stream_cnt--;
643}
644
645static int
646dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
647{
648	if (zs1->zst_offset != zs2->zst_offset)
649		return (0);
650
651	if (zs1->zst_len != zs2->zst_len)
652		return (0);
653
654	if (zs1->zst_stride != zs2->zst_stride)
655		return (0);
656
657	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
658		return (0);
659
660	if (zs1->zst_cap != zs2->zst_cap)
661		return (0);
662
663	if (zs1->zst_direction != zs2->zst_direction)
664		return (0);
665
666	return (1);
667}
668
669/*
670 * This is the prefetch entry point.  It calls all of the other dmu_zfetch
671 * routines to create, delete, find, or operate upon prefetch streams.
672 */
673void
674dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
675{
676	zstream_t	zst;
677	zstream_t	*newstream;
678	boolean_t	fetched;
679	int		inserted;
680	unsigned int	blkshft;
681	uint64_t	blksz;
682
683	if (zfs_prefetch_disable)
684		return;
685
686	/* files that aren't ln2 blocksz are only one block -- nothing to do */
687	if (!zf->zf_dnode->dn_datablkshift)
688		return;
689
690	/* convert offset and size, into blockid and nblocks */
691	blkshft = zf->zf_dnode->dn_datablkshift;
692	blksz = (1 << blkshft);
693
694	bzero(&zst, sizeof (zstream_t));
695	zst.zst_offset = offset >> blkshft;
696	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
697	    P2ALIGN(offset, blksz)) >> blkshft;
698
699	fetched = dmu_zfetch_find(zf, &zst, prefetched);
700	if (fetched) {
701		ZFETCHSTAT_BUMP(zfetchstat_hits);
702	} else {
703		ZFETCHSTAT_BUMP(zfetchstat_misses);
704		fetched = dmu_zfetch_colinear(zf, &zst);
705		if (fetched) {
706			ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
707		} else {
708			ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
709		}
710	}
711
712	if (!fetched) {
713		newstream = dmu_zfetch_stream_reclaim(zf);
714
715		/*
716		 * we still couldn't find a stream, drop the lock, and allocate
717		 * one if possible.  Otherwise, give up and go home.
718		 */
719		if (newstream) {
720			ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
721		} else {
722			uint64_t	maxblocks;
723			uint32_t	max_streams;
724			uint32_t	cur_streams;
725
726			ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
727			cur_streams = zf->zf_stream_cnt;
728			maxblocks = zf->zf_dnode->dn_maxblkid;
729
730			max_streams = MIN(zfetch_max_streams,
731			    (maxblocks / zfetch_block_cap));
732			if (max_streams == 0) {
733				max_streams++;
734			}
735
736			if (cur_streams >= max_streams) {
737				return;
738			}
739			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
740		}
741
742		newstream->zst_offset = zst.zst_offset;
743		newstream->zst_len = zst.zst_len;
744		newstream->zst_stride = zst.zst_len;
745		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
746		newstream->zst_cap = zst.zst_len;
747		newstream->zst_direction = ZFETCH_FORWARD;
748		newstream->zst_last = ddi_get_lbolt();
749
750		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
751
752		rw_enter(&zf->zf_rwlock, RW_WRITER);
753		inserted = dmu_zfetch_stream_insert(zf, newstream);
754		rw_exit(&zf->zf_rwlock);
755
756		if (!inserted) {
757			mutex_destroy(&newstream->zst_lock);
758			kmem_free(newstream, sizeof (zstream_t));
759		}
760	}
761}
762