nfs_fha.c revision 261048
1/*-
2 * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD: stable/10/sys/nfs/nfs_fha.c 261048 2014-01-22 23:47:29Z mav $");
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysproto.h>
32#include <sys/kernel.h>
33#include <sys/sysctl.h>
34#include <sys/vnode.h>
35#include <sys/malloc.h>
36#include <sys/mount.h>
37#include <sys/mbuf.h>
38#include <sys/sbuf.h>
39
40#include <rpc/rpc.h>
41#include <nfs/nfs_fha.h>
42
43static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
44
45/*
46 * XXX need to commonize definitions between old and new NFS code.  Define
47 * this here so we don't include one nfsproto.h over the other.
48 */
49#define	NFS_PROG		100003
50
51void
52fha_init(struct fha_params *softc)
53{
54	char tmpstr[128];
55
56	/*
57	 * A small hash table to map filehandles to fha_hash_entry
58	 * structures.
59	 */
60	softc->g_fha.hashtable = hashinit(256, M_NFS_FHA,
61	    &softc->g_fha.hashmask);
62
63	/*
64	 * Set the default tuning parameters.
65	 */
66	softc->ctls.enable = FHA_DEF_ENABLE;
67	softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
68	softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
69	softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
70
71	/*
72	 * Allow the user to override the defaults at boot time with
73	 * tunables.
74	 */
75	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.enable",
76	    softc->server_name);
77	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.enable);
78	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.bin_shift",
79	    softc->server_name);
80	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.bin_shift);
81	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_nfsds_per_fh",
82	    softc->server_name);
83	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_nfsds_per_fh);
84	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_reqs_per_nfsd",
85	    softc->server_name);
86	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_reqs_per_nfsd);
87
88	/*
89	 * Add sysctls so the user can change the tuning parameters at
90	 * runtime.
91	 */
92	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
93	    OID_AUTO, "enable", CTLFLAG_RW,
94	    &softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");
95
96	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
97	    OID_AUTO, "bin_shift", CTLFLAG_RW,
98	    &softc->ctls.bin_shift, 0, "For FHA reads, no two requests will "
99	    "contend if they're 2^(bin_shift) bytes apart");
100
101	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
102	    OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RW,
103	    &softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
104	    "should be working on requests for the same file handle");
105
106	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
107	    OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RW,
108	    &softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
109	    "single nfsd thread should be working on at any time");
110
111	SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
112	    OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
113	    softc->callbacks.fhe_stats_sysctl, "A", "");
114
115}
116
117void
118fha_uninit(struct fha_params *softc)
119{
120	sysctl_ctx_free(&softc->sysctl_ctx);
121	hashdestroy(softc->g_fha.hashtable, M_NFS_FHA, softc->g_fha.hashmask);
122}
123
124/*
125 * This just specifies that offsets should obey affinity when within
126 * the same 1Mbyte (1<<20) chunk for the file (reads only for now).
127 */
128static void
129fha_extract_info(struct svc_req *req, struct fha_info *i,
130    struct fha_callbacks *cb)
131{
132	struct mbuf *md;
133	fhandle_t fh;
134	caddr_t dpos;
135	static u_int64_t random_fh = 0;
136	int error;
137	int v3 = (req->rq_vers == 3);
138	rpcproc_t procnum;
139
140	/*
141	 * We start off with a random fh.  If we get a reasonable
142	 * procnum, we set the fh.  If there's a concept of offset
143	 * that we're interested in, we set that.
144	 */
145	i->fh = ++random_fh;
146	i->offset = 0;
147	i->locktype = LK_EXCLUSIVE;
148
149	/*
150	 * Extract the procnum and convert to v3 form if necessary,
151	 * taking care to deal with out-of-range procnums.  Caller will
152	 * ensure that rq_vers is either 2 or 3.
153	 */
154	procnum = req->rq_proc;
155	if (!v3) {
156		rpcproc_t tmp_procnum;
157
158		tmp_procnum = cb->get_procnum(procnum);
159		if (tmp_procnum == -1)
160			goto out;
161		procnum = tmp_procnum;
162	}
163
164	/*
165	 * We do affinity for most.  However, we divide a realm of affinity
166	 * by file offset so as to allow for concurrent random access.  We
167	 * only do this for reads today, but this may change when IFS supports
168	 * efficient concurrent writes.
169	 */
170	if (cb->no_offset(procnum))
171		goto out;
172
173	error = cb->realign(&req->rq_args, M_NOWAIT);
174	if (error)
175		goto out;
176	md = req->rq_args;
177	dpos = mtod(md, caddr_t);
178
179	/* Grab the filehandle. */
180	error = cb->get_fh(&fh, v3, &md, &dpos);
181	if (error)
182		goto out;
183
184	bcopy(fh.fh_fid.fid_data, &i->fh, sizeof(i->fh));
185
186	/* Content ourselves with zero offset for all but reads. */
187	if (cb->is_read(procnum) || cb->is_write(procnum))
188		cb->get_offset(&md, &dpos, v3, i);
189
190out:
191	cb->set_locktype(procnum, i);
192}
193
194static struct fha_hash_entry *
195fha_hash_entry_new(u_int64_t fh)
196{
197	struct fha_hash_entry *e;
198
199	e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
200	e->fh = fh;
201	e->num_rw = 0;
202	e->num_exclusive = 0;
203	e->num_threads = 0;
204	LIST_INIT(&e->threads);
205
206	return (e);
207}
208
209static void
210fha_hash_entry_destroy(struct fha_hash_entry *e)
211{
212
213	if (e->num_rw + e->num_exclusive)
214		panic("nonempty fhe");
215	free(e, M_NFS_FHA);
216}
217
218static void
219fha_hash_entry_remove(struct fha_hash_entry *e)
220{
221
222	LIST_REMOVE(e, link);
223	fha_hash_entry_destroy(e);
224}
225
226static struct fha_hash_entry *
227fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
228{
229	SVCPOOL *pool;
230
231	pool = *softc->pool;
232
233	struct fha_hash_entry *fhe, *new_fhe;
234
235	LIST_FOREACH(fhe, &softc->g_fha.hashtable[fh % softc->g_fha.hashmask],
236	    link)
237		if (fhe->fh == fh)
238			break;
239
240	if (!fhe) {
241		/* Allocate a new entry. */
242		mtx_unlock(&pool->sp_lock);
243		new_fhe = fha_hash_entry_new(fh);
244		mtx_lock(&pool->sp_lock);
245
246		/* Double-check to make sure we still need the new entry. */
247		LIST_FOREACH(fhe,
248		    &softc->g_fha.hashtable[fh % softc->g_fha.hashmask], link)
249			if (fhe->fh == fh)
250				break;
251		if (!fhe) {
252			fhe = new_fhe;
253			LIST_INSERT_HEAD(
254			    &softc->g_fha.hashtable[fh % softc->g_fha.hashmask],
255			    fhe, link);
256		} else
257			fha_hash_entry_destroy(new_fhe);
258	}
259
260	return (fhe);
261}
262
263static void
264fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
265{
266
267	LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
268	fhe->num_threads++;
269}
270
271static void
272fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
273{
274
275	LIST_REMOVE(thread, st_alink);
276	fhe->num_threads--;
277}
278
279/*
280 * Account for an ongoing operation associated with this file.
281 */
282static void
283fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
284{
285
286	if (LK_EXCLUSIVE == locktype)
287		fhe->num_exclusive += count;
288	else
289		fhe->num_rw += count;
290}
291
292/*
293 * Get the service thread currently associated with the fhe that is
294 * appropriate to handle this operation.
295 */
296SVCTHREAD *
297fha_hash_entry_choose_thread(struct fha_params *softc,
298    struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread);
299
300SVCTHREAD *
301fha_hash_entry_choose_thread(struct fha_params *softc,
302    struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
303{
304	SVCTHREAD *thread, *min_thread = NULL;
305	SVCPOOL *pool;
306	int req_count, min_count = 0;
307	off_t offset1, offset2;
308
309	pool = *softc->pool;
310
311	LIST_FOREACH(thread, &fhe->threads, st_alink) {
312		req_count = thread->st_reqcount;
313
314		/* If there are any writes in progress, use the first thread. */
315		if (fhe->num_exclusive) {
316#if 0
317			ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
318			    "fha: %p(%d)w", thread, req_count);
319#endif
320			return (thread);
321		}
322
323		/*
324		 * Check for read locality, making sure that we won't
325		 * exceed our per-thread load limit in the process.
326		 */
327		offset1 = i->offset;
328		offset2 = STAILQ_FIRST(&thread->st_reqs)->rq_p3;
329
330		if (((offset1 >= offset2)
331		  && ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
332		 || ((offset2 > offset1)
333		  && ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
334			if ((softc->ctls.max_reqs_per_nfsd == 0) ||
335			    (req_count < softc->ctls.max_reqs_per_nfsd)) {
336#if 0
337				ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
338				    "fha: %p(%d)r", thread, req_count);
339#endif
340				return (thread);
341			}
342		}
343
344		/*
345		 * We don't have a locality match, so skip this thread,
346		 * but keep track of the most attractive thread in case
347		 * we need to come back to it later.
348		 */
349#if 0
350		ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
351		    "fha: %p(%d)s off1 %llu off2 %llu", thread,
352		    req_count, offset1, offset2);
353#endif
354		if ((min_thread == NULL) || (req_count < min_count)) {
355			min_count = req_count;
356			min_thread = thread;
357		}
358	}
359
360	/*
361	 * We didn't find a good match yet.  See if we can add
362	 * a new thread to this file handle entry's thread list.
363	 */
364	if ((softc->ctls.max_nfsds_per_fh == 0) ||
365	    (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
366		/*
367		 * We can add a new thread, so try for an idle thread
368		 * first, and fall back to this_thread if none are idle.
369		 */
370		if (STAILQ_EMPTY(&this_thread->st_reqs)) {
371			thread = this_thread;
372#if 0
373			ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
374			    "fha: %p(%d)t", thread, thread->st_reqcount);
375#endif
376		} else if ((thread = LIST_FIRST(&pool->sp_idlethreads))) {
377#if 0
378			ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
379			    "fha: %p(%d)i", thread, thread->st_reqcount);
380#endif
381		} else {
382			thread = this_thread;
383#if 0
384			ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
385			    "fha: %p(%d)b", thread, thread->st_reqcount);
386#endif
387		}
388		fha_hash_entry_add_thread(fhe, thread);
389	} else {
390		/*
391		 * We don't want to use any more threads for this file, so
392		 * go back to the most attractive nfsd we're already using.
393		 */
394		thread = min_thread;
395	}
396
397	return (thread);
398}
399
400/*
401 * After getting a request, try to assign it to some thread.  Usually we
402 * handle it ourselves.
403 */
404SVCTHREAD *
405fha_assign(SVCTHREAD *this_thread, struct svc_req *req,
406    struct fha_params *softc)
407{
408	SVCTHREAD *thread;
409	struct fha_info i;
410	struct fha_hash_entry *fhe;
411	struct fha_callbacks *cb;
412
413	cb = &softc->callbacks;
414
415	/* Check to see whether we're enabled. */
416	if (softc->ctls.enable == 0)
417		return (this_thread);
418
419	/*
420	 * Only do placement if this is an NFS request.
421	 */
422	if (req->rq_prog != NFS_PROG)
423		return (this_thread);
424
425	if (req->rq_vers != 2 && req->rq_vers != 3)
426		return (this_thread);
427
428	fha_extract_info(req, &i, cb);
429
430	/*
431	 * We save the offset associated with this request for later
432	 * nfsd matching.
433	 */
434	fhe = fha_hash_entry_lookup(softc, i.fh);
435	req->rq_p1 = fhe;
436	req->rq_p2 = i.locktype;
437	req->rq_p3 = i.offset;
438
439	/*
440	 * Choose a thread, taking into consideration locality, thread load,
441	 * and the number of threads already working on this file.
442	 */
443	thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
444	KASSERT(thread, ("fha_assign: NULL thread!"));
445	fha_hash_entry_add_op(fhe, i.locktype, 1);
446
447	return (thread);
448}
449
450/*
451 * Called when we're done with an operation.  The request has already
452 * been de-queued.
453 */
454void
455fha_nd_complete(SVCTHREAD *thread, struct svc_req *req)
456{
457	struct fha_hash_entry *fhe = req->rq_p1;
458
459	/*
460	 * This may be called for reqs that didn't go through
461	 * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
462	 */
463	if (!fhe)
464		return;
465
466	fha_hash_entry_add_op(fhe, req->rq_p2, -1);
467
468	if (thread->st_reqcount == 0) {
469		fha_hash_entry_remove_thread(fhe, thread);
470		if (0 == fhe->num_rw + fhe->num_exclusive)
471			fha_hash_entry_remove(fhe);
472	}
473}
474
475int
476fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc)
477{
478	int error, count, i;
479	struct sbuf sb;
480	struct fha_hash_entry *fhe;
481	bool_t first = TRUE;
482	SVCTHREAD *thread;
483	SVCPOOL *pool;
484
485	sbuf_new(&sb, NULL, 4096, SBUF_FIXEDLEN);
486
487	pool = NULL;
488
489	if (!*softc->pool) {
490		sbuf_printf(&sb, "NFSD not running\n");
491		goto out;
492	}
493	pool = *softc->pool;
494
495	mtx_lock(&pool->sp_lock);
496	count = 0;
497	for (i = 0; i <= softc->g_fha.hashmask; i++)
498		if (!LIST_EMPTY(&softc->g_fha.hashtable[i]))
499			count++;
500
501	if (count == 0) {
502		sbuf_printf(&sb, "No file handle entries.\n");
503		goto out;
504	}
505
506	for (i = 0; i <= softc->g_fha.hashmask; i++) {
507		LIST_FOREACH(fhe, &softc->g_fha.hashtable[i], link) {
508			sbuf_printf(&sb, "%sfhe %p: {\n", first ? "" : ", ", fhe);
509
510			sbuf_printf(&sb, "    fh: %ju\n", (uintmax_t) fhe->fh);
511			sbuf_printf(&sb, "    num_rw: %d\n", fhe->num_rw);
512			sbuf_printf(&sb, "    num_exclusive: %d\n", fhe->num_exclusive);
513			sbuf_printf(&sb, "    num_threads: %d\n", fhe->num_threads);
514
515			LIST_FOREACH(thread, &fhe->threads, st_alink) {
516				sbuf_printf(&sb, "    thread %p offset %ju "
517				    "(count %d)\n", thread,
518				    STAILQ_FIRST(&thread->st_reqs)->rq_p3,
519				    thread->st_reqcount);
520			}
521
522			sbuf_printf(&sb, "}");
523			first = FALSE;
524
525			/* Limit the output. */
526			if (++count > 128) {
527				sbuf_printf(&sb, "...");
528				break;
529			}
530		}
531	}
532
533 out:
534	if (pool)
535		mtx_unlock(&pool->sp_lock);
536	sbuf_trim(&sb);
537	sbuf_finish(&sb);
538	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
539	sbuf_delete(&sb);
540	return (error);
541}
542