nfs_fha.c revision 261049
1146998Sdes/*-
2146998Sdes * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
3146998Sdes *
4146998Sdes * Redistribution and use in source and binary forms, with or without
5146998Sdes * modification, are permitted provided that the following conditions
6146998Sdes * are met:
7146998Sdes * 1. Redistributions of source code must retain the above copyright
8146998Sdes *    notice, this list of conditions and the following disclaimer.
9146998Sdes * 2. Redistributions in binary form must reproduce the above copyright
10146998Sdes *    notice, this list of conditions and the following disclaimer in the
11146998Sdes *    documentation and/or other materials provided with the distribution.
12146998Sdes *
13146998Sdes * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14146998Sdes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15146998Sdes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16146998Sdes * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17146998Sdes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18146998Sdes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19146998Sdes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20146998Sdes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21146998Sdes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22146998Sdes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23146998Sdes * SUCH DAMAGE.
24146998Sdes */
25146998Sdes
26146998Sdes#include <sys/cdefs.h>
27146998Sdes__FBSDID("$FreeBSD: stable/10/sys/nfs/nfs_fha.c 261049 2014-01-22 23:48:15Z mav $");
28146998Sdes
29146998Sdes#include <sys/param.h>
30146998Sdes#include <sys/systm.h>
31146998Sdes#include <sys/sysproto.h>
32146998Sdes#include <sys/kernel.h>
33146998Sdes#include <sys/sysctl.h>
34146998Sdes#include <sys/vnode.h>
35146998Sdes#include <sys/malloc.h>
36146998Sdes#include <sys/mount.h>
37146998Sdes#include <sys/mbuf.h>
38146998Sdes#include <sys/sbuf.h>
39146998Sdes
40146998Sdes#include <rpc/rpc.h>
41146998Sdes#include <nfs/nfs_fha.h>
42146998Sdes
43146998Sdesstatic MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
44146998Sdes
45146998Sdes/*
46146998Sdes * XXX need to commonize definitions between old and new NFS code.  Define
47146998Sdes * this here so we don't include one nfsproto.h over the other.
48146998Sdes */
49146998Sdes#define	NFS_PROG		100003
50146998Sdes
51146998Sdesvoid
52146998Sdesfha_init(struct fha_params *softc)
53146998Sdes{
54146998Sdes	char tmpstr[128];
55146998Sdes
56146998Sdes	/*
57146998Sdes	 * A small hash table to map filehandles to fha_hash_entry
58146998Sdes	 * structures.
59146998Sdes	 */
60146998Sdes	softc->g_fha.hashtable = hashinit(256, M_NFS_FHA,
61146998Sdes	    &softc->g_fha.hashmask);
62146998Sdes
63146998Sdes	/*
64146998Sdes	 * Set the default tuning parameters.
65146998Sdes	 */
66146998Sdes	softc->ctls.enable = FHA_DEF_ENABLE;
67146998Sdes	softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
68146998Sdes	softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
69146998Sdes	softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
70
71	/*
72	 * Allow the user to override the defaults at boot time with
73	 * tunables.
74	 */
75	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.enable",
76	    softc->server_name);
77	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.enable);
78	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.bin_shift",
79	    softc->server_name);
80	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.bin_shift);
81	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_nfsds_per_fh",
82	    softc->server_name);
83	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_nfsds_per_fh);
84	snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_reqs_per_nfsd",
85	    softc->server_name);
86	TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_reqs_per_nfsd);
87
88	/*
89	 * Add sysctls so the user can change the tuning parameters at
90	 * runtime.
91	 */
92	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
93	    OID_AUTO, "enable", CTLFLAG_RW,
94	    &softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");
95
96	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
97	    OID_AUTO, "bin_shift", CTLFLAG_RW,
98	    &softc->ctls.bin_shift, 0, "For FHA reads, no two requests will "
99	    "contend if they're 2^(bin_shift) bytes apart");
100
101	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
102	    OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RW,
103	    &softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
104	    "should be working on requests for the same file handle");
105
106	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
107	    OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RW,
108	    &softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
109	    "single nfsd thread should be working on at any time");
110
111	SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
112	    OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
113	    softc->callbacks.fhe_stats_sysctl, "A", "");
114
115}
116
117void
118fha_uninit(struct fha_params *softc)
119{
120	sysctl_ctx_free(&softc->sysctl_ctx);
121	hashdestroy(softc->g_fha.hashtable, M_NFS_FHA, softc->g_fha.hashmask);
122}
123
124/*
125 * This just specifies that offsets should obey affinity when within
126 * the same 1Mbyte (1<<20) chunk for the file (reads only for now).
127 */
128static void
129fha_extract_info(struct svc_req *req, struct fha_info *i,
130    struct fha_callbacks *cb)
131{
132	struct mbuf *md;
133	caddr_t dpos;
134	static u_int64_t random_fh = 0;
135	int error;
136	int v3 = (req->rq_vers == 3);
137	rpcproc_t procnum;
138
139	/*
140	 * We start off with a random fh.  If we get a reasonable
141	 * procnum, we set the fh.  If there's a concept of offset
142	 * that we're interested in, we set that.
143	 */
144	i->fh = ++random_fh;
145	i->offset = 0;
146	i->locktype = LK_EXCLUSIVE;
147
148	/*
149	 * Extract the procnum and convert to v3 form if necessary,
150	 * taking care to deal with out-of-range procnums.  Caller will
151	 * ensure that rq_vers is either 2 or 3.
152	 */
153	procnum = req->rq_proc;
154	if (!v3) {
155		rpcproc_t tmp_procnum;
156
157		tmp_procnum = cb->get_procnum(procnum);
158		if (tmp_procnum == -1)
159			goto out;
160		procnum = tmp_procnum;
161	}
162
163	/*
164	 * We do affinity for most.  However, we divide a realm of affinity
165	 * by file offset so as to allow for concurrent random access.  We
166	 * only do this for reads today, but this may change when IFS supports
167	 * efficient concurrent writes.
168	 */
169	if (cb->no_offset(procnum))
170		goto out;
171
172	error = cb->realign(&req->rq_args, M_NOWAIT);
173	if (error)
174		goto out;
175	md = req->rq_args;
176	dpos = mtod(md, caddr_t);
177
178	/* Grab the filehandle. */
179	error = cb->get_fh(&i->fh, v3, &md, &dpos);
180	if (error)
181		goto out;
182
183	/* Content ourselves with zero offset for all but reads. */
184	if (cb->is_read(procnum) || cb->is_write(procnum))
185		cb->get_offset(&md, &dpos, v3, i);
186
187out:
188	cb->set_locktype(procnum, i);
189}
190
191static struct fha_hash_entry *
192fha_hash_entry_new(u_int64_t fh)
193{
194	struct fha_hash_entry *e;
195
196	e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
197	e->fh = fh;
198	e->num_rw = 0;
199	e->num_exclusive = 0;
200	e->num_threads = 0;
201	LIST_INIT(&e->threads);
202
203	return (e);
204}
205
206static void
207fha_hash_entry_destroy(struct fha_hash_entry *e)
208{
209
210	if (e->num_rw + e->num_exclusive)
211		panic("nonempty fhe");
212	free(e, M_NFS_FHA);
213}
214
215static void
216fha_hash_entry_remove(struct fha_hash_entry *e)
217{
218
219	LIST_REMOVE(e, link);
220	fha_hash_entry_destroy(e);
221}
222
223static struct fha_hash_entry *
224fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
225{
226	SVCPOOL *pool;
227
228	pool = *softc->pool;
229
230	struct fha_hash_entry *fhe, *new_fhe;
231
232	LIST_FOREACH(fhe, &softc->g_fha.hashtable[fh % softc->g_fha.hashmask],
233	    link)
234		if (fhe->fh == fh)
235			break;
236
237	if (!fhe) {
238		/* Allocate a new entry. */
239		mtx_unlock(&pool->sp_lock);
240		new_fhe = fha_hash_entry_new(fh);
241		mtx_lock(&pool->sp_lock);
242
243		/* Double-check to make sure we still need the new entry. */
244		LIST_FOREACH(fhe,
245		    &softc->g_fha.hashtable[fh % softc->g_fha.hashmask], link)
246			if (fhe->fh == fh)
247				break;
248		if (!fhe) {
249			fhe = new_fhe;
250			LIST_INSERT_HEAD(
251			    &softc->g_fha.hashtable[fh % softc->g_fha.hashmask],
252			    fhe, link);
253		} else
254			fha_hash_entry_destroy(new_fhe);
255	}
256
257	return (fhe);
258}
259
260static void
261fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
262{
263
264	LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
265	fhe->num_threads++;
266}
267
268static void
269fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
270{
271
272	LIST_REMOVE(thread, st_alink);
273	fhe->num_threads--;
274}
275
276/*
277 * Account for an ongoing operation associated with this file.
278 */
279static void
280fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
281{
282
283	if (LK_EXCLUSIVE == locktype)
284		fhe->num_exclusive += count;
285	else
286		fhe->num_rw += count;
287}
288
289/*
290 * Get the service thread currently associated with the fhe that is
291 * appropriate to handle this operation.
292 */
293SVCTHREAD *
294fha_hash_entry_choose_thread(struct fha_params *softc,
295    struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread);
296
297SVCTHREAD *
298fha_hash_entry_choose_thread(struct fha_params *softc,
299    struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
300{
301	SVCTHREAD *thread, *min_thread = NULL;
302	SVCPOOL *pool;
303	int req_count, min_count = 0;
304	off_t offset1, offset2;
305
306	pool = *softc->pool;
307
308	LIST_FOREACH(thread, &fhe->threads, st_alink) {
309		req_count = thread->st_reqcount;
310
311		/* If there are any writes in progress, use the first thread. */
312		if (fhe->num_exclusive) {
313#if 0
314			ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
315			    "fha: %p(%d)w", thread, req_count);
316#endif
317			return (thread);
318		}
319
320		/*
321		 * Check for read locality, making sure that we won't
322		 * exceed our per-thread load limit in the process.
323		 */
324		offset1 = i->offset;
325		offset2 = STAILQ_FIRST(&thread->st_reqs)->rq_p3;
326
327		if (((offset1 >= offset2)
328		  && ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
329		 || ((offset2 > offset1)
330		  && ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
331			if ((softc->ctls.max_reqs_per_nfsd == 0) ||
332			    (req_count < softc->ctls.max_reqs_per_nfsd)) {
333#if 0
334				ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
335				    "fha: %p(%d)r", thread, req_count);
336#endif
337				return (thread);
338			}
339		}
340
341		/*
342		 * We don't have a locality match, so skip this thread,
343		 * but keep track of the most attractive thread in case
344		 * we need to come back to it later.
345		 */
346#if 0
347		ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
348		    "fha: %p(%d)s off1 %llu off2 %llu", thread,
349		    req_count, offset1, offset2);
350#endif
351		if ((min_thread == NULL) || (req_count < min_count)) {
352			min_count = req_count;
353			min_thread = thread;
354		}
355	}
356
357	/*
358	 * We didn't find a good match yet.  See if we can add
359	 * a new thread to this file handle entry's thread list.
360	 */
361	if ((softc->ctls.max_nfsds_per_fh == 0) ||
362	    (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
363		/*
364		 * We can add a new thread, so try for an idle thread
365		 * first, and fall back to this_thread if none are idle.
366		 */
367		if (STAILQ_EMPTY(&this_thread->st_reqs)) {
368			thread = this_thread;
369#if 0
370			ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
371			    "fha: %p(%d)t", thread, thread->st_reqcount);
372#endif
373		} else if ((thread = LIST_FIRST(&pool->sp_idlethreads))) {
374#if 0
375			ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
376			    "fha: %p(%d)i", thread, thread->st_reqcount);
377#endif
378		} else {
379			thread = this_thread;
380#if 0
381			ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
382			    "fha: %p(%d)b", thread, thread->st_reqcount);
383#endif
384		}
385		fha_hash_entry_add_thread(fhe, thread);
386	} else {
387		/*
388		 * We don't want to use any more threads for this file, so
389		 * go back to the most attractive nfsd we're already using.
390		 */
391		thread = min_thread;
392	}
393
394	return (thread);
395}
396
397/*
398 * After getting a request, try to assign it to some thread.  Usually we
399 * handle it ourselves.
400 */
401SVCTHREAD *
402fha_assign(SVCTHREAD *this_thread, struct svc_req *req,
403    struct fha_params *softc)
404{
405	SVCTHREAD *thread;
406	struct fha_info i;
407	struct fha_hash_entry *fhe;
408	struct fha_callbacks *cb;
409
410	cb = &softc->callbacks;
411
412	/* Check to see whether we're enabled. */
413	if (softc->ctls.enable == 0)
414		return (this_thread);
415
416	/*
417	 * Only do placement if this is an NFS request.
418	 */
419	if (req->rq_prog != NFS_PROG)
420		return (this_thread);
421
422	if (req->rq_vers != 2 && req->rq_vers != 3)
423		return (this_thread);
424
425	fha_extract_info(req, &i, cb);
426
427	/*
428	 * We save the offset associated with this request for later
429	 * nfsd matching.
430	 */
431	fhe = fha_hash_entry_lookup(softc, i.fh);
432	req->rq_p1 = fhe;
433	req->rq_p2 = i.locktype;
434	req->rq_p3 = i.offset;
435
436	/*
437	 * Choose a thread, taking into consideration locality, thread load,
438	 * and the number of threads already working on this file.
439	 */
440	thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
441	KASSERT(thread, ("fha_assign: NULL thread!"));
442	fha_hash_entry_add_op(fhe, i.locktype, 1);
443
444	return (thread);
445}
446
447/*
448 * Called when we're done with an operation.  The request has already
449 * been de-queued.
450 */
451void
452fha_nd_complete(SVCTHREAD *thread, struct svc_req *req)
453{
454	struct fha_hash_entry *fhe = req->rq_p1;
455
456	/*
457	 * This may be called for reqs that didn't go through
458	 * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
459	 */
460	if (!fhe)
461		return;
462
463	fha_hash_entry_add_op(fhe, req->rq_p2, -1);
464
465	if (thread->st_reqcount == 0) {
466		fha_hash_entry_remove_thread(fhe, thread);
467		if (0 == fhe->num_rw + fhe->num_exclusive)
468			fha_hash_entry_remove(fhe);
469	}
470}
471
472int
473fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc)
474{
475	int error, count, i;
476	struct sbuf sb;
477	struct fha_hash_entry *fhe;
478	bool_t first = TRUE;
479	SVCTHREAD *thread;
480	SVCPOOL *pool;
481
482	sbuf_new(&sb, NULL, 4096, SBUF_FIXEDLEN);
483
484	pool = NULL;
485
486	if (!*softc->pool) {
487		sbuf_printf(&sb, "NFSD not running\n");
488		goto out;
489	}
490	pool = *softc->pool;
491
492	mtx_lock(&pool->sp_lock);
493	count = 0;
494	for (i = 0; i <= softc->g_fha.hashmask; i++)
495		if (!LIST_EMPTY(&softc->g_fha.hashtable[i]))
496			count++;
497
498	if (count == 0) {
499		sbuf_printf(&sb, "No file handle entries.\n");
500		goto out;
501	}
502
503	for (i = 0; i <= softc->g_fha.hashmask; i++) {
504		LIST_FOREACH(fhe, &softc->g_fha.hashtable[i], link) {
505			sbuf_printf(&sb, "%sfhe %p: {\n", first ? "" : ", ", fhe);
506
507			sbuf_printf(&sb, "    fh: %ju\n", (uintmax_t) fhe->fh);
508			sbuf_printf(&sb, "    num_rw: %d\n", fhe->num_rw);
509			sbuf_printf(&sb, "    num_exclusive: %d\n", fhe->num_exclusive);
510			sbuf_printf(&sb, "    num_threads: %d\n", fhe->num_threads);
511
512			LIST_FOREACH(thread, &fhe->threads, st_alink) {
513				sbuf_printf(&sb, "    thread %p offset %ju "
514				    "(count %d)\n", thread,
515				    STAILQ_FIRST(&thread->st_reqs)->rq_p3,
516				    thread->st_reqcount);
517			}
518
519			sbuf_printf(&sb, "}");
520			first = FALSE;
521
522			/* Limit the output. */
523			if (++count > 128) {
524				sbuf_printf(&sb, "...");
525				break;
526			}
527		}
528	}
529
530 out:
531	if (pool)
532		mtx_unlock(&pool->sp_lock);
533	sbuf_trim(&sb);
534	sbuf_finish(&sb);
535	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
536	sbuf_delete(&sb);
537	return (error);
538}
539