1/*- 2 * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26#include <sys/cdefs.h> 27__FBSDID("$FreeBSD: stable/10/sys/nfs/nfs_fha.c 322138 2017-08-07 07:40:00Z mav $"); 28 29#include <sys/param.h> 30#include <sys/systm.h> 31#include <sys/sysproto.h> 32#include <sys/kernel.h> 33#include <sys/sysctl.h> 34#include <sys/vnode.h> 35#include <sys/malloc.h> 36#include <sys/mount.h> 37#include <sys/mbuf.h> 38#include <sys/sbuf.h> 39 40#include <rpc/rpc.h> 41#include <nfs/nfs_fha.h> 42 43static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA"); 44 45/* 46 * XXX need to commonize definitions between old and new NFS code. Define 47 * this here so we don't include one nfsproto.h over the other. 48 */ 49#define NFS_PROG 100003 50 51void 52fha_init(struct fha_params *softc) 53{ 54 int i; 55 56 for (i = 0; i < FHA_HASH_SIZE; i++) 57 mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF); 58 59 /* 60 * Set the default tuning parameters. 61 */ 62 softc->ctls.enable = FHA_DEF_ENABLE; 63 softc->ctls.read = FHA_DEF_READ; 64 softc->ctls.write = FHA_DEF_WRITE; 65 softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT; 66 softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH; 67 softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD; 68 69 /* 70 * Add sysctls so the user can change the tuning parameters. 71 */ 72 SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), 73 OID_AUTO, "enable", CTLFLAG_RWTUN, 74 &softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)"); 75 76 SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), 77 OID_AUTO, "read", CTLFLAG_RWTUN, 78 &softc->ctls.read, 0, "Enable NFS FHA read locality"); 79 80 SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), 81 OID_AUTO, "write", CTLFLAG_RWTUN, 82 &softc->ctls.write, 0, "Enable NFS FHA write locality"); 83 84 SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), 85 OID_AUTO, "bin_shift", CTLFLAG_RWTUN, 86 &softc->ctls.bin_shift, 0, "Maximum locality distance 2^(bin_shift) bytes"); 87 88 SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), 89 OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RWTUN, 90 &softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that " 91 "should be working on requests for the same file handle"); 92 93 SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), 94 OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RWTUN, 95 &softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that " 96 "single nfsd thread should be working on at any time"); 97 98 SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), 99 OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0, 100 softc->callbacks.fhe_stats_sysctl, "A", ""); 101 102} 103 104void 105fha_uninit(struct fha_params *softc) 106{ 107 int i; 108 109 sysctl_ctx_free(&softc->sysctl_ctx); 110 for (i = 0; i < FHA_HASH_SIZE; i++) 111 mtx_destroy(&softc->fha_hash[i].mtx); 112} 113 114/* 115 * This just specifies that offsets should obey affinity when within 116 * the same 1Mbyte (1<<20) chunk for the file (reads only for now). 117 */ 118static void 119fha_extract_info(struct svc_req *req, struct fha_info *i, 120 struct fha_callbacks *cb) 121{ 122 struct mbuf *md; 123 caddr_t dpos; 124 static u_int64_t random_fh = 0; 125 int error; 126 int v3 = (req->rq_vers == 3); 127 rpcproc_t procnum; 128 129 /* 130 * We start off with a random fh. If we get a reasonable 131 * procnum, we set the fh. If there's a concept of offset 132 * that we're interested in, we set that. 133 */ 134 i->fh = ++random_fh; 135 i->offset = 0; 136 i->locktype = LK_EXCLUSIVE; 137 i->read = i->write = 0; 138 139 /* 140 * Extract the procnum and convert to v3 form if necessary, 141 * taking care to deal with out-of-range procnums. Caller will 142 * ensure that rq_vers is either 2 or 3. 143 */ 144 procnum = req->rq_proc; 145 if (!v3) { 146 rpcproc_t tmp_procnum; 147 148 tmp_procnum = cb->get_procnum(procnum); 149 if (tmp_procnum == -1) 150 goto out; 151 procnum = tmp_procnum; 152 } 153 154 /* 155 * We do affinity for most. However, we divide a realm of affinity 156 * by file offset so as to allow for concurrent random access. We 157 * only do this for reads today, but this may change when IFS supports 158 * efficient concurrent writes. 159 */ 160 if (cb->no_offset(procnum)) 161 goto out; 162 163 i->read = cb->is_read(procnum); 164 i->write = cb->is_write(procnum); 165 166 error = cb->realign(&req->rq_args, M_NOWAIT); 167 if (error) 168 goto out; 169 md = req->rq_args; 170 dpos = mtod(md, caddr_t); 171 172 /* Grab the filehandle. */ 173 error = cb->get_fh(&i->fh, v3, &md, &dpos); 174 if (error) 175 goto out; 176 177 /* Content ourselves with zero offset for all but reads. */ 178 if (i->read || i->write) 179 cb->get_offset(&md, &dpos, v3, i); 180 181out: 182 cb->set_locktype(procnum, i); 183} 184 185static struct fha_hash_entry * 186fha_hash_entry_new(u_int64_t fh) 187{ 188 struct fha_hash_entry *e; 189 190 e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK); 191 e->fh = fh; 192 e->num_rw = 0; 193 e->num_exclusive = 0; 194 e->num_threads = 0; 195 LIST_INIT(&e->threads); 196 197 return (e); 198} 199 200static void 201fha_hash_entry_destroy(struct fha_hash_entry *e) 202{ 203 204 mtx_assert(e->mtx, MA_OWNED); 205 KASSERT(e->num_rw == 0, 206 ("%d reqs on destroyed fhe %p", e->num_rw, e)); 207 KASSERT(e->num_exclusive == 0, 208 ("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e)); 209 KASSERT(e->num_threads == 0, 210 ("%d threads on destroyed fhe %p", e->num_threads, e)); 211 free(e, M_NFS_FHA); 212} 213 214static void 215fha_hash_entry_remove(struct fha_hash_entry *e) 216{ 217 218 mtx_assert(e->mtx, MA_OWNED); 219 LIST_REMOVE(e, link); 220 fha_hash_entry_destroy(e); 221} 222 223static struct fha_hash_entry * 224fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh) 225{ 226 SVCPOOL *pool; 227 struct fha_hash_slot *fhs; 228 struct fha_hash_entry *fhe, *new_fhe; 229 230 pool = *softc->pool; 231 fhs = &softc->fha_hash[fh % FHA_HASH_SIZE]; 232 new_fhe = fha_hash_entry_new(fh); 233 new_fhe->mtx = &fhs->mtx; 234 mtx_lock(&fhs->mtx); 235 LIST_FOREACH(fhe, &fhs->list, link) 236 if (fhe->fh == fh) 237 break; 238 if (!fhe) { 239 fhe = new_fhe; 240 LIST_INSERT_HEAD(&fhs->list, fhe, link); 241 } else 242 fha_hash_entry_destroy(new_fhe); 243 return (fhe); 244} 245 246static void 247fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread) 248{ 249 250 mtx_assert(fhe->mtx, MA_OWNED); 251 thread->st_p2 = 0; 252 LIST_INSERT_HEAD(&fhe->threads, thread, st_alink); 253 fhe->num_threads++; 254} 255 256static void 257fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread) 258{ 259 260 mtx_assert(fhe->mtx, MA_OWNED); 261 KASSERT(thread->st_p2 == 0, 262 ("%d reqs on removed thread %p", thread->st_p2, thread)); 263 LIST_REMOVE(thread, st_alink); 264 fhe->num_threads--; 265} 266 267/* 268 * Account for an ongoing operation associated with this file. 269 */ 270static void 271fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count) 272{ 273 274 mtx_assert(fhe->mtx, MA_OWNED); 275 if (LK_EXCLUSIVE == locktype) 276 fhe->num_exclusive += count; 277 else 278 fhe->num_rw += count; 279} 280 281/* 282 * Get the service thread currently associated with the fhe that is 283 * appropriate to handle this operation. 284 */ 285static SVCTHREAD * 286fha_hash_entry_choose_thread(struct fha_params *softc, 287 struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread) 288{ 289 SVCTHREAD *thread, *min_thread = NULL; 290 SVCPOOL *pool; 291 int req_count, min_count = 0; 292 off_t offset1, offset2; 293 294 pool = *softc->pool; 295 296 LIST_FOREACH(thread, &fhe->threads, st_alink) { 297 req_count = thread->st_p2; 298 299 /* If there are any writes in progress, use the first thread. */ 300 if (fhe->num_exclusive) { 301#if 0 302 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, 303 "fha: %p(%d)w", thread, req_count); 304#endif 305 return (thread); 306 } 307 308 /* Check whether we should consider locality. */ 309 if ((i->read && !softc->ctls.read) || 310 (i->write && !softc->ctls.write)) 311 goto noloc; 312 313 /* 314 * Check for locality, making sure that we won't 315 * exceed our per-thread load limit in the process. 316 */ 317 offset1 = i->offset; 318 offset2 = thread->st_p3; 319 320 if (((offset1 >= offset2) 321 && ((offset1 - offset2) < (1 << softc->ctls.bin_shift))) 322 || ((offset2 > offset1) 323 && ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) { 324 if ((softc->ctls.max_reqs_per_nfsd == 0) || 325 (req_count < softc->ctls.max_reqs_per_nfsd)) { 326#if 0 327 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, 328 "fha: %p(%d)r", thread, req_count); 329#endif 330 return (thread); 331 } 332 } 333 334noloc: 335 /* 336 * We don't have a locality match, so skip this thread, 337 * but keep track of the most attractive thread in case 338 * we need to come back to it later. 339 */ 340#if 0 341 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, 342 "fha: %p(%d)s off1 %llu off2 %llu", thread, 343 req_count, offset1, offset2); 344#endif 345 if ((min_thread == NULL) || (req_count < min_count)) { 346 min_count = req_count; 347 min_thread = thread; 348 } 349 } 350 351 /* 352 * We didn't find a good match yet. See if we can add 353 * a new thread to this file handle entry's thread list. 354 */ 355 if ((softc->ctls.max_nfsds_per_fh == 0) || 356 (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) { 357 thread = this_thread; 358#if 0 359 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, 360 "fha: %p(%d)t", thread, thread->st_p2); 361#endif 362 fha_hash_entry_add_thread(fhe, thread); 363 } else { 364 /* 365 * We don't want to use any more threads for this file, so 366 * go back to the most attractive nfsd we're already using. 367 */ 368 thread = min_thread; 369 } 370 371 return (thread); 372} 373 374/* 375 * After getting a request, try to assign it to some thread. Usually we 376 * handle it ourselves. 377 */ 378SVCTHREAD * 379fha_assign(SVCTHREAD *this_thread, struct svc_req *req, 380 struct fha_params *softc) 381{ 382 SVCTHREAD *thread; 383 struct fha_info i; 384 struct fha_hash_entry *fhe; 385 struct fha_callbacks *cb; 386 387 cb = &softc->callbacks; 388 389 /* Check to see whether we're enabled. */ 390 if (softc->ctls.enable == 0) 391 goto thist; 392 393 /* 394 * Only do placement if this is an NFS request. 395 */ 396 if (req->rq_prog != NFS_PROG) 397 goto thist; 398 399 if (req->rq_vers != 2 && req->rq_vers != 3) 400 goto thist; 401 402 fha_extract_info(req, &i, cb); 403 404 /* 405 * We save the offset associated with this request for later 406 * nfsd matching. 407 */ 408 fhe = fha_hash_entry_lookup(softc, i.fh); 409 req->rq_p1 = fhe; 410 req->rq_p2 = i.locktype; 411 req->rq_p3 = i.offset; 412 413 /* 414 * Choose a thread, taking into consideration locality, thread load, 415 * and the number of threads already working on this file. 416 */ 417 thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread); 418 KASSERT(thread, ("fha_assign: NULL thread!")); 419 fha_hash_entry_add_op(fhe, i.locktype, 1); 420 thread->st_p2++; 421 thread->st_p3 = i.offset; 422 423 /* 424 * Grab the pool lock here to not let chosen thread go away before 425 * the new request inserted to its queue while we drop fhe lock. 426 */ 427 mtx_lock(&thread->st_lock); 428 mtx_unlock(fhe->mtx); 429 430 return (thread); 431thist: 432 req->rq_p1 = NULL; 433 mtx_lock(&this_thread->st_lock); 434 return (this_thread); 435} 436 437/* 438 * Called when we're done with an operation. The request has already 439 * been de-queued. 440 */ 441void 442fha_nd_complete(SVCTHREAD *thread, struct svc_req *req) 443{ 444 struct fha_hash_entry *fhe = req->rq_p1; 445 struct mtx *mtx; 446 447 /* 448 * This may be called for reqs that didn't go through 449 * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS. 450 */ 451 if (!fhe) 452 return; 453 454 mtx = fhe->mtx; 455 mtx_lock(mtx); 456 fha_hash_entry_add_op(fhe, req->rq_p2, -1); 457 thread->st_p2--; 458 KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p", 459 thread->st_p2, thread)); 460 if (thread->st_p2 == 0) { 461 fha_hash_entry_remove_thread(fhe, thread); 462 if (0 == fhe->num_rw + fhe->num_exclusive) 463 fha_hash_entry_remove(fhe); 464 } 465 mtx_unlock(mtx); 466} 467 468int 469fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc) 470{ 471 int error, i; 472 struct sbuf sb; 473 struct fha_hash_entry *fhe; 474 bool_t first, hfirst; 475 SVCTHREAD *thread; 476 SVCPOOL *pool; 477 478 sbuf_new(&sb, NULL, 65536, SBUF_FIXEDLEN); 479 480 pool = NULL; 481 482 if (!*softc->pool) { 483 sbuf_printf(&sb, "NFSD not running\n"); 484 goto out; 485 } 486 pool = *softc->pool; 487 488 for (i = 0; i < FHA_HASH_SIZE; i++) 489 if (!LIST_EMPTY(&softc->fha_hash[i].list)) 490 break; 491 492 if (i == FHA_HASH_SIZE) { 493 sbuf_printf(&sb, "No file handle entries.\n"); 494 goto out; 495 } 496 497 hfirst = TRUE; 498 for (; i < FHA_HASH_SIZE; i++) { 499 mtx_lock(&softc->fha_hash[i].mtx); 500 if (LIST_EMPTY(&softc->fha_hash[i].list)) { 501 mtx_unlock(&softc->fha_hash[i].mtx); 502 continue; 503 } 504 sbuf_printf(&sb, "%shash %d: {\n", hfirst ? "" : ", ", i); 505 first = TRUE; 506 LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) { 507 sbuf_printf(&sb, "%sfhe %p: {\n", first ? " " : ", ", fhe); 508 509 sbuf_printf(&sb, " fh: %ju\n", (uintmax_t) fhe->fh); 510 sbuf_printf(&sb, " num_rw/exclusive: %d/%d\n", 511 fhe->num_rw, fhe->num_exclusive); 512 sbuf_printf(&sb, " num_threads: %d\n", fhe->num_threads); 513 514 LIST_FOREACH(thread, &fhe->threads, st_alink) { 515 sbuf_printf(&sb, " thread %p offset %ju " 516 "reqs %d\n", thread, 517 thread->st_p3, thread->st_p2); 518 } 519 520 sbuf_printf(&sb, " }"); 521 first = FALSE; 522 } 523 sbuf_printf(&sb, "\n}"); 524 mtx_unlock(&softc->fha_hash[i].mtx); 525 hfirst = FALSE; 526 } 527 528 out: 529 sbuf_trim(&sb); 530 sbuf_finish(&sb); 531 error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); 532 sbuf_delete(&sb); 533 return (error); 534} 535