g_sched.c revision 223921
1/*- 2 * Copyright (c) 2009-2010 Fabio Checconi 3 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28/* 29 * $Id$ 30 * $FreeBSD: head/sys/geom/sched/g_sched.c 223921 2011-07-11 05:22:31Z ae $ 31 * 32 * Main control module for geom-based disk schedulers ('sched'). 33 * 34 * USER VIEW 35 * A 'sched' node is typically inserted transparently between 36 * an existing provider pp and its original geom gp 37 * 38 * [pp --> gp ..] 39 * 40 * using the command "geom sched insert <provider>" and 41 * resulting in the following topology 42 * 43 * [pp --> sched_gp --> cp] [new_pp --> gp ... ] 44 * 45 * Deletion "geom sched destroy <provider>.sched." restores the 46 * original chain. The normal "geom sched create <provide>" 47 * is also supported. 48 * 49 * INTERNALS 50 * Internally, the 'sched' uses the following data structures 51 * 52 * geom{} g_sched_softc{} g_gsched{} 53 * +----------+ +---------------+ +-------------+ 54 * | softc *-|--->| sc_gsched *-|-->| gs_init | 55 * | ... | | | | gs_fini | 56 * | | | [ hash table] | | gs_start | 57 * +----------+ | | | ... | 58 * | | +-------------+ 59 * | | 60 * | | g_*_softc{} 61 * | | +-------------+ 62 * | sc_data *-|-->| | 63 * +---------------+ | algorithm- | 64 * | specific | 65 * +-------------+ 66 * 67 * A g_sched_softc{} is created with a "geom sched insert" call. 68 * In turn this instantiates a specific scheduling algorithm, 69 * which sets sc_gsched to point to the algorithm callbacks, 70 * and calls gs_init() to create the g_*_softc{} . 71 * The other callbacks (gs_start, gs_next, ...) are invoked 72 * as needed 73 * 74 * g_sched_softc{} is defined in g_sched.h and mostly used here; 75 * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; 76 * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) 77 * 78 * DATA MOVING 79 * When a bio is received on the provider, it goes to the 80 * g_sched_start() which calls gs_start() to initially queue it; 81 * then we call g_sched_dispatch() that loops around gs_next() 82 * to select zero or more bio's to be sent downstream. 83 * 84 * g_sched_dispatch() can also be called as a result of a timeout, 85 * e.g. when doing anticipation or pacing requests. 86 * 87 * When a bio comes back, it goes to g_sched_done() which in turn 88 * calls gs_done(). The latter does any necessary housekeeping in 89 * the scheduling algorithm, and may decide to call g_sched_dispatch() 90 * to send more bio's downstream. 91 * 92 * If an algorithm needs per-flow queues, these are created 93 * calling gs_init_class() and destroyed with gs_fini_class(), 94 * and they are also inserted in the hash table implemented in 95 * the g_sched_softc{} 96 * 97 * If an algorithm is replaced, or a transparently-inserted node is 98 * removed with "geom sched destroy", we need to remove all references 99 * to the g_*_softc{} and g_sched_softc from the bio's still in 100 * the scheduler. g_sched_forced_dispatch() helps doing this. 101 * XXX need to explain better. 102 */ 103 104#include <sys/cdefs.h> 105#include <sys/param.h> 106#include <sys/systm.h> 107#include <sys/kernel.h> 108#include <sys/module.h> 109#include <sys/lock.h> 110#include <sys/mutex.h> 111#include <sys/bio.h> 112#include <sys/limits.h> 113#include <sys/hash.h> 114#include <sys/sbuf.h> 115#include <sys/sysctl.h> 116#include <sys/malloc.h> 117#include <sys/proc.h> /* we access curthread */ 118#include <geom/geom.h> 119#include "gs_scheduler.h" 120#include "g_sched.h" /* geom hooks */ 121 122/* 123 * Size of the per-geom hash table storing traffic classes. 124 * We may decide to change it at a later time, it has no ABI 125 * implications as it is only used for run-time allocations. 126 */ 127#define G_SCHED_HASH_SIZE 32 128 129static int g_sched_destroy(struct g_geom *gp, boolean_t force); 130static int g_sched_destroy_geom(struct gctl_req *req, 131 struct g_class *mp, struct g_geom *gp); 132static void g_sched_config(struct gctl_req *req, struct g_class *mp, 133 const char *verb); 134static struct g_geom *g_sched_taste(struct g_class *mp, 135 struct g_provider *pp, int flags __unused); 136static void g_sched_dumpconf(struct sbuf *sb, const char *indent, 137 struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); 138static void g_sched_init(struct g_class *mp); 139static void g_sched_fini(struct g_class *mp); 140static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, 141 int fflag, struct thread *td); 142 143struct g_class g_sched_class = { 144 .name = G_SCHED_CLASS_NAME, 145 .version = G_VERSION, 146 .ctlreq = g_sched_config, 147 .taste = g_sched_taste, 148 .destroy_geom = g_sched_destroy_geom, 149 .init = g_sched_init, 150 .ioctl = g_sched_ioctl, 151 .fini = g_sched_fini 152}; 153 154MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); 155 156/* 157 * Global variables describing the state of the geom_sched module. 158 * There is only one static instance of this structure. 159 */ 160LIST_HEAD(gs_list, g_gsched); /* type, link field */ 161struct geom_sched_vars { 162 struct mtx gs_mtx; 163 struct gs_list gs_scheds; /* list of algorithms */ 164 u_int gs_debug; 165 u_int gs_sched_count; /* how many algorithms ? */ 166 u_int gs_patched; /* g_io_request was patched */ 167 168 u_int gs_initialized; 169 u_int gs_expire_secs; /* expiration of hash entries */ 170 171 struct bio_queue_head gs_pending; 172 u_int gs_npending; 173 174 /* The following are for stats, usually protected by gs_mtx. */ 175 u_long gs_requests; /* total requests */ 176 u_long gs_done; /* total done */ 177 u_int gs_in_flight; /* requests in flight */ 178 u_int gs_writes_in_flight; 179 u_int gs_bytes_in_flight; 180 u_int gs_write_bytes_in_flight; 181 182 char gs_names[256]; /* names of schedulers */ 183}; 184 185static struct geom_sched_vars me = { 186 .gs_expire_secs = 10, 187}; 188 189SYSCTL_DECL(_kern_geom); 190SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, 191 "GEOM_SCHED stuff"); 192 193SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, 194 &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); 195 196SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, 197 &me.gs_bytes_in_flight, 0, "Bytes in flight"); 198 199SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, 200 &me.gs_writes_in_flight, 0, "Write Requests in flight"); 201 202SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, 203 &me.gs_in_flight, 0, "Requests in flight"); 204 205SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, 206 &me.gs_done, 0, "Total done"); 207 208SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, 209 &me.gs_requests, 0, "Total requests"); 210 211SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, 212 &me.gs_names, 0, "Algorithm names"); 213 214SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, 215 &me.gs_sched_count, 0, "Number of algorithms"); 216 217SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, 218 &me.gs_debug, 0, "Debug level"); 219 220SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, 221 &me.gs_expire_secs, 0, "Expire time in seconds"); 222 223/* 224 * g_sched calls the scheduler algorithms with this lock held. 225 * The locking functions are exposed so the scheduler algorithms can also 226 * protect themselves e.g. when running a callout handler. 227 */ 228void 229g_sched_lock(struct g_geom *gp) 230{ 231 struct g_sched_softc *sc = gp->softc; 232 233 mtx_lock(&sc->sc_mtx); 234} 235 236void 237g_sched_unlock(struct g_geom *gp) 238{ 239 struct g_sched_softc *sc = gp->softc; 240 241 mtx_unlock(&sc->sc_mtx); 242} 243 244/* 245 * Support functions to handle references to the module, 246 * which are coming from devices using this scheduler. 247 */ 248static inline void 249g_gsched_ref(struct g_gsched *gsp) 250{ 251 252 atomic_add_int(&gsp->gs_refs, 1); 253} 254 255static inline void 256g_gsched_unref(struct g_gsched *gsp) 257{ 258 259 atomic_add_int(&gsp->gs_refs, -1); 260} 261 262/* 263 * Update the stats when this request is done. 264 */ 265static void 266g_sched_update_stats(struct bio *bio) 267{ 268 269 me.gs_done++; 270 me.gs_in_flight--; 271 me.gs_bytes_in_flight -= bio->bio_length; 272 if (bio->bio_cmd & BIO_WRITE) { 273 me.gs_writes_in_flight--; 274 me.gs_write_bytes_in_flight -= bio->bio_length; 275 } 276} 277 278/* 279 * Dispatch any pending request. 280 */ 281static void 282g_sched_forced_dispatch(struct g_geom *gp) 283{ 284 struct g_sched_softc *sc = gp->softc; 285 struct g_gsched *gsp = sc->sc_gsched; 286 struct bio *bp; 287 288 KASSERT(mtx_owned(&sc->sc_mtx), 289 ("sc_mtx not owned during forced dispatch")); 290 291 while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) 292 g_io_request(bp, LIST_FIRST(&gp->consumer)); 293} 294 295/* 296 * The main dispatch loop, called either here after the start 297 * routine, or by scheduling algorithms when they receive a timeout 298 * or a 'done' notification. Does not share code with the forced 299 * dispatch path, since the gs_done() callback can call us. 300 */ 301void 302g_sched_dispatch(struct g_geom *gp) 303{ 304 struct g_sched_softc *sc = gp->softc; 305 struct g_gsched *gsp = sc->sc_gsched; 306 struct bio *bp; 307 308 KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); 309 310 if ((sc->sc_flags & G_SCHED_FLUSHING)) 311 return; 312 313 while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) 314 g_io_request(bp, LIST_FIRST(&gp->consumer)); 315} 316 317/* 318 * Recent (8.0 and above) versions of FreeBSD have support to 319 * register classifiers of disk requests. The classifier is 320 * invoked by g_io_request(), and stores the information into 321 * bp->bio_classifier1. 322 * 323 * Support for older versions, which is left here only for 324 * documentation purposes, relies on two hacks: 325 * 1. classification info is written into the bio_caller1 326 * field of the topmost node in the bio chain. This field 327 * is rarely used, but this module is incompatible with 328 * those that use bio_caller1 for other purposes, 329 * such as ZFS and gjournal; 330 * 2. g_io_request() is patched in-memory when the module is 331 * loaded, so that the function calls a classifier as its 332 * first thing. g_io_request() is restored when the module 333 * is unloaded. This functionality is only supported for 334 * x86 and amd64, other architectures need source code changes. 335 */ 336 337/* 338 * Lookup the identity of the issuer of the original request. 339 * In the current implementation we use the curthread of the 340 * issuer, but different mechanisms may be implemented later 341 * so we do not make assumptions on the return value which for 342 * us is just an opaque identifier. 343 */ 344 345static inline u_long 346g_sched_classify(struct bio *bp) 347{ 348 349#if __FreeBSD_version > 800098 350 /* we have classifier fields in the struct bio */ 351#define HAVE_BIO_CLASSIFIER 352 return ((u_long)bp->bio_classifier1); 353#else 354#warning old version!!! 355 while (bp->bio_parent != NULL) 356 bp = bp->bio_parent; 357 358 return ((u_long)bp->bio_caller1); 359#endif 360} 361 362/* Return the hash chain for the given key. */ 363static inline struct g_hash * 364g_sched_hash(struct g_sched_softc *sc, u_long key) 365{ 366 367 return (&sc->sc_hash[key & sc->sc_mask]); 368} 369 370/* 371 * Helper function for the children classes, which takes 372 * a geom and a bio and returns the private descriptor 373 * associated to the request. This involves fetching 374 * the classification field and [al]locating the 375 * corresponding entry in the hash table. 376 */ 377void * 378g_sched_get_class(struct g_geom *gp, struct bio *bp) 379{ 380 struct g_sched_softc *sc; 381 struct g_sched_class *gsc; 382 struct g_gsched *gsp; 383 struct g_hash *bucket; 384 u_long key; 385 386 sc = gp->softc; 387 key = g_sched_classify(bp); 388 bucket = g_sched_hash(sc, key); 389 LIST_FOREACH(gsc, bucket, gsc_clist) { 390 if (key == gsc->gsc_key) { 391 gsc->gsc_refs++; 392 return (gsc->gsc_priv); 393 } 394 } 395 396 gsp = sc->sc_gsched; 397 gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, 398 M_GEOM_SCHED, M_NOWAIT | M_ZERO); 399 if (!gsc) 400 return (NULL); 401 402 if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { 403 free(gsc, M_GEOM_SCHED); 404 return (NULL); 405 } 406 407 gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ 408 gsc->gsc_key = key; 409 LIST_INSERT_HEAD(bucket, gsc, gsc_clist); 410 411 gsc->gsc_expire = ticks + me.gs_expire_secs * hz; 412 413 return (gsc->gsc_priv); 414} 415 416/* 417 * Release a reference to the per-client descriptor, 418 */ 419void 420g_sched_put_class(struct g_geom *gp, void *priv) 421{ 422 struct g_sched_class *gsc; 423 struct g_sched_softc *sc; 424 425 gsc = g_sched_priv2class(priv); 426 gsc->gsc_expire = ticks + me.gs_expire_secs * hz; 427 428 if (--gsc->gsc_refs > 0) 429 return; 430 431 sc = gp->softc; 432 sc->sc_gsched->gs_fini_class(sc->sc_data, priv); 433 434 LIST_REMOVE(gsc, gsc_clist); 435 free(gsc, M_GEOM_SCHED); 436} 437 438static void 439g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, 440 struct g_gsched *gsp, void *data) 441{ 442 struct g_sched_class *cp, *cp2; 443 int i; 444 445 if (!hp) 446 return; 447 448 if (data && gsp->gs_hash_unref) 449 gsp->gs_hash_unref(data); 450 451 for (i = 0; i < G_SCHED_HASH_SIZE; i++) { 452 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) 453 g_sched_put_class(gp, cp->gsc_priv); 454 } 455 456 hashdestroy(hp, M_GEOM_SCHED, mask); 457} 458 459static struct g_hash * 460g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) 461{ 462 struct g_hash *hash; 463 464 if (gsp->gs_priv_size == 0) 465 return (NULL); 466 467 hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); 468 469 return (hash); 470} 471 472static void 473g_sched_flush_classes(struct g_geom *gp) 474{ 475 struct g_sched_softc *sc; 476 struct g_sched_class *cp, *cp2; 477 int i; 478 479 sc = gp->softc; 480 481 if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) 482 return; 483 484 for (i = 0; i < G_SCHED_HASH_SIZE; i++) { 485 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { 486 if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) 487 g_sched_put_class(gp, cp->gsc_priv); 488 } 489 } 490 491 sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; 492} 493 494/* 495 * Wait for the completion of any outstanding request. To ensure 496 * that this does not take forever the caller has to make sure that 497 * no new request enter the scehduler before calling us. 498 * 499 * Must be called with the gp mutex held and topology locked. 500 */ 501static int 502g_sched_wait_pending(struct g_geom *gp) 503{ 504 struct g_sched_softc *sc = gp->softc; 505 int endticks = ticks + hz; 506 507 g_topology_assert(); 508 509 while (sc->sc_pending && endticks - ticks >= 0) 510 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); 511 512 return (sc->sc_pending ? ETIMEDOUT : 0); 513} 514 515static int 516g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) 517{ 518 struct g_sched_softc *sc = gp->softc; 519 int error; 520 521 /* Set the flushing flag: new bios will not enter the scheduler. */ 522 sc->sc_flags |= G_SCHED_FLUSHING; 523 524 g_sched_forced_dispatch(gp); 525 error = g_sched_wait_pending(gp); 526 if (error) 527 goto failed; 528 529 /* No more requests pending or in flight from the old gsp. */ 530 531 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); 532 sc->sc_hash = NULL; 533 534 /* 535 * Avoid deadlock here by releasing the gp mutex and reacquiring 536 * it once done. It should be safe, since no reconfiguration or 537 * destruction can take place due to the geom topology lock; no 538 * new request can use the current sc_data since we flagged the 539 * geom as being flushed. 540 */ 541 g_sched_unlock(gp); 542 gsp->gs_fini(sc->sc_data); 543 g_sched_lock(gp); 544 545 sc->sc_gsched = NULL; 546 sc->sc_data = NULL; 547 g_gsched_unref(gsp); 548 549failed: 550 sc->sc_flags &= ~G_SCHED_FLUSHING; 551 552 return (error); 553} 554 555static int 556g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) 557{ 558 int error; 559 560 g_sched_lock(gp); 561 error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ 562 g_sched_unlock(gp); 563 564 return (error); 565} 566 567/* 568 * Support function for create/taste -- locate the desired 569 * algorithm and grab a reference to it. 570 */ 571static struct g_gsched * 572g_gsched_find(const char *name) 573{ 574 struct g_gsched *gsp = NULL; 575 576 mtx_lock(&me.gs_mtx); 577 LIST_FOREACH(gsp, &me.gs_scheds, glist) { 578 if (strcmp(name, gsp->gs_name) == 0) { 579 g_gsched_ref(gsp); 580 break; 581 } 582 } 583 mtx_unlock(&me.gs_mtx); 584 585 return (gsp); 586} 587 588/* 589 * Rebuild the list of scheduler names. 590 * To be called with me.gs_mtx lock held. 591 */ 592static void 593g_gsched_build_names(struct g_gsched *gsp) 594{ 595 int pos, l; 596 struct g_gsched *cur; 597 598 pos = 0; 599 LIST_FOREACH(cur, &me.gs_scheds, glist) { 600 l = strlen(cur->gs_name); 601 if (l + pos + 1 + 1 < sizeof(me.gs_names)) { 602 if (pos != 0) 603 me.gs_names[pos++] = ' '; 604 strcpy(me.gs_names + pos, cur->gs_name); 605 pos += l; 606 } 607 } 608 me.gs_names[pos] = '\0'; 609} 610 611/* 612 * Register or unregister individual scheduling algorithms. 613 */ 614static int 615g_gsched_register(struct g_gsched *gsp) 616{ 617 struct g_gsched *cur; 618 int error = 0; 619 620 mtx_lock(&me.gs_mtx); 621 LIST_FOREACH(cur, &me.gs_scheds, glist) { 622 if (strcmp(gsp->gs_name, cur->gs_name) == 0) 623 break; 624 } 625 if (cur != NULL) { 626 G_SCHED_DEBUG(0, "A scheduler named %s already" 627 "exists.", gsp->gs_name); 628 error = EEXIST; 629 } else { 630 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); 631 gsp->gs_refs = 1; 632 me.gs_sched_count++; 633 g_gsched_build_names(gsp); 634 } 635 mtx_unlock(&me.gs_mtx); 636 637 return (error); 638} 639 640struct g_gsched_unregparm { 641 struct g_gsched *gup_gsp; 642 int gup_error; 643}; 644 645static void 646g_gsched_unregister(void *arg, int flag) 647{ 648 struct g_gsched_unregparm *parm = arg; 649 struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; 650 struct g_sched_softc *sc; 651 struct g_geom *gp, *gp_tmp; 652 int error; 653 654 parm->gup_error = 0; 655 656 g_topology_assert(); 657 658 if (flag == EV_CANCEL) 659 return; 660 661 mtx_lock(&me.gs_mtx); 662 663 LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { 664 if (gp->class != &g_sched_class) 665 continue; /* Should not happen. */ 666 667 sc = gp->softc; 668 if (sc->sc_gsched == gsp) { 669 error = g_sched_remove(gp, gsp); 670 if (error) 671 goto failed; 672 } 673 } 674 675 LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { 676 if (cur != gsp) 677 continue; 678 679 if (gsp->gs_refs != 1) { 680 G_SCHED_DEBUG(0, "%s still in use.", 681 gsp->gs_name); 682 parm->gup_error = EBUSY; 683 } else { 684 LIST_REMOVE(gsp, glist); 685 me.gs_sched_count--; 686 g_gsched_build_names(gsp); 687 } 688 break; 689 } 690 691 if (cur == NULL) { 692 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); 693 parm->gup_error = ENOENT; 694 } 695 696failed: 697 mtx_unlock(&me.gs_mtx); 698} 699 700static inline void 701g_gsched_global_init(void) 702{ 703 704 if (!me.gs_initialized) { 705 G_SCHED_DEBUG(0, "Initializing global data."); 706 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); 707 LIST_INIT(&me.gs_scheds); 708 gs_bioq_init(&me.gs_pending); 709 me.gs_initialized = 1; 710 } 711} 712 713/* 714 * Module event called when a scheduling algorithm module is loaded or 715 * unloaded. 716 */ 717int 718g_gsched_modevent(module_t mod, int cmd, void *arg) 719{ 720 struct g_gsched *gsp = arg; 721 struct g_gsched_unregparm parm; 722 int error; 723 724 G_SCHED_DEBUG(0, "Modevent %d.", cmd); 725 726 /* 727 * If the module is loaded at boot, the geom thread that calls 728 * g_sched_init() might actually run after g_gsched_modevent(), 729 * so make sure that the module is properly initialized. 730 */ 731 g_gsched_global_init(); 732 733 error = EOPNOTSUPP; 734 switch (cmd) { 735 case MOD_LOAD: 736 error = g_gsched_register(gsp); 737 G_SCHED_DEBUG(0, "Loaded module %s error %d.", 738 gsp->gs_name, error); 739 if (error == 0) 740 g_retaste(&g_sched_class); 741 break; 742 743 case MOD_UNLOAD: 744 parm.gup_gsp = gsp; 745 parm.gup_error = 0; 746 747 error = g_waitfor_event(g_gsched_unregister, 748 &parm, M_WAITOK, NULL); 749 if (error == 0) 750 error = parm.gup_error; 751 G_SCHED_DEBUG(0, "Unloaded module %s error %d.", 752 gsp->gs_name, error); 753 break; 754 }; 755 756 return (error); 757} 758 759#ifdef KTR 760#define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) 761 762static inline char 763g_sched_type(struct bio *bp) 764{ 765 766 if (0 != (bp->bio_cmd & BIO_READ)) 767 return ('R'); 768 else if (0 != (bp->bio_cmd & BIO_WRITE)) 769 return ('W'); 770 return ('U'); 771} 772 773static inline void 774g_sched_trace_bio_START(struct bio *bp) 775{ 776 777 CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp), 778 g_sched_type(bp), bp->bio_offset / ULONG_MAX, 779 bp->bio_offset, bp->bio_length); 780} 781 782static inline void 783g_sched_trace_bio_DONE(struct bio *bp) 784{ 785 786 CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp), 787 g_sched_type(bp), bp->bio_offset / ULONG_MAX, 788 bp->bio_offset, bp->bio_length); 789} 790#else /* !KTR */ 791#define TRC_BIO_EVENT(e, bp) 792#endif /* !KTR */ 793 794/* 795 * g_sched_done() and g_sched_start() dispatch the geom requests to 796 * the scheduling algorithm in use. 797 */ 798static void 799g_sched_done(struct bio *bio) 800{ 801 struct g_geom *gp = bio->bio_caller2; 802 struct g_sched_softc *sc = gp->softc; 803 804 TRC_BIO_EVENT(DONE, bio); 805 806 KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); 807 808 g_sched_lock(gp); 809 810 g_sched_update_stats(bio); 811 sc->sc_gsched->gs_done(sc->sc_data, bio); 812 if (!--sc->sc_pending) 813 wakeup(gp); 814 815 g_sched_flush_classes(gp); 816 g_sched_unlock(gp); 817 818 g_std_done(bio); 819} 820 821static void 822g_sched_start(struct bio *bp) 823{ 824 struct g_geom *gp = bp->bio_to->geom; 825 struct g_sched_softc *sc = gp->softc; 826 struct bio *cbp; 827 828 TRC_BIO_EVENT(START, bp); 829 G_SCHED_LOGREQ(bp, "Request received."); 830 831 cbp = g_clone_bio(bp); 832 if (cbp == NULL) { 833 g_io_deliver(bp, ENOMEM); 834 return; 835 } 836 cbp->bio_done = g_sched_done; 837 cbp->bio_to = LIST_FIRST(&gp->provider); 838 KASSERT(cbp->bio_to != NULL, ("NULL provider")); 839 840 /* We only schedule reads and writes. */ 841 if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE))) 842 goto bypass; 843 844 G_SCHED_LOGREQ(cbp, "Sending request."); 845 846 g_sched_lock(gp); 847 /* 848 * Call the algorithm's gs_start to queue the request in the 849 * scheduler. If gs_start fails then pass the request down, 850 * otherwise call g_sched_dispatch() which tries to push 851 * one or more requests down. 852 */ 853 if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || 854 sc->sc_gsched->gs_start(sc->sc_data, cbp)) { 855 g_sched_unlock(gp); 856 goto bypass; 857 } 858 /* 859 * We use bio_caller1 to mark requests that are scheduled 860 * so make sure it is not NULL. 861 */ 862 if (cbp->bio_caller1 == NULL) 863 cbp->bio_caller1 = &me; /* anything not NULL */ 864 865 cbp->bio_caller2 = gp; 866 sc->sc_pending++; 867 868 /* Update general stats. */ 869 me.gs_in_flight++; 870 me.gs_requests++; 871 me.gs_bytes_in_flight += bp->bio_length; 872 if (bp->bio_cmd & BIO_WRITE) { 873 me.gs_writes_in_flight++; 874 me.gs_write_bytes_in_flight += bp->bio_length; 875 } 876 g_sched_dispatch(gp); 877 g_sched_unlock(gp); 878 return; 879 880bypass: 881 cbp->bio_done = g_std_done; 882 cbp->bio_caller1 = NULL; /* not scheduled */ 883 g_io_request(cbp, LIST_FIRST(&gp->consumer)); 884} 885 886/* 887 * The next few functions are the geom glue. 888 */ 889static void 890g_sched_orphan(struct g_consumer *cp) 891{ 892 893 g_topology_assert(); 894 g_sched_destroy(cp->geom, 1); 895} 896 897static int 898g_sched_access(struct g_provider *pp, int dr, int dw, int de) 899{ 900 struct g_geom *gp; 901 struct g_consumer *cp; 902 int error; 903 904 gp = pp->geom; 905 cp = LIST_FIRST(&gp->consumer); 906 error = g_access(cp, dr, dw, de); 907 908 return (error); 909} 910 911static void 912g_sched_temporary_start(struct bio *bio) 913{ 914 915 mtx_lock(&me.gs_mtx); 916 me.gs_npending++; 917 gs_bioq_disksort(&me.gs_pending, bio); 918 mtx_unlock(&me.gs_mtx); 919} 920 921static void 922g_sched_flush_pending(g_start_t *start) 923{ 924 struct bio *bp; 925 926 while ((bp = gs_bioq_takefirst(&me.gs_pending))) 927 start(bp); 928} 929 930static int 931g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, 932 struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) 933{ 934 struct g_sched_softc *sc = gp->softc; 935 g_start_t *saved_start, *flush = g_sched_start; 936 int error = 0, endticks = ticks + hz; 937 938 g_cancel_event(newpp); /* prevent taste() */ 939 /* copy private fields */ 940 newpp->private = pp->private; 941 newpp->index = pp->index; 942 943 /* Queue all the early requests coming for us. */ 944 me.gs_npending = 0; 945 saved_start = pp->geom->start; 946 dstgp->start = g_sched_temporary_start; 947 948 while (pp->nstart - pp->nend != me.gs_npending && 949 endticks - ticks >= 0) 950 tsleep(pp, PRIBIO, "-", hz/10); 951 952 if (pp->nstart - pp->nend != me.gs_npending) { 953 flush = saved_start; 954 error = ETIMEDOUT; 955 goto fail; 956 } 957 958 /* link pp to this geom */ 959 LIST_REMOVE(pp, provider); 960 pp->geom = gp; 961 LIST_INSERT_HEAD(&gp->provider, pp, provider); 962 963 /* 964 * replicate the counts from the parent in the 965 * new provider and consumer nodes 966 */ 967 cp->acr = newpp->acr = pp->acr; 968 cp->acw = newpp->acw = pp->acw; 969 cp->ace = newpp->ace = pp->ace; 970 sc->sc_flags |= G_SCHED_PROXYING; 971 972fail: 973 dstgp->start = saved_start; 974 975 g_sched_flush_pending(flush); 976 977 return (error); 978} 979 980/* 981 * Create a geom node for the device passed as *pp. 982 * If successful, add a reference to this gsp. 983 */ 984static int 985g_sched_create(struct gctl_req *req, struct g_class *mp, 986 struct g_provider *pp, struct g_gsched *gsp, int proxy) 987{ 988 struct g_sched_softc *sc = NULL; 989 struct g_geom *gp, *dstgp; 990 struct g_provider *newpp = NULL; 991 struct g_consumer *cp = NULL; 992 char name[64]; 993 int error; 994 995 g_topology_assert(); 996 997 snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); 998 LIST_FOREACH(gp, &mp->geom, geom) { 999 if (strcmp(gp->name, name) == 0) { 1000 gctl_error(req, "Geom %s already exists.", 1001 name); 1002 return (EEXIST); 1003 } 1004 } 1005 1006 gp = g_new_geomf(mp, name); 1007 dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ 1008 1009 sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); 1010 sc->sc_gsched = gsp; 1011 sc->sc_data = gsp->gs_init(gp); 1012 if (sc->sc_data == NULL) { 1013 error = ENOMEM; 1014 goto fail; 1015 } 1016 1017 sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); 1018 1019 /* 1020 * Do not initialize the flush mechanism, will be initialized 1021 * on the first insertion on the hash table. 1022 */ 1023 1024 mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); 1025 1026 gp->softc = sc; 1027 gp->start = g_sched_start; 1028 gp->orphan = g_sched_orphan; 1029 gp->access = g_sched_access; 1030 gp->dumpconf = g_sched_dumpconf; 1031 1032 newpp = g_new_providerf(dstgp, gp->name); 1033 newpp->mediasize = pp->mediasize; 1034 newpp->sectorsize = pp->sectorsize; 1035 1036 cp = g_new_consumer(gp); 1037 error = g_attach(cp, proxy ? newpp : pp); 1038 if (error != 0) { 1039 gctl_error(req, "Cannot attach to provider %s.", 1040 pp->name); 1041 goto fail; 1042 } 1043 1044 g_error_provider(newpp, 0); 1045 if (proxy) { 1046 error = g_insert_proxy(gp, newpp, dstgp, pp, cp); 1047 if (error) 1048 goto fail; 1049 } 1050 G_SCHED_DEBUG(0, "Device %s created.", gp->name); 1051 1052 g_gsched_ref(gsp); 1053 1054 return (0); 1055 1056fail: 1057 if (cp != NULL) { 1058 if (cp->provider != NULL) 1059 g_detach(cp); 1060 g_destroy_consumer(cp); 1061 } 1062 if (newpp != NULL) 1063 g_destroy_provider(newpp); 1064 if (sc->sc_hash) 1065 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, 1066 gsp, sc->sc_data); 1067 if (sc->sc_data) 1068 gsp->gs_fini(sc->sc_data); 1069 g_free(gp->softc); 1070 g_destroy_geom(gp); 1071 1072 return (error); 1073} 1074 1075/* 1076 * Support for dynamic switching of scheduling algorithms. 1077 * First initialize the data structures for the new algorithm, 1078 * then call g_sched_remove_locked() to flush all references 1079 * to the old one, finally link the new algorithm. 1080 */ 1081static int 1082g_sched_change_algo(struct gctl_req *req, struct g_class *mp, 1083 struct g_provider *pp, struct g_gsched *gsp) 1084{ 1085 struct g_sched_softc *sc; 1086 struct g_geom *gp; 1087 struct g_hash *newh; 1088 void *data; 1089 u_long mask; 1090 int error = 0; 1091 1092 gp = pp->geom; 1093 sc = gp->softc; 1094 1095 data = gsp->gs_init(gp); 1096 if (data == NULL) 1097 return (ENOMEM); 1098 1099 newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); 1100 if (gsp->gs_priv_size && !newh) { 1101 error = ENOMEM; 1102 goto fail; 1103 } 1104 1105 g_sched_lock(gp); 1106 if (sc->sc_gsched) { /* can be NULL in some cases */ 1107 error = g_sched_remove_locked(gp, sc->sc_gsched); 1108 if (error) 1109 goto fail; 1110 } 1111 1112 g_gsched_ref(gsp); 1113 sc->sc_gsched = gsp; 1114 sc->sc_data = data; 1115 sc->sc_hash = newh; 1116 sc->sc_mask = mask; 1117 1118 g_sched_unlock(gp); 1119 1120 return (0); 1121 1122fail: 1123 if (newh) 1124 g_sched_hash_fini(gp, newh, mask, gsp, data); 1125 1126 if (data) 1127 gsp->gs_fini(data); 1128 1129 g_sched_unlock(gp); 1130 1131 return (error); 1132} 1133 1134/* 1135 * Stop the request flow directed to the proxy, redirecting the new 1136 * requests to the me.gs_pending queue. 1137 */ 1138static struct g_provider * 1139g_detach_proxy(struct g_geom *gp) 1140{ 1141 struct g_consumer *cp; 1142 struct g_provider *pp, *newpp; 1143 1144 do { 1145 pp = LIST_FIRST(&gp->provider); 1146 if (pp == NULL) 1147 break; 1148 cp = LIST_FIRST(&gp->consumer); 1149 if (cp == NULL) 1150 break; 1151 newpp = cp->provider; 1152 if (newpp == NULL) 1153 break; 1154 1155 me.gs_npending = 0; 1156 pp->geom->start = g_sched_temporary_start; 1157 1158 return (pp); 1159 } while (0); 1160 printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); 1161 1162 return (NULL); 1163} 1164 1165static void 1166g_sched_blackhole(struct bio *bp) 1167{ 1168 1169 g_io_deliver(bp, ENXIO); 1170} 1171 1172static inline void 1173g_reparent_provider(struct g_provider *pp, struct g_geom *gp, 1174 struct g_provider *newpp) 1175{ 1176 1177 LIST_REMOVE(pp, provider); 1178 if (newpp) { 1179 pp->private = newpp->private; 1180 pp->index = newpp->index; 1181 } 1182 pp->geom = gp; 1183 LIST_INSERT_HEAD(&gp->provider, pp, provider); 1184} 1185 1186static inline void 1187g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) 1188{ 1189 struct g_geom *gp = oldpp->geom; 1190 1191 g_reparent_provider(oldpp, newpp->geom, newpp); 1192 1193 /* 1194 * Hackish: let the system destroy the old provider for us, just 1195 * in case someone attached a consumer to it, in which case a 1196 * direct call to g_destroy_provider() would not work. 1197 */ 1198 g_reparent_provider(newpp, gp, NULL); 1199} 1200 1201/* 1202 * Complete the proxy destruction, linking the old provider to its 1203 * original geom, and destroying the proxy provider. Also take care 1204 * of issuing the pending requests collected in me.gs_pending (if any). 1205 */ 1206static int 1207g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) 1208{ 1209 struct g_consumer *cp; 1210 struct g_provider *newpp; 1211 1212 do { 1213 cp = LIST_FIRST(&gp->consumer); 1214 if (cp == NULL) 1215 break; 1216 newpp = cp->provider; 1217 if (newpp == NULL) 1218 break; 1219 1220 /* Relink the provider to its original geom. */ 1221 g_unproxy_provider(oldpp, newpp); 1222 1223 /* Detach consumer from provider, and destroy provider. */ 1224 cp->acr = newpp->acr = 0; 1225 cp->acw = newpp->acw = 0; 1226 cp->ace = newpp->ace = 0; 1227 g_detach(cp); 1228 1229 /* Send the pending bios through the right start function. */ 1230 g_sched_flush_pending(oldpp->geom->start); 1231 1232 return (0); 1233 } while (0); 1234 printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); 1235 1236 /* We cannot send the pending bios anywhere... */ 1237 g_sched_flush_pending(g_sched_blackhole); 1238 1239 return (EINVAL); 1240} 1241 1242static int 1243g_sched_destroy(struct g_geom *gp, boolean_t force) 1244{ 1245 struct g_provider *pp, *oldpp = NULL; 1246 struct g_sched_softc *sc; 1247 struct g_gsched *gsp; 1248 int error; 1249 1250 g_topology_assert(); 1251 sc = gp->softc; 1252 if (sc == NULL) 1253 return (ENXIO); 1254 if (!(sc->sc_flags & G_SCHED_PROXYING)) { 1255 pp = LIST_FIRST(&gp->provider); 1256 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { 1257 const char *msg = force ? 1258 "but we force removal" : "cannot remove"; 1259 1260 G_SCHED_DEBUG(!force, 1261 "Device %s is still open (r%dw%de%d), %s.", 1262 pp->name, pp->acr, pp->acw, pp->ace, msg); 1263 if (!force) 1264 return (EBUSY); 1265 } else { 1266 G_SCHED_DEBUG(0, "Device %s removed.", gp->name); 1267 } 1268 } else 1269 oldpp = g_detach_proxy(gp); 1270 1271 gsp = sc->sc_gsched; 1272 if (gsp) { 1273 /* 1274 * XXX bad hack here: force a dispatch to release 1275 * any reference to the hash table still held by 1276 * the scheduler. 1277 */ 1278 g_sched_lock(gp); 1279 /* 1280 * We are dying here, no new requests should enter 1281 * the scheduler. This is granted by the topolgy, 1282 * either in case we were proxying (new bios are 1283 * being redirected) or not (see the access check 1284 * above). 1285 */ 1286 g_sched_forced_dispatch(gp); 1287 error = g_sched_wait_pending(gp); 1288 1289 if (error) { 1290 /* 1291 * Not all the requests came home: this might happen 1292 * under heavy load, or if we were waiting for any 1293 * bio which is served in the event path (see 1294 * geom_slice.c for an example of how this can 1295 * happen). Try to restore a working configuration 1296 * if we can fail. 1297 */ 1298 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { 1299 g_sched_flush_pending(force ? 1300 g_sched_blackhole : g_sched_start); 1301 } 1302 1303 /* 1304 * In the forced destroy case there is not so much 1305 * we can do, we have pending bios that will call 1306 * g_sched_done() somehow, and we don't want them 1307 * to crash the system using freed memory. We tell 1308 * the user that something went wrong, and leak some 1309 * memory here. 1310 * Note: the callers using force = 1 ignore the 1311 * return value. 1312 */ 1313 if (force) { 1314 G_SCHED_DEBUG(0, "Pending requests while " 1315 " destroying geom, some memory leaked."); 1316 } 1317 1318 return (error); 1319 } 1320 1321 g_sched_unlock(gp); 1322 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, 1323 gsp, sc->sc_data); 1324 sc->sc_hash = NULL; 1325 gsp->gs_fini(sc->sc_data); 1326 g_gsched_unref(gsp); 1327 sc->sc_gsched = NULL; 1328 } 1329 1330 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { 1331 error = g_destroy_proxy(gp, oldpp); 1332 1333 if (error) { 1334 if (force) { 1335 G_SCHED_DEBUG(0, "Unrecoverable error while " 1336 "destroying a proxy geom, leaking some " 1337 " memory."); 1338 } 1339 1340 return (error); 1341 } 1342 } 1343 1344 mtx_destroy(&sc->sc_mtx); 1345 1346 g_free(gp->softc); 1347 gp->softc = NULL; 1348 g_wither_geom(gp, ENXIO); 1349 1350 return (error); 1351} 1352 1353static int 1354g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, 1355 struct g_geom *gp) 1356{ 1357 1358 return (g_sched_destroy(gp, 0)); 1359} 1360 1361/* 1362 * Functions related to the classification of requests. 1363 * 1364 * On recent FreeBSD versions (8.0 and above), we store a reference 1365 * to the issuer of a request in bp->bio_classifier1 as soon 1366 * as the bio is posted to the geom queue (and not later, because 1367 * requests are managed by the g_down thread afterwards). 1368 * 1369 * On older versions of the system (but this code is not used 1370 * in any existing release), we [ab]use the caller1 field in the 1371 * root element of the bio tree to store the classification info. 1372 * The marking is done at the beginning of g_io_request() 1373 * and only if we find that the field is NULL. 1374 * 1375 * To avoid rebuilding the kernel, this module will patch the 1376 * initial part of g_io_request() so it jumps to some hand-coded 1377 * assembly that does the marking and then executes the original 1378 * body of g_io_request(). 1379 * 1380 * fake_ioreq[] is architecture-specific machine code 1381 * that implements the above. CODE_SIZE, STORE_SIZE etc. 1382 * are constants used in the patching routine. Look at the 1383 * code in g_ioreq_patch() for the details. 1384 */ 1385 1386#ifndef HAVE_BIO_CLASSIFIER 1387/* 1388 * Support for old FreeBSD versions 1389 */ 1390#if defined(__i386__) 1391#define CODE_SIZE 29 1392#define STORE_SIZE 5 1393#define EPILOGUE 5 1394#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) 1395 1396static u_char fake_ioreq[SIZE] = { 1397 0x8b, 0x44, 0x24, 0x04, /* mov bp, %eax */ 1398 /* 1: */ 1399 0x89, 0xc2, /* mov %eax, %edx # edx = bp */ 1400 0x8b, 0x40, 0x64, /* mov bp->bio_parent, %eax */ 1401 0x85, 0xc0, /* test %eax, %eax */ 1402 0x75, 0xf7, /* jne 1b */ 1403 0x8b, 0x42, 0x30, /* mov bp->bp_caller1, %eax */ 1404 0x85, 0xc0, /* test %eax, %eax */ 1405 0x75, 0x09, /* jne 2f */ 1406 0x64, 0xa1, 0x00, 0x00, /* mov %fs:0, %eax */ 1407 0x00, 0x00, 1408 0x89, 0x42, 0x30, /* mov %eax, bp->bio_caller1 */ 1409 /* 2: */ 1410 0x55, 0x89, 0xe5, 0x57, 0x56, 1411 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ 1412}; 1413#elif defined(__amd64) 1414#define CODE_SIZE 38 1415#define STORE_SIZE 6 1416#define EPILOGUE 5 1417#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) 1418 1419static u_char fake_ioreq[SIZE] = { 1420 0x48, 0x89, 0xf8, /* mov bp, %rax */ 1421 /* 1: */ 1422 0x48, 0x89, 0xc2, /* mov %rax, %rdx # rdx = bp */ 1423 0x48, 0x8b, 0x82, 0xa8, /* mov bp->bio_parent, %rax */ 1424 0x00, 0x00, 0x00, 1425 0x48, 0x85, 0xc0, /* test %rax, %rax */ 1426 0x75, 0xf1, /* jne 1b */ 1427 0x48, 0x83, 0x7a, 0x58, /* cmp $0, bp->bp_caller1 */ 1428 0x00, 1429 0x75, 0x0d, /* jne 2f */ 1430 0x65, 0x48, 0x8b, 0x04, /* mov %gs:0, %rax */ 1431 0x25, 0x00, 0x00, 0x00, 1432 0x00, 1433 0x48, 0x89, 0x42, 0x58, /* mov %rax, bp->bio_caller1 */ 1434 /* 2: */ 1435 0x55, 0x48, 0x89, 0xe5, 0x41, 0x56, 1436 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ 1437}; 1438#else /* neither x86 nor amd64 */ 1439static void 1440g_new_io_request(struct bio *bp, struct g_consumer *cp) 1441{ 1442 struct bio *top = bp; 1443 1444 /* 1445 * bio classification: if bio_caller1 is available in the 1446 * root of the 'struct bio' tree, store there the thread id 1447 * of the thread that originated the request. 1448 * More sophisticated classification schemes can be used. 1449 */ 1450 while (top->bio_parent) 1451 top = top->bio_parent; 1452 1453 if (top->bio_caller1 == NULL) 1454 top->bio_caller1 = curthread; 1455} 1456 1457#error please add the code above in g_new_io_request() to the beginning of \ 1458 /sys/geom/geom_io.c::g_io_request(), and remove this line. 1459#endif /* end of arch-specific code */ 1460 1461static int 1462g_ioreq_patch(void) 1463{ 1464 u_char *original; 1465 u_long ofs; 1466 int found; 1467 1468 if (me.gs_patched) 1469 return (-1); 1470 1471 original = (u_char *)g_io_request; 1472 1473 found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE); 1474 if (!found) 1475 return (-1); 1476 1477 /* Jump back to the original + STORE_SIZE. */ 1478 ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE); 1479 bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4); 1480 1481 /* Patch the original address with a jump to the trampoline. */ 1482 *original = 0xe9; /* jump opcode */ 1483 ofs = fake_ioreq - (original + 5); 1484 bcopy(&ofs, original + 1, 4); 1485 1486 me.gs_patched = 1; 1487 1488 return (0); 1489} 1490 1491/* 1492 * Restore the original code, this is easy. 1493 */ 1494static void 1495g_ioreq_restore(void) 1496{ 1497 u_char *original; 1498 1499 if (me.gs_patched) { 1500 original = (u_char *)g_io_request; 1501 bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE); 1502 me.gs_patched = 0; 1503 } 1504} 1505 1506static inline void 1507g_classifier_ini(void) 1508{ 1509 1510 g_ioreq_patch(); 1511} 1512 1513static inline void 1514g_classifier_fini(void) 1515{ 1516 1517 g_ioreq_restore(); 1518} 1519 1520/*--- end of support code for older FreeBSD versions */ 1521 1522#else /* HAVE_BIO_CLASSIFIER */ 1523 1524/* 1525 * Classifier support for recent FreeBSD versions: we use 1526 * a very simple classifier, only use curthread to tag a request. 1527 * The classifier is registered at module load, and unregistered 1528 * at module unload. 1529 */ 1530static int 1531g_sched_tag(void *arg, struct bio *bp) 1532{ 1533 1534 bp->bio_classifier1 = curthread; 1535 return (1); 1536} 1537 1538static struct g_classifier_hook g_sched_classifier = { 1539 .func = g_sched_tag, 1540}; 1541 1542static inline void 1543g_classifier_ini(void) 1544{ 1545 1546 g_register_classifier(&g_sched_classifier); 1547} 1548 1549static inline void 1550g_classifier_fini(void) 1551{ 1552 1553 g_unregister_classifier(&g_sched_classifier); 1554} 1555#endif /* HAVE_BIO_CLASSIFIER */ 1556 1557static void 1558g_sched_init(struct g_class *mp) 1559{ 1560 1561 g_gsched_global_init(); 1562 1563 G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", 1564 mp, &g_sched_class); 1565 1566 /* Patch g_io_request to store classification info in the bio. */ 1567 g_classifier_ini(); 1568} 1569 1570static void 1571g_sched_fini(struct g_class *mp) 1572{ 1573 1574 g_classifier_fini(); 1575 1576 G_SCHED_DEBUG(0, "Unloading..."); 1577 1578 KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); 1579 mtx_destroy(&me.gs_mtx); 1580} 1581 1582static int 1583g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, 1584 struct thread *td) 1585{ 1586 struct g_consumer *cp; 1587 struct g_geom *gp; 1588 1589 cp = LIST_FIRST(&pp->geom->consumer); 1590 if (cp == NULL) 1591 return (ENOIOCTL); 1592 gp = cp->provider->geom; 1593 if (gp->ioctl == NULL) 1594 return (ENOIOCTL); 1595 return (gp->ioctl(cp->provider, cmd, data, fflag, td)); 1596} 1597 1598/* 1599 * Read the i-th argument for a request, skipping the /dev/ 1600 * prefix if present. 1601 */ 1602static const char * 1603g_sched_argi(struct gctl_req *req, int i) 1604{ 1605 static const char *dev_prefix = "/dev/"; 1606 const char *name; 1607 char param[16]; 1608 int l = strlen(dev_prefix); 1609 1610 snprintf(param, sizeof(param), "arg%d", i); 1611 name = gctl_get_asciiparam(req, param); 1612 if (name == NULL) 1613 gctl_error(req, "No 'arg%d' argument", i); 1614 else if (strncmp(name, dev_prefix, l) == 0) 1615 name += l; 1616 return (name); 1617} 1618 1619/* 1620 * Fetch nargs and do appropriate checks. 1621 */ 1622static int 1623g_sched_get_nargs(struct gctl_req *req) 1624{ 1625 int *nargs; 1626 1627 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); 1628 if (nargs == NULL) { 1629 gctl_error(req, "No 'nargs' argument"); 1630 return (0); 1631 } 1632 if (*nargs <= 0) 1633 gctl_error(req, "Missing device(s)."); 1634 return (*nargs); 1635} 1636 1637/* 1638 * Check whether we should add the class on certain volumes when 1639 * this geom is created. Right now this is under control of a kenv 1640 * variable containing the names of all devices that we care about. 1641 * Probably we should only support transparent insertion as the 1642 * preferred mode of operation. 1643 */ 1644static struct g_geom * 1645g_sched_taste(struct g_class *mp, struct g_provider *pp, 1646 int flags __unused) 1647{ 1648 struct g_gsched *gsp = NULL; /* the . algorithm we want */ 1649 const char *s; /* generic string pointer */ 1650 const char *taste_names; /* devices we like */ 1651 int l; 1652 1653 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, 1654 mp->name, pp->name); 1655 g_topology_assert(); 1656 1657 G_SCHED_DEBUG(2, "Tasting %s.", pp->name); 1658 1659 do { 1660 /* do not taste on ourselves */ 1661 if (pp->geom->class == mp) 1662 break; 1663 1664 taste_names = getenv("geom.sched.taste"); 1665 if (taste_names == NULL) 1666 break; 1667 1668 l = strlen(pp->name); 1669 for (s = taste_names; *s && 1670 (s = strstr(s, pp->name)); s++) { 1671 /* further checks for an exact match */ 1672 if ( (s == taste_names || s[-1] == ' ') && 1673 (s[l] == '\0' || s[l] == ' ') ) 1674 break; 1675 } 1676 if (s == NULL) 1677 break; 1678 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", 1679 pp->name, s); 1680 1681 /* look up the provider name in the list */ 1682 s = getenv("geom.sched.algo"); 1683 if (s == NULL) 1684 s = "rr"; 1685 1686 gsp = g_gsched_find(s); /* also get a reference */ 1687 if (gsp == NULL) { 1688 G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); 1689 break; 1690 } 1691 1692 /* XXX create with 1 as last argument ? */ 1693 g_sched_create(NULL, mp, pp, gsp, 0); 1694 g_gsched_unref(gsp); 1695 } while (0); 1696 return NULL; 1697} 1698 1699static void 1700g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) 1701{ 1702 struct g_provider *pp; 1703 struct g_gsched *gsp; 1704 const char *name; 1705 int i, nargs; 1706 1707 g_topology_assert(); 1708 1709 name = gctl_get_asciiparam(req, "algo"); 1710 if (name == NULL) { 1711 gctl_error(req, "No '%s' argument", "algo"); 1712 return; 1713 } 1714 1715 gsp = g_gsched_find(name); /* also get a reference */ 1716 if (gsp == NULL) { 1717 gctl_error(req, "Bad algorithm '%s'", name); 1718 return; 1719 } 1720 1721 nargs = g_sched_get_nargs(req); 1722 1723 /* 1724 * Run on the arguments, and break on any error. 1725 * We look for a device name, but skip the /dev/ prefix if any. 1726 */ 1727 for (i = 0; i < nargs; i++) { 1728 name = g_sched_argi(req, i); 1729 if (name == NULL) 1730 break; 1731 pp = g_provider_by_name(name); 1732 if (pp == NULL) { 1733 G_SCHED_DEBUG(1, "Provider %s is invalid.", name); 1734 gctl_error(req, "Provider %s is invalid.", name); 1735 break; 1736 } 1737 if (g_sched_create(req, mp, pp, gsp, proxy) != 0) 1738 break; 1739 } 1740 1741 g_gsched_unref(gsp); 1742} 1743 1744static void 1745g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) 1746{ 1747 struct g_provider *pp; 1748 struct g_gsched *gsp; 1749 const char *name; 1750 int i, nargs; 1751 1752 g_topology_assert(); 1753 1754 name = gctl_get_asciiparam(req, "algo"); 1755 if (name == NULL) { 1756 gctl_error(req, "No '%s' argument", "algo"); 1757 return; 1758 } 1759 1760 gsp = g_gsched_find(name); /* also get a reference */ 1761 if (gsp == NULL) { 1762 gctl_error(req, "Bad algorithm '%s'", name); 1763 return; 1764 } 1765 1766 nargs = g_sched_get_nargs(req); 1767 1768 /* 1769 * Run on the arguments, and break on any error. 1770 * We look for a device name, but skip the /dev/ prefix if any. 1771 */ 1772 for (i = 0; i < nargs; i++) { 1773 name = g_sched_argi(req, i); 1774 if (name == NULL) 1775 break; 1776 pp = g_provider_by_name(name); 1777 if (pp == NULL || pp->geom->class != mp) { 1778 G_SCHED_DEBUG(1, "Provider %s is invalid.", name); 1779 gctl_error(req, "Provider %s is invalid.", name); 1780 break; 1781 } 1782 if (g_sched_change_algo(req, mp, pp, gsp) != 0) 1783 break; 1784 } 1785 1786 g_gsched_unref(gsp); 1787} 1788 1789static struct g_geom * 1790g_sched_find_geom(struct g_class *mp, const char *name) 1791{ 1792 struct g_geom *gp; 1793 1794 LIST_FOREACH(gp, &mp->geom, geom) { 1795 if (strcmp(gp->name, name) == 0) 1796 return (gp); 1797 } 1798 return (NULL); 1799} 1800 1801static void 1802g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) 1803{ 1804 int nargs, *force, error, i; 1805 struct g_geom *gp; 1806 const char *name; 1807 1808 g_topology_assert(); 1809 1810 nargs = g_sched_get_nargs(req); 1811 1812 force = gctl_get_paraml(req, "force", sizeof(*force)); 1813 if (force == NULL) { 1814 gctl_error(req, "No 'force' argument"); 1815 return; 1816 } 1817 1818 for (i = 0; i < nargs; i++) { 1819 name = g_sched_argi(req, i); 1820 if (name == NULL) 1821 break; 1822 1823 gp = g_sched_find_geom(mp, name); 1824 if (gp == NULL) { 1825 G_SCHED_DEBUG(1, "Device %s is invalid.", name); 1826 gctl_error(req, "Device %s is invalid.", name); 1827 break; 1828 } 1829 1830 error = g_sched_destroy(gp, *force); 1831 if (error != 0) { 1832 gctl_error(req, "Cannot destroy device %s (error=%d).", 1833 gp->name, error); 1834 break; 1835 } 1836 } 1837} 1838 1839static void 1840g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) 1841{ 1842 uint32_t *version; 1843 1844 g_topology_assert(); 1845 1846 version = gctl_get_paraml(req, "version", sizeof(*version)); 1847 if (version == NULL) { 1848 gctl_error(req, "No '%s' argument.", "version"); 1849 return; 1850 } 1851 1852 if (*version != G_SCHED_VERSION) { 1853 gctl_error(req, "Userland and kernel parts are " 1854 "out of sync."); 1855 return; 1856 } 1857 1858 if (strcmp(verb, "create") == 0) { 1859 g_sched_ctl_create(req, mp, 0); 1860 return; 1861 } else if (strcmp(verb, "insert") == 0) { 1862 g_sched_ctl_create(req, mp, 1); 1863 return; 1864 } else if (strcmp(verb, "configure") == 0) { 1865 g_sched_ctl_configure(req, mp); 1866 return; 1867 } else if (strcmp(verb, "destroy") == 0) { 1868 g_sched_ctl_destroy(req, mp); 1869 return; 1870 } 1871 1872 gctl_error(req, "Unknown verb."); 1873} 1874 1875static void 1876g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 1877 struct g_consumer *cp, struct g_provider *pp) 1878{ 1879 struct g_sched_softc *sc = gp->softc; 1880 struct g_gsched *gsp = sc->sc_gsched; 1881 if (indent == NULL) { /* plaintext */ 1882 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); 1883 } 1884 if (gsp != NULL && gsp->gs_dumpconf) 1885 gsp->gs_dumpconf(sb, indent, gp, cp, pp); 1886} 1887 1888DECLARE_GEOM_CLASS(g_sched_class, g_sched); 1889MODULE_VERSION(geom_sched, 0); 1890