g_sched.c revision 206497
1162674Spiso/*- 2162674Spiso * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo 3162674Spiso * All rights reserved. 4162674Spiso * 5162674Spiso * Redistribution and use in source and binary forms, with or without 6162674Spiso * modification, are permitted provided that the following conditions 7162674Spiso * are met: 8162674Spiso * 1. Redistributions of source code must retain the above copyright 9162674Spiso * notice, this list of conditions and the following disclaimer. 10162674Spiso * 2. Redistributions in binary form must reproduce the above copyright 11162674Spiso * notice, this list of conditions and the following disclaimer in the 12162674Spiso * documentation and/or other materials provided with the distribution. 13162674Spiso * 14162674Spiso * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15162674Spiso * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16162674Spiso * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17162674Spiso * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18162674Spiso * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19162674Spiso * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20162674Spiso * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21162674Spiso * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22162674Spiso * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23162674Spiso * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24162674Spiso * SUCH DAMAGE. 25162674Spiso */ 26162674Spiso 27162674Spiso/* 28162674Spiso * $Id$ 29162674Spiso * $FreeBSD: head/sys/geom/sched/g_sched.c 206497 2010-04-12 16:37:45Z luigi $ 30162674Spiso * 31162674Spiso * Main control module for geom-based disk schedulers ('sched'). 32162674Spiso * 33162674Spiso * USER VIEW 34162674Spiso * A 'sched' node is typically inserted transparently between 35162674Spiso * an existing provider pp and its original geom gp 36162674Spiso * 37162674Spiso * [pp --> gp ..] 38162674Spiso * 39162674Spiso * using the command "geom sched insert <provider>" and 40162674Spiso * resulting in the following topology 41162674Spiso * 42162674Spiso * [pp --> sched_gp --> cp] [new_pp --> gp ... ] 43162674Spiso * 44162674Spiso * Deletion "geom sched destroy <provider>.sched." restores the 45162674Spiso * original chain. The normal "geom sched create <provide>" 46162674Spiso * is also supported. 47162674Spiso * 48162674Spiso * INTERNALS 49162674Spiso * Internally, the 'sched' uses the following data structures 50162674Spiso * 51162674Spiso * geom{} g_sched_softc{} g_gsched{} 52162674Spiso * +----------+ +---------------+ +-------------+ 53162674Spiso * | softc *-|--->| sc_gsched *-|-->| gs_init | 54162674Spiso * | ... | | | | gs_fini | 55162674Spiso * | | | [ hash table] | | gs_start | 56162674Spiso * +----------+ | | | ... | 57162674Spiso * | | +-------------+ 58162674Spiso * | | 59162674Spiso * | | g_*_softc{} 60162674Spiso * | | +-------------+ 61162674Spiso * | sc_data *-|-->| | 62162674Spiso * +---------------+ | algorithm- | 63162674Spiso * | specific | 64162674Spiso * +-------------+ 65162674Spiso * 66162674Spiso * A g_sched_softc{} is created with a "geom sched insert" call. 67162674Spiso * In turn this instantiates a specific scheduling algorithm, 68162674Spiso * which sets sc_gsched to point to the algorithm callbacks, 69162674Spiso * and calls gs_init() to create the g_*_softc{} . 70162674Spiso * The other callbacks (gs_start, gs_next, ...) are invoked 71162674Spiso * as needed 72162674Spiso * 73162674Spiso * g_sched_softc{} is defined in g_sched.h and mostly used here; 74162674Spiso * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; 75162674Spiso * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) 76162674Spiso * 77162674Spiso * DATA MOVING 78162674Spiso * When a bio is received on the provider, it goes to the 79162674Spiso * g_sched_start() which calls gs_start() to initially queue it; 80162674Spiso * then we call g_sched_dispatch() that loops around gs_next() 81162674Spiso * to select zero or more bio's to be sent downstream. 82162674Spiso * 83162674Spiso * g_sched_dispatch() can also be called as a result of a timeout, 84162674Spiso * e.g. when doing anticipation or pacing requests. 85162674Spiso * 86162674Spiso * When a bio comes back, it goes to g_sched_done() which in turn 87162674Spiso * calls gs_done(). The latter does any necessary housekeeping in 88162674Spiso * the scheduling algorithm, and may decide to call g_sched_dispatch() 89162674Spiso * to send more bio's downstream. 90162674Spiso * 91162674Spiso * If an algorithm needs per-flow queues, these are created 92162674Spiso * calling gs_init_class() and destroyed with gs_fini_class(), 93162674Spiso * and they are also inserted in the hash table implemented in 94162674Spiso * the g_sched_softc{} 95162674Spiso * 96162674Spiso * If an algorithm is replaced, or a transparently-inserted node is 97162674Spiso * removed with "geom sched destroy", we need to remove all references 98162674Spiso * to the g_*_softc{} and g_sched_softc from the bio's still in 99162674Spiso * the scheduler. g_sched_forced_dispatch() helps doing this. 100162674Spiso * XXX need to explain better. 101162674Spiso */ 102162674Spiso 103162674Spiso#include <sys/cdefs.h> 104162674Spiso#include <sys/param.h> 105162674Spiso#include <sys/systm.h> 106162674Spiso#include <sys/kernel.h> 107162674Spiso#include <sys/module.h> 108162674Spiso#include <sys/lock.h> 109162674Spiso#include <sys/mutex.h> 110162674Spiso#include <sys/bio.h> 111162674Spiso#include <sys/limits.h> 112162674Spiso#include <sys/hash.h> 113162674Spiso#include <sys/sysctl.h> 114162674Spiso#include <sys/malloc.h> 115162674Spiso#include <sys/proc.h> /* we access curthread */ 116162674Spiso#include <geom/geom.h> 117162674Spiso#include "gs_scheduler.h" 118162674Spiso#include "g_sched.h" /* geom hooks */ 119162674Spiso 120162674Spiso/* 121162674Spiso * Size of the per-geom hash table storing traffic classes. 122162674Spiso * We may decide to change it at a later time, it has no ABI 123162674Spiso * implications as it is only used for run-time allocations. 124162674Spiso */ 125162674Spiso#define G_SCHED_HASH_SIZE 32 126162674Spiso 127162674Spisostatic int g_sched_destroy(struct g_geom *gp, boolean_t force); 128162674Spisostatic int g_sched_destroy_geom(struct gctl_req *req, 129162674Spiso struct g_class *mp, struct g_geom *gp); 130162674Spisostatic void g_sched_config(struct gctl_req *req, struct g_class *mp, 131162674Spiso const char *verb); 132162674Spisostatic struct g_geom *g_sched_taste(struct g_class *mp, 133162674Spiso struct g_provider *pp, int flags __unused); 134162674Spisostatic void g_sched_dumpconf(struct sbuf *sb, const char *indent, 135162674Spiso struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); 136162674Spisostatic void g_sched_init(struct g_class *mp); 137162674Spisostatic void g_sched_fini(struct g_class *mp); 138162674Spiso 139162674Spisostruct g_class g_sched_class = { 140162674Spiso .name = G_SCHED_CLASS_NAME, 141162674Spiso .version = G_VERSION, 142162674Spiso .ctlreq = g_sched_config, 143162674Spiso .taste = g_sched_taste, 144162674Spiso .destroy_geom = g_sched_destroy_geom, 145162674Spiso .init = g_sched_init, 146162674Spiso .fini = g_sched_fini 147162674Spiso}; 148162674Spiso 149162674SpisoMALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); 150162674Spiso 151162674Spiso/* 152162674Spiso * Global variables describing the state of the geom_sched module. 153162674Spiso * There is only one static instance of this structure. 154 */ 155LIST_HEAD(gs_list, g_gsched); /* type, link field */ 156struct geom_sched_vars { 157 struct mtx gs_mtx; 158 struct gs_list gs_scheds; /* list of algorithms */ 159 u_int gs_debug; 160 u_int gs_sched_count; /* how many algorithms ? */ 161 u_int gs_patched; /* g_io_request was patched */ 162 163 u_int gs_initialized; 164 u_int gs_expire_secs; /* expiration of hash entries */ 165 166 struct bio_queue_head gs_pending; 167 u_int gs_npending; 168 169 /* The following are for stats, usually protected by gs_mtx. */ 170 u_long gs_requests; /* total requests */ 171 u_long gs_done; /* total done */ 172 u_int gs_in_flight; /* requests in flight */ 173 u_int gs_writes_in_flight; 174 u_int gs_bytes_in_flight; 175 u_int gs_write_bytes_in_flight; 176 177 char gs_names[256]; /* names of schedulers */ 178}; 179 180static struct geom_sched_vars me = { 181 .gs_expire_secs = 10, 182}; 183 184SYSCTL_DECL(_kern_geom); 185SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, 186 "GEOM_SCHED stuff"); 187 188SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, 189 &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); 190 191SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, 192 &me.gs_bytes_in_flight, 0, "Bytes in flight"); 193 194SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, 195 &me.gs_writes_in_flight, 0, "Write Requests in flight"); 196 197SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, 198 &me.gs_in_flight, 0, "Requests in flight"); 199 200SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, 201 &me.gs_done, 0, "Total done"); 202 203SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, 204 &me.gs_requests, 0, "Total requests"); 205 206SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, 207 &me.gs_names, 0, "Algorithm names"); 208 209SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, 210 &me.gs_sched_count, 0, "Number of algorithms"); 211 212SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, 213 &me.gs_debug, 0, "Debug level"); 214 215SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, 216 &me.gs_expire_secs, 0, "Expire time in seconds"); 217 218/* 219 * g_sched calls the scheduler algorithms with this lock held. 220 * The locking functions are exposed so the scheduler algorithms can also 221 * protect themselves e.g. when running a callout handler. 222 */ 223void 224g_sched_lock(struct g_geom *gp) 225{ 226 struct g_sched_softc *sc = gp->softc; 227 228 mtx_lock(&sc->sc_mtx); 229} 230 231void 232g_sched_unlock(struct g_geom *gp) 233{ 234 struct g_sched_softc *sc = gp->softc; 235 236 mtx_unlock(&sc->sc_mtx); 237} 238 239/* 240 * Support functions to handle references to the module, 241 * which are coming from devices using this scheduler. 242 */ 243static inline void 244g_gsched_ref(struct g_gsched *gsp) 245{ 246 247 atomic_add_int(&gsp->gs_refs, 1); 248} 249 250static inline void 251g_gsched_unref(struct g_gsched *gsp) 252{ 253 254 atomic_add_int(&gsp->gs_refs, -1); 255} 256 257/* 258 * Update the stats when this request is done. 259 */ 260static void 261g_sched_update_stats(struct bio *bio) 262{ 263 264 me.gs_done++; 265 me.gs_in_flight--; 266 me.gs_bytes_in_flight -= bio->bio_length; 267 if (bio->bio_cmd & BIO_WRITE) { 268 me.gs_writes_in_flight--; 269 me.gs_write_bytes_in_flight -= bio->bio_length; 270 } 271} 272 273/* 274 * Dispatch any pending request. 275 */ 276static void 277g_sched_forced_dispatch(struct g_geom *gp) 278{ 279 struct g_sched_softc *sc = gp->softc; 280 struct g_gsched *gsp = sc->sc_gsched; 281 struct bio *bp; 282 283 KASSERT(mtx_owned(&sc->sc_mtx), 284 ("sc_mtx not owned during forced dispatch")); 285 286 while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) 287 g_io_request(bp, LIST_FIRST(&gp->consumer)); 288} 289 290/* 291 * The main dispatch loop, called either here after the start 292 * routine, or by scheduling algorithms when they receive a timeout 293 * or a 'done' notification. Does not share code with the forced 294 * dispatch path, since the gs_done() callback can call us. 295 */ 296void 297g_sched_dispatch(struct g_geom *gp) 298{ 299 struct g_sched_softc *sc = gp->softc; 300 struct g_gsched *gsp = sc->sc_gsched; 301 struct bio *bp; 302 303 KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); 304 305 if ((sc->sc_flags & G_SCHED_FLUSHING)) 306 return; 307 308 while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) 309 g_io_request(bp, LIST_FIRST(&gp->consumer)); 310} 311 312/* 313 * Recent (8.0 and above) versions of FreeBSD have support to 314 * register classifiers of disk requests. The classifier is 315 * invoked by g_io_request(), and stores the information into 316 * bp->bio_classifier1. 317 * 318 * Support for older versions, which is left here only for 319 * documentation purposes, relies on two hacks: 320 * 1. classification info is written into the bio_caller1 321 * field of the topmost node in the bio chain. This field 322 * is rarely used, but this module is incompatible with 323 * those that use bio_caller1 for other purposes, 324 * such as ZFS and gjournal; 325 * 2. g_io_request() is patched in-memory when the module is 326 * loaded, so that the function calls a classifier as its 327 * first thing. g_io_request() is restored when the module 328 * is unloaded. This functionality is only supported for 329 * x86 and amd64, other architectures need source code changes. 330 */ 331 332/* 333 * Lookup the identity of the issuer of the original request. 334 * In the current implementation we use the curthread of the 335 * issuer, but different mechanisms may be implemented later 336 * so we do not make assumptions on the return value which for 337 * us is just an opaque identifier. 338 */ 339 340static inline u_long 341g_sched_classify(struct bio *bp) 342{ 343 344#if __FreeBSD_version > 800098 345 /* we have classifier fields in the struct bio */ 346#define HAVE_BIO_CLASSIFIER 347 return ((u_long)bp->bio_classifier1); 348#else 349#warning old version!!! 350 while (bp->bio_parent != NULL) 351 bp = bp->bio_parent; 352 353 return ((u_long)bp->bio_caller1); 354#endif 355} 356 357/* Return the hash chain for the given key. */ 358static inline struct g_hash * 359g_sched_hash(struct g_sched_softc *sc, u_long key) 360{ 361 362 return (&sc->sc_hash[key & sc->sc_mask]); 363} 364 365/* 366 * Helper function for the children classes, which takes 367 * a geom and a bio and returns the private descriptor 368 * associated to the request. This involves fetching 369 * the classification field and [al]locating the 370 * corresponding entry in the hash table. 371 */ 372void * 373g_sched_get_class(struct g_geom *gp, struct bio *bp) 374{ 375 struct g_sched_softc *sc; 376 struct g_sched_class *gsc; 377 struct g_gsched *gsp; 378 struct g_hash *bucket; 379 u_long key; 380 381 sc = gp->softc; 382 key = g_sched_classify(bp); 383 bucket = g_sched_hash(sc, key); 384 LIST_FOREACH(gsc, bucket, gsc_clist) { 385 if (key == gsc->gsc_key) { 386 gsc->gsc_refs++; 387 return (gsc->gsc_priv); 388 } 389 } 390 391 gsp = sc->sc_gsched; 392 gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, 393 M_GEOM_SCHED, M_NOWAIT | M_ZERO); 394 if (!gsc) 395 return (NULL); 396 397 if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { 398 free(gsc, M_GEOM_SCHED); 399 return (NULL); 400 } 401 402 gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ 403 gsc->gsc_key = key; 404 LIST_INSERT_HEAD(bucket, gsc, gsc_clist); 405 406 gsc->gsc_expire = ticks + me.gs_expire_secs * hz; 407 408 return (gsc->gsc_priv); 409} 410 411/* 412 * Release a reference to the per-client descriptor, 413 */ 414void 415g_sched_put_class(struct g_geom *gp, void *priv) 416{ 417 struct g_sched_class *gsc; 418 struct g_sched_softc *sc; 419 420 gsc = g_sched_priv2class(priv); 421 gsc->gsc_expire = ticks + me.gs_expire_secs * hz; 422 423 if (--gsc->gsc_refs > 0) 424 return; 425 426 sc = gp->softc; 427 sc->sc_gsched->gs_fini_class(sc->sc_data, priv); 428 429 LIST_REMOVE(gsc, gsc_clist); 430 free(gsc, M_GEOM_SCHED); 431} 432 433static void 434g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, 435 struct g_gsched *gsp, void *data) 436{ 437 struct g_sched_class *cp, *cp2; 438 int i; 439 440 if (!hp) 441 return; 442 443 if (data && gsp->gs_hash_unref) 444 gsp->gs_hash_unref(data); 445 446 for (i = 0; i < G_SCHED_HASH_SIZE; i++) { 447 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) 448 g_sched_put_class(gp, cp->gsc_priv); 449 } 450 451 hashdestroy(hp, M_GEOM_SCHED, mask); 452} 453 454static struct g_hash * 455g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) 456{ 457 struct g_hash *hash; 458 459 if (gsp->gs_priv_size == 0) 460 return (NULL); 461 462 hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); 463 464 return (hash); 465} 466 467static void 468g_sched_flush_classes(struct g_geom *gp) 469{ 470 struct g_sched_softc *sc; 471 struct g_sched_class *cp, *cp2; 472 int i; 473 474 sc = gp->softc; 475 476 if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) 477 return; 478 479 for (i = 0; i < G_SCHED_HASH_SIZE; i++) { 480 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { 481 if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) 482 g_sched_put_class(gp, cp->gsc_priv); 483 } 484 } 485 486 sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; 487} 488 489/* 490 * Wait for the completion of any outstanding request. To ensure 491 * that this does not take forever the caller has to make sure that 492 * no new request enter the scehduler before calling us. 493 * 494 * Must be called with the gp mutex held and topology locked. 495 */ 496static int 497g_sched_wait_pending(struct g_geom *gp) 498{ 499 struct g_sched_softc *sc = gp->softc; 500 int endticks = ticks + hz; 501 502 g_topology_assert(); 503 504 while (sc->sc_pending && endticks - ticks >= 0) 505 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); 506 507 return (sc->sc_pending ? ETIMEDOUT : 0); 508} 509 510static int 511g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) 512{ 513 struct g_sched_softc *sc = gp->softc; 514 int error; 515 516 /* Set the flushing flag: new bios will not enter the scheduler. */ 517 sc->sc_flags |= G_SCHED_FLUSHING; 518 519 g_sched_forced_dispatch(gp); 520 error = g_sched_wait_pending(gp); 521 if (error) 522 goto failed; 523 524 /* No more requests pending or in flight from the old gsp. */ 525 526 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); 527 sc->sc_hash = NULL; 528 529 /* 530 * Avoid deadlock here by releasing the gp mutex and reacquiring 531 * it once done. It should be safe, since no reconfiguration or 532 * destruction can take place due to the geom topology lock; no 533 * new request can use the current sc_data since we flagged the 534 * geom as being flushed. 535 */ 536 g_sched_unlock(gp); 537 gsp->gs_fini(sc->sc_data); 538 g_sched_lock(gp); 539 540 sc->sc_gsched = NULL; 541 sc->sc_data = NULL; 542 g_gsched_unref(gsp); 543 544failed: 545 sc->sc_flags &= ~G_SCHED_FLUSHING; 546 547 return (error); 548} 549 550static int 551g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) 552{ 553 int error; 554 555 g_sched_lock(gp); 556 error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ 557 g_sched_unlock(gp); 558 559 return (error); 560} 561 562/* 563 * Support function for create/taste -- locate the desired 564 * algorithm and grab a reference to it. 565 */ 566static struct g_gsched * 567g_gsched_find(const char *name) 568{ 569 struct g_gsched *gsp = NULL; 570 571 mtx_lock(&me.gs_mtx); 572 LIST_FOREACH(gsp, &me.gs_scheds, glist) { 573 if (strcmp(name, gsp->gs_name) == 0) { 574 g_gsched_ref(gsp); 575 break; 576 } 577 } 578 mtx_unlock(&me.gs_mtx); 579 580 return (gsp); 581} 582 583/* 584 * Rebuild the list of scheduler names. 585 * To be called with me.gs_mtx lock held. 586 */ 587static void 588g_gsched_build_names(struct g_gsched *gsp) 589{ 590 int pos, l; 591 struct g_gsched *cur; 592 593 pos = 0; 594 LIST_FOREACH(cur, &me.gs_scheds, glist) { 595 l = strlen(cur->gs_name); 596 if (l + pos + 1 + 1 < sizeof(me.gs_names)) { 597 if (pos != 0) 598 me.gs_names[pos++] = ' '; 599 strcpy(me.gs_names + pos, cur->gs_name); 600 pos += l; 601 } 602 } 603 me.gs_names[pos] = '\0'; 604} 605 606/* 607 * Register or unregister individual scheduling algorithms. 608 */ 609static int 610g_gsched_register(struct g_gsched *gsp) 611{ 612 struct g_gsched *cur; 613 int error = 0; 614 615 mtx_lock(&me.gs_mtx); 616 LIST_FOREACH(cur, &me.gs_scheds, glist) { 617 if (strcmp(gsp->gs_name, cur->gs_name) == 0) 618 break; 619 } 620 if (cur != NULL) { 621 G_SCHED_DEBUG(0, "A scheduler named %s already" 622 "exists.", gsp->gs_name); 623 error = EEXIST; 624 } else { 625 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); 626 gsp->gs_refs = 1; 627 me.gs_sched_count++; 628 g_gsched_build_names(gsp); 629 } 630 mtx_unlock(&me.gs_mtx); 631 632 return (error); 633} 634 635struct g_gsched_unregparm { 636 struct g_gsched *gup_gsp; 637 int gup_error; 638}; 639 640static void 641g_gsched_unregister(void *arg, int flag) 642{ 643 struct g_gsched_unregparm *parm = arg; 644 struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; 645 struct g_sched_softc *sc; 646 struct g_geom *gp, *gp_tmp; 647 int error; 648 649 parm->gup_error = 0; 650 651 g_topology_assert(); 652 653 if (flag == EV_CANCEL) 654 return; 655 656 mtx_lock(&me.gs_mtx); 657 658 LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { 659 if (gp->class != &g_sched_class) 660 continue; /* Should not happen. */ 661 662 sc = gp->softc; 663 if (sc->sc_gsched == gsp) { 664 error = g_sched_remove(gp, gsp); 665 if (error) 666 goto failed; 667 } 668 } 669 670 LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { 671 if (cur != gsp) 672 continue; 673 674 if (gsp->gs_refs != 1) { 675 G_SCHED_DEBUG(0, "%s still in use.", 676 gsp->gs_name); 677 parm->gup_error = EBUSY; 678 } else { 679 LIST_REMOVE(gsp, glist); 680 me.gs_sched_count--; 681 g_gsched_build_names(gsp); 682 } 683 break; 684 } 685 686 if (cur == NULL) { 687 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); 688 parm->gup_error = ENOENT; 689 } 690 691failed: 692 mtx_unlock(&me.gs_mtx); 693} 694 695static inline void 696g_gsched_global_init(void) 697{ 698 699 if (!me.gs_initialized) { 700 G_SCHED_DEBUG(0, "Initializing global data."); 701 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); 702 LIST_INIT(&me.gs_scheds); 703 gs_bioq_init(&me.gs_pending); 704 me.gs_initialized = 1; 705 } 706} 707 708/* 709 * Module event called when a scheduling algorithm module is loaded or 710 * unloaded. 711 */ 712int 713g_gsched_modevent(module_t mod, int cmd, void *arg) 714{ 715 struct g_gsched *gsp = arg; 716 struct g_gsched_unregparm parm; 717 int error; 718 719 G_SCHED_DEBUG(0, "Modevent %d.", cmd); 720 721 /* 722 * If the module is loaded at boot, the geom thread that calls 723 * g_sched_init() might actually run after g_gsched_modevent(), 724 * so make sure that the module is properly initialized. 725 */ 726 g_gsched_global_init(); 727 728 error = EOPNOTSUPP; 729 switch (cmd) { 730 case MOD_LOAD: 731 error = g_gsched_register(gsp); 732 G_SCHED_DEBUG(0, "Loaded module %s error %d.", 733 gsp->gs_name, error); 734 if (error == 0) 735 g_retaste(&g_sched_class); 736 break; 737 738 case MOD_UNLOAD: 739 parm.gup_gsp = gsp; 740 parm.gup_error = 0; 741 742 error = g_waitfor_event(g_gsched_unregister, 743 &parm, M_WAITOK, NULL); 744 if (error == 0) 745 error = parm.gup_error; 746 G_SCHED_DEBUG(0, "Unloaded module %s error %d.", 747 gsp->gs_name, error); 748 break; 749 }; 750 751 return (error); 752} 753 754#ifdef KTR 755#define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) 756static inline int 757g_sched_issuer_pid(struct bio *bp) 758{ 759 struct thread *thread = g_sched_issuer(bp); 760 761 return (thread->td_tid); 762} 763 764static inline char 765g_sched_type(struct bio *bp) 766{ 767 768 if (0 != (bp->bio_cmd & BIO_READ)) 769 return ('R'); 770 else if (0 != (bp->bio_cmd & BIO_WRITE)) 771 return ('W'); 772 return ('U'); 773} 774 775static inline void 776g_sched_trace_bio_START(struct bio *bp) 777{ 778 779 CTR5(KTR_GSCHED, "S %d %c %lu/%lu %lu", g_sched_issuer_pid(bp), 780 g_sched_type(bp), bp->bio_offset / ULONG_MAX, 781 bp->bio_offset, bp->bio_length); 782} 783 784static inline void 785g_sched_trace_bio_DONE(struct bio *bp) 786{ 787 788 CTR5(KTR_GSCHED, "D %d %c %lu/%lu %lu", g_sched_issuer_pid(bp), 789 g_sched_type(bp), bp->bio_offset / ULONG_MAX, 790 bp->bio_offset, bp->bio_length); 791} 792#else 793#define TRC_BIO_EVENT(e, bp) 794#endif 795 796/* 797 * g_sched_done() and g_sched_start() dispatch the geom requests to 798 * the scheduling algorithm in use. 799 */ 800static void 801g_sched_done(struct bio *bio) 802{ 803 struct g_geom *gp = bio->bio_caller2; 804 struct g_sched_softc *sc = gp->softc; 805 806 TRC_BIO_EVENT(DONE, bio); 807 808 KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); 809 810 g_sched_lock(gp); 811 812 g_sched_update_stats(bio); 813 sc->sc_gsched->gs_done(sc->sc_data, bio); 814 if (!--sc->sc_pending) 815 wakeup(gp); 816 817 g_sched_flush_classes(gp); 818 g_sched_unlock(gp); 819 820 g_std_done(bio); 821} 822 823static void 824g_sched_start(struct bio *bp) 825{ 826 struct g_geom *gp = bp->bio_to->geom; 827 struct g_sched_softc *sc = gp->softc; 828 struct bio *cbp; 829 830 TRC_BIO_EVENT(START, bp); 831 G_SCHED_LOGREQ(bp, "Request received."); 832 833 cbp = g_clone_bio(bp); 834 if (cbp == NULL) { 835 g_io_deliver(bp, ENOMEM); 836 return; 837 } 838 cbp->bio_done = g_sched_done; 839 cbp->bio_to = LIST_FIRST(&gp->provider); 840 KASSERT(cbp->bio_to != NULL, ("NULL provider")); 841 842 /* We only schedule reads and writes. */ 843 if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE))) 844 goto bypass; 845 846 G_SCHED_LOGREQ(cbp, "Sending request."); 847 848 g_sched_lock(gp); 849 /* 850 * Call the algorithm's gs_start to queue the request in the 851 * scheduler. If gs_start fails then pass the request down, 852 * otherwise call g_sched_dispatch() which tries to push 853 * one or more requests down. 854 */ 855 if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || 856 sc->sc_gsched->gs_start(sc->sc_data, cbp)) { 857 g_sched_unlock(gp); 858 goto bypass; 859 } 860 /* 861 * We use bio_caller1 to mark requests that are scheduled 862 * so make sure it is not NULL. 863 */ 864 if (cbp->bio_caller1 == NULL) 865 cbp->bio_caller1 = &me; /* anything not NULL */ 866 867 cbp->bio_caller2 = gp; 868 sc->sc_pending++; 869 870 /* Update general stats. */ 871 me.gs_in_flight++; 872 me.gs_requests++; 873 me.gs_bytes_in_flight += bp->bio_length; 874 if (bp->bio_cmd & BIO_WRITE) { 875 me.gs_writes_in_flight++; 876 me.gs_write_bytes_in_flight += bp->bio_length; 877 } 878 g_sched_dispatch(gp); 879 g_sched_unlock(gp); 880 return; 881 882bypass: 883 cbp->bio_done = g_std_done; 884 cbp->bio_caller1 = NULL; /* not scheduled */ 885 g_io_request(cbp, LIST_FIRST(&gp->consumer)); 886} 887 888/* 889 * The next few functions are the geom glue. 890 */ 891static void 892g_sched_orphan(struct g_consumer *cp) 893{ 894 895 g_topology_assert(); 896 g_sched_destroy(cp->geom, 1); 897} 898 899static int 900g_sched_access(struct g_provider *pp, int dr, int dw, int de) 901{ 902 struct g_geom *gp; 903 struct g_consumer *cp; 904 int error; 905 906 gp = pp->geom; 907 cp = LIST_FIRST(&gp->consumer); 908 error = g_access(cp, dr, dw, de); 909 910 return (error); 911} 912 913static void 914g_sched_temporary_start(struct bio *bio) 915{ 916 917 mtx_lock(&me.gs_mtx); 918 me.gs_npending++; 919 gs_bioq_disksort(&me.gs_pending, bio); 920 mtx_unlock(&me.gs_mtx); 921} 922 923static void 924g_sched_flush_pending(g_start_t *start) 925{ 926 struct bio *bp; 927 928 while ((bp = gs_bioq_takefirst(&me.gs_pending))) 929 start(bp); 930} 931 932static int 933g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, 934 struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) 935{ 936 struct g_sched_softc *sc = gp->softc; 937 g_start_t *saved_start, *flush = g_sched_start; 938 int error = 0, endticks = ticks + hz; 939 940 g_cancel_event(newpp); /* prevent taste() */ 941 /* copy private fields */ 942 newpp->private = pp->private; 943 newpp->index = pp->index; 944 945 /* Queue all the early requests coming for us. */ 946 me.gs_npending = 0; 947 saved_start = pp->geom->start; 948 dstgp->start = g_sched_temporary_start; 949 950 while (pp->nstart - pp->nend != me.gs_npending && 951 endticks - ticks >= 0) 952 tsleep(pp, PRIBIO, "-", hz/10); 953 954 if (pp->nstart - pp->nend != me.gs_npending) { 955 flush = saved_start; 956 error = ETIMEDOUT; 957 goto fail; 958 } 959 960 /* link pp to this geom */ 961 LIST_REMOVE(pp, provider); 962 pp->geom = gp; 963 LIST_INSERT_HEAD(&gp->provider, pp, provider); 964 965 /* 966 * replicate the counts from the parent in the 967 * new provider and consumer nodes 968 */ 969 cp->acr = newpp->acr = pp->acr; 970 cp->acw = newpp->acw = pp->acw; 971 cp->ace = newpp->ace = pp->ace; 972 sc->sc_flags |= G_SCHED_PROXYING; 973 974fail: 975 dstgp->start = saved_start; 976 977 g_sched_flush_pending(flush); 978 979 return (error); 980} 981 982/* 983 * Create a geom node for the device passed as *pp. 984 * If successful, add a reference to this gsp. 985 */ 986static int 987g_sched_create(struct gctl_req *req, struct g_class *mp, 988 struct g_provider *pp, struct g_gsched *gsp, int proxy) 989{ 990 struct g_sched_softc *sc = NULL; 991 struct g_geom *gp, *dstgp; 992 struct g_provider *newpp = NULL; 993 struct g_consumer *cp = NULL; 994 char name[64]; 995 int error; 996 997 g_topology_assert(); 998 999 snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); 1000 LIST_FOREACH(gp, &mp->geom, geom) { 1001 if (strcmp(gp->name, name) == 0) { 1002 gctl_error(req, "Geom %s already exists.", 1003 name); 1004 return (EEXIST); 1005 } 1006 } 1007 1008 gp = g_new_geomf(mp, name); 1009 dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ 1010 if (gp == NULL) { 1011 gctl_error(req, "Cannot create geom %s.", name); 1012 error = ENOMEM; 1013 goto fail; 1014 } 1015 1016 sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); 1017 sc->sc_gsched = gsp; 1018 sc->sc_data = gsp->gs_init(gp); 1019 if (sc->sc_data == NULL) { 1020 error = ENOMEM; 1021 goto fail; 1022 } 1023 1024 sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); 1025 1026 /* 1027 * Do not initialize the flush mechanism, will be initialized 1028 * on the first insertion on the hash table. 1029 */ 1030 1031 mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); 1032 1033 gp->softc = sc; 1034 gp->start = g_sched_start; 1035 gp->orphan = g_sched_orphan; 1036 gp->access = g_sched_access; 1037 gp->dumpconf = g_sched_dumpconf; 1038 1039 newpp = g_new_providerf(dstgp, gp->name); 1040 if (newpp == NULL) { 1041 gctl_error(req, "Cannot create provider %s.", name); 1042 error = ENOMEM; 1043 goto fail; 1044 } 1045 1046 newpp->mediasize = pp->mediasize; 1047 newpp->sectorsize = pp->sectorsize; 1048 1049 cp = g_new_consumer(gp); 1050 if (cp == NULL) { 1051 gctl_error(req, "Cannot create consumer for %s.", 1052 gp->name); 1053 error = ENOMEM; 1054 goto fail; 1055 } 1056 1057 error = g_attach(cp, proxy ? newpp : pp); 1058 if (error != 0) { 1059 gctl_error(req, "Cannot attach to provider %s.", 1060 pp->name); 1061 goto fail; 1062 } 1063 1064 g_error_provider(newpp, 0); 1065 if (proxy) { 1066 error = g_insert_proxy(gp, newpp, dstgp, pp, cp); 1067 if (error) 1068 goto fail; 1069 } 1070 G_SCHED_DEBUG(0, "Device %s created.", gp->name); 1071 1072 g_gsched_ref(gsp); 1073 1074 return (0); 1075 1076fail: 1077 if (cp != NULL) { 1078 if (cp->provider != NULL) 1079 g_detach(cp); 1080 g_destroy_consumer(cp); 1081 } 1082 1083 if (newpp != NULL) 1084 g_destroy_provider(newpp); 1085 1086 if (sc && sc->sc_hash) { 1087 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, 1088 gsp, sc->sc_data); 1089 } 1090 1091 if (sc && sc->sc_data) 1092 gsp->gs_fini(sc->sc_data); 1093 1094 if (gp != NULL) { 1095 if (gp->softc != NULL) 1096 g_free(gp->softc); 1097 g_destroy_geom(gp); 1098 } 1099 1100 return (error); 1101} 1102 1103/* 1104 * Support for dynamic switching of scheduling algorithms. 1105 * First initialize the data structures for the new algorithm, 1106 * then call g_sched_remove_locked() to flush all references 1107 * to the old one, finally link the new algorithm. 1108 */ 1109static int 1110g_sched_change_algo(struct gctl_req *req, struct g_class *mp, 1111 struct g_provider *pp, struct g_gsched *gsp) 1112{ 1113 struct g_sched_softc *sc; 1114 struct g_geom *gp; 1115 struct g_hash *newh; 1116 void *data; 1117 u_long mask; 1118 int error = 0; 1119 1120 gp = pp->geom; 1121 sc = gp->softc; 1122 1123 data = gsp->gs_init(gp); 1124 if (data == NULL) 1125 return (ENOMEM); 1126 1127 newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); 1128 if (gsp->gs_priv_size && !newh) { 1129 error = ENOMEM; 1130 goto fail; 1131 } 1132 1133 g_sched_lock(gp); 1134 if (sc->sc_gsched) { /* can be NULL in some cases */ 1135 error = g_sched_remove_locked(gp, sc->sc_gsched); 1136 if (error) 1137 goto fail; 1138 } 1139 1140 g_gsched_ref(gsp); 1141 sc->sc_gsched = gsp; 1142 sc->sc_data = data; 1143 sc->sc_hash = newh; 1144 sc->sc_mask = mask; 1145 1146 g_sched_unlock(gp); 1147 1148 return (0); 1149 1150fail: 1151 if (newh) 1152 g_sched_hash_fini(gp, newh, mask, gsp, data); 1153 1154 if (data) 1155 gsp->gs_fini(data); 1156 1157 g_sched_unlock(gp); 1158 1159 return (error); 1160} 1161 1162/* 1163 * Stop the request flow directed to the proxy, redirecting the new 1164 * requests to the me.gs_pending queue. 1165 */ 1166static struct g_provider * 1167g_detach_proxy(struct g_geom *gp) 1168{ 1169 struct g_consumer *cp; 1170 struct g_provider *pp, *newpp; 1171 1172 do { 1173 pp = LIST_FIRST(&gp->provider); 1174 if (pp == NULL) 1175 break; 1176 cp = LIST_FIRST(&gp->consumer); 1177 if (cp == NULL) 1178 break; 1179 newpp = cp->provider; 1180 if (newpp == NULL) 1181 break; 1182 1183 me.gs_npending = 0; 1184 pp->geom->start = g_sched_temporary_start; 1185 1186 return (pp); 1187 } while (0); 1188 printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); 1189 1190 return (NULL); 1191} 1192 1193static void 1194g_sched_blackhole(struct bio *bp) 1195{ 1196 1197 g_io_deliver(bp, ENXIO); 1198} 1199 1200static inline void 1201g_reparent_provider(struct g_provider *pp, struct g_geom *gp, 1202 struct g_provider *newpp) 1203{ 1204 1205 LIST_REMOVE(pp, provider); 1206 if (newpp) { 1207 pp->private = newpp->private; 1208 pp->index = newpp->index; 1209 } 1210 pp->geom = gp; 1211 LIST_INSERT_HEAD(&gp->provider, pp, provider); 1212} 1213 1214static inline void 1215g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) 1216{ 1217 struct g_geom *gp = oldpp->geom; 1218 1219 g_reparent_provider(oldpp, newpp->geom, newpp); 1220 1221 /* 1222 * Hackish: let the system destroy the old provider for us, just 1223 * in case someone attached a consumer to it, in which case a 1224 * direct call to g_destroy_provider() would not work. 1225 */ 1226 g_reparent_provider(newpp, gp, NULL); 1227} 1228 1229/* 1230 * Complete the proxy destruction, linking the old provider to its 1231 * original geom, and destroying the proxy provider. Also take care 1232 * of issuing the pending requests collected in me.gs_pending (if any). 1233 */ 1234static int 1235g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) 1236{ 1237 struct g_consumer *cp; 1238 struct g_provider *newpp; 1239 1240 do { 1241 cp = LIST_FIRST(&gp->consumer); 1242 if (cp == NULL) 1243 break; 1244 newpp = cp->provider; 1245 if (newpp == NULL) 1246 break; 1247 1248 /* Relink the provider to its original geom. */ 1249 g_unproxy_provider(oldpp, newpp); 1250 1251 /* Detach consumer from provider, and destroy provider. */ 1252 cp->acr = newpp->acr = 0; 1253 cp->acw = newpp->acw = 0; 1254 cp->ace = newpp->ace = 0; 1255 g_detach(cp); 1256 1257 /* Send the pending bios through the right start function. */ 1258 g_sched_flush_pending(oldpp->geom->start); 1259 1260 return (0); 1261 } while (0); 1262 printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); 1263 1264 /* We cannot send the pending bios anywhere... */ 1265 g_sched_flush_pending(g_sched_blackhole); 1266 1267 return (EINVAL); 1268} 1269 1270static int 1271g_sched_destroy(struct g_geom *gp, boolean_t force) 1272{ 1273 struct g_provider *pp, *oldpp = NULL; 1274 struct g_sched_softc *sc; 1275 struct g_gsched *gsp; 1276 int error; 1277 1278 g_topology_assert(); 1279 sc = gp->softc; 1280 if (sc == NULL) 1281 return (ENXIO); 1282 if (!(sc->sc_flags & G_SCHED_PROXYING)) { 1283 pp = LIST_FIRST(&gp->provider); 1284 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { 1285 const char *msg = force ? 1286 "but we force removal" : "cannot remove"; 1287 1288 G_SCHED_DEBUG(!force, 1289 "Device %s is still open (r%dw%de%d), %s.", 1290 pp->name, pp->acr, pp->acw, pp->ace, msg); 1291 if (!force) 1292 return (EBUSY); 1293 } else { 1294 G_SCHED_DEBUG(0, "Device %s removed.", gp->name); 1295 } 1296 } else 1297 oldpp = g_detach_proxy(gp); 1298 1299 gsp = sc->sc_gsched; 1300 if (gsp) { 1301 /* 1302 * XXX bad hack here: force a dispatch to release 1303 * any reference to the hash table still held by 1304 * the scheduler. 1305 */ 1306 g_sched_lock(gp); 1307 /* 1308 * We are dying here, no new requests should enter 1309 * the scheduler. This is granted by the topolgy, 1310 * either in case we were proxying (new bios are 1311 * being redirected) or not (see the access check 1312 * above). 1313 */ 1314 g_sched_forced_dispatch(gp); 1315 error = g_sched_wait_pending(gp); 1316 1317 if (error) { 1318 /* 1319 * Not all the requests came home: this might happen 1320 * under heavy load, or if we were waiting for any 1321 * bio which is served in the event path (see 1322 * geom_slice.c for an example of how this can 1323 * happen). Try to restore a working configuration 1324 * if we can fail. 1325 */ 1326 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { 1327 g_sched_flush_pending(force ? 1328 g_sched_blackhole : g_sched_start); 1329 } 1330 1331 /* 1332 * In the forced destroy case there is not so much 1333 * we can do, we have pending bios that will call 1334 * g_sched_done() somehow, and we don't want them 1335 * to crash the system using freed memory. We tell 1336 * the user that something went wrong, and leak some 1337 * memory here. 1338 * Note: the callers using force = 1 ignore the 1339 * return value. 1340 */ 1341 if (force) { 1342 G_SCHED_DEBUG(0, "Pending requests while " 1343 " destroying geom, some memory leaked."); 1344 } 1345 1346 return (error); 1347 } 1348 1349 g_sched_unlock(gp); 1350 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, 1351 gsp, sc->sc_data); 1352 sc->sc_hash = NULL; 1353 gsp->gs_fini(sc->sc_data); 1354 g_gsched_unref(gsp); 1355 sc->sc_gsched = NULL; 1356 } 1357 1358 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { 1359 error = g_destroy_proxy(gp, oldpp); 1360 1361 if (error) { 1362 if (force) { 1363 G_SCHED_DEBUG(0, "Unrecoverable error while " 1364 "destroying a proxy geom, leaking some " 1365 " memory."); 1366 } 1367 1368 return (error); 1369 } 1370 } 1371 1372 mtx_destroy(&sc->sc_mtx); 1373 1374 g_free(gp->softc); 1375 gp->softc = NULL; 1376 g_wither_geom(gp, ENXIO); 1377 1378 return (error); 1379} 1380 1381static int 1382g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, 1383 struct g_geom *gp) 1384{ 1385 1386 return (g_sched_destroy(gp, 0)); 1387} 1388 1389/* 1390 * Functions related to the classification of requests. 1391 * 1392 * On recent FreeBSD versions (8.0 and above), we store a reference 1393 * to the issuer of a request in bp->bio_classifier1 as soon 1394 * as the bio is posted to the geom queue (and not later, because 1395 * requests are managed by the g_down thread afterwards). 1396 * 1397 * On older versions of the system (but this code is not used 1398 * in any existing release), we [ab]use the caller1 field in the 1399 * root element of the bio tree to store the classification info. 1400 * The marking is done at the beginning of g_io_request() 1401 * and only if we find that the field is NULL. 1402 * 1403 * To avoid rebuilding the kernel, this module will patch the 1404 * initial part of g_io_request() so it jumps to some hand-coded 1405 * assembly that does the marking and then executes the original 1406 * body of g_io_request(). 1407 * 1408 * fake_ioreq[] is architecture-specific machine code 1409 * that implements the above. CODE_SIZE, STORE_SIZE etc. 1410 * are constants used in the patching routine. Look at the 1411 * code in g_ioreq_patch() for the details. 1412 */ 1413 1414#ifndef HAVE_BIO_CLASSIFIER 1415/* 1416 * Support for old FreeBSD versions 1417 */ 1418#if defined(__i386__) 1419#define CODE_SIZE 29 1420#define STORE_SIZE 5 1421#define EPILOGUE 5 1422#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) 1423 1424static u_char fake_ioreq[SIZE] = { 1425 0x8b, 0x44, 0x24, 0x04, /* mov bp, %eax */ 1426 /* 1: */ 1427 0x89, 0xc2, /* mov %eax, %edx # edx = bp */ 1428 0x8b, 0x40, 0x64, /* mov bp->bio_parent, %eax */ 1429 0x85, 0xc0, /* test %eax, %eax */ 1430 0x75, 0xf7, /* jne 1b */ 1431 0x8b, 0x42, 0x30, /* mov bp->bp_caller1, %eax */ 1432 0x85, 0xc0, /* test %eax, %eax */ 1433 0x75, 0x09, /* jne 2f */ 1434 0x64, 0xa1, 0x00, 0x00, /* mov %fs:0, %eax */ 1435 0x00, 0x00, 1436 0x89, 0x42, 0x30, /* mov %eax, bp->bio_caller1 */ 1437 /* 2: */ 1438 0x55, 0x89, 0xe5, 0x57, 0x56, 1439 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ 1440}; 1441#elif defined(__amd64) 1442#define CODE_SIZE 38 1443#define STORE_SIZE 6 1444#define EPILOGUE 5 1445#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) 1446 1447static u_char fake_ioreq[SIZE] = { 1448 0x48, 0x89, 0xf8, /* mov bp, %rax */ 1449 /* 1: */ 1450 0x48, 0x89, 0xc2, /* mov %rax, %rdx # rdx = bp */ 1451 0x48, 0x8b, 0x82, 0xa8, /* mov bp->bio_parent, %rax */ 1452 0x00, 0x00, 0x00, 1453 0x48, 0x85, 0xc0, /* test %rax, %rax */ 1454 0x75, 0xf1, /* jne 1b */ 1455 0x48, 0x83, 0x7a, 0x58, /* cmp $0, bp->bp_caller1 */ 1456 0x00, 1457 0x75, 0x0d, /* jne 2f */ 1458 0x65, 0x48, 0x8b, 0x04, /* mov %gs:0, %rax */ 1459 0x25, 0x00, 0x00, 0x00, 1460 0x00, 1461 0x48, 0x89, 0x42, 0x58, /* mov %rax, bp->bio_caller1 */ 1462 /* 2: */ 1463 0x55, 0x48, 0x89, 0xe5, 0x41, 0x56, 1464 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ 1465}; 1466#else /* neither x86 nor amd64 */ 1467static void 1468g_new_io_request(struct bio *bp, struct g_consumer *cp) 1469{ 1470 struct bio *top = bp; 1471 1472 /* 1473 * bio classification: if bio_caller1 is available in the 1474 * root of the 'struct bio' tree, store there the thread id 1475 * of the thread that originated the request. 1476 * More sophisticated classification schemes can be used. 1477 */ 1478 while (top->bio_parent) 1479 top = top->bio_parent; 1480 1481 if (top->bio_caller1 == NULL) 1482 top->bio_caller1 = curthread; 1483} 1484 1485#error please add the code above in g_new_io_request() to the beginning of \ 1486 /sys/geom/geom_io.c::g_io_request(), and remove this line. 1487#endif /* end of arch-specific code */ 1488 1489static int 1490g_ioreq_patch(void) 1491{ 1492 u_char *original; 1493 u_long ofs; 1494 int found; 1495 1496 if (me.gs_patched) 1497 return (-1); 1498 1499 original = (u_char *)g_io_request; 1500 1501 found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE); 1502 if (!found) 1503 return (-1); 1504 1505 /* Jump back to the original + STORE_SIZE. */ 1506 ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE); 1507 bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4); 1508 1509 /* Patch the original address with a jump to the trampoline. */ 1510 *original = 0xe9; /* jump opcode */ 1511 ofs = fake_ioreq - (original + 5); 1512 bcopy(&ofs, original + 1, 4); 1513 1514 me.gs_patched = 1; 1515 1516 return (0); 1517} 1518 1519/* 1520 * Restore the original code, this is easy. 1521 */ 1522static void 1523g_ioreq_restore(void) 1524{ 1525 u_char *original; 1526 1527 if (me.gs_patched) { 1528 original = (u_char *)g_io_request; 1529 bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE); 1530 me.gs_patched = 0; 1531 } 1532} 1533 1534static inline void 1535g_classifier_ini(void) 1536{ 1537 1538 g_ioreq_patch(); 1539} 1540 1541static inline void 1542g_classifier_fini(void) 1543{ 1544 1545 g_ioreq_restore(); 1546} 1547 1548/*--- end of support code for older FreeBSD versions */ 1549 1550#else /* HAVE_BIO_CLASSIFIER */ 1551 1552/* 1553 * Classifier support for recent FreeBSD versions: we use 1554 * a very simple classifier, only use curthread to tag a request. 1555 * The classifier is registered at module load, and unregistered 1556 * at module unload. 1557 */ 1558static int 1559g_sched_tag(void *arg, struct bio *bp) 1560{ 1561 1562 bp->bio_classifier1 = curthread; 1563 return (1); 1564} 1565 1566static struct g_classifier_hook g_sched_classifier = { 1567 .func = g_sched_tag, 1568}; 1569 1570static inline void 1571g_classifier_ini(void) 1572{ 1573 1574 g_register_classifier(&g_sched_classifier); 1575} 1576 1577static inline void 1578g_classifier_fini(void) 1579{ 1580 1581 g_unregister_classifier(&g_sched_classifier); 1582} 1583#endif /* HAVE_BIO_CLASSIFIER */ 1584 1585static void 1586g_sched_init(struct g_class *mp) 1587{ 1588 1589 g_gsched_global_init(); 1590 1591 G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", 1592 mp, &g_sched_class); 1593 1594 /* Patch g_io_request to store classification info in the bio. */ 1595 g_classifier_ini(); 1596} 1597 1598static void 1599g_sched_fini(struct g_class *mp) 1600{ 1601 1602 g_classifier_fini(); 1603 1604 G_SCHED_DEBUG(0, "Unloading..."); 1605 1606 KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); 1607 mtx_destroy(&me.gs_mtx); 1608} 1609 1610/* 1611 * Read the i-th argument for a request, skipping the /dev/ 1612 * prefix if present. 1613 */ 1614static const char * 1615g_sched_argi(struct gctl_req *req, int i) 1616{ 1617 static const char *dev_prefix = "/dev/"; 1618 const char *name; 1619 char param[16]; 1620 int l = strlen(dev_prefix); 1621 1622 snprintf(param, sizeof(param), "arg%d", i); 1623 name = gctl_get_asciiparam(req, param); 1624 if (name == NULL) 1625 gctl_error(req, "No 'arg%d' argument", i); 1626 else if (strncmp(name, dev_prefix, l) == 0) 1627 name += l; 1628 return (name); 1629} 1630 1631/* 1632 * Fetch nargs and do appropriate checks. 1633 */ 1634static int 1635g_sched_get_nargs(struct gctl_req *req) 1636{ 1637 int *nargs; 1638 1639 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); 1640 if (nargs == NULL) { 1641 gctl_error(req, "No 'nargs' argument"); 1642 return (0); 1643 } 1644 if (*nargs <= 0) 1645 gctl_error(req, "Missing device(s)."); 1646 return (*nargs); 1647} 1648 1649/* 1650 * Check whether we should add the class on certain volumes when 1651 * this geom is created. Right now this is under control of a kenv 1652 * variable containing the names of all devices that we care about. 1653 * Probably we should only support transparent insertion as the 1654 * preferred mode of operation. 1655 */ 1656static struct g_geom * 1657g_sched_taste(struct g_class *mp, struct g_provider *pp, 1658 int flags __unused) 1659{ 1660 struct g_gsched *gsp = NULL; /* the . algorithm we want */ 1661 const char *s; /* generic string pointer */ 1662 const char *taste_names; /* devices we like */ 1663 int l; 1664 1665 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, 1666 mp->name, pp->name); 1667 g_topology_assert(); 1668 1669 G_SCHED_DEBUG(2, "Tasting %s.", pp->name); 1670 1671 do { 1672 /* do not taste on ourselves */ 1673 if (pp->geom->class == mp) 1674 break; 1675 1676 taste_names = getenv("geom.sched.taste"); 1677 if (taste_names == NULL) 1678 break; 1679 1680 l = strlen(pp->name); 1681 for (s = taste_names; *s && 1682 (s = strstr(s, pp->name)); s++) { 1683 /* further checks for an exact match */ 1684 if ( (s == taste_names || s[-1] == ' ') && 1685 (s[l] == '\0' || s[l] == ' ') ) 1686 break; 1687 } 1688 if (s == NULL) 1689 break; 1690 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", 1691 pp->name, s); 1692 1693 /* look up the provider name in the list */ 1694 s = getenv("geom.sched.algo"); 1695 if (s == NULL) 1696 s = "rr"; 1697 1698 gsp = g_gsched_find(s); /* also get a reference */ 1699 if (gsp == NULL) { 1700 G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); 1701 break; 1702 } 1703 1704 /* XXX create with 1 as last argument ? */ 1705 g_sched_create(NULL, mp, pp, gsp, 0); 1706 g_gsched_unref(gsp); 1707 } while (0); 1708 return NULL; 1709} 1710 1711static void 1712g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) 1713{ 1714 struct g_provider *pp; 1715 struct g_gsched *gsp; 1716 const char *name; 1717 int i, nargs; 1718 1719 g_topology_assert(); 1720 1721 name = gctl_get_asciiparam(req, "algo"); 1722 if (name == NULL) { 1723 gctl_error(req, "No '%s' argument", "algo"); 1724 return; 1725 } 1726 1727 gsp = g_gsched_find(name); /* also get a reference */ 1728 if (gsp == NULL) { 1729 gctl_error(req, "Bad algorithm '%s'", name); 1730 return; 1731 } 1732 1733 nargs = g_sched_get_nargs(req); 1734 1735 /* 1736 * Run on the arguments, and break on any error. 1737 * We look for a device name, but skip the /dev/ prefix if any. 1738 */ 1739 for (i = 0; i < nargs; i++) { 1740 name = g_sched_argi(req, i); 1741 if (name == NULL) 1742 break; 1743 pp = g_provider_by_name(name); 1744 if (pp == NULL) { 1745 G_SCHED_DEBUG(1, "Provider %s is invalid.", name); 1746 gctl_error(req, "Provider %s is invalid.", name); 1747 break; 1748 } 1749 if (g_sched_create(req, mp, pp, gsp, proxy) != 0) 1750 break; 1751 } 1752 1753 g_gsched_unref(gsp); 1754} 1755 1756static void 1757g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) 1758{ 1759 struct g_provider *pp; 1760 struct g_gsched *gsp; 1761 const char *name; 1762 int i, nargs; 1763 1764 g_topology_assert(); 1765 1766 name = gctl_get_asciiparam(req, "algo"); 1767 if (name == NULL) { 1768 gctl_error(req, "No '%s' argument", "algo"); 1769 return; 1770 } 1771 1772 gsp = g_gsched_find(name); /* also get a reference */ 1773 if (gsp == NULL) { 1774 gctl_error(req, "Bad algorithm '%s'", name); 1775 return; 1776 } 1777 1778 nargs = g_sched_get_nargs(req); 1779 1780 /* 1781 * Run on the arguments, and break on any error. 1782 * We look for a device name, but skip the /dev/ prefix if any. 1783 */ 1784 for (i = 0; i < nargs; i++) { 1785 name = g_sched_argi(req, i); 1786 if (name == NULL) 1787 break; 1788 pp = g_provider_by_name(name); 1789 if (pp == NULL || pp->geom->class != mp) { 1790 G_SCHED_DEBUG(1, "Provider %s is invalid.", name); 1791 gctl_error(req, "Provider %s is invalid.", name); 1792 break; 1793 } 1794 if (g_sched_change_algo(req, mp, pp, gsp) != 0) 1795 break; 1796 } 1797 1798 g_gsched_unref(gsp); 1799} 1800 1801static struct g_geom * 1802g_sched_find_geom(struct g_class *mp, const char *name) 1803{ 1804 struct g_geom *gp; 1805 1806 LIST_FOREACH(gp, &mp->geom, geom) { 1807 if (strcmp(gp->name, name) == 0) 1808 return (gp); 1809 } 1810 return (NULL); 1811} 1812 1813static void 1814g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) 1815{ 1816 int nargs, *force, error, i; 1817 struct g_geom *gp; 1818 const char *name; 1819 1820 g_topology_assert(); 1821 1822 nargs = g_sched_get_nargs(req); 1823 1824 force = gctl_get_paraml(req, "force", sizeof(*force)); 1825 if (force == NULL) { 1826 gctl_error(req, "No 'force' argument"); 1827 return; 1828 } 1829 1830 for (i = 0; i < nargs; i++) { 1831 name = g_sched_argi(req, i); 1832 if (name == NULL) 1833 break; 1834 1835 gp = g_sched_find_geom(mp, name); 1836 if (gp == NULL) { 1837 G_SCHED_DEBUG(1, "Device %s is invalid.", name); 1838 gctl_error(req, "Device %s is invalid.", name); 1839 break; 1840 } 1841 1842 error = g_sched_destroy(gp, *force); 1843 if (error != 0) { 1844 gctl_error(req, "Cannot destroy device %s (error=%d).", 1845 gp->name, error); 1846 break; 1847 } 1848 } 1849} 1850 1851static void 1852g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) 1853{ 1854 uint32_t *version; 1855 1856 g_topology_assert(); 1857 1858 version = gctl_get_paraml(req, "version", sizeof(*version)); 1859 if (version == NULL) { 1860 gctl_error(req, "No '%s' argument.", "version"); 1861 return; 1862 } 1863 1864 if (*version != G_SCHED_VERSION) { 1865 gctl_error(req, "Userland and kernel parts are " 1866 "out of sync."); 1867 return; 1868 } 1869 1870 if (strcmp(verb, "create") == 0) { 1871 g_sched_ctl_create(req, mp, 0); 1872 return; 1873 } else if (strcmp(verb, "insert") == 0) { 1874 g_sched_ctl_create(req, mp, 1); 1875 return; 1876 } else if (strcmp(verb, "configure") == 0) { 1877 g_sched_ctl_configure(req, mp); 1878 return; 1879 } else if (strcmp(verb, "destroy") == 0) { 1880 g_sched_ctl_destroy(req, mp); 1881 return; 1882 } 1883 1884 gctl_error(req, "Unknown verb."); 1885} 1886 1887static void 1888g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 1889 struct g_consumer *cp, struct g_provider *pp) 1890{ 1891 struct g_sched_softc *sc = gp->softc; 1892 struct g_gsched *gsp = sc->sc_gsched; 1893 if (indent == NULL) { /* plaintext */ 1894 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); 1895 } 1896 if (gsp->gs_dumpconf) 1897 gsp->gs_dumpconf(sb, indent, gp, cp, pp); 1898} 1899 1900DECLARE_GEOM_CLASS(g_sched_class, g_sched); 1901MODULE_VERSION(geom_sched, 0); 1902