g_sched.c revision 221453
12340Sbde/*- 250476Speter * Copyright (c) 2009-2010 Fabio Checconi 31638Srgrimes * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa 42340Sbde * All rights reserved. 578347Sobrien * 61638Srgrimes * Redistribution and use in source and binary forms, with or without 7209024Simp * modification, are permitted provided that the following conditions 8209024Simp * are met: 9209024Simp * 1. Redistributions of source code must retain the above copyright 10215149Sdim * notice, this list of conditions and the following disclaimer. 11215149Sdim * 2. Redistributions in binary form must reproduce the above copyright 12215149Sdim * notice, this list of conditions and the following disclaimer in the 13215149Sdim * documentation and/or other materials provided with the distribution. 14215149Sdim * 15209024Simp * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 16239272Sgonzo * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17209024Simp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18209024Simp * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 19241298Smarcel * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20241298Smarcel * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21241298Smarcel * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22241298Smarcel * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2319343Ssteve * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2419343Ssteve * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2519343Ssteve * SUCH DAMAGE. 2619343Ssteve */ 2719343Ssteve 2819343Ssteve/* 2919343Ssteve * $Id$ 3019343Ssteve * $FreeBSD: head/sys/geom/sched/g_sched.c 221453 2011-05-04 18:41:26Z ae $ 3119343Ssteve * 3219343Ssteve * Main control module for geom-based disk schedulers ('sched'). 3319343Ssteve * 3419343Ssteve * USER VIEW 35117173Sru * A 'sched' node is typically inserted transparently between 3619343Ssteve * an existing provider pp and its original geom gp 371638Srgrimes * 382340Sbde * [pp --> gp ..] 3919343Ssteve * 4019343Ssteve * using the command "geom sched insert <provider>" and 4119343Ssteve * resulting in the following topology 42243934Seadler * 4319343Ssteve * [pp --> sched_gp --> cp] [new_pp --> gp ... ] 442340Sbde * 451638Srgrimes * Deletion "geom sched destroy <provider>.sched." restores the 462340Sbde * original chain. The normal "geom sched create <provide>" 472340Sbde * is also supported. 48217100Skib * 491638Srgrimes * INTERNALS 5019343Ssteve * Internally, the 'sched' uses the following data structures 5119343Ssteve * 52125445Sbde * geom{} g_sched_softc{} g_gsched{} 5319343Ssteve * +----------+ +---------------+ +-------------+ 542340Sbde * | softc *-|--->| sc_gsched *-|-->| gs_init | 55209024Simp * | ... | | | | gs_fini | 56177865Sobrien * | | | [ hash table] | | gs_start | 57173375Scognet * +----------+ | | | ... | 58177865Sobrien * | | +-------------+ 5919343Ssteve * | | 60173850Sjb * | | g_*_softc{} 61173850Sjb * | | +-------------+ 62173375Scognet * | sc_data *-|-->| | 63173850Sjb * +---------------+ | algorithm- | 64202807Ssepotvin * | specific | 651638Srgrimes * +-------------+ 66179184Sjb * 67179184Sjb * A g_sched_softc{} is created with a "geom sched insert" call. 68179184Sjb * In turn this instantiates a specific scheduling algorithm, 69179184Sjb * which sets sc_gsched to point to the algorithm callbacks, 70179184Sjb * and calls gs_init() to create the g_*_softc{} . 71212422Srpaulo * The other callbacks (gs_start, gs_next, ...) are invoked 72179184Sjb * as needed 73179184Sjb * 74179184Sjb * g_sched_softc{} is defined in g_sched.h and mostly used here; 75206082Snetchild * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; 76206082Snetchild * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) 77206082Snetchild * 78179184Sjb * DATA MOVING 79179184Sjb * When a bio is received on the provider, it goes to the 802419Spaul * g_sched_start() which calls gs_start() to initially queue it; 81204024Smarcel * then we call g_sched_dispatch() that loops around gs_next() 82202807Ssepotvin * to select zero or more bio's to be sent downstream. 831638Srgrimes * 842340Sbde * g_sched_dispatch() can also be called as a result of a timeout, 851638Srgrimes * e.g. when doing anticipation or pacing requests. 86125119Sru * 872340Sbde * When a bio comes back, it goes to g_sched_done() which in turn 882340Sbde * calls gs_done(). The latter does any necessary housekeeping in 892340Sbde * the scheduling algorithm, and may decide to call g_sched_dispatch() 902340Sbde * to send more bio's downstream. 912340Sbde * 922340Sbde * If an algorithm needs per-flow queues, these are created 932340Sbde * calling gs_init_class() and destroyed with gs_fini_class(), 942340Sbde * and they are also inserted in the hash table implemented in 952340Sbde * the g_sched_softc{} 962340Sbde * 971638Srgrimes * If an algorithm is replaced, or a transparently-inserted node is 98133369Sharti * removed with "geom sched destroy", we need to remove all references 99133369Sharti * to the g_*_softc{} and g_sched_softc from the bio's still in 100133369Sharti * the scheduler. g_sched_forced_dispatch() helps doing this. 101133369Sharti * XXX need to explain better. 102133369Sharti */ 103133369Sharti 10419343Ssteve#include <sys/cdefs.h> 10519343Ssteve#include <sys/param.h> 10619343Ssteve#include <sys/systm.h> 10719343Ssteve#include <sys/kernel.h> 1082340Sbde#include <sys/module.h> 1092340Sbde#include <sys/lock.h> 11019343Ssteve#include <sys/mutex.h> 1112340Sbde#include <sys/bio.h> 1121638Srgrimes#include <sys/limits.h> 1135257Sache#include <sys/hash.h> 1145257Sache#include <sys/sysctl.h> 1152340Sbde#include <sys/malloc.h> 1162340Sbde#include <sys/proc.h> /* we access curthread */ 1171638Srgrimes#include <geom/geom.h> 1182340Sbde#include "gs_scheduler.h" 1192340Sbde#include "g_sched.h" /* geom hooks */ 1201638Srgrimes 1212340Sbde/* 12292813Sru * Size of the per-geom hash table storing traffic classes. 12392491Smarkm * We may decide to change it at a later time, it has no ABI 12492491Smarkm * implications as it is only used for run-time allocations. 125120485Smarkm */ 12692491Smarkm#define G_SCHED_HASH_SIZE 32 1271638Srgrimes 1282340Sbdestatic int g_sched_destroy(struct g_geom *gp, boolean_t force); 1291638Srgrimesstatic int g_sched_destroy_geom(struct gctl_req *req, 130234575Simp struct g_class *mp, struct g_geom *gp); 131234575Simpstatic void g_sched_config(struct gctl_req *req, struct g_class *mp, 132234575Simp const char *verb); 13336054Sbdestatic struct g_geom *g_sched_taste(struct g_class *mp, 13436054Sbde struct g_provider *pp, int flags __unused); 13536054Sbdestatic void g_sched_dumpconf(struct sbuf *sb, const char *indent, 136244236Semaste struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); 137244236Semastestatic void g_sched_init(struct g_class *mp); 1382340Sbdestatic void g_sched_fini(struct g_class *mp); 1392340Sbdestatic int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, 1401638Srgrimes int fflag, struct thread *td); 1412340Sbde 1422340Sbdestruct g_class g_sched_class = { 143244366Semaste .name = G_SCHED_CLASS_NAME, 1441638Srgrimes .version = G_VERSION, 1452340Sbde .ctlreq = g_sched_config, 1462340Sbde .taste = g_sched_taste, 1472340Sbde .destroy_geom = g_sched_destroy_geom, 14819343Ssteve .init = g_sched_init, 14919343Ssteve .ioctl = g_sched_ioctl, 15019343Ssteve .fini = g_sched_fini 1512340Sbde}; 15219343Ssteve 1532340SbdeMALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); 15419343Ssteve 155129163Sbde/* 15619343Ssteve * Global variables describing the state of the geom_sched module. 15719343Ssteve * There is only one static instance of this structure. 15819343Ssteve */ 15919343SsteveLIST_HEAD(gs_list, g_gsched); /* type, link field */ 16019343Sstevestruct geom_sched_vars { 16119343Ssteve struct mtx gs_mtx; 16219343Ssteve struct gs_list gs_scheds; /* list of algorithms */ 16311613Sbde u_int gs_debug; 16419481Salex u_int gs_sched_count; /* how many algorithms ? */ 165228137Sfjoe u_int gs_patched; /* g_io_request was patched */ 16619343Ssteve 16719343Ssteve u_int gs_initialized; 16819481Salex u_int gs_expire_secs; /* expiration of hash entries */ 169228137Sfjoe 17019343Ssteve struct bio_queue_head gs_pending; 17119343Ssteve u_int gs_npending; 172211243Swill 17319343Ssteve /* The following are for stats, usually protected by gs_mtx. */ 17419343Ssteve u_long gs_requests; /* total requests */ 17519343Ssteve u_long gs_done; /* total done */ 17619343Ssteve u_int gs_in_flight; /* requests in flight */ 17719343Ssteve u_int gs_writes_in_flight; 17819481Salex u_int gs_bytes_in_flight; 179228137Sfjoe u_int gs_write_bytes_in_flight; 18019343Ssteve 18119343Ssteve char gs_names[256]; /* names of schedulers */ 18219481Salex}; 183228137Sfjoe 18419343Sstevestatic struct geom_sched_vars me = { 18519343Ssteve .gs_expire_secs = 10, 18619481Salex}; 18719481Salex 18839208SobrienSYSCTL_DECL(_kern_geom); 18919343SsteveSYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, 190228137Sfjoe "GEOM_SCHED stuff"); 19119343Ssteve 19219343SsteveSYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, 19319481Salex &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); 19419481Salex 19539208SobrienSYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, 19619343Ssteve &me.gs_bytes_in_flight, 0, "Bytes in flight"); 197228137Sfjoe 19819343SsteveSYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, 19919343Ssteve &me.gs_writes_in_flight, 0, "Write Requests in flight"); 20019481Salex 20119343SsteveSYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, 20219343Ssteve &me.gs_in_flight, 0, "Requests in flight"); 20319343Ssteve 20419481SalexSYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, 20519343Ssteve &me.gs_done, 0, "Total done"); 20619343Ssteve 20719343SsteveSYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, 20819481Salex &me.gs_requests, 0, "Total requests"); 20919481Salex 21039208SobrienSYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, 21119343Ssteve &me.gs_names, 0, "Algorithm names"); 21219343Ssteve 21319481SalexSYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, 21419481Salex &me.gs_sched_count, 0, "Number of algorithms"); 21539208Sobrien 21619343SsteveSYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, 21719343Ssteve &me.gs_debug, 0, "Debug level"); 21819343Ssteve 21919343SsteveSYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, 22019343Ssteve &me.gs_expire_secs, 0, "Expire time in seconds"); 22111613Sbde 222211243Swill/* 22311613Sbde * g_sched calls the scheduler algorithms with this lock held. 22411613Sbde * The locking functions are exposed so the scheduler algorithms can also 225103713Smarkm * protect themselves e.g. when running a callout handler. 226103713Smarkm */ 227103713Smarkmvoid 228103713Smarkmg_sched_lock(struct g_geom *gp) 229103713Smarkm{ 230103713Smarkm struct g_sched_softc *sc = gp->softc; 231103713Smarkm 232103713Smarkm mtx_lock(&sc->sc_mtx); 23377817Sobrien} 23477817Sobrien 235228137Sfjoevoid 23677817Sobrieng_sched_unlock(struct g_geom *gp) 2371638Srgrimes{ 2381638Srgrimes struct g_sched_softc *sc = gp->softc; 239228137Sfjoe 2401638Srgrimes mtx_unlock(&sc->sc_mtx); 24177817Sobrien} 24277817Sobrien 24377817Sobrien/* 24436673Sdt * Support functions to handle references to the module, 2451844Swollman * which are coming from devices using this scheduler. 2461844Swollman */ 24736054Sbdestatic inline void 24836054Sbdeg_gsched_ref(struct g_gsched *gsp) 249228137Sfjoe{ 25036054Sbde 2511638Srgrimes atomic_add_int(&gsp->gs_refs, 1); 2521638Srgrimes} 253228137Sfjoe 2541638Srgrimesstatic inline void 25577818Sobrieng_gsched_unref(struct g_gsched *gsp) 25677818Sobrien{ 25777818Sobrien 25877818Sobrien atomic_add_int(&gsp->gs_refs, -1); 2591638Srgrimes} 2601638Srgrimes 2611638Srgrimes/* 2623292Srgrimes * Update the stats when this request is done. 263217100Skib */ 264228137Sfjoestatic void 2653292Srgrimesg_sched_update_stats(struct bio *bio) 266117173Sru{ 267217100Skib 268228137Sfjoe me.gs_done++; 269117173Sru me.gs_in_flight--; 270117173Sru me.gs_bytes_in_flight -= bio->bio_length; 2711638Srgrimes if (bio->bio_cmd & BIO_WRITE) { 272228137Sfjoe me.gs_writes_in_flight--; 2731638Srgrimes me.gs_write_bytes_in_flight -= bio->bio_length; 27416068Sphk } 2751638Srgrimes} 27616068Sphk 27716068Sphk/* 27839208Sobrien * Dispatch any pending request. 279228137Sfjoe */ 2801638Srgrimesstatic void 2811638Srgrimesg_sched_forced_dispatch(struct g_geom *gp) 28215959Sphk{ 28315959Sphk struct g_sched_softc *sc = gp->softc; 28439208Sobrien struct g_gsched *gsp = sc->sc_gsched; 285228137Sfjoe struct bio *bp; 2861638Srgrimes 28721582Ssteve KASSERT(mtx_owned(&sc->sc_mtx), 2881638Srgrimes ("sc_mtx not owned during forced dispatch")); 28921582Ssteve 29021582Ssteve while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) 2911638Srgrimes g_io_request(bp, LIST_FIRST(&gp->consumer)); 2921638Srgrimes} 29315959Sphk 2941638Srgrimes/* 2951638Srgrimes * The main dispatch loop, called either here after the start 29611613Sbde * routine, or by scheduling algorithms when they receive a timeout 297228137Sfjoe * or a 'done' notification. Does not share code with the forced 2981638Srgrimes * dispatch path, since the gs_done() callback can call us. 2991638Srgrimes */ 30011613Sbdevoid 3011638Srgrimesg_sched_dispatch(struct g_geom *gp) 30239208Sobrien{ 303228137Sfjoe struct g_sched_softc *sc = gp->softc; 3041638Srgrimes struct g_gsched *gsp = sc->sc_gsched; 30516068Sphk struct bio *bp; 3061638Srgrimes 30716068Sphk KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); 30816068Sphk 30939208Sobrien if ((sc->sc_flags & G_SCHED_FLUSHING)) 310228137Sfjoe return; 3111638Srgrimes 3121638Srgrimes while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) 31315959Sphk g_io_request(bp, LIST_FIRST(&gp->consumer)); 31415959Sphk} 31539208Sobrien 316228137Sfjoe/* 3171844Swollman * Recent (8.0 and above) versions of FreeBSD have support to 318129163Sbde * register classifiers of disk requests. The classifier is 31992546Simp * invoked by g_io_request(), and stores the information into 32092546Simp * bp->bio_classifier1. 32192546Simp * 3221844Swollman * Support for older versions, which is left here only for 32336609Sjb * documentation purposes, relies on two hacks: 324173075Syar * 1. classification info is written into the bio_caller1 325173075Syar * field of the topmost node in the bio chain. This field 326173075Syar * is rarely used, but this module is incompatible with 327173075Syar * those that use bio_caller1 for other purposes, 328173075Syar * such as ZFS and gjournal; 32994940Sru * 2. g_io_request() is patched in-memory when the module is 330103436Speter * loaded, so that the function calls a classifier as its 33194940Sru * first thing. g_io_request() is restored when the module 332129163Sbde * is unloaded. This functionality is only supported for 333241298Smarcel * x86 and amd64, other architectures need source code changes. 334241298Smarcel */ 335241298Smarcel 336241298Smarcel/* 337241298Smarcel * Lookup the identity of the issuer of the original request. 338241298Smarcel * In the current implementation we use the curthread of the 339241298Smarcel * issuer, but different mechanisms may be implemented later 340241298Smarcel * so we do not make assumptions on the return value which for 341241298Smarcel * us is just an opaque identifier. 342241298Smarcel */ 343145681Sharti 344145681Shartistatic inline u_long 345241298Smarcelg_sched_classify(struct bio *bp) 346145681Sharti{ 347129163Sbde 348129163Sbde#if __FreeBSD_version > 800098 349139232Sru /* we have classifier fields in the struct bio */ 350129163Sbde#define HAVE_BIO_CLASSIFIER 351 return ((u_long)bp->bio_classifier1); 352#else 353#warning old version!!! 354 while (bp->bio_parent != NULL) 355 bp = bp->bio_parent; 356 357 return ((u_long)bp->bio_caller1); 358#endif 359} 360 361/* Return the hash chain for the given key. */ 362static inline struct g_hash * 363g_sched_hash(struct g_sched_softc *sc, u_long key) 364{ 365 366 return (&sc->sc_hash[key & sc->sc_mask]); 367} 368 369/* 370 * Helper function for the children classes, which takes 371 * a geom and a bio and returns the private descriptor 372 * associated to the request. This involves fetching 373 * the classification field and [al]locating the 374 * corresponding entry in the hash table. 375 */ 376void * 377g_sched_get_class(struct g_geom *gp, struct bio *bp) 378{ 379 struct g_sched_softc *sc; 380 struct g_sched_class *gsc; 381 struct g_gsched *gsp; 382 struct g_hash *bucket; 383 u_long key; 384 385 sc = gp->softc; 386 key = g_sched_classify(bp); 387 bucket = g_sched_hash(sc, key); 388 LIST_FOREACH(gsc, bucket, gsc_clist) { 389 if (key == gsc->gsc_key) { 390 gsc->gsc_refs++; 391 return (gsc->gsc_priv); 392 } 393 } 394 395 gsp = sc->sc_gsched; 396 gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, 397 M_GEOM_SCHED, M_NOWAIT | M_ZERO); 398 if (!gsc) 399 return (NULL); 400 401 if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { 402 free(gsc, M_GEOM_SCHED); 403 return (NULL); 404 } 405 406 gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ 407 gsc->gsc_key = key; 408 LIST_INSERT_HEAD(bucket, gsc, gsc_clist); 409 410 gsc->gsc_expire = ticks + me.gs_expire_secs * hz; 411 412 return (gsc->gsc_priv); 413} 414 415/* 416 * Release a reference to the per-client descriptor, 417 */ 418void 419g_sched_put_class(struct g_geom *gp, void *priv) 420{ 421 struct g_sched_class *gsc; 422 struct g_sched_softc *sc; 423 424 gsc = g_sched_priv2class(priv); 425 gsc->gsc_expire = ticks + me.gs_expire_secs * hz; 426 427 if (--gsc->gsc_refs > 0) 428 return; 429 430 sc = gp->softc; 431 sc->sc_gsched->gs_fini_class(sc->sc_data, priv); 432 433 LIST_REMOVE(gsc, gsc_clist); 434 free(gsc, M_GEOM_SCHED); 435} 436 437static void 438g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, 439 struct g_gsched *gsp, void *data) 440{ 441 struct g_sched_class *cp, *cp2; 442 int i; 443 444 if (!hp) 445 return; 446 447 if (data && gsp->gs_hash_unref) 448 gsp->gs_hash_unref(data); 449 450 for (i = 0; i < G_SCHED_HASH_SIZE; i++) { 451 LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) 452 g_sched_put_class(gp, cp->gsc_priv); 453 } 454 455 hashdestroy(hp, M_GEOM_SCHED, mask); 456} 457 458static struct g_hash * 459g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) 460{ 461 struct g_hash *hash; 462 463 if (gsp->gs_priv_size == 0) 464 return (NULL); 465 466 hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); 467 468 return (hash); 469} 470 471static void 472g_sched_flush_classes(struct g_geom *gp) 473{ 474 struct g_sched_softc *sc; 475 struct g_sched_class *cp, *cp2; 476 int i; 477 478 sc = gp->softc; 479 480 if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) 481 return; 482 483 for (i = 0; i < G_SCHED_HASH_SIZE; i++) { 484 LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { 485 if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) 486 g_sched_put_class(gp, cp->gsc_priv); 487 } 488 } 489 490 sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; 491} 492 493/* 494 * Wait for the completion of any outstanding request. To ensure 495 * that this does not take forever the caller has to make sure that 496 * no new request enter the scehduler before calling us. 497 * 498 * Must be called with the gp mutex held and topology locked. 499 */ 500static int 501g_sched_wait_pending(struct g_geom *gp) 502{ 503 struct g_sched_softc *sc = gp->softc; 504 int endticks = ticks + hz; 505 506 g_topology_assert(); 507 508 while (sc->sc_pending && endticks - ticks >= 0) 509 msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); 510 511 return (sc->sc_pending ? ETIMEDOUT : 0); 512} 513 514static int 515g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) 516{ 517 struct g_sched_softc *sc = gp->softc; 518 int error; 519 520 /* Set the flushing flag: new bios will not enter the scheduler. */ 521 sc->sc_flags |= G_SCHED_FLUSHING; 522 523 g_sched_forced_dispatch(gp); 524 error = g_sched_wait_pending(gp); 525 if (error) 526 goto failed; 527 528 /* No more requests pending or in flight from the old gsp. */ 529 530 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); 531 sc->sc_hash = NULL; 532 533 /* 534 * Avoid deadlock here by releasing the gp mutex and reacquiring 535 * it once done. It should be safe, since no reconfiguration or 536 * destruction can take place due to the geom topology lock; no 537 * new request can use the current sc_data since we flagged the 538 * geom as being flushed. 539 */ 540 g_sched_unlock(gp); 541 gsp->gs_fini(sc->sc_data); 542 g_sched_lock(gp); 543 544 sc->sc_gsched = NULL; 545 sc->sc_data = NULL; 546 g_gsched_unref(gsp); 547 548failed: 549 sc->sc_flags &= ~G_SCHED_FLUSHING; 550 551 return (error); 552} 553 554static int 555g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) 556{ 557 int error; 558 559 g_sched_lock(gp); 560 error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ 561 g_sched_unlock(gp); 562 563 return (error); 564} 565 566/* 567 * Support function for create/taste -- locate the desired 568 * algorithm and grab a reference to it. 569 */ 570static struct g_gsched * 571g_gsched_find(const char *name) 572{ 573 struct g_gsched *gsp = NULL; 574 575 mtx_lock(&me.gs_mtx); 576 LIST_FOREACH(gsp, &me.gs_scheds, glist) { 577 if (strcmp(name, gsp->gs_name) == 0) { 578 g_gsched_ref(gsp); 579 break; 580 } 581 } 582 mtx_unlock(&me.gs_mtx); 583 584 return (gsp); 585} 586 587/* 588 * Rebuild the list of scheduler names. 589 * To be called with me.gs_mtx lock held. 590 */ 591static void 592g_gsched_build_names(struct g_gsched *gsp) 593{ 594 int pos, l; 595 struct g_gsched *cur; 596 597 pos = 0; 598 LIST_FOREACH(cur, &me.gs_scheds, glist) { 599 l = strlen(cur->gs_name); 600 if (l + pos + 1 + 1 < sizeof(me.gs_names)) { 601 if (pos != 0) 602 me.gs_names[pos++] = ' '; 603 strcpy(me.gs_names + pos, cur->gs_name); 604 pos += l; 605 } 606 } 607 me.gs_names[pos] = '\0'; 608} 609 610/* 611 * Register or unregister individual scheduling algorithms. 612 */ 613static int 614g_gsched_register(struct g_gsched *gsp) 615{ 616 struct g_gsched *cur; 617 int error = 0; 618 619 mtx_lock(&me.gs_mtx); 620 LIST_FOREACH(cur, &me.gs_scheds, glist) { 621 if (strcmp(gsp->gs_name, cur->gs_name) == 0) 622 break; 623 } 624 if (cur != NULL) { 625 G_SCHED_DEBUG(0, "A scheduler named %s already" 626 "exists.", gsp->gs_name); 627 error = EEXIST; 628 } else { 629 LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); 630 gsp->gs_refs = 1; 631 me.gs_sched_count++; 632 g_gsched_build_names(gsp); 633 } 634 mtx_unlock(&me.gs_mtx); 635 636 return (error); 637} 638 639struct g_gsched_unregparm { 640 struct g_gsched *gup_gsp; 641 int gup_error; 642}; 643 644static void 645g_gsched_unregister(void *arg, int flag) 646{ 647 struct g_gsched_unregparm *parm = arg; 648 struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; 649 struct g_sched_softc *sc; 650 struct g_geom *gp, *gp_tmp; 651 int error; 652 653 parm->gup_error = 0; 654 655 g_topology_assert(); 656 657 if (flag == EV_CANCEL) 658 return; 659 660 mtx_lock(&me.gs_mtx); 661 662 LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { 663 if (gp->class != &g_sched_class) 664 continue; /* Should not happen. */ 665 666 sc = gp->softc; 667 if (sc->sc_gsched == gsp) { 668 error = g_sched_remove(gp, gsp); 669 if (error) 670 goto failed; 671 } 672 } 673 674 LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { 675 if (cur != gsp) 676 continue; 677 678 if (gsp->gs_refs != 1) { 679 G_SCHED_DEBUG(0, "%s still in use.", 680 gsp->gs_name); 681 parm->gup_error = EBUSY; 682 } else { 683 LIST_REMOVE(gsp, glist); 684 me.gs_sched_count--; 685 g_gsched_build_names(gsp); 686 } 687 break; 688 } 689 690 if (cur == NULL) { 691 G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); 692 parm->gup_error = ENOENT; 693 } 694 695failed: 696 mtx_unlock(&me.gs_mtx); 697} 698 699static inline void 700g_gsched_global_init(void) 701{ 702 703 if (!me.gs_initialized) { 704 G_SCHED_DEBUG(0, "Initializing global data."); 705 mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); 706 LIST_INIT(&me.gs_scheds); 707 gs_bioq_init(&me.gs_pending); 708 me.gs_initialized = 1; 709 } 710} 711 712/* 713 * Module event called when a scheduling algorithm module is loaded or 714 * unloaded. 715 */ 716int 717g_gsched_modevent(module_t mod, int cmd, void *arg) 718{ 719 struct g_gsched *gsp = arg; 720 struct g_gsched_unregparm parm; 721 int error; 722 723 G_SCHED_DEBUG(0, "Modevent %d.", cmd); 724 725 /* 726 * If the module is loaded at boot, the geom thread that calls 727 * g_sched_init() might actually run after g_gsched_modevent(), 728 * so make sure that the module is properly initialized. 729 */ 730 g_gsched_global_init(); 731 732 error = EOPNOTSUPP; 733 switch (cmd) { 734 case MOD_LOAD: 735 error = g_gsched_register(gsp); 736 G_SCHED_DEBUG(0, "Loaded module %s error %d.", 737 gsp->gs_name, error); 738 if (error == 0) 739 g_retaste(&g_sched_class); 740 break; 741 742 case MOD_UNLOAD: 743 parm.gup_gsp = gsp; 744 parm.gup_error = 0; 745 746 error = g_waitfor_event(g_gsched_unregister, 747 &parm, M_WAITOK, NULL); 748 if (error == 0) 749 error = parm.gup_error; 750 G_SCHED_DEBUG(0, "Unloaded module %s error %d.", 751 gsp->gs_name, error); 752 break; 753 }; 754 755 return (error); 756} 757 758#ifdef KTR 759#define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) 760 761static inline char 762g_sched_type(struct bio *bp) 763{ 764 765 if (0 != (bp->bio_cmd & BIO_READ)) 766 return ('R'); 767 else if (0 != (bp->bio_cmd & BIO_WRITE)) 768 return ('W'); 769 return ('U'); 770} 771 772static inline void 773g_sched_trace_bio_START(struct bio *bp) 774{ 775 776 CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp), 777 g_sched_type(bp), bp->bio_offset / ULONG_MAX, 778 bp->bio_offset, bp->bio_length); 779} 780 781static inline void 782g_sched_trace_bio_DONE(struct bio *bp) 783{ 784 785 CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp), 786 g_sched_type(bp), bp->bio_offset / ULONG_MAX, 787 bp->bio_offset, bp->bio_length); 788} 789#else /* !KTR */ 790#define TRC_BIO_EVENT(e, bp) 791#endif /* !KTR */ 792 793/* 794 * g_sched_done() and g_sched_start() dispatch the geom requests to 795 * the scheduling algorithm in use. 796 */ 797static void 798g_sched_done(struct bio *bio) 799{ 800 struct g_geom *gp = bio->bio_caller2; 801 struct g_sched_softc *sc = gp->softc; 802 803 TRC_BIO_EVENT(DONE, bio); 804 805 KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); 806 807 g_sched_lock(gp); 808 809 g_sched_update_stats(bio); 810 sc->sc_gsched->gs_done(sc->sc_data, bio); 811 if (!--sc->sc_pending) 812 wakeup(gp); 813 814 g_sched_flush_classes(gp); 815 g_sched_unlock(gp); 816 817 g_std_done(bio); 818} 819 820static void 821g_sched_start(struct bio *bp) 822{ 823 struct g_geom *gp = bp->bio_to->geom; 824 struct g_sched_softc *sc = gp->softc; 825 struct bio *cbp; 826 827 TRC_BIO_EVENT(START, bp); 828 G_SCHED_LOGREQ(bp, "Request received."); 829 830 cbp = g_clone_bio(bp); 831 if (cbp == NULL) { 832 g_io_deliver(bp, ENOMEM); 833 return; 834 } 835 cbp->bio_done = g_sched_done; 836 cbp->bio_to = LIST_FIRST(&gp->provider); 837 KASSERT(cbp->bio_to != NULL, ("NULL provider")); 838 839 /* We only schedule reads and writes. */ 840 if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE))) 841 goto bypass; 842 843 G_SCHED_LOGREQ(cbp, "Sending request."); 844 845 g_sched_lock(gp); 846 /* 847 * Call the algorithm's gs_start to queue the request in the 848 * scheduler. If gs_start fails then pass the request down, 849 * otherwise call g_sched_dispatch() which tries to push 850 * one or more requests down. 851 */ 852 if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || 853 sc->sc_gsched->gs_start(sc->sc_data, cbp)) { 854 g_sched_unlock(gp); 855 goto bypass; 856 } 857 /* 858 * We use bio_caller1 to mark requests that are scheduled 859 * so make sure it is not NULL. 860 */ 861 if (cbp->bio_caller1 == NULL) 862 cbp->bio_caller1 = &me; /* anything not NULL */ 863 864 cbp->bio_caller2 = gp; 865 sc->sc_pending++; 866 867 /* Update general stats. */ 868 me.gs_in_flight++; 869 me.gs_requests++; 870 me.gs_bytes_in_flight += bp->bio_length; 871 if (bp->bio_cmd & BIO_WRITE) { 872 me.gs_writes_in_flight++; 873 me.gs_write_bytes_in_flight += bp->bio_length; 874 } 875 g_sched_dispatch(gp); 876 g_sched_unlock(gp); 877 return; 878 879bypass: 880 cbp->bio_done = g_std_done; 881 cbp->bio_caller1 = NULL; /* not scheduled */ 882 g_io_request(cbp, LIST_FIRST(&gp->consumer)); 883} 884 885/* 886 * The next few functions are the geom glue. 887 */ 888static void 889g_sched_orphan(struct g_consumer *cp) 890{ 891 892 g_topology_assert(); 893 g_sched_destroy(cp->geom, 1); 894} 895 896static int 897g_sched_access(struct g_provider *pp, int dr, int dw, int de) 898{ 899 struct g_geom *gp; 900 struct g_consumer *cp; 901 int error; 902 903 gp = pp->geom; 904 cp = LIST_FIRST(&gp->consumer); 905 error = g_access(cp, dr, dw, de); 906 907 return (error); 908} 909 910static void 911g_sched_temporary_start(struct bio *bio) 912{ 913 914 mtx_lock(&me.gs_mtx); 915 me.gs_npending++; 916 gs_bioq_disksort(&me.gs_pending, bio); 917 mtx_unlock(&me.gs_mtx); 918} 919 920static void 921g_sched_flush_pending(g_start_t *start) 922{ 923 struct bio *bp; 924 925 while ((bp = gs_bioq_takefirst(&me.gs_pending))) 926 start(bp); 927} 928 929static int 930g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, 931 struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) 932{ 933 struct g_sched_softc *sc = gp->softc; 934 g_start_t *saved_start, *flush = g_sched_start; 935 int error = 0, endticks = ticks + hz; 936 937 g_cancel_event(newpp); /* prevent taste() */ 938 /* copy private fields */ 939 newpp->private = pp->private; 940 newpp->index = pp->index; 941 942 /* Queue all the early requests coming for us. */ 943 me.gs_npending = 0; 944 saved_start = pp->geom->start; 945 dstgp->start = g_sched_temporary_start; 946 947 while (pp->nstart - pp->nend != me.gs_npending && 948 endticks - ticks >= 0) 949 tsleep(pp, PRIBIO, "-", hz/10); 950 951 if (pp->nstart - pp->nend != me.gs_npending) { 952 flush = saved_start; 953 error = ETIMEDOUT; 954 goto fail; 955 } 956 957 /* link pp to this geom */ 958 LIST_REMOVE(pp, provider); 959 pp->geom = gp; 960 LIST_INSERT_HEAD(&gp->provider, pp, provider); 961 962 /* 963 * replicate the counts from the parent in the 964 * new provider and consumer nodes 965 */ 966 cp->acr = newpp->acr = pp->acr; 967 cp->acw = newpp->acw = pp->acw; 968 cp->ace = newpp->ace = pp->ace; 969 sc->sc_flags |= G_SCHED_PROXYING; 970 971fail: 972 dstgp->start = saved_start; 973 974 g_sched_flush_pending(flush); 975 976 return (error); 977} 978 979/* 980 * Create a geom node for the device passed as *pp. 981 * If successful, add a reference to this gsp. 982 */ 983static int 984g_sched_create(struct gctl_req *req, struct g_class *mp, 985 struct g_provider *pp, struct g_gsched *gsp, int proxy) 986{ 987 struct g_sched_softc *sc = NULL; 988 struct g_geom *gp, *dstgp; 989 struct g_provider *newpp = NULL; 990 struct g_consumer *cp = NULL; 991 char name[64]; 992 int error; 993 994 g_topology_assert(); 995 996 snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); 997 LIST_FOREACH(gp, &mp->geom, geom) { 998 if (strcmp(gp->name, name) == 0) { 999 gctl_error(req, "Geom %s already exists.", 1000 name); 1001 return (EEXIST); 1002 } 1003 } 1004 1005 gp = g_new_geomf(mp, name); 1006 dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ 1007 1008 sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); 1009 sc->sc_gsched = gsp; 1010 sc->sc_data = gsp->gs_init(gp); 1011 if (sc->sc_data == NULL) { 1012 error = ENOMEM; 1013 goto fail; 1014 } 1015 1016 sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); 1017 1018 /* 1019 * Do not initialize the flush mechanism, will be initialized 1020 * on the first insertion on the hash table. 1021 */ 1022 1023 mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); 1024 1025 gp->softc = sc; 1026 gp->start = g_sched_start; 1027 gp->orphan = g_sched_orphan; 1028 gp->access = g_sched_access; 1029 gp->dumpconf = g_sched_dumpconf; 1030 1031 newpp = g_new_providerf(dstgp, gp->name); 1032 newpp->mediasize = pp->mediasize; 1033 newpp->sectorsize = pp->sectorsize; 1034 1035 cp = g_new_consumer(gp); 1036 error = g_attach(cp, proxy ? newpp : pp); 1037 if (error != 0) { 1038 gctl_error(req, "Cannot attach to provider %s.", 1039 pp->name); 1040 goto fail; 1041 } 1042 1043 g_error_provider(newpp, 0); 1044 if (proxy) { 1045 error = g_insert_proxy(gp, newpp, dstgp, pp, cp); 1046 if (error) 1047 goto fail; 1048 } 1049 G_SCHED_DEBUG(0, "Device %s created.", gp->name); 1050 1051 g_gsched_ref(gsp); 1052 1053 return (0); 1054 1055fail: 1056 if (cp != NULL) { 1057 if (cp->provider != NULL) 1058 g_detach(cp); 1059 g_destroy_consumer(cp); 1060 } 1061 if (newpp != NULL) 1062 g_destroy_provider(newpp); 1063 if (sc->sc_hash) 1064 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, 1065 gsp, sc->sc_data); 1066 if (sc->sc_data) 1067 gsp->gs_fini(sc->sc_data); 1068 g_free(gp->softc); 1069 g_destroy_geom(gp); 1070 1071 return (error); 1072} 1073 1074/* 1075 * Support for dynamic switching of scheduling algorithms. 1076 * First initialize the data structures for the new algorithm, 1077 * then call g_sched_remove_locked() to flush all references 1078 * to the old one, finally link the new algorithm. 1079 */ 1080static int 1081g_sched_change_algo(struct gctl_req *req, struct g_class *mp, 1082 struct g_provider *pp, struct g_gsched *gsp) 1083{ 1084 struct g_sched_softc *sc; 1085 struct g_geom *gp; 1086 struct g_hash *newh; 1087 void *data; 1088 u_long mask; 1089 int error = 0; 1090 1091 gp = pp->geom; 1092 sc = gp->softc; 1093 1094 data = gsp->gs_init(gp); 1095 if (data == NULL) 1096 return (ENOMEM); 1097 1098 newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); 1099 if (gsp->gs_priv_size && !newh) { 1100 error = ENOMEM; 1101 goto fail; 1102 } 1103 1104 g_sched_lock(gp); 1105 if (sc->sc_gsched) { /* can be NULL in some cases */ 1106 error = g_sched_remove_locked(gp, sc->sc_gsched); 1107 if (error) 1108 goto fail; 1109 } 1110 1111 g_gsched_ref(gsp); 1112 sc->sc_gsched = gsp; 1113 sc->sc_data = data; 1114 sc->sc_hash = newh; 1115 sc->sc_mask = mask; 1116 1117 g_sched_unlock(gp); 1118 1119 return (0); 1120 1121fail: 1122 if (newh) 1123 g_sched_hash_fini(gp, newh, mask, gsp, data); 1124 1125 if (data) 1126 gsp->gs_fini(data); 1127 1128 g_sched_unlock(gp); 1129 1130 return (error); 1131} 1132 1133/* 1134 * Stop the request flow directed to the proxy, redirecting the new 1135 * requests to the me.gs_pending queue. 1136 */ 1137static struct g_provider * 1138g_detach_proxy(struct g_geom *gp) 1139{ 1140 struct g_consumer *cp; 1141 struct g_provider *pp, *newpp; 1142 1143 do { 1144 pp = LIST_FIRST(&gp->provider); 1145 if (pp == NULL) 1146 break; 1147 cp = LIST_FIRST(&gp->consumer); 1148 if (cp == NULL) 1149 break; 1150 newpp = cp->provider; 1151 if (newpp == NULL) 1152 break; 1153 1154 me.gs_npending = 0; 1155 pp->geom->start = g_sched_temporary_start; 1156 1157 return (pp); 1158 } while (0); 1159 printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); 1160 1161 return (NULL); 1162} 1163 1164static void 1165g_sched_blackhole(struct bio *bp) 1166{ 1167 1168 g_io_deliver(bp, ENXIO); 1169} 1170 1171static inline void 1172g_reparent_provider(struct g_provider *pp, struct g_geom *gp, 1173 struct g_provider *newpp) 1174{ 1175 1176 LIST_REMOVE(pp, provider); 1177 if (newpp) { 1178 pp->private = newpp->private; 1179 pp->index = newpp->index; 1180 } 1181 pp->geom = gp; 1182 LIST_INSERT_HEAD(&gp->provider, pp, provider); 1183} 1184 1185static inline void 1186g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) 1187{ 1188 struct g_geom *gp = oldpp->geom; 1189 1190 g_reparent_provider(oldpp, newpp->geom, newpp); 1191 1192 /* 1193 * Hackish: let the system destroy the old provider for us, just 1194 * in case someone attached a consumer to it, in which case a 1195 * direct call to g_destroy_provider() would not work. 1196 */ 1197 g_reparent_provider(newpp, gp, NULL); 1198} 1199 1200/* 1201 * Complete the proxy destruction, linking the old provider to its 1202 * original geom, and destroying the proxy provider. Also take care 1203 * of issuing the pending requests collected in me.gs_pending (if any). 1204 */ 1205static int 1206g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) 1207{ 1208 struct g_consumer *cp; 1209 struct g_provider *newpp; 1210 1211 do { 1212 cp = LIST_FIRST(&gp->consumer); 1213 if (cp == NULL) 1214 break; 1215 newpp = cp->provider; 1216 if (newpp == NULL) 1217 break; 1218 1219 /* Relink the provider to its original geom. */ 1220 g_unproxy_provider(oldpp, newpp); 1221 1222 /* Detach consumer from provider, and destroy provider. */ 1223 cp->acr = newpp->acr = 0; 1224 cp->acw = newpp->acw = 0; 1225 cp->ace = newpp->ace = 0; 1226 g_detach(cp); 1227 1228 /* Send the pending bios through the right start function. */ 1229 g_sched_flush_pending(oldpp->geom->start); 1230 1231 return (0); 1232 } while (0); 1233 printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); 1234 1235 /* We cannot send the pending bios anywhere... */ 1236 g_sched_flush_pending(g_sched_blackhole); 1237 1238 return (EINVAL); 1239} 1240 1241static int 1242g_sched_destroy(struct g_geom *gp, boolean_t force) 1243{ 1244 struct g_provider *pp, *oldpp = NULL; 1245 struct g_sched_softc *sc; 1246 struct g_gsched *gsp; 1247 int error; 1248 1249 g_topology_assert(); 1250 sc = gp->softc; 1251 if (sc == NULL) 1252 return (ENXIO); 1253 if (!(sc->sc_flags & G_SCHED_PROXYING)) { 1254 pp = LIST_FIRST(&gp->provider); 1255 if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { 1256 const char *msg = force ? 1257 "but we force removal" : "cannot remove"; 1258 1259 G_SCHED_DEBUG(!force, 1260 "Device %s is still open (r%dw%de%d), %s.", 1261 pp->name, pp->acr, pp->acw, pp->ace, msg); 1262 if (!force) 1263 return (EBUSY); 1264 } else { 1265 G_SCHED_DEBUG(0, "Device %s removed.", gp->name); 1266 } 1267 } else 1268 oldpp = g_detach_proxy(gp); 1269 1270 gsp = sc->sc_gsched; 1271 if (gsp) { 1272 /* 1273 * XXX bad hack here: force a dispatch to release 1274 * any reference to the hash table still held by 1275 * the scheduler. 1276 */ 1277 g_sched_lock(gp); 1278 /* 1279 * We are dying here, no new requests should enter 1280 * the scheduler. This is granted by the topolgy, 1281 * either in case we were proxying (new bios are 1282 * being redirected) or not (see the access check 1283 * above). 1284 */ 1285 g_sched_forced_dispatch(gp); 1286 error = g_sched_wait_pending(gp); 1287 1288 if (error) { 1289 /* 1290 * Not all the requests came home: this might happen 1291 * under heavy load, or if we were waiting for any 1292 * bio which is served in the event path (see 1293 * geom_slice.c for an example of how this can 1294 * happen). Try to restore a working configuration 1295 * if we can fail. 1296 */ 1297 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { 1298 g_sched_flush_pending(force ? 1299 g_sched_blackhole : g_sched_start); 1300 } 1301 1302 /* 1303 * In the forced destroy case there is not so much 1304 * we can do, we have pending bios that will call 1305 * g_sched_done() somehow, and we don't want them 1306 * to crash the system using freed memory. We tell 1307 * the user that something went wrong, and leak some 1308 * memory here. 1309 * Note: the callers using force = 1 ignore the 1310 * return value. 1311 */ 1312 if (force) { 1313 G_SCHED_DEBUG(0, "Pending requests while " 1314 " destroying geom, some memory leaked."); 1315 } 1316 1317 return (error); 1318 } 1319 1320 g_sched_unlock(gp); 1321 g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, 1322 gsp, sc->sc_data); 1323 sc->sc_hash = NULL; 1324 gsp->gs_fini(sc->sc_data); 1325 g_gsched_unref(gsp); 1326 sc->sc_gsched = NULL; 1327 } 1328 1329 if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { 1330 error = g_destroy_proxy(gp, oldpp); 1331 1332 if (error) { 1333 if (force) { 1334 G_SCHED_DEBUG(0, "Unrecoverable error while " 1335 "destroying a proxy geom, leaking some " 1336 " memory."); 1337 } 1338 1339 return (error); 1340 } 1341 } 1342 1343 mtx_destroy(&sc->sc_mtx); 1344 1345 g_free(gp->softc); 1346 gp->softc = NULL; 1347 g_wither_geom(gp, ENXIO); 1348 1349 return (error); 1350} 1351 1352static int 1353g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, 1354 struct g_geom *gp) 1355{ 1356 1357 return (g_sched_destroy(gp, 0)); 1358} 1359 1360/* 1361 * Functions related to the classification of requests. 1362 * 1363 * On recent FreeBSD versions (8.0 and above), we store a reference 1364 * to the issuer of a request in bp->bio_classifier1 as soon 1365 * as the bio is posted to the geom queue (and not later, because 1366 * requests are managed by the g_down thread afterwards). 1367 * 1368 * On older versions of the system (but this code is not used 1369 * in any existing release), we [ab]use the caller1 field in the 1370 * root element of the bio tree to store the classification info. 1371 * The marking is done at the beginning of g_io_request() 1372 * and only if we find that the field is NULL. 1373 * 1374 * To avoid rebuilding the kernel, this module will patch the 1375 * initial part of g_io_request() so it jumps to some hand-coded 1376 * assembly that does the marking and then executes the original 1377 * body of g_io_request(). 1378 * 1379 * fake_ioreq[] is architecture-specific machine code 1380 * that implements the above. CODE_SIZE, STORE_SIZE etc. 1381 * are constants used in the patching routine. Look at the 1382 * code in g_ioreq_patch() for the details. 1383 */ 1384 1385#ifndef HAVE_BIO_CLASSIFIER 1386/* 1387 * Support for old FreeBSD versions 1388 */ 1389#if defined(__i386__) 1390#define CODE_SIZE 29 1391#define STORE_SIZE 5 1392#define EPILOGUE 5 1393#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) 1394 1395static u_char fake_ioreq[SIZE] = { 1396 0x8b, 0x44, 0x24, 0x04, /* mov bp, %eax */ 1397 /* 1: */ 1398 0x89, 0xc2, /* mov %eax, %edx # edx = bp */ 1399 0x8b, 0x40, 0x64, /* mov bp->bio_parent, %eax */ 1400 0x85, 0xc0, /* test %eax, %eax */ 1401 0x75, 0xf7, /* jne 1b */ 1402 0x8b, 0x42, 0x30, /* mov bp->bp_caller1, %eax */ 1403 0x85, 0xc0, /* test %eax, %eax */ 1404 0x75, 0x09, /* jne 2f */ 1405 0x64, 0xa1, 0x00, 0x00, /* mov %fs:0, %eax */ 1406 0x00, 0x00, 1407 0x89, 0x42, 0x30, /* mov %eax, bp->bio_caller1 */ 1408 /* 2: */ 1409 0x55, 0x89, 0xe5, 0x57, 0x56, 1410 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ 1411}; 1412#elif defined(__amd64) 1413#define CODE_SIZE 38 1414#define STORE_SIZE 6 1415#define EPILOGUE 5 1416#define SIZE (CODE_SIZE + STORE_SIZE + EPILOGUE) 1417 1418static u_char fake_ioreq[SIZE] = { 1419 0x48, 0x89, 0xf8, /* mov bp, %rax */ 1420 /* 1: */ 1421 0x48, 0x89, 0xc2, /* mov %rax, %rdx # rdx = bp */ 1422 0x48, 0x8b, 0x82, 0xa8, /* mov bp->bio_parent, %rax */ 1423 0x00, 0x00, 0x00, 1424 0x48, 0x85, 0xc0, /* test %rax, %rax */ 1425 0x75, 0xf1, /* jne 1b */ 1426 0x48, 0x83, 0x7a, 0x58, /* cmp $0, bp->bp_caller1 */ 1427 0x00, 1428 0x75, 0x0d, /* jne 2f */ 1429 0x65, 0x48, 0x8b, 0x04, /* mov %gs:0, %rax */ 1430 0x25, 0x00, 0x00, 0x00, 1431 0x00, 1432 0x48, 0x89, 0x42, 0x58, /* mov %rax, bp->bio_caller1 */ 1433 /* 2: */ 1434 0x55, 0x48, 0x89, 0xe5, 0x41, 0x56, 1435 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp back... */ 1436}; 1437#else /* neither x86 nor amd64 */ 1438static void 1439g_new_io_request(struct bio *bp, struct g_consumer *cp) 1440{ 1441 struct bio *top = bp; 1442 1443 /* 1444 * bio classification: if bio_caller1 is available in the 1445 * root of the 'struct bio' tree, store there the thread id 1446 * of the thread that originated the request. 1447 * More sophisticated classification schemes can be used. 1448 */ 1449 while (top->bio_parent) 1450 top = top->bio_parent; 1451 1452 if (top->bio_caller1 == NULL) 1453 top->bio_caller1 = curthread; 1454} 1455 1456#error please add the code above in g_new_io_request() to the beginning of \ 1457 /sys/geom/geom_io.c::g_io_request(), and remove this line. 1458#endif /* end of arch-specific code */ 1459 1460static int 1461g_ioreq_patch(void) 1462{ 1463 u_char *original; 1464 u_long ofs; 1465 int found; 1466 1467 if (me.gs_patched) 1468 return (-1); 1469 1470 original = (u_char *)g_io_request; 1471 1472 found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE); 1473 if (!found) 1474 return (-1); 1475 1476 /* Jump back to the original + STORE_SIZE. */ 1477 ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE); 1478 bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4); 1479 1480 /* Patch the original address with a jump to the trampoline. */ 1481 *original = 0xe9; /* jump opcode */ 1482 ofs = fake_ioreq - (original + 5); 1483 bcopy(&ofs, original + 1, 4); 1484 1485 me.gs_patched = 1; 1486 1487 return (0); 1488} 1489 1490/* 1491 * Restore the original code, this is easy. 1492 */ 1493static void 1494g_ioreq_restore(void) 1495{ 1496 u_char *original; 1497 1498 if (me.gs_patched) { 1499 original = (u_char *)g_io_request; 1500 bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE); 1501 me.gs_patched = 0; 1502 } 1503} 1504 1505static inline void 1506g_classifier_ini(void) 1507{ 1508 1509 g_ioreq_patch(); 1510} 1511 1512static inline void 1513g_classifier_fini(void) 1514{ 1515 1516 g_ioreq_restore(); 1517} 1518 1519/*--- end of support code for older FreeBSD versions */ 1520 1521#else /* HAVE_BIO_CLASSIFIER */ 1522 1523/* 1524 * Classifier support for recent FreeBSD versions: we use 1525 * a very simple classifier, only use curthread to tag a request. 1526 * The classifier is registered at module load, and unregistered 1527 * at module unload. 1528 */ 1529static int 1530g_sched_tag(void *arg, struct bio *bp) 1531{ 1532 1533 bp->bio_classifier1 = curthread; 1534 return (1); 1535} 1536 1537static struct g_classifier_hook g_sched_classifier = { 1538 .func = g_sched_tag, 1539}; 1540 1541static inline void 1542g_classifier_ini(void) 1543{ 1544 1545 g_register_classifier(&g_sched_classifier); 1546} 1547 1548static inline void 1549g_classifier_fini(void) 1550{ 1551 1552 g_unregister_classifier(&g_sched_classifier); 1553} 1554#endif /* HAVE_BIO_CLASSIFIER */ 1555 1556static void 1557g_sched_init(struct g_class *mp) 1558{ 1559 1560 g_gsched_global_init(); 1561 1562 G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", 1563 mp, &g_sched_class); 1564 1565 /* Patch g_io_request to store classification info in the bio. */ 1566 g_classifier_ini(); 1567} 1568 1569static void 1570g_sched_fini(struct g_class *mp) 1571{ 1572 1573 g_classifier_fini(); 1574 1575 G_SCHED_DEBUG(0, "Unloading..."); 1576 1577 KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); 1578 mtx_destroy(&me.gs_mtx); 1579} 1580 1581static int 1582g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, 1583 struct thread *td) 1584{ 1585 struct g_consumer *cp; 1586 struct g_geom *gp; 1587 1588 cp = LIST_FIRST(&pp->geom->consumer); 1589 if (cp == NULL) 1590 return (ENOIOCTL); 1591 gp = cp->provider->geom; 1592 if (gp->ioctl == NULL) 1593 return (ENOIOCTL); 1594 return (gp->ioctl(cp->provider, cmd, data, fflag, td)); 1595} 1596 1597/* 1598 * Read the i-th argument for a request, skipping the /dev/ 1599 * prefix if present. 1600 */ 1601static const char * 1602g_sched_argi(struct gctl_req *req, int i) 1603{ 1604 static const char *dev_prefix = "/dev/"; 1605 const char *name; 1606 char param[16]; 1607 int l = strlen(dev_prefix); 1608 1609 snprintf(param, sizeof(param), "arg%d", i); 1610 name = gctl_get_asciiparam(req, param); 1611 if (name == NULL) 1612 gctl_error(req, "No 'arg%d' argument", i); 1613 else if (strncmp(name, dev_prefix, l) == 0) 1614 name += l; 1615 return (name); 1616} 1617 1618/* 1619 * Fetch nargs and do appropriate checks. 1620 */ 1621static int 1622g_sched_get_nargs(struct gctl_req *req) 1623{ 1624 int *nargs; 1625 1626 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); 1627 if (nargs == NULL) { 1628 gctl_error(req, "No 'nargs' argument"); 1629 return (0); 1630 } 1631 if (*nargs <= 0) 1632 gctl_error(req, "Missing device(s)."); 1633 return (*nargs); 1634} 1635 1636/* 1637 * Check whether we should add the class on certain volumes when 1638 * this geom is created. Right now this is under control of a kenv 1639 * variable containing the names of all devices that we care about. 1640 * Probably we should only support transparent insertion as the 1641 * preferred mode of operation. 1642 */ 1643static struct g_geom * 1644g_sched_taste(struct g_class *mp, struct g_provider *pp, 1645 int flags __unused) 1646{ 1647 struct g_gsched *gsp = NULL; /* the . algorithm we want */ 1648 const char *s; /* generic string pointer */ 1649 const char *taste_names; /* devices we like */ 1650 int l; 1651 1652 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, 1653 mp->name, pp->name); 1654 g_topology_assert(); 1655 1656 G_SCHED_DEBUG(2, "Tasting %s.", pp->name); 1657 1658 do { 1659 /* do not taste on ourselves */ 1660 if (pp->geom->class == mp) 1661 break; 1662 1663 taste_names = getenv("geom.sched.taste"); 1664 if (taste_names == NULL) 1665 break; 1666 1667 l = strlen(pp->name); 1668 for (s = taste_names; *s && 1669 (s = strstr(s, pp->name)); s++) { 1670 /* further checks for an exact match */ 1671 if ( (s == taste_names || s[-1] == ' ') && 1672 (s[l] == '\0' || s[l] == ' ') ) 1673 break; 1674 } 1675 if (s == NULL) 1676 break; 1677 G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", 1678 pp->name, s); 1679 1680 /* look up the provider name in the list */ 1681 s = getenv("geom.sched.algo"); 1682 if (s == NULL) 1683 s = "rr"; 1684 1685 gsp = g_gsched_find(s); /* also get a reference */ 1686 if (gsp == NULL) { 1687 G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); 1688 break; 1689 } 1690 1691 /* XXX create with 1 as last argument ? */ 1692 g_sched_create(NULL, mp, pp, gsp, 0); 1693 g_gsched_unref(gsp); 1694 } while (0); 1695 return NULL; 1696} 1697 1698static void 1699g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) 1700{ 1701 struct g_provider *pp; 1702 struct g_gsched *gsp; 1703 const char *name; 1704 int i, nargs; 1705 1706 g_topology_assert(); 1707 1708 name = gctl_get_asciiparam(req, "algo"); 1709 if (name == NULL) { 1710 gctl_error(req, "No '%s' argument", "algo"); 1711 return; 1712 } 1713 1714 gsp = g_gsched_find(name); /* also get a reference */ 1715 if (gsp == NULL) { 1716 gctl_error(req, "Bad algorithm '%s'", name); 1717 return; 1718 } 1719 1720 nargs = g_sched_get_nargs(req); 1721 1722 /* 1723 * Run on the arguments, and break on any error. 1724 * We look for a device name, but skip the /dev/ prefix if any. 1725 */ 1726 for (i = 0; i < nargs; i++) { 1727 name = g_sched_argi(req, i); 1728 if (name == NULL) 1729 break; 1730 pp = g_provider_by_name(name); 1731 if (pp == NULL) { 1732 G_SCHED_DEBUG(1, "Provider %s is invalid.", name); 1733 gctl_error(req, "Provider %s is invalid.", name); 1734 break; 1735 } 1736 if (g_sched_create(req, mp, pp, gsp, proxy) != 0) 1737 break; 1738 } 1739 1740 g_gsched_unref(gsp); 1741} 1742 1743static void 1744g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) 1745{ 1746 struct g_provider *pp; 1747 struct g_gsched *gsp; 1748 const char *name; 1749 int i, nargs; 1750 1751 g_topology_assert(); 1752 1753 name = gctl_get_asciiparam(req, "algo"); 1754 if (name == NULL) { 1755 gctl_error(req, "No '%s' argument", "algo"); 1756 return; 1757 } 1758 1759 gsp = g_gsched_find(name); /* also get a reference */ 1760 if (gsp == NULL) { 1761 gctl_error(req, "Bad algorithm '%s'", name); 1762 return; 1763 } 1764 1765 nargs = g_sched_get_nargs(req); 1766 1767 /* 1768 * Run on the arguments, and break on any error. 1769 * We look for a device name, but skip the /dev/ prefix if any. 1770 */ 1771 for (i = 0; i < nargs; i++) { 1772 name = g_sched_argi(req, i); 1773 if (name == NULL) 1774 break; 1775 pp = g_provider_by_name(name); 1776 if (pp == NULL || pp->geom->class != mp) { 1777 G_SCHED_DEBUG(1, "Provider %s is invalid.", name); 1778 gctl_error(req, "Provider %s is invalid.", name); 1779 break; 1780 } 1781 if (g_sched_change_algo(req, mp, pp, gsp) != 0) 1782 break; 1783 } 1784 1785 g_gsched_unref(gsp); 1786} 1787 1788static struct g_geom * 1789g_sched_find_geom(struct g_class *mp, const char *name) 1790{ 1791 struct g_geom *gp; 1792 1793 LIST_FOREACH(gp, &mp->geom, geom) { 1794 if (strcmp(gp->name, name) == 0) 1795 return (gp); 1796 } 1797 return (NULL); 1798} 1799 1800static void 1801g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) 1802{ 1803 int nargs, *force, error, i; 1804 struct g_geom *gp; 1805 const char *name; 1806 1807 g_topology_assert(); 1808 1809 nargs = g_sched_get_nargs(req); 1810 1811 force = gctl_get_paraml(req, "force", sizeof(*force)); 1812 if (force == NULL) { 1813 gctl_error(req, "No 'force' argument"); 1814 return; 1815 } 1816 1817 for (i = 0; i < nargs; i++) { 1818 name = g_sched_argi(req, i); 1819 if (name == NULL) 1820 break; 1821 1822 gp = g_sched_find_geom(mp, name); 1823 if (gp == NULL) { 1824 G_SCHED_DEBUG(1, "Device %s is invalid.", name); 1825 gctl_error(req, "Device %s is invalid.", name); 1826 break; 1827 } 1828 1829 error = g_sched_destroy(gp, *force); 1830 if (error != 0) { 1831 gctl_error(req, "Cannot destroy device %s (error=%d).", 1832 gp->name, error); 1833 break; 1834 } 1835 } 1836} 1837 1838static void 1839g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) 1840{ 1841 uint32_t *version; 1842 1843 g_topology_assert(); 1844 1845 version = gctl_get_paraml(req, "version", sizeof(*version)); 1846 if (version == NULL) { 1847 gctl_error(req, "No '%s' argument.", "version"); 1848 return; 1849 } 1850 1851 if (*version != G_SCHED_VERSION) { 1852 gctl_error(req, "Userland and kernel parts are " 1853 "out of sync."); 1854 return; 1855 } 1856 1857 if (strcmp(verb, "create") == 0) { 1858 g_sched_ctl_create(req, mp, 0); 1859 return; 1860 } else if (strcmp(verb, "insert") == 0) { 1861 g_sched_ctl_create(req, mp, 1); 1862 return; 1863 } else if (strcmp(verb, "configure") == 0) { 1864 g_sched_ctl_configure(req, mp); 1865 return; 1866 } else if (strcmp(verb, "destroy") == 0) { 1867 g_sched_ctl_destroy(req, mp); 1868 return; 1869 } 1870 1871 gctl_error(req, "Unknown verb."); 1872} 1873 1874static void 1875g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 1876 struct g_consumer *cp, struct g_provider *pp) 1877{ 1878 struct g_sched_softc *sc = gp->softc; 1879 struct g_gsched *gsp = sc->sc_gsched; 1880 if (indent == NULL) { /* plaintext */ 1881 sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); 1882 } 1883 if (gsp != NULL && gsp->gs_dumpconf) 1884 gsp->gs_dumpconf(sb, indent, gp, cp, pp); 1885} 1886 1887DECLARE_GEOM_CLASS(g_sched_class, g_sched); 1888MODULE_VERSION(geom_sched, 0); 1889