g_sched.c revision 206497
1162674Spiso/*-
2162674Spiso * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo
3162674Spiso * All rights reserved.
4162674Spiso *
5162674Spiso * Redistribution and use in source and binary forms, with or without
6162674Spiso * modification, are permitted provided that the following conditions
7162674Spiso * are met:
8162674Spiso * 1. Redistributions of source code must retain the above copyright
9162674Spiso *    notice, this list of conditions and the following disclaimer.
10162674Spiso * 2. Redistributions in binary form must reproduce the above copyright
11162674Spiso *    notice, this list of conditions and the following disclaimer in the
12162674Spiso *    documentation and/or other materials provided with the distribution.
13162674Spiso *
14162674Spiso * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15162674Spiso * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16162674Spiso * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17162674Spiso * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18162674Spiso * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19162674Spiso * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20162674Spiso * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21162674Spiso * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22162674Spiso * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23162674Spiso * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24162674Spiso * SUCH DAMAGE.
25162674Spiso */
26162674Spiso
27162674Spiso/*
28162674Spiso * $Id$
29162674Spiso * $FreeBSD: head/sys/geom/sched/g_sched.c 206497 2010-04-12 16:37:45Z luigi $
30162674Spiso *
31162674Spiso * Main control module for geom-based disk schedulers ('sched').
32162674Spiso *
33162674Spiso * USER VIEW
34162674Spiso * A 'sched' node is typically inserted transparently between
35162674Spiso * an existing provider pp and its original geom gp
36162674Spiso *
37162674Spiso *	[pp --> gp  ..]
38162674Spiso *
39162674Spiso * using the command "geom sched insert <provider>" and
40162674Spiso * resulting in the following topology
41162674Spiso *
42162674Spiso *	[pp --> sched_gp --> cp]   [new_pp --> gp ... ]
43162674Spiso *
44162674Spiso * Deletion "geom sched destroy <provider>.sched." restores the
45162674Spiso * original chain. The normal "geom sched create <provide>"
46162674Spiso * is also supported.
47162674Spiso *
48162674Spiso * INTERNALS
49162674Spiso * Internally, the 'sched' uses the following data structures
50162674Spiso *
51162674Spiso *   geom{}         g_sched_softc{}      g_gsched{}
52162674Spiso * +----------+    +---------------+   +-------------+
53162674Spiso * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
54162674Spiso * |  ...     |    |               |   |  gs_fini    |
55162674Spiso * |          |    | [ hash table] |   |  gs_start   |
56162674Spiso * +----------+    |               |   |  ...        |
57162674Spiso *                 |               |   +-------------+
58162674Spiso *                 |               |
59162674Spiso *                 |               |     g_*_softc{}
60162674Spiso *                 |               |   +-------------+
61162674Spiso *                 | sc_data     *-|-->|             |
62162674Spiso *                 +---------------+   |  algorithm- |
63162674Spiso *                                     |  specific   |
64162674Spiso *                                     +-------------+
65162674Spiso *
66162674Spiso * A g_sched_softc{} is created with a "geom sched insert" call.
67162674Spiso * In turn this instantiates a specific scheduling algorithm,
68162674Spiso * which sets sc_gsched to point to the algorithm callbacks,
69162674Spiso * and calls gs_init() to create the g_*_softc{} .
70162674Spiso * The other callbacks (gs_start, gs_next, ...) are invoked
71162674Spiso * as needed
72162674Spiso *
73162674Spiso * g_sched_softc{} is defined in g_sched.h and mostly used here;
74162674Spiso * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
75162674Spiso * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
76162674Spiso *
77162674Spiso * DATA MOVING
78162674Spiso * When a bio is received on the provider, it goes to the
79162674Spiso * g_sched_start() which calls gs_start() to initially queue it;
80162674Spiso * then we call g_sched_dispatch() that loops around gs_next()
81162674Spiso * to select zero or more bio's to be sent downstream.
82162674Spiso *
83162674Spiso * g_sched_dispatch() can also be called as a result of a timeout,
84162674Spiso * e.g. when doing anticipation or pacing requests.
85162674Spiso *
86162674Spiso * When a bio comes back, it goes to g_sched_done() which in turn
87162674Spiso * calls gs_done(). The latter does any necessary housekeeping in
88162674Spiso * the scheduling algorithm, and may decide to call g_sched_dispatch()
89162674Spiso * to send more bio's downstream.
90162674Spiso *
91162674Spiso * If an algorithm needs per-flow queues, these are created
92162674Spiso * calling gs_init_class() and destroyed with gs_fini_class(),
93162674Spiso * and they are also inserted in the hash table implemented in
94162674Spiso * the g_sched_softc{}
95162674Spiso *
96162674Spiso * If an algorithm is replaced, or a transparently-inserted node is
97162674Spiso * removed with "geom sched destroy", we need to remove all references
98162674Spiso * to the g_*_softc{} and g_sched_softc from the bio's still in
99162674Spiso * the scheduler. g_sched_forced_dispatch() helps doing this.
100162674Spiso * XXX need to explain better.
101162674Spiso */
102162674Spiso
103162674Spiso#include <sys/cdefs.h>
104162674Spiso#include <sys/param.h>
105162674Spiso#include <sys/systm.h>
106162674Spiso#include <sys/kernel.h>
107162674Spiso#include <sys/module.h>
108162674Spiso#include <sys/lock.h>
109162674Spiso#include <sys/mutex.h>
110162674Spiso#include <sys/bio.h>
111162674Spiso#include <sys/limits.h>
112162674Spiso#include <sys/hash.h>
113162674Spiso#include <sys/sysctl.h>
114162674Spiso#include <sys/malloc.h>
115162674Spiso#include <sys/proc.h>		/* we access curthread */
116162674Spiso#include <geom/geom.h>
117162674Spiso#include "gs_scheduler.h"
118162674Spiso#include "g_sched.h"		/* geom hooks */
119162674Spiso
120162674Spiso/*
121162674Spiso * Size of the per-geom hash table storing traffic classes.
122162674Spiso * We may decide to change it at a later time, it has no ABI
123162674Spiso * implications as it is only used for run-time allocations.
124162674Spiso */
125162674Spiso#define G_SCHED_HASH_SIZE	32
126162674Spiso
127162674Spisostatic int g_sched_destroy(struct g_geom *gp, boolean_t force);
128162674Spisostatic int g_sched_destroy_geom(struct gctl_req *req,
129162674Spiso    struct g_class *mp, struct g_geom *gp);
130162674Spisostatic void g_sched_config(struct gctl_req *req, struct g_class *mp,
131162674Spiso    const char *verb);
132162674Spisostatic struct g_geom *g_sched_taste(struct g_class *mp,
133162674Spiso    struct g_provider *pp, int flags __unused);
134162674Spisostatic void g_sched_dumpconf(struct sbuf *sb, const char *indent,
135162674Spiso    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
136162674Spisostatic void g_sched_init(struct g_class *mp);
137162674Spisostatic void g_sched_fini(struct g_class *mp);
138162674Spiso
139162674Spisostruct g_class g_sched_class = {
140162674Spiso	.name = G_SCHED_CLASS_NAME,
141162674Spiso	.version = G_VERSION,
142162674Spiso	.ctlreq = g_sched_config,
143162674Spiso	.taste = g_sched_taste,
144162674Spiso	.destroy_geom = g_sched_destroy_geom,
145162674Spiso	.init = g_sched_init,
146162674Spiso	.fini = g_sched_fini
147162674Spiso};
148162674Spiso
149162674SpisoMALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
150162674Spiso
151162674Spiso/*
152162674Spiso * Global variables describing the state of the geom_sched module.
153162674Spiso * There is only one static instance of this structure.
154 */
155LIST_HEAD(gs_list, g_gsched);	/* type, link field */
156struct geom_sched_vars {
157	struct mtx	gs_mtx;
158	struct gs_list	gs_scheds;	/* list of algorithms */
159	u_int		gs_debug;
160	u_int		gs_sched_count;	/* how many algorithms ? */
161	u_int 		gs_patched;	/* g_io_request was patched */
162
163	u_int		gs_initialized;
164	u_int		gs_expire_secs;	/* expiration of hash entries */
165
166	struct bio_queue_head gs_pending;
167	u_int		gs_npending;
168
169	/* The following are for stats, usually protected by gs_mtx. */
170	u_long		gs_requests;	/* total requests */
171	u_long		gs_done;	/* total done */
172	u_int 		gs_in_flight;	/* requests in flight */
173	u_int 		gs_writes_in_flight;
174	u_int 		gs_bytes_in_flight;
175	u_int 		gs_write_bytes_in_flight;
176
177	char		gs_names[256];	/* names of schedulers */
178};
179
180static struct geom_sched_vars me = {
181	.gs_expire_secs = 10,
182};
183
184SYSCTL_DECL(_kern_geom);
185SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
186    "GEOM_SCHED stuff");
187
188SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
189    &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
190
191SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
192    &me.gs_bytes_in_flight, 0, "Bytes in flight");
193
194SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
195    &me.gs_writes_in_flight, 0, "Write Requests in flight");
196
197SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
198    &me.gs_in_flight, 0, "Requests in flight");
199
200SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
201    &me.gs_done, 0, "Total done");
202
203SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
204    &me.gs_requests, 0, "Total requests");
205
206SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
207    &me.gs_names, 0, "Algorithm names");
208
209SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
210    &me.gs_sched_count, 0, "Number of algorithms");
211
212SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
213    &me.gs_debug, 0, "Debug level");
214
215SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
216    &me.gs_expire_secs, 0, "Expire time in seconds");
217
218/*
219 * g_sched calls the scheduler algorithms with this lock held.
220 * The locking functions are exposed so the scheduler algorithms can also
221 * protect themselves e.g. when running a callout handler.
222 */
223void
224g_sched_lock(struct g_geom *gp)
225{
226	struct g_sched_softc *sc = gp->softc;
227
228	mtx_lock(&sc->sc_mtx);
229}
230
231void
232g_sched_unlock(struct g_geom *gp)
233{
234	struct g_sched_softc *sc = gp->softc;
235
236	mtx_unlock(&sc->sc_mtx);
237}
238
239/*
240 * Support functions to handle references to the module,
241 * which are coming from devices using this scheduler.
242 */
243static inline void
244g_gsched_ref(struct g_gsched *gsp)
245{
246
247	atomic_add_int(&gsp->gs_refs, 1);
248}
249
250static inline void
251g_gsched_unref(struct g_gsched *gsp)
252{
253
254	atomic_add_int(&gsp->gs_refs, -1);
255}
256
257/*
258 * Update the stats when this request is done.
259 */
260static void
261g_sched_update_stats(struct bio *bio)
262{
263
264	me.gs_done++;
265	me.gs_in_flight--;
266	me.gs_bytes_in_flight -= bio->bio_length;
267	if (bio->bio_cmd & BIO_WRITE) {
268		me.gs_writes_in_flight--;
269		me.gs_write_bytes_in_flight -= bio->bio_length;
270	}
271}
272
273/*
274 * Dispatch any pending request.
275 */
276static void
277g_sched_forced_dispatch(struct g_geom *gp)
278{
279	struct g_sched_softc *sc = gp->softc;
280	struct g_gsched *gsp = sc->sc_gsched;
281	struct bio *bp;
282
283	KASSERT(mtx_owned(&sc->sc_mtx),
284	    ("sc_mtx not owned during forced dispatch"));
285
286	while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
287		g_io_request(bp, LIST_FIRST(&gp->consumer));
288}
289
290/*
291 * The main dispatch loop, called either here after the start
292 * routine, or by scheduling algorithms when they receive a timeout
293 * or a 'done' notification.  Does not share code with the forced
294 * dispatch path, since the gs_done() callback can call us.
295 */
296void
297g_sched_dispatch(struct g_geom *gp)
298{
299	struct g_sched_softc *sc = gp->softc;
300	struct g_gsched *gsp = sc->sc_gsched;
301	struct bio *bp;
302
303	KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
304
305	if ((sc->sc_flags & G_SCHED_FLUSHING))
306		return;
307
308	while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
309		g_io_request(bp, LIST_FIRST(&gp->consumer));
310}
311
312/*
313 * Recent (8.0 and above) versions of FreeBSD have support to
314 * register classifiers of disk requests. The classifier is
315 * invoked by g_io_request(), and stores the information into
316 * bp->bio_classifier1.
317 *
318 * Support for older versions, which is left here only for
319 * documentation purposes, relies on two hacks:
320 * 1. classification info is written into the bio_caller1
321 *    field of the topmost node in the bio chain. This field
322 *    is rarely used, but this module is incompatible with
323 *    those that use bio_caller1 for other purposes,
324 *    such as ZFS and gjournal;
325 * 2. g_io_request() is patched in-memory when the module is
326 *    loaded, so that the function calls a classifier as its
327 *    first thing. g_io_request() is restored when the module
328 *    is unloaded. This functionality is only supported for
329 *    x86 and amd64, other architectures need source code changes.
330 */
331
332/*
333 * Lookup the identity of the issuer of the original request.
334 * In the current implementation we use the curthread of the
335 * issuer, but different mechanisms may be implemented later
336 * so we do not make assumptions on the return value which for
337 * us is just an opaque identifier.
338 */
339
340static inline u_long
341g_sched_classify(struct bio *bp)
342{
343
344#if __FreeBSD_version > 800098
345	/* we have classifier fields in the struct bio */
346#define HAVE_BIO_CLASSIFIER
347	return ((u_long)bp->bio_classifier1);
348#else
349#warning old version!!!
350	while (bp->bio_parent != NULL)
351		bp = bp->bio_parent;
352
353	return ((u_long)bp->bio_caller1);
354#endif
355}
356
357/* Return the hash chain for the given key. */
358static inline struct g_hash *
359g_sched_hash(struct g_sched_softc *sc, u_long key)
360{
361
362	return (&sc->sc_hash[key & sc->sc_mask]);
363}
364
365/*
366 * Helper function for the children classes, which takes
367 * a geom and a bio and returns the private descriptor
368 * associated to the request.  This involves fetching
369 * the classification field and [al]locating the
370 * corresponding entry in the hash table.
371 */
372void *
373g_sched_get_class(struct g_geom *gp, struct bio *bp)
374{
375	struct g_sched_softc *sc;
376	struct g_sched_class *gsc;
377	struct g_gsched *gsp;
378	struct g_hash *bucket;
379	u_long key;
380
381	sc = gp->softc;
382	key = g_sched_classify(bp);
383	bucket = g_sched_hash(sc, key);
384	LIST_FOREACH(gsc, bucket, gsc_clist) {
385		if (key == gsc->gsc_key) {
386			gsc->gsc_refs++;
387			return (gsc->gsc_priv);
388		}
389	}
390
391	gsp = sc->sc_gsched;
392	gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
393	    M_GEOM_SCHED, M_NOWAIT | M_ZERO);
394	if (!gsc)
395		return (NULL);
396
397	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
398		free(gsc, M_GEOM_SCHED);
399		return (NULL);
400	}
401
402	gsc->gsc_refs = 2;	/* 1 for the hash table, 1 for the caller. */
403	gsc->gsc_key = key;
404	LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
405
406	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
407
408	return (gsc->gsc_priv);
409}
410
411/*
412 * Release a reference to the per-client descriptor,
413 */
414void
415g_sched_put_class(struct g_geom *gp, void *priv)
416{
417	struct g_sched_class *gsc;
418	struct g_sched_softc *sc;
419
420	gsc = g_sched_priv2class(priv);
421	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
422
423	if (--gsc->gsc_refs > 0)
424		return;
425
426	sc = gp->softc;
427	sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
428
429	LIST_REMOVE(gsc, gsc_clist);
430	free(gsc, M_GEOM_SCHED);
431}
432
433static void
434g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
435    struct g_gsched *gsp, void *data)
436{
437	struct g_sched_class *cp, *cp2;
438	int i;
439
440	if (!hp)
441		return;
442
443	if (data && gsp->gs_hash_unref)
444		gsp->gs_hash_unref(data);
445
446	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
447		LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
448			g_sched_put_class(gp, cp->gsc_priv);
449	}
450
451	hashdestroy(hp, M_GEOM_SCHED, mask);
452}
453
454static struct g_hash *
455g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
456{
457	struct g_hash *hash;
458
459	if (gsp->gs_priv_size == 0)
460		return (NULL);
461
462	hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
463
464	return (hash);
465}
466
467static void
468g_sched_flush_classes(struct g_geom *gp)
469{
470	struct g_sched_softc *sc;
471	struct g_sched_class *cp, *cp2;
472	int i;
473
474	sc = gp->softc;
475
476	if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
477		return;
478
479	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
480		LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
481			if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
482				g_sched_put_class(gp, cp->gsc_priv);
483		}
484	}
485
486	sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
487}
488
489/*
490 * Wait for the completion of any outstanding request.  To ensure
491 * that this does not take forever the caller has to make sure that
492 * no new request enter the scehduler before calling us.
493 *
494 * Must be called with the gp mutex held and topology locked.
495 */
496static int
497g_sched_wait_pending(struct g_geom *gp)
498{
499	struct g_sched_softc *sc = gp->softc;
500	int endticks = ticks + hz;
501
502	g_topology_assert();
503
504	while (sc->sc_pending && endticks - ticks >= 0)
505		msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
506
507	return (sc->sc_pending ? ETIMEDOUT : 0);
508}
509
510static int
511g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
512{
513	struct g_sched_softc *sc = gp->softc;
514	int error;
515
516	/* Set the flushing flag: new bios will not enter the scheduler. */
517	sc->sc_flags |= G_SCHED_FLUSHING;
518
519	g_sched_forced_dispatch(gp);
520	error = g_sched_wait_pending(gp);
521	if (error)
522		goto failed;
523
524	/* No more requests pending or in flight from the old gsp. */
525
526	g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
527	sc->sc_hash = NULL;
528
529	/*
530	 * Avoid deadlock here by releasing the gp mutex and reacquiring
531	 * it once done.  It should be safe, since no reconfiguration or
532	 * destruction can take place due to the geom topology lock; no
533	 * new request can use the current sc_data since we flagged the
534	 * geom as being flushed.
535	 */
536	g_sched_unlock(gp);
537	gsp->gs_fini(sc->sc_data);
538	g_sched_lock(gp);
539
540	sc->sc_gsched = NULL;
541	sc->sc_data = NULL;
542	g_gsched_unref(gsp);
543
544failed:
545	sc->sc_flags &= ~G_SCHED_FLUSHING;
546
547	return (error);
548}
549
550static int
551g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
552{
553	int error;
554
555	g_sched_lock(gp);
556	error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
557	g_sched_unlock(gp);
558
559	return (error);
560}
561
562/*
563 * Support function for create/taste -- locate the desired
564 * algorithm and grab a reference to it.
565 */
566static struct g_gsched *
567g_gsched_find(const char *name)
568{
569	struct g_gsched *gsp = NULL;
570
571	mtx_lock(&me.gs_mtx);
572	LIST_FOREACH(gsp, &me.gs_scheds, glist) {
573		if (strcmp(name, gsp->gs_name) == 0) {
574			g_gsched_ref(gsp);
575			break;
576		}
577	}
578	mtx_unlock(&me.gs_mtx);
579
580	return (gsp);
581}
582
583/*
584 * Rebuild the list of scheduler names.
585 * To be called with me.gs_mtx lock held.
586 */
587static void
588g_gsched_build_names(struct g_gsched *gsp)
589{
590	int pos, l;
591	struct g_gsched *cur;
592
593	pos = 0;
594	LIST_FOREACH(cur, &me.gs_scheds, glist) {
595		l = strlen(cur->gs_name);
596		if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
597			if (pos != 0)
598				me.gs_names[pos++] = ' ';
599			strcpy(me.gs_names + pos, cur->gs_name);
600			pos += l;
601		}
602	}
603	me.gs_names[pos] = '\0';
604}
605
606/*
607 * Register or unregister individual scheduling algorithms.
608 */
609static int
610g_gsched_register(struct g_gsched *gsp)
611{
612	struct g_gsched *cur;
613	int error = 0;
614
615	mtx_lock(&me.gs_mtx);
616	LIST_FOREACH(cur, &me.gs_scheds, glist) {
617		if (strcmp(gsp->gs_name, cur->gs_name) == 0)
618			break;
619	}
620	if (cur != NULL) {
621		G_SCHED_DEBUG(0, "A scheduler named %s already"
622		    "exists.", gsp->gs_name);
623		error = EEXIST;
624	} else {
625		LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
626		gsp->gs_refs = 1;
627		me.gs_sched_count++;
628		g_gsched_build_names(gsp);
629	}
630	mtx_unlock(&me.gs_mtx);
631
632	return (error);
633}
634
635struct g_gsched_unregparm {
636	struct g_gsched *gup_gsp;
637	int		gup_error;
638};
639
640static void
641g_gsched_unregister(void *arg, int flag)
642{
643	struct g_gsched_unregparm *parm = arg;
644	struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
645	struct g_sched_softc *sc;
646	struct g_geom *gp, *gp_tmp;
647	int error;
648
649	parm->gup_error = 0;
650
651	g_topology_assert();
652
653	if (flag == EV_CANCEL)
654		return;
655
656	mtx_lock(&me.gs_mtx);
657
658	LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
659		if (gp->class != &g_sched_class)
660			continue;	/* Should not happen. */
661
662		sc = gp->softc;
663		if (sc->sc_gsched == gsp) {
664			error = g_sched_remove(gp, gsp);
665			if (error)
666				goto failed;
667		}
668	}
669
670	LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
671		if (cur != gsp)
672			continue;
673
674		if (gsp->gs_refs != 1) {
675			G_SCHED_DEBUG(0, "%s still in use.",
676			    gsp->gs_name);
677			parm->gup_error = EBUSY;
678		} else {
679			LIST_REMOVE(gsp, glist);
680			me.gs_sched_count--;
681			g_gsched_build_names(gsp);
682		}
683		break;
684	}
685
686	if (cur == NULL) {
687		G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
688		parm->gup_error = ENOENT;
689	}
690
691failed:
692	mtx_unlock(&me.gs_mtx);
693}
694
695static inline void
696g_gsched_global_init(void)
697{
698
699	if (!me.gs_initialized) {
700		G_SCHED_DEBUG(0, "Initializing global data.");
701		mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
702		LIST_INIT(&me.gs_scheds);
703		gs_bioq_init(&me.gs_pending);
704		me.gs_initialized = 1;
705	}
706}
707
708/*
709 * Module event called when a scheduling algorithm module is loaded or
710 * unloaded.
711 */
712int
713g_gsched_modevent(module_t mod, int cmd, void *arg)
714{
715	struct g_gsched *gsp = arg;
716	struct g_gsched_unregparm parm;
717	int error;
718
719	G_SCHED_DEBUG(0, "Modevent %d.", cmd);
720
721	/*
722	 * If the module is loaded at boot, the geom thread that calls
723	 * g_sched_init() might actually run after g_gsched_modevent(),
724	 * so make sure that the module is properly initialized.
725	 */
726	g_gsched_global_init();
727
728	error = EOPNOTSUPP;
729	switch (cmd) {
730	case MOD_LOAD:
731		error = g_gsched_register(gsp);
732		G_SCHED_DEBUG(0, "Loaded module %s error %d.",
733		    gsp->gs_name, error);
734		if (error == 0)
735			g_retaste(&g_sched_class);
736		break;
737
738	case MOD_UNLOAD:
739		parm.gup_gsp = gsp;
740		parm.gup_error = 0;
741
742		error = g_waitfor_event(g_gsched_unregister,
743		    &parm, M_WAITOK, NULL);
744		if (error == 0)
745			error = parm.gup_error;
746		G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
747		    gsp->gs_name, error);
748		break;
749	};
750
751	return (error);
752}
753
754#ifdef KTR
755#define	TRC_BIO_EVENT(e, bp)	g_sched_trace_bio_ ## e (bp)
756static inline int
757g_sched_issuer_pid(struct bio *bp)
758{
759	struct thread *thread = g_sched_issuer(bp);
760
761	return (thread->td_tid);
762}
763
764static inline char
765g_sched_type(struct bio *bp)
766{
767
768	if (0 != (bp->bio_cmd & BIO_READ))
769		return ('R');
770	else if (0 != (bp->bio_cmd & BIO_WRITE))
771		return ('W');
772	return ('U');
773}
774
775static inline void
776g_sched_trace_bio_START(struct bio *bp)
777{
778
779	CTR5(KTR_GSCHED, "S %d %c %lu/%lu %lu", g_sched_issuer_pid(bp),
780	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
781	    bp->bio_offset, bp->bio_length);
782}
783
784static inline void
785g_sched_trace_bio_DONE(struct bio *bp)
786{
787
788	CTR5(KTR_GSCHED, "D %d %c %lu/%lu %lu", g_sched_issuer_pid(bp),
789	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
790	    bp->bio_offset, bp->bio_length);
791}
792#else
793#define	TRC_BIO_EVENT(e, bp)
794#endif
795
796/*
797 * g_sched_done() and g_sched_start() dispatch the geom requests to
798 * the scheduling algorithm in use.
799 */
800static void
801g_sched_done(struct bio *bio)
802{
803	struct g_geom *gp = bio->bio_caller2;
804	struct g_sched_softc *sc = gp->softc;
805
806	TRC_BIO_EVENT(DONE, bio);
807
808	KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
809
810	g_sched_lock(gp);
811
812	g_sched_update_stats(bio);
813	sc->sc_gsched->gs_done(sc->sc_data, bio);
814	if (!--sc->sc_pending)
815		wakeup(gp);
816
817	g_sched_flush_classes(gp);
818	g_sched_unlock(gp);
819
820	g_std_done(bio);
821}
822
823static void
824g_sched_start(struct bio *bp)
825{
826	struct g_geom *gp = bp->bio_to->geom;
827	struct g_sched_softc *sc = gp->softc;
828	struct bio *cbp;
829
830	TRC_BIO_EVENT(START, bp);
831	G_SCHED_LOGREQ(bp, "Request received.");
832
833	cbp = g_clone_bio(bp);
834	if (cbp == NULL) {
835		g_io_deliver(bp, ENOMEM);
836		return;
837	}
838	cbp->bio_done = g_sched_done;
839	cbp->bio_to = LIST_FIRST(&gp->provider);
840	KASSERT(cbp->bio_to != NULL, ("NULL provider"));
841
842	/* We only schedule reads and writes. */
843	if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE)))
844		goto bypass;
845
846	G_SCHED_LOGREQ(cbp, "Sending request.");
847
848	g_sched_lock(gp);
849	/*
850	 * Call the algorithm's gs_start to queue the request in the
851	 * scheduler. If gs_start fails then pass the request down,
852	 * otherwise call g_sched_dispatch() which tries to push
853	 * one or more requests down.
854	 */
855	if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
856	    sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
857		g_sched_unlock(gp);
858		goto bypass;
859	}
860	/*
861	 * We use bio_caller1 to mark requests that are scheduled
862	 * so make sure it is not NULL.
863	 */
864	if (cbp->bio_caller1 == NULL)
865		cbp->bio_caller1 = &me;	/* anything not NULL */
866
867	cbp->bio_caller2 = gp;
868	sc->sc_pending++;
869
870	/* Update general stats. */
871	me.gs_in_flight++;
872	me.gs_requests++;
873	me.gs_bytes_in_flight += bp->bio_length;
874	if (bp->bio_cmd & BIO_WRITE) {
875		me.gs_writes_in_flight++;
876		me.gs_write_bytes_in_flight += bp->bio_length;
877	}
878	g_sched_dispatch(gp);
879	g_sched_unlock(gp);
880	return;
881
882bypass:
883	cbp->bio_done = g_std_done;
884	cbp->bio_caller1 = NULL; /* not scheduled */
885	g_io_request(cbp, LIST_FIRST(&gp->consumer));
886}
887
888/*
889 * The next few functions are the geom glue.
890 */
891static void
892g_sched_orphan(struct g_consumer *cp)
893{
894
895	g_topology_assert();
896	g_sched_destroy(cp->geom, 1);
897}
898
899static int
900g_sched_access(struct g_provider *pp, int dr, int dw, int de)
901{
902	struct g_geom *gp;
903	struct g_consumer *cp;
904	int error;
905
906	gp = pp->geom;
907	cp = LIST_FIRST(&gp->consumer);
908	error = g_access(cp, dr, dw, de);
909
910	return (error);
911}
912
913static void
914g_sched_temporary_start(struct bio *bio)
915{
916
917	mtx_lock(&me.gs_mtx);
918	me.gs_npending++;
919	gs_bioq_disksort(&me.gs_pending, bio);
920	mtx_unlock(&me.gs_mtx);
921}
922
923static void
924g_sched_flush_pending(g_start_t *start)
925{
926	struct bio *bp;
927
928	while ((bp = gs_bioq_takefirst(&me.gs_pending)))
929		start(bp);
930}
931
932static int
933g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
934    struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
935{
936	struct g_sched_softc *sc = gp->softc;
937	g_start_t *saved_start, *flush = g_sched_start;
938	int error = 0, endticks = ticks + hz;
939
940	g_cancel_event(newpp);	/* prevent taste() */
941	/* copy private fields */
942	newpp->private = pp->private;
943	newpp->index = pp->index;
944
945	/* Queue all the early requests coming for us. */
946	me.gs_npending = 0;
947	saved_start = pp->geom->start;
948	dstgp->start = g_sched_temporary_start;
949
950	while (pp->nstart - pp->nend != me.gs_npending &&
951	    endticks - ticks >= 0)
952		tsleep(pp, PRIBIO, "-", hz/10);
953
954	if (pp->nstart - pp->nend != me.gs_npending) {
955		flush = saved_start;
956		error = ETIMEDOUT;
957		goto fail;
958	}
959
960	/* link pp to this geom */
961	LIST_REMOVE(pp, provider);
962	pp->geom = gp;
963	LIST_INSERT_HEAD(&gp->provider, pp, provider);
964
965	/*
966	 * replicate the counts from the parent in the
967	 * new provider and consumer nodes
968	 */
969	cp->acr = newpp->acr = pp->acr;
970	cp->acw = newpp->acw = pp->acw;
971	cp->ace = newpp->ace = pp->ace;
972	sc->sc_flags |= G_SCHED_PROXYING;
973
974fail:
975	dstgp->start = saved_start;
976
977	g_sched_flush_pending(flush);
978
979	return (error);
980}
981
982/*
983 * Create a geom node for the device passed as *pp.
984 * If successful, add a reference to this gsp.
985 */
986static int
987g_sched_create(struct gctl_req *req, struct g_class *mp,
988    struct g_provider *pp, struct g_gsched *gsp, int proxy)
989{
990	struct g_sched_softc *sc = NULL;
991	struct g_geom *gp, *dstgp;
992	struct g_provider *newpp = NULL;
993	struct g_consumer *cp = NULL;
994	char name[64];
995	int error;
996
997	g_topology_assert();
998
999	snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
1000	LIST_FOREACH(gp, &mp->geom, geom) {
1001		if (strcmp(gp->name, name) == 0) {
1002			gctl_error(req, "Geom %s already exists.",
1003			    name);
1004			return (EEXIST);
1005		}
1006	}
1007
1008	gp = g_new_geomf(mp, name);
1009	dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1010	if (gp == NULL) {
1011		gctl_error(req, "Cannot create geom %s.", name);
1012		error = ENOMEM;
1013		goto fail;
1014	}
1015
1016	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1017	sc->sc_gsched = gsp;
1018	sc->sc_data = gsp->gs_init(gp);
1019	if (sc->sc_data == NULL) {
1020		error = ENOMEM;
1021		goto fail;
1022	}
1023
1024	sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1025
1026	/*
1027	 * Do not initialize the flush mechanism, will be initialized
1028	 * on the first insertion on the hash table.
1029	 */
1030
1031	mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1032
1033	gp->softc = sc;
1034	gp->start = g_sched_start;
1035	gp->orphan = g_sched_orphan;
1036	gp->access = g_sched_access;
1037	gp->dumpconf = g_sched_dumpconf;
1038
1039	newpp = g_new_providerf(dstgp, gp->name);
1040	if (newpp == NULL) {
1041		gctl_error(req, "Cannot create provider %s.", name);
1042		error = ENOMEM;
1043		goto fail;
1044	}
1045
1046	newpp->mediasize = pp->mediasize;
1047	newpp->sectorsize = pp->sectorsize;
1048
1049	cp = g_new_consumer(gp);
1050	if (cp == NULL) {
1051		gctl_error(req, "Cannot create consumer for %s.",
1052		    gp->name);
1053		error = ENOMEM;
1054		goto fail;
1055	}
1056
1057	error = g_attach(cp, proxy ? newpp : pp);
1058	if (error != 0) {
1059		gctl_error(req, "Cannot attach to provider %s.",
1060		    pp->name);
1061		goto fail;
1062	}
1063
1064	g_error_provider(newpp, 0);
1065	if (proxy) {
1066		error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1067		if (error)
1068			goto fail;
1069	}
1070	G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1071
1072	g_gsched_ref(gsp);
1073
1074	return (0);
1075
1076fail:
1077	if (cp != NULL) {
1078		if (cp->provider != NULL)
1079			g_detach(cp);
1080		g_destroy_consumer(cp);
1081	}
1082
1083	if (newpp != NULL)
1084		g_destroy_provider(newpp);
1085
1086	if (sc && sc->sc_hash) {
1087		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1088		    gsp, sc->sc_data);
1089	}
1090
1091	if (sc && sc->sc_data)
1092		gsp->gs_fini(sc->sc_data);
1093
1094	if (gp != NULL) {
1095		if (gp->softc != NULL)
1096			g_free(gp->softc);
1097		g_destroy_geom(gp);
1098	}
1099
1100	return (error);
1101}
1102
1103/*
1104 * Support for dynamic switching of scheduling algorithms.
1105 * First initialize the data structures for the new algorithm,
1106 * then call g_sched_remove_locked() to flush all references
1107 * to the old one, finally link the new algorithm.
1108 */
1109static int
1110g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1111    struct g_provider *pp, struct g_gsched *gsp)
1112{
1113	struct g_sched_softc *sc;
1114	struct g_geom *gp;
1115	struct g_hash *newh;
1116	void *data;
1117	u_long mask;
1118	int error = 0;
1119
1120	gp = pp->geom;
1121	sc = gp->softc;
1122
1123	data = gsp->gs_init(gp);
1124	if (data == NULL)
1125		return (ENOMEM);
1126
1127	newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1128	if (gsp->gs_priv_size && !newh) {
1129		error = ENOMEM;
1130		goto fail;
1131	}
1132
1133	g_sched_lock(gp);
1134	if (sc->sc_gsched) {	/* can be NULL in some cases */
1135		error = g_sched_remove_locked(gp, sc->sc_gsched);
1136		if (error)
1137			goto fail;
1138	}
1139
1140	g_gsched_ref(gsp);
1141	sc->sc_gsched = gsp;
1142	sc->sc_data = data;
1143	sc->sc_hash = newh;
1144	sc->sc_mask = mask;
1145
1146	g_sched_unlock(gp);
1147
1148	return (0);
1149
1150fail:
1151	if (newh)
1152		g_sched_hash_fini(gp, newh, mask, gsp, data);
1153
1154	if (data)
1155		gsp->gs_fini(data);
1156
1157	g_sched_unlock(gp);
1158
1159	return (error);
1160}
1161
1162/*
1163 * Stop the request flow directed to the proxy, redirecting the new
1164 * requests to the me.gs_pending queue.
1165 */
1166static struct g_provider *
1167g_detach_proxy(struct g_geom *gp)
1168{
1169	struct g_consumer *cp;
1170	struct g_provider *pp, *newpp;
1171
1172	do {
1173		pp = LIST_FIRST(&gp->provider);
1174		if (pp == NULL)
1175			break;
1176		cp = LIST_FIRST(&gp->consumer);
1177		if (cp == NULL)
1178			break;
1179		newpp = cp->provider;
1180		if (newpp == NULL)
1181			break;
1182
1183		me.gs_npending = 0;
1184		pp->geom->start = g_sched_temporary_start;
1185
1186		return (pp);
1187	} while (0);
1188	printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1189
1190	return (NULL);
1191}
1192
1193static void
1194g_sched_blackhole(struct bio *bp)
1195{
1196
1197	g_io_deliver(bp, ENXIO);
1198}
1199
1200static inline void
1201g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1202    struct g_provider *newpp)
1203{
1204
1205	LIST_REMOVE(pp, provider);
1206	if (newpp) {
1207		pp->private = newpp->private;
1208		pp->index = newpp->index;
1209	}
1210	pp->geom = gp;
1211	LIST_INSERT_HEAD(&gp->provider, pp, provider);
1212}
1213
1214static inline void
1215g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1216{
1217	struct g_geom *gp = oldpp->geom;
1218
1219	g_reparent_provider(oldpp, newpp->geom, newpp);
1220
1221	/*
1222	 * Hackish: let the system destroy the old provider for us, just
1223	 * in case someone attached a consumer to it, in which case a
1224	 * direct call to g_destroy_provider() would not work.
1225	 */
1226	g_reparent_provider(newpp, gp, NULL);
1227}
1228
1229/*
1230 * Complete the proxy destruction, linking the old provider to its
1231 * original geom, and destroying the proxy provider.  Also take care
1232 * of issuing the pending requests collected in me.gs_pending (if any).
1233 */
1234static int
1235g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1236{
1237	struct g_consumer *cp;
1238	struct g_provider *newpp;
1239
1240	do {
1241		cp = LIST_FIRST(&gp->consumer);
1242		if (cp == NULL)
1243			break;
1244		newpp = cp->provider;
1245		if (newpp == NULL)
1246			break;
1247
1248		/* Relink the provider to its original geom. */
1249		g_unproxy_provider(oldpp, newpp);
1250
1251		/* Detach consumer from provider, and destroy provider. */
1252		cp->acr = newpp->acr = 0;
1253		cp->acw = newpp->acw = 0;
1254		cp->ace = newpp->ace = 0;
1255		g_detach(cp);
1256
1257		/* Send the pending bios through the right start function. */
1258		g_sched_flush_pending(oldpp->geom->start);
1259
1260		return (0);
1261	} while (0);
1262	printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1263
1264	/* We cannot send the pending bios anywhere... */
1265	g_sched_flush_pending(g_sched_blackhole);
1266
1267	return (EINVAL);
1268}
1269
1270static int
1271g_sched_destroy(struct g_geom *gp, boolean_t force)
1272{
1273	struct g_provider *pp, *oldpp = NULL;
1274	struct g_sched_softc *sc;
1275	struct g_gsched *gsp;
1276	int error;
1277
1278	g_topology_assert();
1279	sc = gp->softc;
1280	if (sc == NULL)
1281		return (ENXIO);
1282	if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1283		pp = LIST_FIRST(&gp->provider);
1284		if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1285			const char *msg = force ?
1286				"but we force removal" : "cannot remove";
1287
1288			G_SCHED_DEBUG(!force,
1289			    "Device %s is still open (r%dw%de%d), %s.",
1290			    pp->name, pp->acr, pp->acw, pp->ace, msg);
1291			if (!force)
1292				return (EBUSY);
1293		} else {
1294			G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1295		}
1296	} else
1297		oldpp = g_detach_proxy(gp);
1298
1299	gsp = sc->sc_gsched;
1300	if (gsp) {
1301		/*
1302		 * XXX bad hack here: force a dispatch to release
1303		 * any reference to the hash table still held by
1304		 * the scheduler.
1305		 */
1306		g_sched_lock(gp);
1307		/*
1308		 * We are dying here, no new requests should enter
1309		 * the scheduler.  This is granted by the topolgy,
1310		 * either in case we were proxying (new bios are
1311		 * being redirected) or not (see the access check
1312		 * above).
1313		 */
1314		g_sched_forced_dispatch(gp);
1315		error = g_sched_wait_pending(gp);
1316
1317		if (error) {
1318			/*
1319			 * Not all the requests came home: this might happen
1320			 * under heavy load, or if we were waiting for any
1321			 * bio which is served in the event path (see
1322			 * geom_slice.c for an example of how this can
1323			 * happen).  Try to restore a working configuration
1324			 * if we can fail.
1325			 */
1326			if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1327				g_sched_flush_pending(force ?
1328				    g_sched_blackhole : g_sched_start);
1329			}
1330
1331			/*
1332			 * In the forced destroy case there is not so much
1333			 * we can do, we have pending bios that will call
1334			 * g_sched_done() somehow, and we don't want them
1335			 * to crash the system using freed memory.  We tell
1336			 * the user that something went wrong, and leak some
1337			 * memory here.
1338			 * Note: the callers using force = 1 ignore the
1339			 * return value.
1340			 */
1341			if (force) {
1342				G_SCHED_DEBUG(0, "Pending requests while "
1343				    " destroying geom, some memory leaked.");
1344			}
1345
1346			return (error);
1347		}
1348
1349		g_sched_unlock(gp);
1350		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1351		    gsp, sc->sc_data);
1352		sc->sc_hash = NULL;
1353		gsp->gs_fini(sc->sc_data);
1354		g_gsched_unref(gsp);
1355		sc->sc_gsched = NULL;
1356	}
1357
1358	if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1359		error = g_destroy_proxy(gp, oldpp);
1360
1361		if (error) {
1362			if (force) {
1363				G_SCHED_DEBUG(0, "Unrecoverable error while "
1364				    "destroying a proxy geom, leaking some "
1365				    " memory.");
1366			}
1367
1368			return (error);
1369		}
1370	}
1371
1372	mtx_destroy(&sc->sc_mtx);
1373
1374	g_free(gp->softc);
1375	gp->softc = NULL;
1376	g_wither_geom(gp, ENXIO);
1377
1378	return (error);
1379}
1380
1381static int
1382g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1383    struct g_geom *gp)
1384{
1385
1386	return (g_sched_destroy(gp, 0));
1387}
1388
1389/*
1390 * Functions related to the classification of requests.
1391 *
1392 * On recent FreeBSD versions (8.0 and above), we store a reference
1393 * to the issuer of a request in bp->bio_classifier1 as soon
1394 * as the bio is posted to the geom queue (and not later, because
1395 * requests are managed by the g_down thread afterwards).
1396 *
1397 * On older versions of the system (but this code is not used
1398 * in any existing release), we [ab]use the caller1 field in the
1399 * root element of the bio tree to store the classification info.
1400 * The marking is done at the beginning of g_io_request()
1401 * and only if we find that the field is NULL.
1402 *
1403 * To avoid rebuilding the kernel, this module will patch the
1404 * initial part of g_io_request() so it jumps to some hand-coded
1405 * assembly that does the marking and then executes the original
1406 * body of g_io_request().
1407 *
1408 * fake_ioreq[] is architecture-specific machine code
1409 * that implements the above. CODE_SIZE, STORE_SIZE etc.
1410 * are constants used in the patching routine. Look at the
1411 * code in g_ioreq_patch() for the details.
1412 */
1413
1414#ifndef HAVE_BIO_CLASSIFIER
1415/*
1416 * Support for old FreeBSD versions
1417 */
1418#if defined(__i386__)
1419#define	CODE_SIZE	29
1420#define	STORE_SIZE	5
1421#define	EPILOGUE	5
1422#define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1423
1424static u_char fake_ioreq[SIZE] = {
1425	0x8b, 0x44, 0x24, 0x04,		/* mov bp, %eax */
1426	/* 1: */
1427	0x89, 0xc2,			/* mov %eax, %edx # edx = bp */
1428	0x8b, 0x40, 0x64,		/* mov bp->bio_parent, %eax */
1429	0x85, 0xc0,			/* test %eax, %eax */
1430	0x75, 0xf7,			/* jne 1b */
1431	0x8b, 0x42, 0x30,		/* mov bp->bp_caller1, %eax */
1432	0x85, 0xc0,			/* test %eax, %eax */
1433	0x75, 0x09,			/* jne 2f */
1434	0x64, 0xa1, 0x00, 0x00,		/* mov %fs:0, %eax */
1435	0x00, 0x00,
1436	0x89, 0x42, 0x30,		/* mov %eax, bp->bio_caller1 */
1437	/* 2: */
1438        0x55, 0x89, 0xe5, 0x57, 0x56,
1439	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1440};
1441#elif defined(__amd64)
1442#define	CODE_SIZE	38
1443#define	STORE_SIZE	6
1444#define	EPILOGUE	5
1445#define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1446
1447static u_char fake_ioreq[SIZE] = {
1448	0x48, 0x89, 0xf8,		/* mov bp, %rax */
1449	/* 1: */
1450	0x48, 0x89, 0xc2,		/* mov %rax, %rdx # rdx = bp */
1451	0x48, 0x8b, 0x82, 0xa8,		/* mov bp->bio_parent, %rax */
1452	0x00, 0x00, 0x00,
1453	0x48, 0x85, 0xc0,		/* test %rax, %rax */
1454	0x75, 0xf1,			/* jne 1b */
1455	0x48, 0x83, 0x7a, 0x58,		/* cmp $0, bp->bp_caller1 */
1456	0x00,
1457	0x75, 0x0d,			/* jne 2f */
1458	0x65, 0x48, 0x8b, 0x04,		/* mov %gs:0, %rax */
1459	0x25, 0x00, 0x00, 0x00,
1460	0x00,
1461	0x48, 0x89, 0x42, 0x58,		/* mov %rax, bp->bio_caller1 */
1462	/* 2: */
1463	0x55, 0x48, 0x89, 0xe5, 0x41, 0x56,
1464	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1465};
1466#else /* neither x86 nor amd64 */
1467static void
1468g_new_io_request(struct bio *bp, struct g_consumer *cp)
1469{
1470	struct bio *top = bp;
1471
1472        /*
1473         * bio classification: if bio_caller1 is available in the
1474         * root of the 'struct bio' tree, store there the thread id
1475         * of the thread that originated the request.
1476         * More sophisticated classification schemes can be used.
1477         */
1478	while (top->bio_parent)
1479		top = top->bio_parent;
1480
1481	if (top->bio_caller1 == NULL)
1482		top->bio_caller1 = curthread;
1483}
1484
1485#error please add the code above in g_new_io_request() to the beginning of \
1486	/sys/geom/geom_io.c::g_io_request(), and remove this line.
1487#endif /* end of arch-specific code */
1488
1489static int
1490g_ioreq_patch(void)
1491{
1492	u_char *original;
1493	u_long ofs;
1494	int found;
1495
1496	if (me.gs_patched)
1497		return (-1);
1498
1499	original = (u_char *)g_io_request;
1500
1501	found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE);
1502	if (!found)
1503		return (-1);
1504
1505	/* Jump back to the original + STORE_SIZE. */
1506	ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE);
1507	bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4);
1508
1509	/* Patch the original address with a jump to the trampoline. */
1510	*original = 0xe9;     /* jump opcode */
1511	ofs = fake_ioreq - (original + 5);
1512	bcopy(&ofs, original + 1, 4);
1513
1514	me.gs_patched = 1;
1515
1516	return (0);
1517}
1518
1519/*
1520 * Restore the original code, this is easy.
1521 */
1522static void
1523g_ioreq_restore(void)
1524{
1525	u_char *original;
1526
1527	if (me.gs_patched) {
1528		original = (u_char *)g_io_request;
1529		bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE);
1530		me.gs_patched = 0;
1531	}
1532}
1533
1534static inline void
1535g_classifier_ini(void)
1536{
1537
1538	g_ioreq_patch();
1539}
1540
1541static inline void
1542g_classifier_fini(void)
1543{
1544
1545	g_ioreq_restore();
1546}
1547
1548/*--- end of support code for older FreeBSD versions */
1549
1550#else /* HAVE_BIO_CLASSIFIER */
1551
1552/*
1553 * Classifier support for recent FreeBSD versions: we use
1554 * a very simple classifier, only use curthread to tag a request.
1555 * The classifier is registered at module load, and unregistered
1556 * at module unload.
1557 */
1558static int
1559g_sched_tag(void *arg, struct bio *bp)
1560{
1561
1562	bp->bio_classifier1 = curthread;
1563	return (1);
1564}
1565
1566static struct g_classifier_hook g_sched_classifier = {
1567	.func =	g_sched_tag,
1568};
1569
1570static inline void
1571g_classifier_ini(void)
1572{
1573
1574	g_register_classifier(&g_sched_classifier);
1575}
1576
1577static inline void
1578g_classifier_fini(void)
1579{
1580
1581	g_unregister_classifier(&g_sched_classifier);
1582}
1583#endif /* HAVE_BIO_CLASSIFIER */
1584
1585static void
1586g_sched_init(struct g_class *mp)
1587{
1588
1589	g_gsched_global_init();
1590
1591	G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1592	    mp, &g_sched_class);
1593
1594	/* Patch g_io_request to store classification info in the bio. */
1595	g_classifier_ini();
1596}
1597
1598static void
1599g_sched_fini(struct g_class *mp)
1600{
1601
1602	g_classifier_fini();
1603
1604	G_SCHED_DEBUG(0, "Unloading...");
1605
1606	KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1607	mtx_destroy(&me.gs_mtx);
1608}
1609
1610/*
1611 * Read the i-th argument for a request, skipping the /dev/
1612 * prefix if present.
1613 */
1614static const char *
1615g_sched_argi(struct gctl_req *req, int i)
1616{
1617	static const char *dev_prefix = "/dev/";
1618	const char *name;
1619	char param[16];
1620	int l = strlen(dev_prefix);
1621
1622	snprintf(param, sizeof(param), "arg%d", i);
1623	name = gctl_get_asciiparam(req, param);
1624	if (name == NULL)
1625		gctl_error(req, "No 'arg%d' argument", i);
1626	else if (strncmp(name, dev_prefix, l) == 0)
1627		name += l;
1628	return (name);
1629}
1630
1631/*
1632 * Fetch nargs and do appropriate checks.
1633 */
1634static int
1635g_sched_get_nargs(struct gctl_req *req)
1636{
1637	int *nargs;
1638
1639	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1640	if (nargs == NULL) {
1641		gctl_error(req, "No 'nargs' argument");
1642		return (0);
1643	}
1644	if (*nargs <= 0)
1645		gctl_error(req, "Missing device(s).");
1646	return (*nargs);
1647}
1648
1649/*
1650 * Check whether we should add the class on certain volumes when
1651 * this geom is created. Right now this is under control of a kenv
1652 * variable containing the names of all devices that we care about.
1653 * Probably we should only support transparent insertion as the
1654 * preferred mode of operation.
1655 */
1656static struct g_geom *
1657g_sched_taste(struct g_class *mp, struct g_provider *pp,
1658		int flags __unused)
1659{
1660	struct g_gsched *gsp = NULL;	/* the . algorithm we want */
1661	const char *s;			/* generic string pointer */
1662	const char *taste_names;	/* devices we like */
1663	int l;
1664
1665        g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1666	    mp->name, pp->name);
1667        g_topology_assert();
1668
1669        G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1670
1671	do {
1672		/* do not taste on ourselves */
1673		if (pp->geom->class == mp)
1674                	break;
1675
1676		taste_names = getenv("geom.sched.taste");
1677		if (taste_names == NULL)
1678			break;
1679
1680		l = strlen(pp->name);
1681		for (s = taste_names; *s &&
1682		    (s = strstr(s, pp->name)); s++) {
1683			/* further checks for an exact match */
1684			if ( (s == taste_names || s[-1] == ' ') &&
1685			     (s[l] == '\0' || s[l] == ' ') )
1686				break;
1687		}
1688		if (s == NULL)
1689			break;
1690		G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1691		    pp->name, s);
1692
1693		/* look up the provider name in the list */
1694		s = getenv("geom.sched.algo");
1695		if (s == NULL)
1696			s = "rr";
1697
1698		gsp = g_gsched_find(s);	/* also get a reference */
1699		if (gsp == NULL) {
1700			G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1701			break;
1702		}
1703
1704		/* XXX create with 1 as last argument ? */
1705		g_sched_create(NULL, mp, pp, gsp, 0);
1706		g_gsched_unref(gsp);
1707	} while (0);
1708	return NULL;
1709}
1710
1711static void
1712g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1713{
1714	struct g_provider *pp;
1715	struct g_gsched *gsp;
1716	const char *name;
1717	int i, nargs;
1718
1719	g_topology_assert();
1720
1721	name = gctl_get_asciiparam(req, "algo");
1722	if (name == NULL) {
1723		gctl_error(req, "No '%s' argument", "algo");
1724		return;
1725	}
1726
1727	gsp = g_gsched_find(name);	/* also get a reference */
1728	if (gsp == NULL) {
1729		gctl_error(req, "Bad algorithm '%s'", name);
1730		return;
1731	}
1732
1733	nargs = g_sched_get_nargs(req);
1734
1735	/*
1736	 * Run on the arguments, and break on any error.
1737	 * We look for a device name, but skip the /dev/ prefix if any.
1738	 */
1739	for (i = 0; i < nargs; i++) {
1740		name = g_sched_argi(req, i);
1741		if (name == NULL)
1742			break;
1743		pp = g_provider_by_name(name);
1744		if (pp == NULL) {
1745			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1746			gctl_error(req, "Provider %s is invalid.", name);
1747			break;
1748		}
1749		if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1750			break;
1751	}
1752
1753	g_gsched_unref(gsp);
1754}
1755
1756static void
1757g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1758{
1759	struct g_provider *pp;
1760	struct g_gsched *gsp;
1761	const char *name;
1762	int i, nargs;
1763
1764	g_topology_assert();
1765
1766	name = gctl_get_asciiparam(req, "algo");
1767	if (name == NULL) {
1768		gctl_error(req, "No '%s' argument", "algo");
1769		return;
1770	}
1771
1772	gsp = g_gsched_find(name);	/* also get a reference */
1773	if (gsp == NULL) {
1774		gctl_error(req, "Bad algorithm '%s'", name);
1775		return;
1776	}
1777
1778	nargs = g_sched_get_nargs(req);
1779
1780	/*
1781	 * Run on the arguments, and break on any error.
1782	 * We look for a device name, but skip the /dev/ prefix if any.
1783	 */
1784	for (i = 0; i < nargs; i++) {
1785		name = g_sched_argi(req, i);
1786		if (name == NULL)
1787			break;
1788		pp = g_provider_by_name(name);
1789		if (pp == NULL || pp->geom->class != mp) {
1790			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1791			gctl_error(req, "Provider %s is invalid.", name);
1792			break;
1793		}
1794		if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1795			break;
1796	}
1797
1798	g_gsched_unref(gsp);
1799}
1800
1801static struct g_geom *
1802g_sched_find_geom(struct g_class *mp, const char *name)
1803{
1804	struct g_geom *gp;
1805
1806	LIST_FOREACH(gp, &mp->geom, geom) {
1807		if (strcmp(gp->name, name) == 0)
1808			return (gp);
1809	}
1810	return (NULL);
1811}
1812
1813static void
1814g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1815{
1816	int nargs, *force, error, i;
1817	struct g_geom *gp;
1818	const char *name;
1819
1820	g_topology_assert();
1821
1822	nargs = g_sched_get_nargs(req);
1823
1824	force = gctl_get_paraml(req, "force", sizeof(*force));
1825	if (force == NULL) {
1826		gctl_error(req, "No 'force' argument");
1827		return;
1828	}
1829
1830	for (i = 0; i < nargs; i++) {
1831		name = g_sched_argi(req, i);
1832		if (name == NULL)
1833			break;
1834
1835		gp = g_sched_find_geom(mp, name);
1836		if (gp == NULL) {
1837			G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1838			gctl_error(req, "Device %s is invalid.", name);
1839			break;
1840		}
1841
1842		error = g_sched_destroy(gp, *force);
1843		if (error != 0) {
1844			gctl_error(req, "Cannot destroy device %s (error=%d).",
1845			    gp->name, error);
1846			break;
1847		}
1848	}
1849}
1850
1851static void
1852g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1853{
1854	uint32_t *version;
1855
1856	g_topology_assert();
1857
1858	version = gctl_get_paraml(req, "version", sizeof(*version));
1859	if (version == NULL) {
1860		gctl_error(req, "No '%s' argument.", "version");
1861		return;
1862	}
1863
1864	if (*version != G_SCHED_VERSION) {
1865		gctl_error(req, "Userland and kernel parts are "
1866		    "out of sync.");
1867		return;
1868	}
1869
1870	if (strcmp(verb, "create") == 0) {
1871		g_sched_ctl_create(req, mp, 0);
1872		return;
1873	} else if (strcmp(verb, "insert") == 0) {
1874		g_sched_ctl_create(req, mp, 1);
1875		return;
1876	} else if (strcmp(verb, "configure") == 0) {
1877		g_sched_ctl_configure(req, mp);
1878		return;
1879	} else if (strcmp(verb, "destroy") == 0) {
1880		g_sched_ctl_destroy(req, mp);
1881		return;
1882	}
1883
1884	gctl_error(req, "Unknown verb.");
1885}
1886
1887static void
1888g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1889    struct g_consumer *cp, struct g_provider *pp)
1890{
1891	struct g_sched_softc *sc = gp->softc;
1892	struct g_gsched *gsp = sc->sc_gsched;
1893	if (indent == NULL) {	/* plaintext */
1894		sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1895	}
1896	if (gsp->gs_dumpconf)
1897		gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1898}
1899
1900DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1901MODULE_VERSION(geom_sched, 0);
1902