1/*-
2 * Copyright (c) 2009-2010 Fabio Checconi
3 * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * $Id$
30 * $FreeBSD$
31 *
32 * Main control module for geom-based disk schedulers ('sched').
33 *
34 * USER VIEW
35 * A 'sched' node is typically inserted transparently between
36 * an existing provider pp and its original geom gp
37 *
38 *	[pp --> gp  ..]
39 *
40 * using the command "geom sched insert <provider>" and
41 * resulting in the following topology
42 *
43 *	[pp --> sched_gp --> cp]   [new_pp --> gp ... ]
44 *
45 * Deletion "geom sched destroy <provider>.sched." restores the
46 * original chain. The normal "geom sched create <provide>"
47 * is also supported.
48 *
49 * INTERNALS
50 * Internally, the 'sched' uses the following data structures
51 *
52 *   geom{}         g_sched_softc{}      g_gsched{}
53 * +----------+    +---------------+   +-------------+
54 * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
55 * |  ...     |    |               |   |  gs_fini    |
56 * |          |    | [ hash table] |   |  gs_start   |
57 * +----------+    |               |   |  ...        |
58 *                 |               |   +-------------+
59 *                 |               |
60 *                 |               |     g_*_softc{}
61 *                 |               |   +-------------+
62 *                 | sc_data     *-|-->|             |
63 *                 +---------------+   |  algorithm- |
64 *                                     |  specific   |
65 *                                     +-------------+
66 *
67 * A g_sched_softc{} is created with a "geom sched insert" call.
68 * In turn this instantiates a specific scheduling algorithm,
69 * which sets sc_gsched to point to the algorithm callbacks,
70 * and calls gs_init() to create the g_*_softc{} .
71 * The other callbacks (gs_start, gs_next, ...) are invoked
72 * as needed
73 *
74 * g_sched_softc{} is defined in g_sched.h and mostly used here;
75 * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
76 * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
77 *
78 * DATA MOVING
79 * When a bio is received on the provider, it goes to the
80 * g_sched_start() which calls gs_start() to initially queue it;
81 * then we call g_sched_dispatch() that loops around gs_next()
82 * to select zero or more bio's to be sent downstream.
83 *
84 * g_sched_dispatch() can also be called as a result of a timeout,
85 * e.g. when doing anticipation or pacing requests.
86 *
87 * When a bio comes back, it goes to g_sched_done() which in turn
88 * calls gs_done(). The latter does any necessary housekeeping in
89 * the scheduling algorithm, and may decide to call g_sched_dispatch()
90 * to send more bio's downstream.
91 *
92 * If an algorithm needs per-flow queues, these are created
93 * calling gs_init_class() and destroyed with gs_fini_class(),
94 * and they are also inserted in the hash table implemented in
95 * the g_sched_softc{}
96 *
97 * If an algorithm is replaced, or a transparently-inserted node is
98 * removed with "geom sched destroy", we need to remove all references
99 * to the g_*_softc{} and g_sched_softc from the bio's still in
100 * the scheduler. g_sched_forced_dispatch() helps doing this.
101 * XXX need to explain better.
102 */
103
104#include <sys/cdefs.h>
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/kernel.h>
108#include <sys/module.h>
109#include <sys/lock.h>
110#include <sys/mutex.h>
111#include <sys/bio.h>
112#include <sys/limits.h>
113#include <sys/hash.h>
114#include <sys/sbuf.h>
115#include <sys/sysctl.h>
116#include <sys/malloc.h>
117#include <sys/proc.h>		/* we access curthread */
118#include <geom/geom.h>
119#include "gs_scheduler.h"
120#include "g_sched.h"		/* geom hooks */
121
122/*
123 * Size of the per-geom hash table storing traffic classes.
124 * We may decide to change it at a later time, it has no ABI
125 * implications as it is only used for run-time allocations.
126 */
127#define G_SCHED_HASH_SIZE	32
128
129static int g_sched_destroy(struct g_geom *gp, boolean_t force);
130static int g_sched_destroy_geom(struct gctl_req *req,
131    struct g_class *mp, struct g_geom *gp);
132static void g_sched_config(struct gctl_req *req, struct g_class *mp,
133    const char *verb);
134static struct g_geom *g_sched_taste(struct g_class *mp,
135    struct g_provider *pp, int flags __unused);
136static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
137    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
138static void g_sched_init(struct g_class *mp);
139static void g_sched_fini(struct g_class *mp);
140static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
141    int fflag, struct thread *td);
142
143struct g_class g_sched_class = {
144	.name = G_SCHED_CLASS_NAME,
145	.version = G_VERSION,
146	.ctlreq = g_sched_config,
147	.taste = g_sched_taste,
148	.destroy_geom = g_sched_destroy_geom,
149	.init = g_sched_init,
150	.ioctl = g_sched_ioctl,
151	.fini = g_sched_fini
152};
153
154MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
155
156/*
157 * Global variables describing the state of the geom_sched module.
158 * There is only one static instance of this structure.
159 */
160LIST_HEAD(gs_list, g_gsched);	/* type, link field */
161struct geom_sched_vars {
162	struct mtx	gs_mtx;
163	struct gs_list	gs_scheds;	/* list of algorithms */
164	u_int		gs_debug;
165	u_int		gs_sched_count;	/* how many algorithms ? */
166	u_int 		gs_patched;	/* g_io_request was patched */
167
168	u_int		gs_initialized;
169	u_int		gs_expire_secs;	/* expiration of hash entries */
170
171	struct bio_queue_head gs_pending;
172	u_int		gs_npending;
173
174	/* The following are for stats, usually protected by gs_mtx. */
175	u_long		gs_requests;	/* total requests */
176	u_long		gs_done;	/* total done */
177	u_int 		gs_in_flight;	/* requests in flight */
178	u_int 		gs_writes_in_flight;
179	u_int 		gs_bytes_in_flight;
180	u_int 		gs_write_bytes_in_flight;
181
182	char		gs_names[256];	/* names of schedulers */
183};
184
185static struct geom_sched_vars me = {
186	.gs_expire_secs = 10,
187};
188
189SYSCTL_DECL(_kern_geom);
190SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
191    "GEOM_SCHED stuff");
192
193SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
194    &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
195
196SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
197    &me.gs_bytes_in_flight, 0, "Bytes in flight");
198
199SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
200    &me.gs_writes_in_flight, 0, "Write Requests in flight");
201
202SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
203    &me.gs_in_flight, 0, "Requests in flight");
204
205SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
206    &me.gs_done, 0, "Total done");
207
208SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
209    &me.gs_requests, 0, "Total requests");
210
211SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
212    &me.gs_names, 0, "Algorithm names");
213
214SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
215    &me.gs_sched_count, 0, "Number of algorithms");
216
217SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
218    &me.gs_debug, 0, "Debug level");
219
220SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
221    &me.gs_expire_secs, 0, "Expire time in seconds");
222
223/*
224 * g_sched calls the scheduler algorithms with this lock held.
225 * The locking functions are exposed so the scheduler algorithms can also
226 * protect themselves e.g. when running a callout handler.
227 */
228void
229g_sched_lock(struct g_geom *gp)
230{
231	struct g_sched_softc *sc = gp->softc;
232
233	mtx_lock(&sc->sc_mtx);
234}
235
236void
237g_sched_unlock(struct g_geom *gp)
238{
239	struct g_sched_softc *sc = gp->softc;
240
241	mtx_unlock(&sc->sc_mtx);
242}
243
244/*
245 * Support functions to handle references to the module,
246 * which are coming from devices using this scheduler.
247 */
248static inline void
249g_gsched_ref(struct g_gsched *gsp)
250{
251
252	atomic_add_int(&gsp->gs_refs, 1);
253}
254
255static inline void
256g_gsched_unref(struct g_gsched *gsp)
257{
258
259	atomic_add_int(&gsp->gs_refs, -1);
260}
261
262/*
263 * Update the stats when this request is done.
264 */
265static void
266g_sched_update_stats(struct bio *bio)
267{
268
269	me.gs_done++;
270	me.gs_in_flight--;
271	me.gs_bytes_in_flight -= bio->bio_length;
272	if (bio->bio_cmd & BIO_WRITE) {
273		me.gs_writes_in_flight--;
274		me.gs_write_bytes_in_flight -= bio->bio_length;
275	}
276}
277
278/*
279 * Dispatch any pending request.
280 */
281static void
282g_sched_forced_dispatch(struct g_geom *gp)
283{
284	struct g_sched_softc *sc = gp->softc;
285	struct g_gsched *gsp = sc->sc_gsched;
286	struct bio *bp;
287
288	KASSERT(mtx_owned(&sc->sc_mtx),
289	    ("sc_mtx not owned during forced dispatch"));
290
291	while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
292		g_io_request(bp, LIST_FIRST(&gp->consumer));
293}
294
295/*
296 * The main dispatch loop, called either here after the start
297 * routine, or by scheduling algorithms when they receive a timeout
298 * or a 'done' notification.  Does not share code with the forced
299 * dispatch path, since the gs_done() callback can call us.
300 */
301void
302g_sched_dispatch(struct g_geom *gp)
303{
304	struct g_sched_softc *sc = gp->softc;
305	struct g_gsched *gsp = sc->sc_gsched;
306	struct bio *bp;
307
308	KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
309
310	if ((sc->sc_flags & G_SCHED_FLUSHING))
311		return;
312
313	while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
314		g_io_request(bp, LIST_FIRST(&gp->consumer));
315}
316
317/*
318 * Recent (8.0 and above) versions of FreeBSD have support to
319 * register classifiers of disk requests. The classifier is
320 * invoked by g_io_request(), and stores the information into
321 * bp->bio_classifier1.
322 *
323 * Support for older versions, which is left here only for
324 * documentation purposes, relies on two hacks:
325 * 1. classification info is written into the bio_caller1
326 *    field of the topmost node in the bio chain. This field
327 *    is rarely used, but this module is incompatible with
328 *    those that use bio_caller1 for other purposes,
329 *    such as ZFS and gjournal;
330 * 2. g_io_request() is patched in-memory when the module is
331 *    loaded, so that the function calls a classifier as its
332 *    first thing. g_io_request() is restored when the module
333 *    is unloaded. This functionality is only supported for
334 *    x86 and amd64, other architectures need source code changes.
335 */
336
337/*
338 * Lookup the identity of the issuer of the original request.
339 * In the current implementation we use the curthread of the
340 * issuer, but different mechanisms may be implemented later
341 * so we do not make assumptions on the return value which for
342 * us is just an opaque identifier.
343 */
344
345static inline u_long
346g_sched_classify(struct bio *bp)
347{
348
349#if __FreeBSD_version > 800098
350	/* we have classifier fields in the struct bio */
351#define HAVE_BIO_CLASSIFIER
352	return ((u_long)bp->bio_classifier1);
353#else
354#warning old version!!!
355	while (bp->bio_parent != NULL)
356		bp = bp->bio_parent;
357
358	return ((u_long)bp->bio_caller1);
359#endif
360}
361
362/* Return the hash chain for the given key. */
363static inline struct g_hash *
364g_sched_hash(struct g_sched_softc *sc, u_long key)
365{
366
367	return (&sc->sc_hash[key & sc->sc_mask]);
368}
369
370/*
371 * Helper function for the children classes, which takes
372 * a geom and a bio and returns the private descriptor
373 * associated to the request.  This involves fetching
374 * the classification field and [al]locating the
375 * corresponding entry in the hash table.
376 */
377void *
378g_sched_get_class(struct g_geom *gp, struct bio *bp)
379{
380	struct g_sched_softc *sc;
381	struct g_sched_class *gsc;
382	struct g_gsched *gsp;
383	struct g_hash *bucket;
384	u_long key;
385
386	sc = gp->softc;
387	key = g_sched_classify(bp);
388	bucket = g_sched_hash(sc, key);
389	LIST_FOREACH(gsc, bucket, gsc_clist) {
390		if (key == gsc->gsc_key) {
391			gsc->gsc_refs++;
392			return (gsc->gsc_priv);
393		}
394	}
395
396	gsp = sc->sc_gsched;
397	gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
398	    M_GEOM_SCHED, M_NOWAIT | M_ZERO);
399	if (!gsc)
400		return (NULL);
401
402	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
403		free(gsc, M_GEOM_SCHED);
404		return (NULL);
405	}
406
407	gsc->gsc_refs = 2;	/* 1 for the hash table, 1 for the caller. */
408	gsc->gsc_key = key;
409	LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
410
411	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
412
413	return (gsc->gsc_priv);
414}
415
416/*
417 * Release a reference to the per-client descriptor,
418 */
419void
420g_sched_put_class(struct g_geom *gp, void *priv)
421{
422	struct g_sched_class *gsc;
423	struct g_sched_softc *sc;
424
425	gsc = g_sched_priv2class(priv);
426	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
427
428	if (--gsc->gsc_refs > 0)
429		return;
430
431	sc = gp->softc;
432	sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
433
434	LIST_REMOVE(gsc, gsc_clist);
435	free(gsc, M_GEOM_SCHED);
436}
437
438static void
439g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
440    struct g_gsched *gsp, void *data)
441{
442	struct g_sched_class *cp, *cp2;
443	int i;
444
445	if (!hp)
446		return;
447
448	if (data && gsp->gs_hash_unref)
449		gsp->gs_hash_unref(data);
450
451	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
452		LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
453			g_sched_put_class(gp, cp->gsc_priv);
454	}
455
456	hashdestroy(hp, M_GEOM_SCHED, mask);
457}
458
459static struct g_hash *
460g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
461{
462	struct g_hash *hash;
463
464	if (gsp->gs_priv_size == 0)
465		return (NULL);
466
467	hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
468
469	return (hash);
470}
471
472static void
473g_sched_flush_classes(struct g_geom *gp)
474{
475	struct g_sched_softc *sc;
476	struct g_sched_class *cp, *cp2;
477	int i;
478
479	sc = gp->softc;
480
481	if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
482		return;
483
484	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
485		LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
486			if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
487				g_sched_put_class(gp, cp->gsc_priv);
488		}
489	}
490
491	sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
492}
493
494/*
495 * Wait for the completion of any outstanding request.  To ensure
496 * that this does not take forever the caller has to make sure that
497 * no new request enter the scehduler before calling us.
498 *
499 * Must be called with the gp mutex held and topology locked.
500 */
501static int
502g_sched_wait_pending(struct g_geom *gp)
503{
504	struct g_sched_softc *sc = gp->softc;
505	int endticks = ticks + hz;
506
507	g_topology_assert();
508
509	while (sc->sc_pending && endticks - ticks >= 0)
510		msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
511
512	return (sc->sc_pending ? ETIMEDOUT : 0);
513}
514
515static int
516g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
517{
518	struct g_sched_softc *sc = gp->softc;
519	int error;
520
521	/* Set the flushing flag: new bios will not enter the scheduler. */
522	sc->sc_flags |= G_SCHED_FLUSHING;
523
524	g_sched_forced_dispatch(gp);
525	error = g_sched_wait_pending(gp);
526	if (error)
527		goto failed;
528
529	/* No more requests pending or in flight from the old gsp. */
530
531	g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
532	sc->sc_hash = NULL;
533
534	/*
535	 * Avoid deadlock here by releasing the gp mutex and reacquiring
536	 * it once done.  It should be safe, since no reconfiguration or
537	 * destruction can take place due to the geom topology lock; no
538	 * new request can use the current sc_data since we flagged the
539	 * geom as being flushed.
540	 */
541	g_sched_unlock(gp);
542	gsp->gs_fini(sc->sc_data);
543	g_sched_lock(gp);
544
545	sc->sc_gsched = NULL;
546	sc->sc_data = NULL;
547	g_gsched_unref(gsp);
548
549failed:
550	sc->sc_flags &= ~G_SCHED_FLUSHING;
551
552	return (error);
553}
554
555static int
556g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
557{
558	int error;
559
560	g_sched_lock(gp);
561	error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
562	g_sched_unlock(gp);
563
564	return (error);
565}
566
567/*
568 * Support function for create/taste -- locate the desired
569 * algorithm and grab a reference to it.
570 */
571static struct g_gsched *
572g_gsched_find(const char *name)
573{
574	struct g_gsched *gsp = NULL;
575
576	mtx_lock(&me.gs_mtx);
577	LIST_FOREACH(gsp, &me.gs_scheds, glist) {
578		if (strcmp(name, gsp->gs_name) == 0) {
579			g_gsched_ref(gsp);
580			break;
581		}
582	}
583	mtx_unlock(&me.gs_mtx);
584
585	return (gsp);
586}
587
588/*
589 * Rebuild the list of scheduler names.
590 * To be called with me.gs_mtx lock held.
591 */
592static void
593g_gsched_build_names(struct g_gsched *gsp)
594{
595	int pos, l;
596	struct g_gsched *cur;
597
598	pos = 0;
599	LIST_FOREACH(cur, &me.gs_scheds, glist) {
600		l = strlen(cur->gs_name);
601		if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
602			if (pos != 0)
603				me.gs_names[pos++] = ' ';
604			strcpy(me.gs_names + pos, cur->gs_name);
605			pos += l;
606		}
607	}
608	me.gs_names[pos] = '\0';
609}
610
611/*
612 * Register or unregister individual scheduling algorithms.
613 */
614static int
615g_gsched_register(struct g_gsched *gsp)
616{
617	struct g_gsched *cur;
618	int error = 0;
619
620	mtx_lock(&me.gs_mtx);
621	LIST_FOREACH(cur, &me.gs_scheds, glist) {
622		if (strcmp(gsp->gs_name, cur->gs_name) == 0)
623			break;
624	}
625	if (cur != NULL) {
626		G_SCHED_DEBUG(0, "A scheduler named %s already"
627		    "exists.", gsp->gs_name);
628		error = EEXIST;
629	} else {
630		LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
631		gsp->gs_refs = 1;
632		me.gs_sched_count++;
633		g_gsched_build_names(gsp);
634	}
635	mtx_unlock(&me.gs_mtx);
636
637	return (error);
638}
639
640struct g_gsched_unregparm {
641	struct g_gsched *gup_gsp;
642	int		gup_error;
643};
644
645static void
646g_gsched_unregister(void *arg, int flag)
647{
648	struct g_gsched_unregparm *parm = arg;
649	struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
650	struct g_sched_softc *sc;
651	struct g_geom *gp, *gp_tmp;
652	int error;
653
654	parm->gup_error = 0;
655
656	g_topology_assert();
657
658	if (flag == EV_CANCEL)
659		return;
660
661	mtx_lock(&me.gs_mtx);
662
663	LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
664		if (gp->class != &g_sched_class)
665			continue;	/* Should not happen. */
666
667		sc = gp->softc;
668		if (sc->sc_gsched == gsp) {
669			error = g_sched_remove(gp, gsp);
670			if (error)
671				goto failed;
672		}
673	}
674
675	LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
676		if (cur != gsp)
677			continue;
678
679		if (gsp->gs_refs != 1) {
680			G_SCHED_DEBUG(0, "%s still in use.",
681			    gsp->gs_name);
682			parm->gup_error = EBUSY;
683		} else {
684			LIST_REMOVE(gsp, glist);
685			me.gs_sched_count--;
686			g_gsched_build_names(gsp);
687		}
688		break;
689	}
690
691	if (cur == NULL) {
692		G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
693		parm->gup_error = ENOENT;
694	}
695
696failed:
697	mtx_unlock(&me.gs_mtx);
698}
699
700static inline void
701g_gsched_global_init(void)
702{
703
704	if (!me.gs_initialized) {
705		G_SCHED_DEBUG(0, "Initializing global data.");
706		mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
707		LIST_INIT(&me.gs_scheds);
708		gs_bioq_init(&me.gs_pending);
709		me.gs_initialized = 1;
710	}
711}
712
713/*
714 * Module event called when a scheduling algorithm module is loaded or
715 * unloaded.
716 */
717int
718g_gsched_modevent(module_t mod, int cmd, void *arg)
719{
720	struct g_gsched *gsp = arg;
721	struct g_gsched_unregparm parm;
722	int error;
723
724	G_SCHED_DEBUG(0, "Modevent %d.", cmd);
725
726	/*
727	 * If the module is loaded at boot, the geom thread that calls
728	 * g_sched_init() might actually run after g_gsched_modevent(),
729	 * so make sure that the module is properly initialized.
730	 */
731	g_gsched_global_init();
732
733	error = EOPNOTSUPP;
734	switch (cmd) {
735	case MOD_LOAD:
736		error = g_gsched_register(gsp);
737		G_SCHED_DEBUG(0, "Loaded module %s error %d.",
738		    gsp->gs_name, error);
739		if (error == 0)
740			g_retaste(&g_sched_class);
741		break;
742
743	case MOD_UNLOAD:
744		parm.gup_gsp = gsp;
745		parm.gup_error = 0;
746
747		error = g_waitfor_event(g_gsched_unregister,
748		    &parm, M_WAITOK, NULL);
749		if (error == 0)
750			error = parm.gup_error;
751		G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
752		    gsp->gs_name, error);
753		break;
754	};
755
756	return (error);
757}
758
759#ifdef KTR
760#define	TRC_BIO_EVENT(e, bp)	g_sched_trace_bio_ ## e (bp)
761
762static inline char
763g_sched_type(struct bio *bp)
764{
765
766	if (0 != (bp->bio_cmd & BIO_READ))
767		return ('R');
768	else if (0 != (bp->bio_cmd & BIO_WRITE))
769		return ('W');
770	return ('U');
771}
772
773static inline void
774g_sched_trace_bio_START(struct bio *bp)
775{
776
777	CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
778	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
779	    bp->bio_offset, bp->bio_length);
780}
781
782static inline void
783g_sched_trace_bio_DONE(struct bio *bp)
784{
785
786	CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
787	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
788	    bp->bio_offset, bp->bio_length);
789}
790#else /* !KTR */
791#define	TRC_BIO_EVENT(e, bp)
792#endif /* !KTR */
793
794/*
795 * g_sched_done() and g_sched_start() dispatch the geom requests to
796 * the scheduling algorithm in use.
797 */
798static void
799g_sched_done(struct bio *bio)
800{
801	struct g_geom *gp = bio->bio_caller2;
802	struct g_sched_softc *sc = gp->softc;
803
804	TRC_BIO_EVENT(DONE, bio);
805
806	KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
807
808	g_sched_lock(gp);
809
810	g_sched_update_stats(bio);
811	sc->sc_gsched->gs_done(sc->sc_data, bio);
812	if (!--sc->sc_pending)
813		wakeup(gp);
814
815	g_sched_flush_classes(gp);
816	g_sched_unlock(gp);
817
818	g_std_done(bio);
819}
820
821static void
822g_sched_start(struct bio *bp)
823{
824	struct g_geom *gp = bp->bio_to->geom;
825	struct g_sched_softc *sc = gp->softc;
826	struct bio *cbp;
827
828	TRC_BIO_EVENT(START, bp);
829	G_SCHED_LOGREQ(bp, "Request received.");
830
831	cbp = g_clone_bio(bp);
832	if (cbp == NULL) {
833		g_io_deliver(bp, ENOMEM);
834		return;
835	}
836	cbp->bio_done = g_sched_done;
837	cbp->bio_to = LIST_FIRST(&gp->provider);
838	KASSERT(cbp->bio_to != NULL, ("NULL provider"));
839
840	/* We only schedule reads and writes. */
841	if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE)))
842		goto bypass;
843
844	G_SCHED_LOGREQ(cbp, "Sending request.");
845
846	g_sched_lock(gp);
847	/*
848	 * Call the algorithm's gs_start to queue the request in the
849	 * scheduler. If gs_start fails then pass the request down,
850	 * otherwise call g_sched_dispatch() which tries to push
851	 * one or more requests down.
852	 */
853	if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
854	    sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
855		g_sched_unlock(gp);
856		goto bypass;
857	}
858	/*
859	 * We use bio_caller1 to mark requests that are scheduled
860	 * so make sure it is not NULL.
861	 */
862	if (cbp->bio_caller1 == NULL)
863		cbp->bio_caller1 = &me;	/* anything not NULL */
864
865	cbp->bio_caller2 = gp;
866	sc->sc_pending++;
867
868	/* Update general stats. */
869	me.gs_in_flight++;
870	me.gs_requests++;
871	me.gs_bytes_in_flight += bp->bio_length;
872	if (bp->bio_cmd & BIO_WRITE) {
873		me.gs_writes_in_flight++;
874		me.gs_write_bytes_in_flight += bp->bio_length;
875	}
876	g_sched_dispatch(gp);
877	g_sched_unlock(gp);
878	return;
879
880bypass:
881	cbp->bio_done = g_std_done;
882	cbp->bio_caller1 = NULL; /* not scheduled */
883	g_io_request(cbp, LIST_FIRST(&gp->consumer));
884}
885
886/*
887 * The next few functions are the geom glue.
888 */
889static void
890g_sched_orphan(struct g_consumer *cp)
891{
892
893	g_topology_assert();
894	g_sched_destroy(cp->geom, 1);
895}
896
897static int
898g_sched_access(struct g_provider *pp, int dr, int dw, int de)
899{
900	struct g_geom *gp;
901	struct g_consumer *cp;
902	int error;
903
904	gp = pp->geom;
905	cp = LIST_FIRST(&gp->consumer);
906	error = g_access(cp, dr, dw, de);
907
908	return (error);
909}
910
911static void
912g_sched_temporary_start(struct bio *bio)
913{
914
915	mtx_lock(&me.gs_mtx);
916	me.gs_npending++;
917	gs_bioq_disksort(&me.gs_pending, bio);
918	mtx_unlock(&me.gs_mtx);
919}
920
921static void
922g_sched_flush_pending(g_start_t *start)
923{
924	struct bio *bp;
925
926	while ((bp = gs_bioq_takefirst(&me.gs_pending)))
927		start(bp);
928}
929
930static int
931g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
932    struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
933{
934	struct g_sched_softc *sc = gp->softc;
935	g_start_t *saved_start, *flush = g_sched_start;
936	int error = 0, endticks = ticks + hz;
937
938	g_cancel_event(newpp);	/* prevent taste() */
939	/* copy private fields */
940	newpp->private = pp->private;
941	newpp->index = pp->index;
942
943	/* Queue all the early requests coming for us. */
944	me.gs_npending = 0;
945	saved_start = pp->geom->start;
946	dstgp->start = g_sched_temporary_start;
947
948	while (pp->nstart - pp->nend != me.gs_npending &&
949	    endticks - ticks >= 0)
950		tsleep(pp, PRIBIO, "-", hz/10);
951
952	if (pp->nstart - pp->nend != me.gs_npending) {
953		flush = saved_start;
954		error = ETIMEDOUT;
955		goto fail;
956	}
957
958	/* link pp to this geom */
959	LIST_REMOVE(pp, provider);
960	pp->geom = gp;
961	LIST_INSERT_HEAD(&gp->provider, pp, provider);
962
963	/*
964	 * replicate the counts from the parent in the
965	 * new provider and consumer nodes
966	 */
967	cp->acr = newpp->acr = pp->acr;
968	cp->acw = newpp->acw = pp->acw;
969	cp->ace = newpp->ace = pp->ace;
970	sc->sc_flags |= G_SCHED_PROXYING;
971
972fail:
973	dstgp->start = saved_start;
974
975	g_sched_flush_pending(flush);
976
977	return (error);
978}
979
980/*
981 * Create a geom node for the device passed as *pp.
982 * If successful, add a reference to this gsp.
983 */
984static int
985g_sched_create(struct gctl_req *req, struct g_class *mp,
986    struct g_provider *pp, struct g_gsched *gsp, int proxy)
987{
988	struct g_sched_softc *sc = NULL;
989	struct g_geom *gp, *dstgp;
990	struct g_provider *newpp = NULL;
991	struct g_consumer *cp = NULL;
992	char name[64];
993	int error;
994
995	g_topology_assert();
996
997	snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
998	LIST_FOREACH(gp, &mp->geom, geom) {
999		if (strcmp(gp->name, name) == 0) {
1000			gctl_error(req, "Geom %s already exists.",
1001			    name);
1002			return (EEXIST);
1003		}
1004	}
1005
1006	gp = g_new_geomf(mp, "%s", name);
1007	dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1008
1009	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1010	sc->sc_gsched = gsp;
1011	sc->sc_data = gsp->gs_init(gp);
1012	if (sc->sc_data == NULL) {
1013		error = ENOMEM;
1014		goto fail;
1015	}
1016
1017	sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1018
1019	/*
1020	 * Do not initialize the flush mechanism, will be initialized
1021	 * on the first insertion on the hash table.
1022	 */
1023
1024	mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1025
1026	gp->softc = sc;
1027	gp->start = g_sched_start;
1028	gp->orphan = g_sched_orphan;
1029	gp->access = g_sched_access;
1030	gp->dumpconf = g_sched_dumpconf;
1031
1032	newpp = g_new_providerf(dstgp, "%s", gp->name);
1033	newpp->mediasize = pp->mediasize;
1034	newpp->sectorsize = pp->sectorsize;
1035
1036	cp = g_new_consumer(gp);
1037	error = g_attach(cp, proxy ? newpp : pp);
1038	if (error != 0) {
1039		gctl_error(req, "Cannot attach to provider %s.",
1040		    pp->name);
1041		goto fail;
1042	}
1043
1044	g_error_provider(newpp, 0);
1045	if (proxy) {
1046		error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1047		if (error)
1048			goto fail;
1049	}
1050	G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1051
1052	g_gsched_ref(gsp);
1053
1054	return (0);
1055
1056fail:
1057	if (cp != NULL) {
1058		if (cp->provider != NULL)
1059			g_detach(cp);
1060		g_destroy_consumer(cp);
1061	}
1062	if (newpp != NULL)
1063		g_destroy_provider(newpp);
1064	if (sc->sc_hash)
1065		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1066		    gsp, sc->sc_data);
1067	if (sc->sc_data)
1068		gsp->gs_fini(sc->sc_data);
1069	g_free(gp->softc);
1070	g_destroy_geom(gp);
1071
1072	return (error);
1073}
1074
1075/*
1076 * Support for dynamic switching of scheduling algorithms.
1077 * First initialize the data structures for the new algorithm,
1078 * then call g_sched_remove_locked() to flush all references
1079 * to the old one, finally link the new algorithm.
1080 */
1081static int
1082g_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1083    struct g_provider *pp, struct g_gsched *gsp)
1084{
1085	struct g_sched_softc *sc;
1086	struct g_geom *gp;
1087	struct g_hash *newh;
1088	void *data;
1089	u_long mask;
1090	int error = 0;
1091
1092	gp = pp->geom;
1093	sc = gp->softc;
1094
1095	data = gsp->gs_init(gp);
1096	if (data == NULL)
1097		return (ENOMEM);
1098
1099	newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1100	if (gsp->gs_priv_size && !newh) {
1101		error = ENOMEM;
1102		goto fail;
1103	}
1104
1105	g_sched_lock(gp);
1106	if (sc->sc_gsched) {	/* can be NULL in some cases */
1107		error = g_sched_remove_locked(gp, sc->sc_gsched);
1108		if (error)
1109			goto fail;
1110	}
1111
1112	g_gsched_ref(gsp);
1113	sc->sc_gsched = gsp;
1114	sc->sc_data = data;
1115	sc->sc_hash = newh;
1116	sc->sc_mask = mask;
1117
1118	g_sched_unlock(gp);
1119
1120	return (0);
1121
1122fail:
1123	if (newh)
1124		g_sched_hash_fini(gp, newh, mask, gsp, data);
1125
1126	if (data)
1127		gsp->gs_fini(data);
1128
1129	g_sched_unlock(gp);
1130
1131	return (error);
1132}
1133
1134/*
1135 * Stop the request flow directed to the proxy, redirecting the new
1136 * requests to the me.gs_pending queue.
1137 */
1138static struct g_provider *
1139g_detach_proxy(struct g_geom *gp)
1140{
1141	struct g_consumer *cp;
1142	struct g_provider *pp, *newpp;
1143
1144	do {
1145		pp = LIST_FIRST(&gp->provider);
1146		if (pp == NULL)
1147			break;
1148		cp = LIST_FIRST(&gp->consumer);
1149		if (cp == NULL)
1150			break;
1151		newpp = cp->provider;
1152		if (newpp == NULL)
1153			break;
1154
1155		me.gs_npending = 0;
1156		pp->geom->start = g_sched_temporary_start;
1157
1158		return (pp);
1159	} while (0);
1160	printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1161
1162	return (NULL);
1163}
1164
1165static void
1166g_sched_blackhole(struct bio *bp)
1167{
1168
1169	g_io_deliver(bp, ENXIO);
1170}
1171
1172static inline void
1173g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1174    struct g_provider *newpp)
1175{
1176
1177	LIST_REMOVE(pp, provider);
1178	if (newpp) {
1179		pp->private = newpp->private;
1180		pp->index = newpp->index;
1181	}
1182	pp->geom = gp;
1183	LIST_INSERT_HEAD(&gp->provider, pp, provider);
1184}
1185
1186static inline void
1187g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1188{
1189	struct g_geom *gp = oldpp->geom;
1190
1191	g_reparent_provider(oldpp, newpp->geom, newpp);
1192
1193	/*
1194	 * Hackish: let the system destroy the old provider for us, just
1195	 * in case someone attached a consumer to it, in which case a
1196	 * direct call to g_destroy_provider() would not work.
1197	 */
1198	g_reparent_provider(newpp, gp, NULL);
1199}
1200
1201/*
1202 * Complete the proxy destruction, linking the old provider to its
1203 * original geom, and destroying the proxy provider.  Also take care
1204 * of issuing the pending requests collected in me.gs_pending (if any).
1205 */
1206static int
1207g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1208{
1209	struct g_consumer *cp;
1210	struct g_provider *newpp;
1211
1212	do {
1213		cp = LIST_FIRST(&gp->consumer);
1214		if (cp == NULL)
1215			break;
1216		newpp = cp->provider;
1217		if (newpp == NULL)
1218			break;
1219
1220		/* Relink the provider to its original geom. */
1221		g_unproxy_provider(oldpp, newpp);
1222
1223		/* Detach consumer from provider, and destroy provider. */
1224		cp->acr = newpp->acr = 0;
1225		cp->acw = newpp->acw = 0;
1226		cp->ace = newpp->ace = 0;
1227		g_detach(cp);
1228
1229		/* Send the pending bios through the right start function. */
1230		g_sched_flush_pending(oldpp->geom->start);
1231
1232		return (0);
1233	} while (0);
1234	printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1235
1236	/* We cannot send the pending bios anywhere... */
1237	g_sched_flush_pending(g_sched_blackhole);
1238
1239	return (EINVAL);
1240}
1241
1242static int
1243g_sched_destroy(struct g_geom *gp, boolean_t force)
1244{
1245	struct g_provider *pp, *oldpp = NULL;
1246	struct g_sched_softc *sc;
1247	struct g_gsched *gsp;
1248	int error;
1249
1250	g_topology_assert();
1251	sc = gp->softc;
1252	if (sc == NULL)
1253		return (ENXIO);
1254	if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1255		pp = LIST_FIRST(&gp->provider);
1256		if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1257			const char *msg = force ?
1258				"but we force removal" : "cannot remove";
1259
1260			G_SCHED_DEBUG(!force,
1261			    "Device %s is still open (r%dw%de%d), %s.",
1262			    pp->name, pp->acr, pp->acw, pp->ace, msg);
1263			if (!force)
1264				return (EBUSY);
1265		} else {
1266			G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1267		}
1268	} else
1269		oldpp = g_detach_proxy(gp);
1270
1271	gsp = sc->sc_gsched;
1272	if (gsp) {
1273		/*
1274		 * XXX bad hack here: force a dispatch to release
1275		 * any reference to the hash table still held by
1276		 * the scheduler.
1277		 */
1278		g_sched_lock(gp);
1279		/*
1280		 * We are dying here, no new requests should enter
1281		 * the scheduler.  This is granted by the topolgy,
1282		 * either in case we were proxying (new bios are
1283		 * being redirected) or not (see the access check
1284		 * above).
1285		 */
1286		g_sched_forced_dispatch(gp);
1287		error = g_sched_wait_pending(gp);
1288
1289		if (error) {
1290			/*
1291			 * Not all the requests came home: this might happen
1292			 * under heavy load, or if we were waiting for any
1293			 * bio which is served in the event path (see
1294			 * geom_slice.c for an example of how this can
1295			 * happen).  Try to restore a working configuration
1296			 * if we can fail.
1297			 */
1298			if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1299				g_sched_flush_pending(force ?
1300				    g_sched_blackhole : g_sched_start);
1301			}
1302
1303			/*
1304			 * In the forced destroy case there is not so much
1305			 * we can do, we have pending bios that will call
1306			 * g_sched_done() somehow, and we don't want them
1307			 * to crash the system using freed memory.  We tell
1308			 * the user that something went wrong, and leak some
1309			 * memory here.
1310			 * Note: the callers using force = 1 ignore the
1311			 * return value.
1312			 */
1313			if (force) {
1314				G_SCHED_DEBUG(0, "Pending requests while "
1315				    " destroying geom, some memory leaked.");
1316			}
1317
1318			return (error);
1319		}
1320
1321		g_sched_unlock(gp);
1322		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1323		    gsp, sc->sc_data);
1324		sc->sc_hash = NULL;
1325		gsp->gs_fini(sc->sc_data);
1326		g_gsched_unref(gsp);
1327		sc->sc_gsched = NULL;
1328	}
1329
1330	if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1331		error = g_destroy_proxy(gp, oldpp);
1332
1333		if (error) {
1334			if (force) {
1335				G_SCHED_DEBUG(0, "Unrecoverable error while "
1336				    "destroying a proxy geom, leaking some "
1337				    " memory.");
1338			}
1339
1340			return (error);
1341		}
1342	}
1343
1344	mtx_destroy(&sc->sc_mtx);
1345
1346	g_free(gp->softc);
1347	gp->softc = NULL;
1348	g_wither_geom(gp, ENXIO);
1349
1350	return (error);
1351}
1352
1353static int
1354g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1355    struct g_geom *gp)
1356{
1357
1358	return (g_sched_destroy(gp, 0));
1359}
1360
1361/*
1362 * Functions related to the classification of requests.
1363 *
1364 * On recent FreeBSD versions (8.0 and above), we store a reference
1365 * to the issuer of a request in bp->bio_classifier1 as soon
1366 * as the bio is posted to the geom queue (and not later, because
1367 * requests are managed by the g_down thread afterwards).
1368 *
1369 * On older versions of the system (but this code is not used
1370 * in any existing release), we [ab]use the caller1 field in the
1371 * root element of the bio tree to store the classification info.
1372 * The marking is done at the beginning of g_io_request()
1373 * and only if we find that the field is NULL.
1374 *
1375 * To avoid rebuilding the kernel, this module will patch the
1376 * initial part of g_io_request() so it jumps to some hand-coded
1377 * assembly that does the marking and then executes the original
1378 * body of g_io_request().
1379 *
1380 * fake_ioreq[] is architecture-specific machine code
1381 * that implements the above. CODE_SIZE, STORE_SIZE etc.
1382 * are constants used in the patching routine. Look at the
1383 * code in g_ioreq_patch() for the details.
1384 */
1385
1386#ifndef HAVE_BIO_CLASSIFIER
1387/*
1388 * Support for old FreeBSD versions
1389 */
1390#if defined(__i386__)
1391#define	CODE_SIZE	29
1392#define	STORE_SIZE	5
1393#define	EPILOGUE	5
1394#define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1395
1396static u_char fake_ioreq[SIZE] = {
1397	0x8b, 0x44, 0x24, 0x04,		/* mov bp, %eax */
1398	/* 1: */
1399	0x89, 0xc2,			/* mov %eax, %edx # edx = bp */
1400	0x8b, 0x40, 0x64,		/* mov bp->bio_parent, %eax */
1401	0x85, 0xc0,			/* test %eax, %eax */
1402	0x75, 0xf7,			/* jne 1b */
1403	0x8b, 0x42, 0x30,		/* mov bp->bp_caller1, %eax */
1404	0x85, 0xc0,			/* test %eax, %eax */
1405	0x75, 0x09,			/* jne 2f */
1406	0x64, 0xa1, 0x00, 0x00,		/* mov %fs:0, %eax */
1407	0x00, 0x00,
1408	0x89, 0x42, 0x30,		/* mov %eax, bp->bio_caller1 */
1409	/* 2: */
1410        0x55, 0x89, 0xe5, 0x57, 0x56,
1411	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1412};
1413#elif defined(__amd64)
1414#define	CODE_SIZE	38
1415#define	STORE_SIZE	6
1416#define	EPILOGUE	5
1417#define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1418
1419static u_char fake_ioreq[SIZE] = {
1420	0x48, 0x89, 0xf8,		/* mov bp, %rax */
1421	/* 1: */
1422	0x48, 0x89, 0xc2,		/* mov %rax, %rdx # rdx = bp */
1423	0x48, 0x8b, 0x82, 0xa8,		/* mov bp->bio_parent, %rax */
1424	0x00, 0x00, 0x00,
1425	0x48, 0x85, 0xc0,		/* test %rax, %rax */
1426	0x75, 0xf1,			/* jne 1b */
1427	0x48, 0x83, 0x7a, 0x58,		/* cmp $0, bp->bp_caller1 */
1428	0x00,
1429	0x75, 0x0d,			/* jne 2f */
1430	0x65, 0x48, 0x8b, 0x04,		/* mov %gs:0, %rax */
1431	0x25, 0x00, 0x00, 0x00,
1432	0x00,
1433	0x48, 0x89, 0x42, 0x58,		/* mov %rax, bp->bio_caller1 */
1434	/* 2: */
1435	0x55, 0x48, 0x89, 0xe5, 0x41, 0x56,
1436	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1437};
1438#else /* neither x86 nor amd64 */
1439static void
1440g_new_io_request(struct bio *bp, struct g_consumer *cp)
1441{
1442	struct bio *top = bp;
1443
1444        /*
1445         * bio classification: if bio_caller1 is available in the
1446         * root of the 'struct bio' tree, store there the thread id
1447         * of the thread that originated the request.
1448         * More sophisticated classification schemes can be used.
1449         */
1450	while (top->bio_parent)
1451		top = top->bio_parent;
1452
1453	if (top->bio_caller1 == NULL)
1454		top->bio_caller1 = curthread;
1455}
1456
1457#error please add the code above in g_new_io_request() to the beginning of \
1458	/sys/geom/geom_io.c::g_io_request(), and remove this line.
1459#endif /* end of arch-specific code */
1460
1461static int
1462g_ioreq_patch(void)
1463{
1464	u_char *original;
1465	u_long ofs;
1466	int found;
1467
1468	if (me.gs_patched)
1469		return (-1);
1470
1471	original = (u_char *)g_io_request;
1472
1473	found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE);
1474	if (!found)
1475		return (-1);
1476
1477	/* Jump back to the original + STORE_SIZE. */
1478	ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE);
1479	bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4);
1480
1481	/* Patch the original address with a jump to the trampoline. */
1482	*original = 0xe9;     /* jump opcode */
1483	ofs = fake_ioreq - (original + 5);
1484	bcopy(&ofs, original + 1, 4);
1485
1486	me.gs_patched = 1;
1487
1488	return (0);
1489}
1490
1491/*
1492 * Restore the original code, this is easy.
1493 */
1494static void
1495g_ioreq_restore(void)
1496{
1497	u_char *original;
1498
1499	if (me.gs_patched) {
1500		original = (u_char *)g_io_request;
1501		bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE);
1502		me.gs_patched = 0;
1503	}
1504}
1505
1506static inline void
1507g_classifier_ini(void)
1508{
1509
1510	g_ioreq_patch();
1511}
1512
1513static inline void
1514g_classifier_fini(void)
1515{
1516
1517	g_ioreq_restore();
1518}
1519
1520/*--- end of support code for older FreeBSD versions */
1521
1522#else /* HAVE_BIO_CLASSIFIER */
1523
1524/*
1525 * Classifier support for recent FreeBSD versions: we use
1526 * a very simple classifier, only use curthread to tag a request.
1527 * The classifier is registered at module load, and unregistered
1528 * at module unload.
1529 */
1530static int
1531g_sched_tag(void *arg, struct bio *bp)
1532{
1533
1534	bp->bio_classifier1 = curthread;
1535	return (1);
1536}
1537
1538static struct g_classifier_hook g_sched_classifier = {
1539	.func =	g_sched_tag,
1540};
1541
1542static inline void
1543g_classifier_ini(void)
1544{
1545
1546	g_register_classifier(&g_sched_classifier);
1547}
1548
1549static inline void
1550g_classifier_fini(void)
1551{
1552
1553	g_unregister_classifier(&g_sched_classifier);
1554}
1555#endif /* HAVE_BIO_CLASSIFIER */
1556
1557static void
1558g_sched_init(struct g_class *mp)
1559{
1560
1561	g_gsched_global_init();
1562
1563	G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1564	    mp, &g_sched_class);
1565
1566	/* Patch g_io_request to store classification info in the bio. */
1567	g_classifier_ini();
1568}
1569
1570static void
1571g_sched_fini(struct g_class *mp)
1572{
1573
1574	g_classifier_fini();
1575
1576	G_SCHED_DEBUG(0, "Unloading...");
1577
1578	KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1579	mtx_destroy(&me.gs_mtx);
1580}
1581
1582static int
1583g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1584    struct thread *td)
1585{
1586	struct g_consumer *cp;
1587	struct g_geom *gp;
1588
1589	cp = LIST_FIRST(&pp->geom->consumer);
1590	if (cp == NULL)
1591		return (ENOIOCTL);
1592	gp = cp->provider->geom;
1593	if (gp->ioctl == NULL)
1594		return (ENOIOCTL);
1595	return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1596}
1597
1598/*
1599 * Read the i-th argument for a request, skipping the /dev/
1600 * prefix if present.
1601 */
1602static const char *
1603g_sched_argi(struct gctl_req *req, int i)
1604{
1605	static const char *dev_prefix = "/dev/";
1606	const char *name;
1607	char param[16];
1608	int l = strlen(dev_prefix);
1609
1610	snprintf(param, sizeof(param), "arg%d", i);
1611	name = gctl_get_asciiparam(req, param);
1612	if (name == NULL)
1613		gctl_error(req, "No 'arg%d' argument", i);
1614	else if (strncmp(name, dev_prefix, l) == 0)
1615		name += l;
1616	return (name);
1617}
1618
1619/*
1620 * Fetch nargs and do appropriate checks.
1621 */
1622static int
1623g_sched_get_nargs(struct gctl_req *req)
1624{
1625	int *nargs;
1626
1627	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1628	if (nargs == NULL) {
1629		gctl_error(req, "No 'nargs' argument");
1630		return (0);
1631	}
1632	if (*nargs <= 0)
1633		gctl_error(req, "Missing device(s).");
1634	return (*nargs);
1635}
1636
1637/*
1638 * Check whether we should add the class on certain volumes when
1639 * this geom is created. Right now this is under control of a kenv
1640 * variable containing the names of all devices that we care about.
1641 * Probably we should only support transparent insertion as the
1642 * preferred mode of operation.
1643 */
1644static struct g_geom *
1645g_sched_taste(struct g_class *mp, struct g_provider *pp,
1646		int flags __unused)
1647{
1648	struct g_gsched *gsp = NULL;	/* the . algorithm we want */
1649	const char *s;			/* generic string pointer */
1650	const char *taste_names;	/* devices we like */
1651	int l;
1652
1653        g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1654	    mp->name, pp->name);
1655        g_topology_assert();
1656
1657        G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1658
1659	do {
1660		/* do not taste on ourselves */
1661		if (pp->geom->class == mp)
1662                	break;
1663
1664		taste_names = getenv("geom.sched.taste");
1665		if (taste_names == NULL)
1666			break;
1667
1668		l = strlen(pp->name);
1669		for (s = taste_names; *s &&
1670		    (s = strstr(s, pp->name)); s++) {
1671			/* further checks for an exact match */
1672			if ( (s == taste_names || s[-1] == ' ') &&
1673			     (s[l] == '\0' || s[l] == ' ') )
1674				break;
1675		}
1676		if (s == NULL)
1677			break;
1678		G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1679		    pp->name, s);
1680
1681		/* look up the provider name in the list */
1682		s = getenv("geom.sched.algo");
1683		if (s == NULL)
1684			s = "rr";
1685
1686		gsp = g_gsched_find(s);	/* also get a reference */
1687		if (gsp == NULL) {
1688			G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1689			break;
1690		}
1691
1692		/* XXX create with 1 as last argument ? */
1693		g_sched_create(NULL, mp, pp, gsp, 0);
1694		g_gsched_unref(gsp);
1695	} while (0);
1696	return NULL;
1697}
1698
1699static void
1700g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1701{
1702	struct g_provider *pp;
1703	struct g_gsched *gsp;
1704	const char *name;
1705	int i, nargs;
1706
1707	g_topology_assert();
1708
1709	name = gctl_get_asciiparam(req, "algo");
1710	if (name == NULL) {
1711		gctl_error(req, "No '%s' argument", "algo");
1712		return;
1713	}
1714
1715	gsp = g_gsched_find(name);	/* also get a reference */
1716	if (gsp == NULL) {
1717		gctl_error(req, "Bad algorithm '%s'", name);
1718		return;
1719	}
1720
1721	nargs = g_sched_get_nargs(req);
1722
1723	/*
1724	 * Run on the arguments, and break on any error.
1725	 * We look for a device name, but skip the /dev/ prefix if any.
1726	 */
1727	for (i = 0; i < nargs; i++) {
1728		name = g_sched_argi(req, i);
1729		if (name == NULL)
1730			break;
1731		pp = g_provider_by_name(name);
1732		if (pp == NULL) {
1733			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1734			gctl_error(req, "Provider %s is invalid.", name);
1735			break;
1736		}
1737		if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1738			break;
1739	}
1740
1741	g_gsched_unref(gsp);
1742}
1743
1744static void
1745g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1746{
1747	struct g_provider *pp;
1748	struct g_gsched *gsp;
1749	const char *name;
1750	int i, nargs;
1751
1752	g_topology_assert();
1753
1754	name = gctl_get_asciiparam(req, "algo");
1755	if (name == NULL) {
1756		gctl_error(req, "No '%s' argument", "algo");
1757		return;
1758	}
1759
1760	gsp = g_gsched_find(name);	/* also get a reference */
1761	if (gsp == NULL) {
1762		gctl_error(req, "Bad algorithm '%s'", name);
1763		return;
1764	}
1765
1766	nargs = g_sched_get_nargs(req);
1767
1768	/*
1769	 * Run on the arguments, and break on any error.
1770	 * We look for a device name, but skip the /dev/ prefix if any.
1771	 */
1772	for (i = 0; i < nargs; i++) {
1773		name = g_sched_argi(req, i);
1774		if (name == NULL)
1775			break;
1776		pp = g_provider_by_name(name);
1777		if (pp == NULL || pp->geom->class != mp) {
1778			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1779			gctl_error(req, "Provider %s is invalid.", name);
1780			break;
1781		}
1782		if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1783			break;
1784	}
1785
1786	g_gsched_unref(gsp);
1787}
1788
1789static struct g_geom *
1790g_sched_find_geom(struct g_class *mp, const char *name)
1791{
1792	struct g_geom *gp;
1793
1794	LIST_FOREACH(gp, &mp->geom, geom) {
1795		if (strcmp(gp->name, name) == 0)
1796			return (gp);
1797	}
1798	return (NULL);
1799}
1800
1801static void
1802g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1803{
1804	int nargs, *force, error, i;
1805	struct g_geom *gp;
1806	const char *name;
1807
1808	g_topology_assert();
1809
1810	nargs = g_sched_get_nargs(req);
1811
1812	force = gctl_get_paraml(req, "force", sizeof(*force));
1813	if (force == NULL) {
1814		gctl_error(req, "No 'force' argument");
1815		return;
1816	}
1817
1818	for (i = 0; i < nargs; i++) {
1819		name = g_sched_argi(req, i);
1820		if (name == NULL)
1821			break;
1822
1823		gp = g_sched_find_geom(mp, name);
1824		if (gp == NULL) {
1825			G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1826			gctl_error(req, "Device %s is invalid.", name);
1827			break;
1828		}
1829
1830		error = g_sched_destroy(gp, *force);
1831		if (error != 0) {
1832			gctl_error(req, "Cannot destroy device %s (error=%d).",
1833			    gp->name, error);
1834			break;
1835		}
1836	}
1837}
1838
1839static void
1840g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1841{
1842	uint32_t *version;
1843
1844	g_topology_assert();
1845
1846	version = gctl_get_paraml(req, "version", sizeof(*version));
1847	if (version == NULL) {
1848		gctl_error(req, "No '%s' argument.", "version");
1849		return;
1850	}
1851
1852	if (*version != G_SCHED_VERSION) {
1853		gctl_error(req, "Userland and kernel parts are "
1854		    "out of sync.");
1855		return;
1856	}
1857
1858	if (strcmp(verb, "create") == 0) {
1859		g_sched_ctl_create(req, mp, 0);
1860		return;
1861	} else if (strcmp(verb, "insert") == 0) {
1862		g_sched_ctl_create(req, mp, 1);
1863		return;
1864	} else if (strcmp(verb, "configure") == 0) {
1865		g_sched_ctl_configure(req, mp);
1866		return;
1867	} else if (strcmp(verb, "destroy") == 0) {
1868		g_sched_ctl_destroy(req, mp);
1869		return;
1870	}
1871
1872	gctl_error(req, "Unknown verb.");
1873}
1874
1875static void
1876g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1877    struct g_consumer *cp, struct g_provider *pp)
1878{
1879	struct g_sched_softc *sc = gp->softc;
1880	struct g_gsched *gsp = sc->sc_gsched;
1881	if (indent == NULL) {	/* plaintext */
1882		sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1883	}
1884	if (gsp != NULL && gsp->gs_dumpconf)
1885		gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1886}
1887
1888DECLARE_GEOM_CLASS(g_sched_class, g_sched);
1889MODULE_VERSION(geom_sched, 0);
1890