1206497Sluigi/*-
2206552Sluigi * Copyright (c) 2009-2010 Fabio Checconi
3206552Sluigi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
4206497Sluigi * All rights reserved.
5206497Sluigi *
6206497Sluigi * Redistribution and use in source and binary forms, with or without
7206497Sluigi * modification, are permitted provided that the following conditions
8206497Sluigi * are met:
9206497Sluigi * 1. Redistributions of source code must retain the above copyright
10206497Sluigi *    notice, this list of conditions and the following disclaimer.
11206497Sluigi * 2. Redistributions in binary form must reproduce the above copyright
12206497Sluigi *    notice, this list of conditions and the following disclaimer in the
13206497Sluigi *    documentation and/or other materials provided with the distribution.
14206497Sluigi *
15206497Sluigi * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16206497Sluigi * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17206497Sluigi * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18206497Sluigi * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19206497Sluigi * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20206497Sluigi * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21206497Sluigi * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22206497Sluigi * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23206497Sluigi * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24206497Sluigi * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25206497Sluigi * SUCH DAMAGE.
26206497Sluigi */
27206497Sluigi
28206497Sluigi/*
29206497Sluigi * $Id$
30206497Sluigi * $FreeBSD$
31206497Sluigi *
32206497Sluigi * Main control module for geom-based disk schedulers ('sched').
33206497Sluigi *
34206497Sluigi * USER VIEW
35206497Sluigi * A 'sched' node is typically inserted transparently between
36206497Sluigi * an existing provider pp and its original geom gp
37206497Sluigi *
38206497Sluigi *	[pp --> gp  ..]
39206497Sluigi *
40206497Sluigi * using the command "geom sched insert <provider>" and
41206497Sluigi * resulting in the following topology
42206497Sluigi *
43206497Sluigi *	[pp --> sched_gp --> cp]   [new_pp --> gp ... ]
44206497Sluigi *
45206497Sluigi * Deletion "geom sched destroy <provider>.sched." restores the
46206497Sluigi * original chain. The normal "geom sched create <provide>"
47206497Sluigi * is also supported.
48206497Sluigi *
49206497Sluigi * INTERNALS
50206497Sluigi * Internally, the 'sched' uses the following data structures
51206497Sluigi *
52206497Sluigi *   geom{}         g_sched_softc{}      g_gsched{}
53206497Sluigi * +----------+    +---------------+   +-------------+
54206497Sluigi * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
55206497Sluigi * |  ...     |    |               |   |  gs_fini    |
56206497Sluigi * |          |    | [ hash table] |   |  gs_start   |
57206497Sluigi * +----------+    |               |   |  ...        |
58206497Sluigi *                 |               |   +-------------+
59206497Sluigi *                 |               |
60206497Sluigi *                 |               |     g_*_softc{}
61206497Sluigi *                 |               |   +-------------+
62206497Sluigi *                 | sc_data     *-|-->|             |
63206497Sluigi *                 +---------------+   |  algorithm- |
64206497Sluigi *                                     |  specific   |
65206497Sluigi *                                     +-------------+
66206497Sluigi *
67206497Sluigi * A g_sched_softc{} is created with a "geom sched insert" call.
68206497Sluigi * In turn this instantiates a specific scheduling algorithm,
69206497Sluigi * which sets sc_gsched to point to the algorithm callbacks,
70206497Sluigi * and calls gs_init() to create the g_*_softc{} .
71206497Sluigi * The other callbacks (gs_start, gs_next, ...) are invoked
72206497Sluigi * as needed
73206497Sluigi *
74206497Sluigi * g_sched_softc{} is defined in g_sched.h and mostly used here;
75206497Sluigi * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
76206497Sluigi * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
77206497Sluigi *
78206497Sluigi * DATA MOVING
79206497Sluigi * When a bio is received on the provider, it goes to the
80206497Sluigi * g_sched_start() which calls gs_start() to initially queue it;
81206497Sluigi * then we call g_sched_dispatch() that loops around gs_next()
82206497Sluigi * to select zero or more bio's to be sent downstream.
83206497Sluigi *
84206497Sluigi * g_sched_dispatch() can also be called as a result of a timeout,
85206497Sluigi * e.g. when doing anticipation or pacing requests.
86206497Sluigi *
87206497Sluigi * When a bio comes back, it goes to g_sched_done() which in turn
88206497Sluigi * calls gs_done(). The latter does any necessary housekeeping in
89206497Sluigi * the scheduling algorithm, and may decide to call g_sched_dispatch()
90206497Sluigi * to send more bio's downstream.
91206497Sluigi *
92206497Sluigi * If an algorithm needs per-flow queues, these are created
93206497Sluigi * calling gs_init_class() and destroyed with gs_fini_class(),
94206497Sluigi * and they are also inserted in the hash table implemented in
95206497Sluigi * the g_sched_softc{}
96206497Sluigi *
97206497Sluigi * If an algorithm is replaced, or a transparently-inserted node is
98206497Sluigi * removed with "geom sched destroy", we need to remove all references
99206497Sluigi * to the g_*_softc{} and g_sched_softc from the bio's still in
100206497Sluigi * the scheduler. g_sched_forced_dispatch() helps doing this.
101206497Sluigi * XXX need to explain better.
102206497Sluigi */
103206497Sluigi
104206497Sluigi#include <sys/cdefs.h>
105206497Sluigi#include <sys/param.h>
106206497Sluigi#include <sys/systm.h>
107206497Sluigi#include <sys/kernel.h>
108206497Sluigi#include <sys/module.h>
109206497Sluigi#include <sys/lock.h>
110206497Sluigi#include <sys/mutex.h>
111206497Sluigi#include <sys/bio.h>
112206497Sluigi#include <sys/limits.h>
113206497Sluigi#include <sys/hash.h>
114223921Sae#include <sys/sbuf.h>
115206497Sluigi#include <sys/sysctl.h>
116206497Sluigi#include <sys/malloc.h>
117206497Sluigi#include <sys/proc.h>		/* we access curthread */
118206497Sluigi#include <geom/geom.h>
119206497Sluigi#include "gs_scheduler.h"
120206497Sluigi#include "g_sched.h"		/* geom hooks */
121206497Sluigi
122206497Sluigi/*
123206497Sluigi * Size of the per-geom hash table storing traffic classes.
124206497Sluigi * We may decide to change it at a later time, it has no ABI
125206497Sluigi * implications as it is only used for run-time allocations.
126206497Sluigi */
127206497Sluigi#define G_SCHED_HASH_SIZE	32
128206497Sluigi
129206497Sluigistatic int g_sched_destroy(struct g_geom *gp, boolean_t force);
130206497Sluigistatic int g_sched_destroy_geom(struct gctl_req *req,
131206497Sluigi    struct g_class *mp, struct g_geom *gp);
132206497Sluigistatic void g_sched_config(struct gctl_req *req, struct g_class *mp,
133206497Sluigi    const char *verb);
134206497Sluigistatic struct g_geom *g_sched_taste(struct g_class *mp,
135206497Sluigi    struct g_provider *pp, int flags __unused);
136206497Sluigistatic void g_sched_dumpconf(struct sbuf *sb, const char *indent,
137206497Sluigi    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
138206497Sluigistatic void g_sched_init(struct g_class *mp);
139206497Sluigistatic void g_sched_fini(struct g_class *mp);
140210747Saestatic int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data,
141210747Sae    int fflag, struct thread *td);
142206497Sluigi
143206497Sluigistruct g_class g_sched_class = {
144206497Sluigi	.name = G_SCHED_CLASS_NAME,
145206497Sluigi	.version = G_VERSION,
146206497Sluigi	.ctlreq = g_sched_config,
147206497Sluigi	.taste = g_sched_taste,
148206497Sluigi	.destroy_geom = g_sched_destroy_geom,
149206497Sluigi	.init = g_sched_init,
150210747Sae	.ioctl = g_sched_ioctl,
151206497Sluigi	.fini = g_sched_fini
152206497Sluigi};
153206497Sluigi
154206497SluigiMALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
155206497Sluigi
156206497Sluigi/*
157206497Sluigi * Global variables describing the state of the geom_sched module.
158206497Sluigi * There is only one static instance of this structure.
159206497Sluigi */
160206497SluigiLIST_HEAD(gs_list, g_gsched);	/* type, link field */
161206497Sluigistruct geom_sched_vars {
162206497Sluigi	struct mtx	gs_mtx;
163206497Sluigi	struct gs_list	gs_scheds;	/* list of algorithms */
164206497Sluigi	u_int		gs_debug;
165206497Sluigi	u_int		gs_sched_count;	/* how many algorithms ? */
166206497Sluigi	u_int 		gs_patched;	/* g_io_request was patched */
167206497Sluigi
168206497Sluigi	u_int		gs_initialized;
169206497Sluigi	u_int		gs_expire_secs;	/* expiration of hash entries */
170206497Sluigi
171206497Sluigi	struct bio_queue_head gs_pending;
172206497Sluigi	u_int		gs_npending;
173206497Sluigi
174206497Sluigi	/* The following are for stats, usually protected by gs_mtx. */
175206497Sluigi	u_long		gs_requests;	/* total requests */
176206497Sluigi	u_long		gs_done;	/* total done */
177206497Sluigi	u_int 		gs_in_flight;	/* requests in flight */
178206497Sluigi	u_int 		gs_writes_in_flight;
179206497Sluigi	u_int 		gs_bytes_in_flight;
180206497Sluigi	u_int 		gs_write_bytes_in_flight;
181206497Sluigi
182206497Sluigi	char		gs_names[256];	/* names of schedulers */
183206497Sluigi};
184206497Sluigi
185206497Sluigistatic struct geom_sched_vars me = {
186206497Sluigi	.gs_expire_secs = 10,
187206497Sluigi};
188206497Sluigi
189206497SluigiSYSCTL_DECL(_kern_geom);
190206497SluigiSYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
191206497Sluigi    "GEOM_SCHED stuff");
192206497Sluigi
193217324SmdfSYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
194206497Sluigi    &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
195206497Sluigi
196217324SmdfSYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
197206497Sluigi    &me.gs_bytes_in_flight, 0, "Bytes in flight");
198206497Sluigi
199206497SluigiSYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
200206497Sluigi    &me.gs_writes_in_flight, 0, "Write Requests in flight");
201206497Sluigi
202206497SluigiSYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
203206497Sluigi    &me.gs_in_flight, 0, "Requests in flight");
204206497Sluigi
205206497SluigiSYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
206206497Sluigi    &me.gs_done, 0, "Total done");
207206497Sluigi
208206497SluigiSYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
209206497Sluigi    &me.gs_requests, 0, "Total requests");
210206497Sluigi
211206497SluigiSYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
212206497Sluigi    &me.gs_names, 0, "Algorithm names");
213206497Sluigi
214206497SluigiSYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
215206497Sluigi    &me.gs_sched_count, 0, "Number of algorithms");
216206497Sluigi
217206497SluigiSYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
218206497Sluigi    &me.gs_debug, 0, "Debug level");
219206497Sluigi
220206497SluigiSYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
221206497Sluigi    &me.gs_expire_secs, 0, "Expire time in seconds");
222206497Sluigi
223206497Sluigi/*
224206497Sluigi * g_sched calls the scheduler algorithms with this lock held.
225206497Sluigi * The locking functions are exposed so the scheduler algorithms can also
226206497Sluigi * protect themselves e.g. when running a callout handler.
227206497Sluigi */
228206497Sluigivoid
229206497Sluigig_sched_lock(struct g_geom *gp)
230206497Sluigi{
231206497Sluigi	struct g_sched_softc *sc = gp->softc;
232206497Sluigi
233206497Sluigi	mtx_lock(&sc->sc_mtx);
234206497Sluigi}
235206497Sluigi
236206497Sluigivoid
237206497Sluigig_sched_unlock(struct g_geom *gp)
238206497Sluigi{
239206497Sluigi	struct g_sched_softc *sc = gp->softc;
240206497Sluigi
241206497Sluigi	mtx_unlock(&sc->sc_mtx);
242206497Sluigi}
243206497Sluigi
244206497Sluigi/*
245206497Sluigi * Support functions to handle references to the module,
246206497Sluigi * which are coming from devices using this scheduler.
247206497Sluigi */
248206497Sluigistatic inline void
249206497Sluigig_gsched_ref(struct g_gsched *gsp)
250206497Sluigi{
251206497Sluigi
252206497Sluigi	atomic_add_int(&gsp->gs_refs, 1);
253206497Sluigi}
254206497Sluigi
255206497Sluigistatic inline void
256206497Sluigig_gsched_unref(struct g_gsched *gsp)
257206497Sluigi{
258206497Sluigi
259206497Sluigi	atomic_add_int(&gsp->gs_refs, -1);
260206497Sluigi}
261206497Sluigi
262206497Sluigi/*
263206497Sluigi * Update the stats when this request is done.
264206497Sluigi */
265206497Sluigistatic void
266206497Sluigig_sched_update_stats(struct bio *bio)
267206497Sluigi{
268206497Sluigi
269206497Sluigi	me.gs_done++;
270206497Sluigi	me.gs_in_flight--;
271206497Sluigi	me.gs_bytes_in_flight -= bio->bio_length;
272206497Sluigi	if (bio->bio_cmd & BIO_WRITE) {
273206497Sluigi		me.gs_writes_in_flight--;
274206497Sluigi		me.gs_write_bytes_in_flight -= bio->bio_length;
275206497Sluigi	}
276206497Sluigi}
277206497Sluigi
278206497Sluigi/*
279206497Sluigi * Dispatch any pending request.
280206497Sluigi */
281206497Sluigistatic void
282206497Sluigig_sched_forced_dispatch(struct g_geom *gp)
283206497Sluigi{
284206497Sluigi	struct g_sched_softc *sc = gp->softc;
285206497Sluigi	struct g_gsched *gsp = sc->sc_gsched;
286206497Sluigi	struct bio *bp;
287206497Sluigi
288206497Sluigi	KASSERT(mtx_owned(&sc->sc_mtx),
289206497Sluigi	    ("sc_mtx not owned during forced dispatch"));
290206497Sluigi
291206497Sluigi	while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
292206497Sluigi		g_io_request(bp, LIST_FIRST(&gp->consumer));
293206497Sluigi}
294206497Sluigi
295206497Sluigi/*
296206497Sluigi * The main dispatch loop, called either here after the start
297206497Sluigi * routine, or by scheduling algorithms when they receive a timeout
298206497Sluigi * or a 'done' notification.  Does not share code with the forced
299206497Sluigi * dispatch path, since the gs_done() callback can call us.
300206497Sluigi */
301206497Sluigivoid
302206497Sluigig_sched_dispatch(struct g_geom *gp)
303206497Sluigi{
304206497Sluigi	struct g_sched_softc *sc = gp->softc;
305206497Sluigi	struct g_gsched *gsp = sc->sc_gsched;
306206497Sluigi	struct bio *bp;
307206497Sluigi
308206497Sluigi	KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
309206497Sluigi
310206497Sluigi	if ((sc->sc_flags & G_SCHED_FLUSHING))
311206497Sluigi		return;
312206497Sluigi
313206497Sluigi	while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
314206497Sluigi		g_io_request(bp, LIST_FIRST(&gp->consumer));
315206497Sluigi}
316206497Sluigi
317206497Sluigi/*
318206497Sluigi * Recent (8.0 and above) versions of FreeBSD have support to
319206497Sluigi * register classifiers of disk requests. The classifier is
320206497Sluigi * invoked by g_io_request(), and stores the information into
321206497Sluigi * bp->bio_classifier1.
322206497Sluigi *
323206497Sluigi * Support for older versions, which is left here only for
324206497Sluigi * documentation purposes, relies on two hacks:
325206497Sluigi * 1. classification info is written into the bio_caller1
326206497Sluigi *    field of the topmost node in the bio chain. This field
327206497Sluigi *    is rarely used, but this module is incompatible with
328206497Sluigi *    those that use bio_caller1 for other purposes,
329206497Sluigi *    such as ZFS and gjournal;
330206497Sluigi * 2. g_io_request() is patched in-memory when the module is
331206497Sluigi *    loaded, so that the function calls a classifier as its
332206497Sluigi *    first thing. g_io_request() is restored when the module
333206497Sluigi *    is unloaded. This functionality is only supported for
334206497Sluigi *    x86 and amd64, other architectures need source code changes.
335206497Sluigi */
336206497Sluigi
337206497Sluigi/*
338206497Sluigi * Lookup the identity of the issuer of the original request.
339206497Sluigi * In the current implementation we use the curthread of the
340206497Sluigi * issuer, but different mechanisms may be implemented later
341206497Sluigi * so we do not make assumptions on the return value which for
342206497Sluigi * us is just an opaque identifier.
343206497Sluigi */
344206497Sluigi
345206497Sluigistatic inline u_long
346206497Sluigig_sched_classify(struct bio *bp)
347206497Sluigi{
348206497Sluigi
349206497Sluigi#if __FreeBSD_version > 800098
350206497Sluigi	/* we have classifier fields in the struct bio */
351206497Sluigi#define HAVE_BIO_CLASSIFIER
352206497Sluigi	return ((u_long)bp->bio_classifier1);
353206497Sluigi#else
354206497Sluigi#warning old version!!!
355206497Sluigi	while (bp->bio_parent != NULL)
356206497Sluigi		bp = bp->bio_parent;
357206497Sluigi
358206497Sluigi	return ((u_long)bp->bio_caller1);
359206497Sluigi#endif
360206497Sluigi}
361206497Sluigi
362206497Sluigi/* Return the hash chain for the given key. */
363206497Sluigistatic inline struct g_hash *
364206497Sluigig_sched_hash(struct g_sched_softc *sc, u_long key)
365206497Sluigi{
366206497Sluigi
367206497Sluigi	return (&sc->sc_hash[key & sc->sc_mask]);
368206497Sluigi}
369206497Sluigi
370206497Sluigi/*
371206497Sluigi * Helper function for the children classes, which takes
372206497Sluigi * a geom and a bio and returns the private descriptor
373206497Sluigi * associated to the request.  This involves fetching
374206497Sluigi * the classification field and [al]locating the
375206497Sluigi * corresponding entry in the hash table.
376206497Sluigi */
377206497Sluigivoid *
378206497Sluigig_sched_get_class(struct g_geom *gp, struct bio *bp)
379206497Sluigi{
380206497Sluigi	struct g_sched_softc *sc;
381206497Sluigi	struct g_sched_class *gsc;
382206497Sluigi	struct g_gsched *gsp;
383206497Sluigi	struct g_hash *bucket;
384206497Sluigi	u_long key;
385206497Sluigi
386206497Sluigi	sc = gp->softc;
387206497Sluigi	key = g_sched_classify(bp);
388206497Sluigi	bucket = g_sched_hash(sc, key);
389206497Sluigi	LIST_FOREACH(gsc, bucket, gsc_clist) {
390206497Sluigi		if (key == gsc->gsc_key) {
391206497Sluigi			gsc->gsc_refs++;
392206497Sluigi			return (gsc->gsc_priv);
393206497Sluigi		}
394206497Sluigi	}
395206497Sluigi
396206497Sluigi	gsp = sc->sc_gsched;
397206497Sluigi	gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
398206497Sluigi	    M_GEOM_SCHED, M_NOWAIT | M_ZERO);
399206497Sluigi	if (!gsc)
400206497Sluigi		return (NULL);
401206497Sluigi
402206497Sluigi	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
403206497Sluigi		free(gsc, M_GEOM_SCHED);
404206497Sluigi		return (NULL);
405206497Sluigi	}
406206497Sluigi
407206497Sluigi	gsc->gsc_refs = 2;	/* 1 for the hash table, 1 for the caller. */
408206497Sluigi	gsc->gsc_key = key;
409206497Sluigi	LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
410206497Sluigi
411206497Sluigi	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
412206497Sluigi
413206497Sluigi	return (gsc->gsc_priv);
414206497Sluigi}
415206497Sluigi
416206497Sluigi/*
417206497Sluigi * Release a reference to the per-client descriptor,
418206497Sluigi */
419206497Sluigivoid
420206497Sluigig_sched_put_class(struct g_geom *gp, void *priv)
421206497Sluigi{
422206497Sluigi	struct g_sched_class *gsc;
423206497Sluigi	struct g_sched_softc *sc;
424206497Sluigi
425206497Sluigi	gsc = g_sched_priv2class(priv);
426206497Sluigi	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
427206497Sluigi
428206497Sluigi	if (--gsc->gsc_refs > 0)
429206497Sluigi		return;
430206497Sluigi
431206497Sluigi	sc = gp->softc;
432206497Sluigi	sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
433206497Sluigi
434206497Sluigi	LIST_REMOVE(gsc, gsc_clist);
435206497Sluigi	free(gsc, M_GEOM_SCHED);
436206497Sluigi}
437206497Sluigi
438206497Sluigistatic void
439206497Sluigig_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
440206497Sluigi    struct g_gsched *gsp, void *data)
441206497Sluigi{
442206497Sluigi	struct g_sched_class *cp, *cp2;
443206497Sluigi	int i;
444206497Sluigi
445206497Sluigi	if (!hp)
446206497Sluigi		return;
447206497Sluigi
448206497Sluigi	if (data && gsp->gs_hash_unref)
449206497Sluigi		gsp->gs_hash_unref(data);
450206497Sluigi
451206497Sluigi	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
452206497Sluigi		LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
453206497Sluigi			g_sched_put_class(gp, cp->gsc_priv);
454206497Sluigi	}
455206497Sluigi
456206497Sluigi	hashdestroy(hp, M_GEOM_SCHED, mask);
457206497Sluigi}
458206497Sluigi
459206497Sluigistatic struct g_hash *
460206497Sluigig_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
461206497Sluigi{
462206497Sluigi	struct g_hash *hash;
463206497Sluigi
464206497Sluigi	if (gsp->gs_priv_size == 0)
465206497Sluigi		return (NULL);
466206497Sluigi
467206497Sluigi	hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
468206497Sluigi
469206497Sluigi	return (hash);
470206497Sluigi}
471206497Sluigi
472206497Sluigistatic void
473206497Sluigig_sched_flush_classes(struct g_geom *gp)
474206497Sluigi{
475206497Sluigi	struct g_sched_softc *sc;
476206497Sluigi	struct g_sched_class *cp, *cp2;
477206497Sluigi	int i;
478206497Sluigi
479206497Sluigi	sc = gp->softc;
480206497Sluigi
481206497Sluigi	if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
482206497Sluigi		return;
483206497Sluigi
484206497Sluigi	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
485206497Sluigi		LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
486206497Sluigi			if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
487206497Sluigi				g_sched_put_class(gp, cp->gsc_priv);
488206497Sluigi		}
489206497Sluigi	}
490206497Sluigi
491206497Sluigi	sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
492206497Sluigi}
493206497Sluigi
494206497Sluigi/*
495206497Sluigi * Wait for the completion of any outstanding request.  To ensure
496206497Sluigi * that this does not take forever the caller has to make sure that
497206497Sluigi * no new request enter the scehduler before calling us.
498206497Sluigi *
499206497Sluigi * Must be called with the gp mutex held and topology locked.
500206497Sluigi */
501206497Sluigistatic int
502206497Sluigig_sched_wait_pending(struct g_geom *gp)
503206497Sluigi{
504206497Sluigi	struct g_sched_softc *sc = gp->softc;
505206497Sluigi	int endticks = ticks + hz;
506206497Sluigi
507206497Sluigi	g_topology_assert();
508206497Sluigi
509206497Sluigi	while (sc->sc_pending && endticks - ticks >= 0)
510206497Sluigi		msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
511206497Sluigi
512206497Sluigi	return (sc->sc_pending ? ETIMEDOUT : 0);
513206497Sluigi}
514206497Sluigi
515206497Sluigistatic int
516206497Sluigig_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
517206497Sluigi{
518206497Sluigi	struct g_sched_softc *sc = gp->softc;
519206497Sluigi	int error;
520206497Sluigi
521206497Sluigi	/* Set the flushing flag: new bios will not enter the scheduler. */
522206497Sluigi	sc->sc_flags |= G_SCHED_FLUSHING;
523206497Sluigi
524206497Sluigi	g_sched_forced_dispatch(gp);
525206497Sluigi	error = g_sched_wait_pending(gp);
526206497Sluigi	if (error)
527206497Sluigi		goto failed;
528206497Sluigi
529206497Sluigi	/* No more requests pending or in flight from the old gsp. */
530206497Sluigi
531206497Sluigi	g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
532206497Sluigi	sc->sc_hash = NULL;
533206497Sluigi
534206497Sluigi	/*
535206497Sluigi	 * Avoid deadlock here by releasing the gp mutex and reacquiring
536206497Sluigi	 * it once done.  It should be safe, since no reconfiguration or
537206497Sluigi	 * destruction can take place due to the geom topology lock; no
538206497Sluigi	 * new request can use the current sc_data since we flagged the
539206497Sluigi	 * geom as being flushed.
540206497Sluigi	 */
541206497Sluigi	g_sched_unlock(gp);
542206497Sluigi	gsp->gs_fini(sc->sc_data);
543206497Sluigi	g_sched_lock(gp);
544206497Sluigi
545206497Sluigi	sc->sc_gsched = NULL;
546206497Sluigi	sc->sc_data = NULL;
547206497Sluigi	g_gsched_unref(gsp);
548206497Sluigi
549206497Sluigifailed:
550206497Sluigi	sc->sc_flags &= ~G_SCHED_FLUSHING;
551206497Sluigi
552206497Sluigi	return (error);
553206497Sluigi}
554206497Sluigi
555206497Sluigistatic int
556206497Sluigig_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
557206497Sluigi{
558206497Sluigi	int error;
559206497Sluigi
560206497Sluigi	g_sched_lock(gp);
561206497Sluigi	error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
562206497Sluigi	g_sched_unlock(gp);
563206497Sluigi
564206497Sluigi	return (error);
565206497Sluigi}
566206497Sluigi
567206497Sluigi/*
568206497Sluigi * Support function for create/taste -- locate the desired
569206497Sluigi * algorithm and grab a reference to it.
570206497Sluigi */
571206497Sluigistatic struct g_gsched *
572206497Sluigig_gsched_find(const char *name)
573206497Sluigi{
574206497Sluigi	struct g_gsched *gsp = NULL;
575206497Sluigi
576206497Sluigi	mtx_lock(&me.gs_mtx);
577206497Sluigi	LIST_FOREACH(gsp, &me.gs_scheds, glist) {
578206497Sluigi		if (strcmp(name, gsp->gs_name) == 0) {
579206497Sluigi			g_gsched_ref(gsp);
580206497Sluigi			break;
581206497Sluigi		}
582206497Sluigi	}
583206497Sluigi	mtx_unlock(&me.gs_mtx);
584206497Sluigi
585206497Sluigi	return (gsp);
586206497Sluigi}
587206497Sluigi
588206497Sluigi/*
589206497Sluigi * Rebuild the list of scheduler names.
590206497Sluigi * To be called with me.gs_mtx lock held.
591206497Sluigi */
592206497Sluigistatic void
593206497Sluigig_gsched_build_names(struct g_gsched *gsp)
594206497Sluigi{
595206497Sluigi	int pos, l;
596206497Sluigi	struct g_gsched *cur;
597206497Sluigi
598206497Sluigi	pos = 0;
599206497Sluigi	LIST_FOREACH(cur, &me.gs_scheds, glist) {
600206497Sluigi		l = strlen(cur->gs_name);
601206497Sluigi		if (l + pos + 1 + 1 < sizeof(me.gs_names)) {
602206497Sluigi			if (pos != 0)
603206497Sluigi				me.gs_names[pos++] = ' ';
604206497Sluigi			strcpy(me.gs_names + pos, cur->gs_name);
605206497Sluigi			pos += l;
606206497Sluigi		}
607206497Sluigi	}
608206497Sluigi	me.gs_names[pos] = '\0';
609206497Sluigi}
610206497Sluigi
611206497Sluigi/*
612206497Sluigi * Register or unregister individual scheduling algorithms.
613206497Sluigi */
614206497Sluigistatic int
615206497Sluigig_gsched_register(struct g_gsched *gsp)
616206497Sluigi{
617206497Sluigi	struct g_gsched *cur;
618206497Sluigi	int error = 0;
619206497Sluigi
620206497Sluigi	mtx_lock(&me.gs_mtx);
621206497Sluigi	LIST_FOREACH(cur, &me.gs_scheds, glist) {
622206497Sluigi		if (strcmp(gsp->gs_name, cur->gs_name) == 0)
623206497Sluigi			break;
624206497Sluigi	}
625206497Sluigi	if (cur != NULL) {
626206497Sluigi		G_SCHED_DEBUG(0, "A scheduler named %s already"
627206497Sluigi		    "exists.", gsp->gs_name);
628206497Sluigi		error = EEXIST;
629206497Sluigi	} else {
630206497Sluigi		LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist);
631206497Sluigi		gsp->gs_refs = 1;
632206497Sluigi		me.gs_sched_count++;
633206497Sluigi		g_gsched_build_names(gsp);
634206497Sluigi	}
635206497Sluigi	mtx_unlock(&me.gs_mtx);
636206497Sluigi
637206497Sluigi	return (error);
638206497Sluigi}
639206497Sluigi
640206497Sluigistruct g_gsched_unregparm {
641206497Sluigi	struct g_gsched *gup_gsp;
642206497Sluigi	int		gup_error;
643206497Sluigi};
644206497Sluigi
645206497Sluigistatic void
646206497Sluigig_gsched_unregister(void *arg, int flag)
647206497Sluigi{
648206497Sluigi	struct g_gsched_unregparm *parm = arg;
649206497Sluigi	struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
650206497Sluigi	struct g_sched_softc *sc;
651206497Sluigi	struct g_geom *gp, *gp_tmp;
652206497Sluigi	int error;
653206497Sluigi
654206497Sluigi	parm->gup_error = 0;
655206497Sluigi
656206497Sluigi	g_topology_assert();
657206497Sluigi
658206497Sluigi	if (flag == EV_CANCEL)
659206497Sluigi		return;
660206497Sluigi
661206497Sluigi	mtx_lock(&me.gs_mtx);
662206497Sluigi
663206497Sluigi	LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) {
664206497Sluigi		if (gp->class != &g_sched_class)
665206497Sluigi			continue;	/* Should not happen. */
666206497Sluigi
667206497Sluigi		sc = gp->softc;
668206497Sluigi		if (sc->sc_gsched == gsp) {
669206497Sluigi			error = g_sched_remove(gp, gsp);
670206497Sluigi			if (error)
671206497Sluigi				goto failed;
672206497Sluigi		}
673206497Sluigi	}
674206497Sluigi
675206497Sluigi	LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
676206497Sluigi		if (cur != gsp)
677206497Sluigi			continue;
678206497Sluigi
679206497Sluigi		if (gsp->gs_refs != 1) {
680206497Sluigi			G_SCHED_DEBUG(0, "%s still in use.",
681206497Sluigi			    gsp->gs_name);
682206497Sluigi			parm->gup_error = EBUSY;
683206497Sluigi		} else {
684206497Sluigi			LIST_REMOVE(gsp, glist);
685206497Sluigi			me.gs_sched_count--;
686206497Sluigi			g_gsched_build_names(gsp);
687206497Sluigi		}
688206497Sluigi		break;
689206497Sluigi	}
690206497Sluigi
691206497Sluigi	if (cur == NULL) {
692206497Sluigi		G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name);
693206497Sluigi		parm->gup_error = ENOENT;
694206497Sluigi	}
695206497Sluigi
696206497Sluigifailed:
697206497Sluigi	mtx_unlock(&me.gs_mtx);
698206497Sluigi}
699206497Sluigi
700206497Sluigistatic inline void
701206497Sluigig_gsched_global_init(void)
702206497Sluigi{
703206497Sluigi
704206497Sluigi	if (!me.gs_initialized) {
705206497Sluigi		G_SCHED_DEBUG(0, "Initializing global data.");
706206497Sluigi		mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF);
707206497Sluigi		LIST_INIT(&me.gs_scheds);
708206497Sluigi		gs_bioq_init(&me.gs_pending);
709206497Sluigi		me.gs_initialized = 1;
710206497Sluigi	}
711206497Sluigi}
712206497Sluigi
713206497Sluigi/*
714206497Sluigi * Module event called when a scheduling algorithm module is loaded or
715206497Sluigi * unloaded.
716206497Sluigi */
717206497Sluigiint
718206497Sluigig_gsched_modevent(module_t mod, int cmd, void *arg)
719206497Sluigi{
720206497Sluigi	struct g_gsched *gsp = arg;
721206497Sluigi	struct g_gsched_unregparm parm;
722206497Sluigi	int error;
723206497Sluigi
724206497Sluigi	G_SCHED_DEBUG(0, "Modevent %d.", cmd);
725206497Sluigi
726206497Sluigi	/*
727206497Sluigi	 * If the module is loaded at boot, the geom thread that calls
728206497Sluigi	 * g_sched_init() might actually run after g_gsched_modevent(),
729206497Sluigi	 * so make sure that the module is properly initialized.
730206497Sluigi	 */
731206497Sluigi	g_gsched_global_init();
732206497Sluigi
733206497Sluigi	error = EOPNOTSUPP;
734206497Sluigi	switch (cmd) {
735206497Sluigi	case MOD_LOAD:
736206497Sluigi		error = g_gsched_register(gsp);
737206497Sluigi		G_SCHED_DEBUG(0, "Loaded module %s error %d.",
738206497Sluigi		    gsp->gs_name, error);
739206497Sluigi		if (error == 0)
740206497Sluigi			g_retaste(&g_sched_class);
741206497Sluigi		break;
742206497Sluigi
743206497Sluigi	case MOD_UNLOAD:
744206497Sluigi		parm.gup_gsp = gsp;
745206497Sluigi		parm.gup_error = 0;
746206497Sluigi
747206497Sluigi		error = g_waitfor_event(g_gsched_unregister,
748206497Sluigi		    &parm, M_WAITOK, NULL);
749206497Sluigi		if (error == 0)
750206497Sluigi			error = parm.gup_error;
751206497Sluigi		G_SCHED_DEBUG(0, "Unloaded module %s error %d.",
752206497Sluigi		    gsp->gs_name, error);
753206497Sluigi		break;
754206497Sluigi	};
755206497Sluigi
756206497Sluigi	return (error);
757206497Sluigi}
758206497Sluigi
759206497Sluigi#ifdef KTR
760206497Sluigi#define	TRC_BIO_EVENT(e, bp)	g_sched_trace_bio_ ## e (bp)
761206497Sluigi
762206497Sluigistatic inline char
763206497Sluigig_sched_type(struct bio *bp)
764206497Sluigi{
765206497Sluigi
766206497Sluigi	if (0 != (bp->bio_cmd & BIO_READ))
767206497Sluigi		return ('R');
768206497Sluigi	else if (0 != (bp->bio_cmd & BIO_WRITE))
769206497Sluigi		return ('W');
770206497Sluigi	return ('U');
771206497Sluigi}
772206497Sluigi
773206497Sluigistatic inline void
774206497Sluigig_sched_trace_bio_START(struct bio *bp)
775206497Sluigi{
776206497Sluigi
777206551Sluigi	CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp),
778206497Sluigi	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
779206497Sluigi	    bp->bio_offset, bp->bio_length);
780206497Sluigi}
781206497Sluigi
782206497Sluigistatic inline void
783206497Sluigig_sched_trace_bio_DONE(struct bio *bp)
784206497Sluigi{
785206497Sluigi
786206551Sluigi	CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp),
787206497Sluigi	    g_sched_type(bp), bp->bio_offset / ULONG_MAX,
788206497Sluigi	    bp->bio_offset, bp->bio_length);
789206497Sluigi}
790206551Sluigi#else /* !KTR */
791206497Sluigi#define	TRC_BIO_EVENT(e, bp)
792206551Sluigi#endif /* !KTR */
793206497Sluigi
794206497Sluigi/*
795206497Sluigi * g_sched_done() and g_sched_start() dispatch the geom requests to
796206497Sluigi * the scheduling algorithm in use.
797206497Sluigi */
798206497Sluigistatic void
799206497Sluigig_sched_done(struct bio *bio)
800206497Sluigi{
801206497Sluigi	struct g_geom *gp = bio->bio_caller2;
802206497Sluigi	struct g_sched_softc *sc = gp->softc;
803206497Sluigi
804206497Sluigi	TRC_BIO_EVENT(DONE, bio);
805206497Sluigi
806206497Sluigi	KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done"));
807206497Sluigi
808206497Sluigi	g_sched_lock(gp);
809206497Sluigi
810206497Sluigi	g_sched_update_stats(bio);
811206497Sluigi	sc->sc_gsched->gs_done(sc->sc_data, bio);
812206497Sluigi	if (!--sc->sc_pending)
813206497Sluigi		wakeup(gp);
814206497Sluigi
815206497Sluigi	g_sched_flush_classes(gp);
816206497Sluigi	g_sched_unlock(gp);
817206497Sluigi
818206497Sluigi	g_std_done(bio);
819206497Sluigi}
820206497Sluigi
821206497Sluigistatic void
822206497Sluigig_sched_start(struct bio *bp)
823206497Sluigi{
824206497Sluigi	struct g_geom *gp = bp->bio_to->geom;
825206497Sluigi	struct g_sched_softc *sc = gp->softc;
826206497Sluigi	struct bio *cbp;
827206497Sluigi
828206497Sluigi	TRC_BIO_EVENT(START, bp);
829206497Sluigi	G_SCHED_LOGREQ(bp, "Request received.");
830206497Sluigi
831206497Sluigi	cbp = g_clone_bio(bp);
832206497Sluigi	if (cbp == NULL) {
833206497Sluigi		g_io_deliver(bp, ENOMEM);
834206497Sluigi		return;
835206497Sluigi	}
836206497Sluigi	cbp->bio_done = g_sched_done;
837206497Sluigi	cbp->bio_to = LIST_FIRST(&gp->provider);
838206497Sluigi	KASSERT(cbp->bio_to != NULL, ("NULL provider"));
839206497Sluigi
840206497Sluigi	/* We only schedule reads and writes. */
841206497Sluigi	if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE)))
842206497Sluigi		goto bypass;
843206497Sluigi
844206497Sluigi	G_SCHED_LOGREQ(cbp, "Sending request.");
845206497Sluigi
846206497Sluigi	g_sched_lock(gp);
847206497Sluigi	/*
848206497Sluigi	 * Call the algorithm's gs_start to queue the request in the
849206497Sluigi	 * scheduler. If gs_start fails then pass the request down,
850206497Sluigi	 * otherwise call g_sched_dispatch() which tries to push
851206497Sluigi	 * one or more requests down.
852206497Sluigi	 */
853206497Sluigi	if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) ||
854206497Sluigi	    sc->sc_gsched->gs_start(sc->sc_data, cbp)) {
855206497Sluigi		g_sched_unlock(gp);
856206497Sluigi		goto bypass;
857206497Sluigi	}
858206497Sluigi	/*
859206497Sluigi	 * We use bio_caller1 to mark requests that are scheduled
860206497Sluigi	 * so make sure it is not NULL.
861206497Sluigi	 */
862206497Sluigi	if (cbp->bio_caller1 == NULL)
863206497Sluigi		cbp->bio_caller1 = &me;	/* anything not NULL */
864206497Sluigi
865206497Sluigi	cbp->bio_caller2 = gp;
866206497Sluigi	sc->sc_pending++;
867206497Sluigi
868206497Sluigi	/* Update general stats. */
869206497Sluigi	me.gs_in_flight++;
870206497Sluigi	me.gs_requests++;
871206497Sluigi	me.gs_bytes_in_flight += bp->bio_length;
872206497Sluigi	if (bp->bio_cmd & BIO_WRITE) {
873206497Sluigi		me.gs_writes_in_flight++;
874206497Sluigi		me.gs_write_bytes_in_flight += bp->bio_length;
875206497Sluigi	}
876206497Sluigi	g_sched_dispatch(gp);
877206497Sluigi	g_sched_unlock(gp);
878206497Sluigi	return;
879206497Sluigi
880206497Sluigibypass:
881206497Sluigi	cbp->bio_done = g_std_done;
882206497Sluigi	cbp->bio_caller1 = NULL; /* not scheduled */
883206497Sluigi	g_io_request(cbp, LIST_FIRST(&gp->consumer));
884206497Sluigi}
885206497Sluigi
886206497Sluigi/*
887206497Sluigi * The next few functions are the geom glue.
888206497Sluigi */
889206497Sluigistatic void
890206497Sluigig_sched_orphan(struct g_consumer *cp)
891206497Sluigi{
892206497Sluigi
893206497Sluigi	g_topology_assert();
894206497Sluigi	g_sched_destroy(cp->geom, 1);
895206497Sluigi}
896206497Sluigi
897206497Sluigistatic int
898206497Sluigig_sched_access(struct g_provider *pp, int dr, int dw, int de)
899206497Sluigi{
900206497Sluigi	struct g_geom *gp;
901206497Sluigi	struct g_consumer *cp;
902206497Sluigi	int error;
903206497Sluigi
904206497Sluigi	gp = pp->geom;
905206497Sluigi	cp = LIST_FIRST(&gp->consumer);
906206497Sluigi	error = g_access(cp, dr, dw, de);
907206497Sluigi
908206497Sluigi	return (error);
909206497Sluigi}
910206497Sluigi
911206497Sluigistatic void
912206497Sluigig_sched_temporary_start(struct bio *bio)
913206497Sluigi{
914206497Sluigi
915206497Sluigi	mtx_lock(&me.gs_mtx);
916206497Sluigi	me.gs_npending++;
917206497Sluigi	gs_bioq_disksort(&me.gs_pending, bio);
918206497Sluigi	mtx_unlock(&me.gs_mtx);
919206497Sluigi}
920206497Sluigi
921206497Sluigistatic void
922206497Sluigig_sched_flush_pending(g_start_t *start)
923206497Sluigi{
924206497Sluigi	struct bio *bp;
925206497Sluigi
926206497Sluigi	while ((bp = gs_bioq_takefirst(&me.gs_pending)))
927206497Sluigi		start(bp);
928206497Sluigi}
929206497Sluigi
930206497Sluigistatic int
931206497Sluigig_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
932206497Sluigi    struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
933206497Sluigi{
934206497Sluigi	struct g_sched_softc *sc = gp->softc;
935206497Sluigi	g_start_t *saved_start, *flush = g_sched_start;
936206497Sluigi	int error = 0, endticks = ticks + hz;
937206497Sluigi
938206497Sluigi	g_cancel_event(newpp);	/* prevent taste() */
939206497Sluigi	/* copy private fields */
940206497Sluigi	newpp->private = pp->private;
941206497Sluigi	newpp->index = pp->index;
942206497Sluigi
943206497Sluigi	/* Queue all the early requests coming for us. */
944206497Sluigi	me.gs_npending = 0;
945206497Sluigi	saved_start = pp->geom->start;
946206497Sluigi	dstgp->start = g_sched_temporary_start;
947206497Sluigi
948206497Sluigi	while (pp->nstart - pp->nend != me.gs_npending &&
949206497Sluigi	    endticks - ticks >= 0)
950206497Sluigi		tsleep(pp, PRIBIO, "-", hz/10);
951206497Sluigi
952206497Sluigi	if (pp->nstart - pp->nend != me.gs_npending) {
953206497Sluigi		flush = saved_start;
954206497Sluigi		error = ETIMEDOUT;
955206497Sluigi		goto fail;
956206497Sluigi	}
957206497Sluigi
958206497Sluigi	/* link pp to this geom */
959206497Sluigi	LIST_REMOVE(pp, provider);
960206497Sluigi	pp->geom = gp;
961206497Sluigi	LIST_INSERT_HEAD(&gp->provider, pp, provider);
962206497Sluigi
963206497Sluigi	/*
964206497Sluigi	 * replicate the counts from the parent in the
965206497Sluigi	 * new provider and consumer nodes
966206497Sluigi	 */
967206497Sluigi	cp->acr = newpp->acr = pp->acr;
968206497Sluigi	cp->acw = newpp->acw = pp->acw;
969206497Sluigi	cp->ace = newpp->ace = pp->ace;
970206497Sluigi	sc->sc_flags |= G_SCHED_PROXYING;
971206497Sluigi
972206497Sluigifail:
973206497Sluigi	dstgp->start = saved_start;
974206497Sluigi
975206497Sluigi	g_sched_flush_pending(flush);
976206497Sluigi
977206497Sluigi	return (error);
978206497Sluigi}
979206497Sluigi
980206497Sluigi/*
981206497Sluigi * Create a geom node for the device passed as *pp.
982206497Sluigi * If successful, add a reference to this gsp.
983206497Sluigi */
984206497Sluigistatic int
985206497Sluigig_sched_create(struct gctl_req *req, struct g_class *mp,
986206497Sluigi    struct g_provider *pp, struct g_gsched *gsp, int proxy)
987206497Sluigi{
988206497Sluigi	struct g_sched_softc *sc = NULL;
989206497Sluigi	struct g_geom *gp, *dstgp;
990206497Sluigi	struct g_provider *newpp = NULL;
991206497Sluigi	struct g_consumer *cp = NULL;
992206497Sluigi	char name[64];
993206497Sluigi	int error;
994206497Sluigi
995206497Sluigi	g_topology_assert();
996206497Sluigi
997206497Sluigi	snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX);
998206497Sluigi	LIST_FOREACH(gp, &mp->geom, geom) {
999206497Sluigi		if (strcmp(gp->name, name) == 0) {
1000206497Sluigi			gctl_error(req, "Geom %s already exists.",
1001206497Sluigi			    name);
1002206497Sluigi			return (EEXIST);
1003206497Sluigi		}
1004206497Sluigi	}
1005206497Sluigi
1006243333Sjh	gp = g_new_geomf(mp, "%s", name);
1007206497Sluigi	dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
1008206497Sluigi
1009206497Sluigi	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
1010206497Sluigi	sc->sc_gsched = gsp;
1011206497Sluigi	sc->sc_data = gsp->gs_init(gp);
1012206497Sluigi	if (sc->sc_data == NULL) {
1013206497Sluigi		error = ENOMEM;
1014206497Sluigi		goto fail;
1015206497Sluigi	}
1016206497Sluigi
1017206497Sluigi	sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK);
1018206497Sluigi
1019206497Sluigi	/*
1020206497Sluigi	 * Do not initialize the flush mechanism, will be initialized
1021206497Sluigi	 * on the first insertion on the hash table.
1022206497Sluigi	 */
1023206497Sluigi
1024206497Sluigi	mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF);
1025206497Sluigi
1026206497Sluigi	gp->softc = sc;
1027206497Sluigi	gp->start = g_sched_start;
1028206497Sluigi	gp->orphan = g_sched_orphan;
1029206497Sluigi	gp->access = g_sched_access;
1030206497Sluigi	gp->dumpconf = g_sched_dumpconf;
1031206497Sluigi
1032243333Sjh	newpp = g_new_providerf(dstgp, "%s", gp->name);
1033206497Sluigi	newpp->mediasize = pp->mediasize;
1034206497Sluigi	newpp->sectorsize = pp->sectorsize;
1035206497Sluigi
1036206497Sluigi	cp = g_new_consumer(gp);
1037206497Sluigi	error = g_attach(cp, proxy ? newpp : pp);
1038206497Sluigi	if (error != 0) {
1039206497Sluigi		gctl_error(req, "Cannot attach to provider %s.",
1040206497Sluigi		    pp->name);
1041206497Sluigi		goto fail;
1042206497Sluigi	}
1043206497Sluigi
1044206497Sluigi	g_error_provider(newpp, 0);
1045206497Sluigi	if (proxy) {
1046206497Sluigi		error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
1047206497Sluigi		if (error)
1048206497Sluigi			goto fail;
1049206497Sluigi	}
1050206497Sluigi	G_SCHED_DEBUG(0, "Device %s created.", gp->name);
1051206497Sluigi
1052206497Sluigi	g_gsched_ref(gsp);
1053206497Sluigi
1054206497Sluigi	return (0);
1055206497Sluigi
1056206497Sluigifail:
1057206497Sluigi	if (cp != NULL) {
1058206497Sluigi		if (cp->provider != NULL)
1059206497Sluigi			g_detach(cp);
1060206497Sluigi		g_destroy_consumer(cp);
1061206497Sluigi	}
1062206497Sluigi	if (newpp != NULL)
1063206497Sluigi		g_destroy_provider(newpp);
1064221453Sae	if (sc->sc_hash)
1065206497Sluigi		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1066206497Sluigi		    gsp, sc->sc_data);
1067221453Sae	if (sc->sc_data)
1068206497Sluigi		gsp->gs_fini(sc->sc_data);
1069221453Sae	g_free(gp->softc);
1070221453Sae	g_destroy_geom(gp);
1071206497Sluigi
1072206497Sluigi	return (error);
1073206497Sluigi}
1074206497Sluigi
1075206497Sluigi/*
1076206497Sluigi * Support for dynamic switching of scheduling algorithms.
1077206497Sluigi * First initialize the data structures for the new algorithm,
1078206497Sluigi * then call g_sched_remove_locked() to flush all references
1079206497Sluigi * to the old one, finally link the new algorithm.
1080206497Sluigi */
1081206497Sluigistatic int
1082206497Sluigig_sched_change_algo(struct gctl_req *req, struct g_class *mp,
1083206497Sluigi    struct g_provider *pp, struct g_gsched *gsp)
1084206497Sluigi{
1085206497Sluigi	struct g_sched_softc *sc;
1086206497Sluigi	struct g_geom *gp;
1087206497Sluigi	struct g_hash *newh;
1088206497Sluigi	void *data;
1089206497Sluigi	u_long mask;
1090206497Sluigi	int error = 0;
1091206497Sluigi
1092206497Sluigi	gp = pp->geom;
1093206497Sluigi	sc = gp->softc;
1094206497Sluigi
1095206497Sluigi	data = gsp->gs_init(gp);
1096206497Sluigi	if (data == NULL)
1097206497Sluigi		return (ENOMEM);
1098206497Sluigi
1099206497Sluigi	newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
1100206497Sluigi	if (gsp->gs_priv_size && !newh) {
1101206497Sluigi		error = ENOMEM;
1102206497Sluigi		goto fail;
1103206497Sluigi	}
1104206497Sluigi
1105206497Sluigi	g_sched_lock(gp);
1106206497Sluigi	if (sc->sc_gsched) {	/* can be NULL in some cases */
1107206497Sluigi		error = g_sched_remove_locked(gp, sc->sc_gsched);
1108206497Sluigi		if (error)
1109206497Sluigi			goto fail;
1110206497Sluigi	}
1111206497Sluigi
1112206497Sluigi	g_gsched_ref(gsp);
1113206497Sluigi	sc->sc_gsched = gsp;
1114206497Sluigi	sc->sc_data = data;
1115206497Sluigi	sc->sc_hash = newh;
1116206497Sluigi	sc->sc_mask = mask;
1117206497Sluigi
1118206497Sluigi	g_sched_unlock(gp);
1119206497Sluigi
1120206497Sluigi	return (0);
1121206497Sluigi
1122206497Sluigifail:
1123206497Sluigi	if (newh)
1124206497Sluigi		g_sched_hash_fini(gp, newh, mask, gsp, data);
1125206497Sluigi
1126206497Sluigi	if (data)
1127206497Sluigi		gsp->gs_fini(data);
1128206497Sluigi
1129206497Sluigi	g_sched_unlock(gp);
1130206497Sluigi
1131206497Sluigi	return (error);
1132206497Sluigi}
1133206497Sluigi
1134206497Sluigi/*
1135206497Sluigi * Stop the request flow directed to the proxy, redirecting the new
1136206497Sluigi * requests to the me.gs_pending queue.
1137206497Sluigi */
1138206497Sluigistatic struct g_provider *
1139206497Sluigig_detach_proxy(struct g_geom *gp)
1140206497Sluigi{
1141206497Sluigi	struct g_consumer *cp;
1142206497Sluigi	struct g_provider *pp, *newpp;
1143206497Sluigi
1144206497Sluigi	do {
1145206497Sluigi		pp = LIST_FIRST(&gp->provider);
1146206497Sluigi		if (pp == NULL)
1147206497Sluigi			break;
1148206497Sluigi		cp = LIST_FIRST(&gp->consumer);
1149206497Sluigi		if (cp == NULL)
1150206497Sluigi			break;
1151206497Sluigi		newpp = cp->provider;
1152206497Sluigi		if (newpp == NULL)
1153206497Sluigi			break;
1154206497Sluigi
1155206497Sluigi		me.gs_npending = 0;
1156206497Sluigi		pp->geom->start = g_sched_temporary_start;
1157206497Sluigi
1158206497Sluigi		return (pp);
1159206497Sluigi	} while (0);
1160206497Sluigi	printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
1161206497Sluigi
1162206497Sluigi	return (NULL);
1163206497Sluigi}
1164206497Sluigi
1165206497Sluigistatic void
1166206497Sluigig_sched_blackhole(struct bio *bp)
1167206497Sluigi{
1168206497Sluigi
1169206497Sluigi	g_io_deliver(bp, ENXIO);
1170206497Sluigi}
1171206497Sluigi
1172206497Sluigistatic inline void
1173206497Sluigig_reparent_provider(struct g_provider *pp, struct g_geom *gp,
1174206497Sluigi    struct g_provider *newpp)
1175206497Sluigi{
1176206497Sluigi
1177206497Sluigi	LIST_REMOVE(pp, provider);
1178206497Sluigi	if (newpp) {
1179206497Sluigi		pp->private = newpp->private;
1180206497Sluigi		pp->index = newpp->index;
1181206497Sluigi	}
1182206497Sluigi	pp->geom = gp;
1183206497Sluigi	LIST_INSERT_HEAD(&gp->provider, pp, provider);
1184206497Sluigi}
1185206497Sluigi
1186206497Sluigistatic inline void
1187206497Sluigig_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
1188206497Sluigi{
1189206497Sluigi	struct g_geom *gp = oldpp->geom;
1190206497Sluigi
1191206497Sluigi	g_reparent_provider(oldpp, newpp->geom, newpp);
1192206497Sluigi
1193206497Sluigi	/*
1194206497Sluigi	 * Hackish: let the system destroy the old provider for us, just
1195206497Sluigi	 * in case someone attached a consumer to it, in which case a
1196206497Sluigi	 * direct call to g_destroy_provider() would not work.
1197206497Sluigi	 */
1198206497Sluigi	g_reparent_provider(newpp, gp, NULL);
1199206497Sluigi}
1200206497Sluigi
1201206497Sluigi/*
1202206497Sluigi * Complete the proxy destruction, linking the old provider to its
1203206497Sluigi * original geom, and destroying the proxy provider.  Also take care
1204206497Sluigi * of issuing the pending requests collected in me.gs_pending (if any).
1205206497Sluigi */
1206206497Sluigistatic int
1207206497Sluigig_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
1208206497Sluigi{
1209206497Sluigi	struct g_consumer *cp;
1210206497Sluigi	struct g_provider *newpp;
1211206497Sluigi
1212206497Sluigi	do {
1213206497Sluigi		cp = LIST_FIRST(&gp->consumer);
1214206497Sluigi		if (cp == NULL)
1215206497Sluigi			break;
1216206497Sluigi		newpp = cp->provider;
1217206497Sluigi		if (newpp == NULL)
1218206497Sluigi			break;
1219206497Sluigi
1220206497Sluigi		/* Relink the provider to its original geom. */
1221206497Sluigi		g_unproxy_provider(oldpp, newpp);
1222206497Sluigi
1223206497Sluigi		/* Detach consumer from provider, and destroy provider. */
1224206497Sluigi		cp->acr = newpp->acr = 0;
1225206497Sluigi		cp->acw = newpp->acw = 0;
1226206497Sluigi		cp->ace = newpp->ace = 0;
1227206497Sluigi		g_detach(cp);
1228206497Sluigi
1229206497Sluigi		/* Send the pending bios through the right start function. */
1230206497Sluigi		g_sched_flush_pending(oldpp->geom->start);
1231206497Sluigi
1232206497Sluigi		return (0);
1233206497Sluigi	} while (0);
1234206497Sluigi	printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
1235206497Sluigi
1236206497Sluigi	/* We cannot send the pending bios anywhere... */
1237206497Sluigi	g_sched_flush_pending(g_sched_blackhole);
1238206497Sluigi
1239206497Sluigi	return (EINVAL);
1240206497Sluigi}
1241206497Sluigi
1242206497Sluigistatic int
1243206497Sluigig_sched_destroy(struct g_geom *gp, boolean_t force)
1244206497Sluigi{
1245206497Sluigi	struct g_provider *pp, *oldpp = NULL;
1246206497Sluigi	struct g_sched_softc *sc;
1247206497Sluigi	struct g_gsched *gsp;
1248206497Sluigi	int error;
1249206497Sluigi
1250206497Sluigi	g_topology_assert();
1251206497Sluigi	sc = gp->softc;
1252206497Sluigi	if (sc == NULL)
1253206497Sluigi		return (ENXIO);
1254206497Sluigi	if (!(sc->sc_flags & G_SCHED_PROXYING)) {
1255206497Sluigi		pp = LIST_FIRST(&gp->provider);
1256206497Sluigi		if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1257206497Sluigi			const char *msg = force ?
1258206497Sluigi				"but we force removal" : "cannot remove";
1259206497Sluigi
1260206497Sluigi			G_SCHED_DEBUG(!force,
1261206497Sluigi			    "Device %s is still open (r%dw%de%d), %s.",
1262206497Sluigi			    pp->name, pp->acr, pp->acw, pp->ace, msg);
1263206497Sluigi			if (!force)
1264206497Sluigi				return (EBUSY);
1265206497Sluigi		} else {
1266206497Sluigi			G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
1267206497Sluigi		}
1268206497Sluigi	} else
1269206497Sluigi		oldpp = g_detach_proxy(gp);
1270206497Sluigi
1271206497Sluigi	gsp = sc->sc_gsched;
1272206497Sluigi	if (gsp) {
1273206497Sluigi		/*
1274206497Sluigi		 * XXX bad hack here: force a dispatch to release
1275206497Sluigi		 * any reference to the hash table still held by
1276206497Sluigi		 * the scheduler.
1277206497Sluigi		 */
1278206497Sluigi		g_sched_lock(gp);
1279206497Sluigi		/*
1280206497Sluigi		 * We are dying here, no new requests should enter
1281206497Sluigi		 * the scheduler.  This is granted by the topolgy,
1282206497Sluigi		 * either in case we were proxying (new bios are
1283206497Sluigi		 * being redirected) or not (see the access check
1284206497Sluigi		 * above).
1285206497Sluigi		 */
1286206497Sluigi		g_sched_forced_dispatch(gp);
1287206497Sluigi		error = g_sched_wait_pending(gp);
1288206497Sluigi
1289206497Sluigi		if (error) {
1290206497Sluigi			/*
1291206497Sluigi			 * Not all the requests came home: this might happen
1292206497Sluigi			 * under heavy load, or if we were waiting for any
1293206497Sluigi			 * bio which is served in the event path (see
1294206497Sluigi			 * geom_slice.c for an example of how this can
1295206497Sluigi			 * happen).  Try to restore a working configuration
1296206497Sluigi			 * if we can fail.
1297206497Sluigi			 */
1298206497Sluigi			if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1299206497Sluigi				g_sched_flush_pending(force ?
1300206497Sluigi				    g_sched_blackhole : g_sched_start);
1301206497Sluigi			}
1302206497Sluigi
1303206497Sluigi			/*
1304206497Sluigi			 * In the forced destroy case there is not so much
1305206497Sluigi			 * we can do, we have pending bios that will call
1306206497Sluigi			 * g_sched_done() somehow, and we don't want them
1307206497Sluigi			 * to crash the system using freed memory.  We tell
1308206497Sluigi			 * the user that something went wrong, and leak some
1309206497Sluigi			 * memory here.
1310206497Sluigi			 * Note: the callers using force = 1 ignore the
1311206497Sluigi			 * return value.
1312206497Sluigi			 */
1313206497Sluigi			if (force) {
1314206497Sluigi				G_SCHED_DEBUG(0, "Pending requests while "
1315206497Sluigi				    " destroying geom, some memory leaked.");
1316206497Sluigi			}
1317206497Sluigi
1318206497Sluigi			return (error);
1319206497Sluigi		}
1320206497Sluigi
1321206497Sluigi		g_sched_unlock(gp);
1322206497Sluigi		g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
1323206497Sluigi		    gsp, sc->sc_data);
1324206497Sluigi		sc->sc_hash = NULL;
1325206497Sluigi		gsp->gs_fini(sc->sc_data);
1326206497Sluigi		g_gsched_unref(gsp);
1327206497Sluigi		sc->sc_gsched = NULL;
1328206497Sluigi	}
1329206497Sluigi
1330206497Sluigi	if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
1331206497Sluigi		error = g_destroy_proxy(gp, oldpp);
1332206497Sluigi
1333206497Sluigi		if (error) {
1334206497Sluigi			if (force) {
1335206497Sluigi				G_SCHED_DEBUG(0, "Unrecoverable error while "
1336206497Sluigi				    "destroying a proxy geom, leaking some "
1337206497Sluigi				    " memory.");
1338206497Sluigi			}
1339206497Sluigi
1340206497Sluigi			return (error);
1341206497Sluigi		}
1342206497Sluigi	}
1343206497Sluigi
1344206497Sluigi	mtx_destroy(&sc->sc_mtx);
1345206497Sluigi
1346206497Sluigi	g_free(gp->softc);
1347206497Sluigi	gp->softc = NULL;
1348206497Sluigi	g_wither_geom(gp, ENXIO);
1349206497Sluigi
1350206497Sluigi	return (error);
1351206497Sluigi}
1352206497Sluigi
1353206497Sluigistatic int
1354206497Sluigig_sched_destroy_geom(struct gctl_req *req, struct g_class *mp,
1355206497Sluigi    struct g_geom *gp)
1356206497Sluigi{
1357206497Sluigi
1358206497Sluigi	return (g_sched_destroy(gp, 0));
1359206497Sluigi}
1360206497Sluigi
1361206497Sluigi/*
1362206497Sluigi * Functions related to the classification of requests.
1363206497Sluigi *
1364206497Sluigi * On recent FreeBSD versions (8.0 and above), we store a reference
1365206497Sluigi * to the issuer of a request in bp->bio_classifier1 as soon
1366206497Sluigi * as the bio is posted to the geom queue (and not later, because
1367206497Sluigi * requests are managed by the g_down thread afterwards).
1368206497Sluigi *
1369206497Sluigi * On older versions of the system (but this code is not used
1370206497Sluigi * in any existing release), we [ab]use the caller1 field in the
1371206497Sluigi * root element of the bio tree to store the classification info.
1372206497Sluigi * The marking is done at the beginning of g_io_request()
1373206497Sluigi * and only if we find that the field is NULL.
1374206497Sluigi *
1375206497Sluigi * To avoid rebuilding the kernel, this module will patch the
1376206497Sluigi * initial part of g_io_request() so it jumps to some hand-coded
1377206497Sluigi * assembly that does the marking and then executes the original
1378206497Sluigi * body of g_io_request().
1379206497Sluigi *
1380206497Sluigi * fake_ioreq[] is architecture-specific machine code
1381206497Sluigi * that implements the above. CODE_SIZE, STORE_SIZE etc.
1382206497Sluigi * are constants used in the patching routine. Look at the
1383206497Sluigi * code in g_ioreq_patch() for the details.
1384206497Sluigi */
1385206497Sluigi
1386206497Sluigi#ifndef HAVE_BIO_CLASSIFIER
1387206497Sluigi/*
1388206497Sluigi * Support for old FreeBSD versions
1389206497Sluigi */
1390206497Sluigi#if defined(__i386__)
1391206497Sluigi#define	CODE_SIZE	29
1392206497Sluigi#define	STORE_SIZE	5
1393206497Sluigi#define	EPILOGUE	5
1394206497Sluigi#define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1395206497Sluigi
1396206497Sluigistatic u_char fake_ioreq[SIZE] = {
1397206497Sluigi	0x8b, 0x44, 0x24, 0x04,		/* mov bp, %eax */
1398206497Sluigi	/* 1: */
1399206497Sluigi	0x89, 0xc2,			/* mov %eax, %edx # edx = bp */
1400206497Sluigi	0x8b, 0x40, 0x64,		/* mov bp->bio_parent, %eax */
1401206497Sluigi	0x85, 0xc0,			/* test %eax, %eax */
1402206497Sluigi	0x75, 0xf7,			/* jne 1b */
1403206497Sluigi	0x8b, 0x42, 0x30,		/* mov bp->bp_caller1, %eax */
1404206497Sluigi	0x85, 0xc0,			/* test %eax, %eax */
1405206497Sluigi	0x75, 0x09,			/* jne 2f */
1406206497Sluigi	0x64, 0xa1, 0x00, 0x00,		/* mov %fs:0, %eax */
1407206497Sluigi	0x00, 0x00,
1408206497Sluigi	0x89, 0x42, 0x30,		/* mov %eax, bp->bio_caller1 */
1409206497Sluigi	/* 2: */
1410206497Sluigi        0x55, 0x89, 0xe5, 0x57, 0x56,
1411206497Sluigi	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1412206497Sluigi};
1413206497Sluigi#elif defined(__amd64)
1414206497Sluigi#define	CODE_SIZE	38
1415206497Sluigi#define	STORE_SIZE	6
1416206497Sluigi#define	EPILOGUE	5
1417206497Sluigi#define	SIZE		(CODE_SIZE + STORE_SIZE + EPILOGUE)
1418206497Sluigi
1419206497Sluigistatic u_char fake_ioreq[SIZE] = {
1420206497Sluigi	0x48, 0x89, 0xf8,		/* mov bp, %rax */
1421206497Sluigi	/* 1: */
1422206497Sluigi	0x48, 0x89, 0xc2,		/* mov %rax, %rdx # rdx = bp */
1423206497Sluigi	0x48, 0x8b, 0x82, 0xa8,		/* mov bp->bio_parent, %rax */
1424206497Sluigi	0x00, 0x00, 0x00,
1425206497Sluigi	0x48, 0x85, 0xc0,		/* test %rax, %rax */
1426206497Sluigi	0x75, 0xf1,			/* jne 1b */
1427206497Sluigi	0x48, 0x83, 0x7a, 0x58,		/* cmp $0, bp->bp_caller1 */
1428206497Sluigi	0x00,
1429206497Sluigi	0x75, 0x0d,			/* jne 2f */
1430206497Sluigi	0x65, 0x48, 0x8b, 0x04,		/* mov %gs:0, %rax */
1431206497Sluigi	0x25, 0x00, 0x00, 0x00,
1432206497Sluigi	0x00,
1433206497Sluigi	0x48, 0x89, 0x42, 0x58,		/* mov %rax, bp->bio_caller1 */
1434206497Sluigi	/* 2: */
1435206497Sluigi	0x55, 0x48, 0x89, 0xe5, 0x41, 0x56,
1436206497Sluigi	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp back... */
1437206497Sluigi};
1438206497Sluigi#else /* neither x86 nor amd64 */
1439206497Sluigistatic void
1440206497Sluigig_new_io_request(struct bio *bp, struct g_consumer *cp)
1441206497Sluigi{
1442206497Sluigi	struct bio *top = bp;
1443206497Sluigi
1444206497Sluigi        /*
1445206497Sluigi         * bio classification: if bio_caller1 is available in the
1446206497Sluigi         * root of the 'struct bio' tree, store there the thread id
1447206497Sluigi         * of the thread that originated the request.
1448206497Sluigi         * More sophisticated classification schemes can be used.
1449206497Sluigi         */
1450206497Sluigi	while (top->bio_parent)
1451206497Sluigi		top = top->bio_parent;
1452206497Sluigi
1453206497Sluigi	if (top->bio_caller1 == NULL)
1454206497Sluigi		top->bio_caller1 = curthread;
1455206497Sluigi}
1456206497Sluigi
1457206497Sluigi#error please add the code above in g_new_io_request() to the beginning of \
1458206497Sluigi	/sys/geom/geom_io.c::g_io_request(), and remove this line.
1459206497Sluigi#endif /* end of arch-specific code */
1460206497Sluigi
1461206497Sluigistatic int
1462206497Sluigig_ioreq_patch(void)
1463206497Sluigi{
1464206497Sluigi	u_char *original;
1465206497Sluigi	u_long ofs;
1466206497Sluigi	int found;
1467206497Sluigi
1468206497Sluigi	if (me.gs_patched)
1469206497Sluigi		return (-1);
1470206497Sluigi
1471206497Sluigi	original = (u_char *)g_io_request;
1472206497Sluigi
1473206497Sluigi	found = !bcmp(original, fake_ioreq + CODE_SIZE, STORE_SIZE);
1474206497Sluigi	if (!found)
1475206497Sluigi		return (-1);
1476206497Sluigi
1477206497Sluigi	/* Jump back to the original + STORE_SIZE. */
1478206497Sluigi	ofs = (original + STORE_SIZE) - (fake_ioreq + SIZE);
1479206497Sluigi	bcopy(&ofs, fake_ioreq + CODE_SIZE + STORE_SIZE + 1, 4);
1480206497Sluigi
1481206497Sluigi	/* Patch the original address with a jump to the trampoline. */
1482206497Sluigi	*original = 0xe9;     /* jump opcode */
1483206497Sluigi	ofs = fake_ioreq - (original + 5);
1484206497Sluigi	bcopy(&ofs, original + 1, 4);
1485206497Sluigi
1486206497Sluigi	me.gs_patched = 1;
1487206497Sluigi
1488206497Sluigi	return (0);
1489206497Sluigi}
1490206497Sluigi
1491206497Sluigi/*
1492206497Sluigi * Restore the original code, this is easy.
1493206497Sluigi */
1494206497Sluigistatic void
1495206497Sluigig_ioreq_restore(void)
1496206497Sluigi{
1497206497Sluigi	u_char *original;
1498206497Sluigi
1499206497Sluigi	if (me.gs_patched) {
1500206497Sluigi		original = (u_char *)g_io_request;
1501206497Sluigi		bcopy(fake_ioreq + CODE_SIZE, original, STORE_SIZE);
1502206497Sluigi		me.gs_patched = 0;
1503206497Sluigi	}
1504206497Sluigi}
1505206497Sluigi
1506206497Sluigistatic inline void
1507206497Sluigig_classifier_ini(void)
1508206497Sluigi{
1509206497Sluigi
1510206497Sluigi	g_ioreq_patch();
1511206497Sluigi}
1512206497Sluigi
1513206497Sluigistatic inline void
1514206497Sluigig_classifier_fini(void)
1515206497Sluigi{
1516206497Sluigi
1517206497Sluigi	g_ioreq_restore();
1518206497Sluigi}
1519206497Sluigi
1520206497Sluigi/*--- end of support code for older FreeBSD versions */
1521206497Sluigi
1522206497Sluigi#else /* HAVE_BIO_CLASSIFIER */
1523206497Sluigi
1524206497Sluigi/*
1525206497Sluigi * Classifier support for recent FreeBSD versions: we use
1526206497Sluigi * a very simple classifier, only use curthread to tag a request.
1527206497Sluigi * The classifier is registered at module load, and unregistered
1528206497Sluigi * at module unload.
1529206497Sluigi */
1530206497Sluigistatic int
1531206497Sluigig_sched_tag(void *arg, struct bio *bp)
1532206497Sluigi{
1533206497Sluigi
1534206497Sluigi	bp->bio_classifier1 = curthread;
1535206497Sluigi	return (1);
1536206497Sluigi}
1537206497Sluigi
1538206497Sluigistatic struct g_classifier_hook g_sched_classifier = {
1539206497Sluigi	.func =	g_sched_tag,
1540206497Sluigi};
1541206497Sluigi
1542206497Sluigistatic inline void
1543206497Sluigig_classifier_ini(void)
1544206497Sluigi{
1545206497Sluigi
1546206497Sluigi	g_register_classifier(&g_sched_classifier);
1547206497Sluigi}
1548206497Sluigi
1549206497Sluigistatic inline void
1550206497Sluigig_classifier_fini(void)
1551206497Sluigi{
1552206497Sluigi
1553206497Sluigi	g_unregister_classifier(&g_sched_classifier);
1554206497Sluigi}
1555206497Sluigi#endif /* HAVE_BIO_CLASSIFIER */
1556206497Sluigi
1557206497Sluigistatic void
1558206497Sluigig_sched_init(struct g_class *mp)
1559206497Sluigi{
1560206497Sluigi
1561206497Sluigi	g_gsched_global_init();
1562206497Sluigi
1563206497Sluigi	G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.",
1564206497Sluigi	    mp, &g_sched_class);
1565206497Sluigi
1566206497Sluigi	/* Patch g_io_request to store classification info in the bio. */
1567206497Sluigi	g_classifier_ini();
1568206497Sluigi}
1569206497Sluigi
1570206497Sluigistatic void
1571206497Sluigig_sched_fini(struct g_class *mp)
1572206497Sluigi{
1573206497Sluigi
1574206497Sluigi	g_classifier_fini();
1575206497Sluigi
1576206497Sluigi	G_SCHED_DEBUG(0, "Unloading...");
1577206497Sluigi
1578206497Sluigi	KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers"));
1579206497Sluigi	mtx_destroy(&me.gs_mtx);
1580206497Sluigi}
1581206497Sluigi
1582210747Saestatic int
1583210747Saeg_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag,
1584210747Sae    struct thread *td)
1585210747Sae{
1586210747Sae	struct g_consumer *cp;
1587210747Sae	struct g_geom *gp;
1588210747Sae
1589210747Sae	cp = LIST_FIRST(&pp->geom->consumer);
1590210747Sae	if (cp == NULL)
1591210747Sae		return (ENOIOCTL);
1592210747Sae	gp = cp->provider->geom;
1593210747Sae	if (gp->ioctl == NULL)
1594210747Sae		return (ENOIOCTL);
1595210747Sae	return (gp->ioctl(cp->provider, cmd, data, fflag, td));
1596210747Sae}
1597210747Sae
1598206497Sluigi/*
1599206497Sluigi * Read the i-th argument for a request, skipping the /dev/
1600206497Sluigi * prefix if present.
1601206497Sluigi */
1602206497Sluigistatic const char *
1603206497Sluigig_sched_argi(struct gctl_req *req, int i)
1604206497Sluigi{
1605206497Sluigi	static const char *dev_prefix = "/dev/";
1606206497Sluigi	const char *name;
1607206497Sluigi	char param[16];
1608206497Sluigi	int l = strlen(dev_prefix);
1609206497Sluigi
1610206497Sluigi	snprintf(param, sizeof(param), "arg%d", i);
1611206497Sluigi	name = gctl_get_asciiparam(req, param);
1612206497Sluigi	if (name == NULL)
1613206497Sluigi		gctl_error(req, "No 'arg%d' argument", i);
1614206497Sluigi	else if (strncmp(name, dev_prefix, l) == 0)
1615206497Sluigi		name += l;
1616206497Sluigi	return (name);
1617206497Sluigi}
1618206497Sluigi
1619206497Sluigi/*
1620206497Sluigi * Fetch nargs and do appropriate checks.
1621206497Sluigi */
1622206497Sluigistatic int
1623206497Sluigig_sched_get_nargs(struct gctl_req *req)
1624206497Sluigi{
1625206497Sluigi	int *nargs;
1626206497Sluigi
1627206497Sluigi	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1628206497Sluigi	if (nargs == NULL) {
1629206497Sluigi		gctl_error(req, "No 'nargs' argument");
1630206497Sluigi		return (0);
1631206497Sluigi	}
1632206497Sluigi	if (*nargs <= 0)
1633206497Sluigi		gctl_error(req, "Missing device(s).");
1634206497Sluigi	return (*nargs);
1635206497Sluigi}
1636206497Sluigi
1637206497Sluigi/*
1638206497Sluigi * Check whether we should add the class on certain volumes when
1639206497Sluigi * this geom is created. Right now this is under control of a kenv
1640206497Sluigi * variable containing the names of all devices that we care about.
1641206497Sluigi * Probably we should only support transparent insertion as the
1642206497Sluigi * preferred mode of operation.
1643206497Sluigi */
1644206497Sluigistatic struct g_geom *
1645206497Sluigig_sched_taste(struct g_class *mp, struct g_provider *pp,
1646206497Sluigi		int flags __unused)
1647206497Sluigi{
1648206497Sluigi	struct g_gsched *gsp = NULL;	/* the . algorithm we want */
1649206497Sluigi	const char *s;			/* generic string pointer */
1650206497Sluigi	const char *taste_names;	/* devices we like */
1651206497Sluigi	int l;
1652206497Sluigi
1653206497Sluigi        g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
1654206497Sluigi	    mp->name, pp->name);
1655206497Sluigi        g_topology_assert();
1656206497Sluigi
1657206497Sluigi        G_SCHED_DEBUG(2, "Tasting %s.", pp->name);
1658206497Sluigi
1659206497Sluigi	do {
1660206497Sluigi		/* do not taste on ourselves */
1661206497Sluigi		if (pp->geom->class == mp)
1662206497Sluigi                	break;
1663206497Sluigi
1664206497Sluigi		taste_names = getenv("geom.sched.taste");
1665206497Sluigi		if (taste_names == NULL)
1666206497Sluigi			break;
1667206497Sluigi
1668206497Sluigi		l = strlen(pp->name);
1669206497Sluigi		for (s = taste_names; *s &&
1670206497Sluigi		    (s = strstr(s, pp->name)); s++) {
1671206497Sluigi			/* further checks for an exact match */
1672206497Sluigi			if ( (s == taste_names || s[-1] == ' ') &&
1673206497Sluigi			     (s[l] == '\0' || s[l] == ' ') )
1674206497Sluigi				break;
1675206497Sluigi		}
1676206497Sluigi		if (s == NULL)
1677206497Sluigi			break;
1678206497Sluigi		G_SCHED_DEBUG(0, "Attach device %s match [%s]\n",
1679206497Sluigi		    pp->name, s);
1680206497Sluigi
1681206497Sluigi		/* look up the provider name in the list */
1682206497Sluigi		s = getenv("geom.sched.algo");
1683206497Sluigi		if (s == NULL)
1684206497Sluigi			s = "rr";
1685206497Sluigi
1686206497Sluigi		gsp = g_gsched_find(s);	/* also get a reference */
1687206497Sluigi		if (gsp == NULL) {
1688206497Sluigi			G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s);
1689206497Sluigi			break;
1690206497Sluigi		}
1691206497Sluigi
1692206497Sluigi		/* XXX create with 1 as last argument ? */
1693206497Sluigi		g_sched_create(NULL, mp, pp, gsp, 0);
1694206497Sluigi		g_gsched_unref(gsp);
1695206497Sluigi	} while (0);
1696206497Sluigi	return NULL;
1697206497Sluigi}
1698206497Sluigi
1699206497Sluigistatic void
1700206497Sluigig_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy)
1701206497Sluigi{
1702206497Sluigi	struct g_provider *pp;
1703206497Sluigi	struct g_gsched *gsp;
1704206497Sluigi	const char *name;
1705206497Sluigi	int i, nargs;
1706206497Sluigi
1707206497Sluigi	g_topology_assert();
1708206497Sluigi
1709206497Sluigi	name = gctl_get_asciiparam(req, "algo");
1710206497Sluigi	if (name == NULL) {
1711206497Sluigi		gctl_error(req, "No '%s' argument", "algo");
1712206497Sluigi		return;
1713206497Sluigi	}
1714206497Sluigi
1715206497Sluigi	gsp = g_gsched_find(name);	/* also get a reference */
1716206497Sluigi	if (gsp == NULL) {
1717206497Sluigi		gctl_error(req, "Bad algorithm '%s'", name);
1718206497Sluigi		return;
1719206497Sluigi	}
1720206497Sluigi
1721206497Sluigi	nargs = g_sched_get_nargs(req);
1722206497Sluigi
1723206497Sluigi	/*
1724206497Sluigi	 * Run on the arguments, and break on any error.
1725206497Sluigi	 * We look for a device name, but skip the /dev/ prefix if any.
1726206497Sluigi	 */
1727206497Sluigi	for (i = 0; i < nargs; i++) {
1728206497Sluigi		name = g_sched_argi(req, i);
1729206497Sluigi		if (name == NULL)
1730206497Sluigi			break;
1731206497Sluigi		pp = g_provider_by_name(name);
1732206497Sluigi		if (pp == NULL) {
1733206497Sluigi			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1734206497Sluigi			gctl_error(req, "Provider %s is invalid.", name);
1735206497Sluigi			break;
1736206497Sluigi		}
1737206497Sluigi		if (g_sched_create(req, mp, pp, gsp, proxy) != 0)
1738206497Sluigi			break;
1739206497Sluigi	}
1740206497Sluigi
1741206497Sluigi	g_gsched_unref(gsp);
1742206497Sluigi}
1743206497Sluigi
1744206497Sluigistatic void
1745206497Sluigig_sched_ctl_configure(struct gctl_req *req, struct g_class *mp)
1746206497Sluigi{
1747206497Sluigi	struct g_provider *pp;
1748206497Sluigi	struct g_gsched *gsp;
1749206497Sluigi	const char *name;
1750206497Sluigi	int i, nargs;
1751206497Sluigi
1752206497Sluigi	g_topology_assert();
1753206497Sluigi
1754206497Sluigi	name = gctl_get_asciiparam(req, "algo");
1755206497Sluigi	if (name == NULL) {
1756206497Sluigi		gctl_error(req, "No '%s' argument", "algo");
1757206497Sluigi		return;
1758206497Sluigi	}
1759206497Sluigi
1760206497Sluigi	gsp = g_gsched_find(name);	/* also get a reference */
1761206497Sluigi	if (gsp == NULL) {
1762206497Sluigi		gctl_error(req, "Bad algorithm '%s'", name);
1763206497Sluigi		return;
1764206497Sluigi	}
1765206497Sluigi
1766206497Sluigi	nargs = g_sched_get_nargs(req);
1767206497Sluigi
1768206497Sluigi	/*
1769206497Sluigi	 * Run on the arguments, and break on any error.
1770206497Sluigi	 * We look for a device name, but skip the /dev/ prefix if any.
1771206497Sluigi	 */
1772206497Sluigi	for (i = 0; i < nargs; i++) {
1773206497Sluigi		name = g_sched_argi(req, i);
1774206497Sluigi		if (name == NULL)
1775206497Sluigi			break;
1776206497Sluigi		pp = g_provider_by_name(name);
1777206497Sluigi		if (pp == NULL || pp->geom->class != mp) {
1778206497Sluigi			G_SCHED_DEBUG(1, "Provider %s is invalid.", name);
1779206497Sluigi			gctl_error(req, "Provider %s is invalid.", name);
1780206497Sluigi			break;
1781206497Sluigi		}
1782206497Sluigi		if (g_sched_change_algo(req, mp, pp, gsp) != 0)
1783206497Sluigi			break;
1784206497Sluigi	}
1785206497Sluigi
1786206497Sluigi	g_gsched_unref(gsp);
1787206497Sluigi}
1788206497Sluigi
1789206497Sluigistatic struct g_geom *
1790206497Sluigig_sched_find_geom(struct g_class *mp, const char *name)
1791206497Sluigi{
1792206497Sluigi	struct g_geom *gp;
1793206497Sluigi
1794206497Sluigi	LIST_FOREACH(gp, &mp->geom, geom) {
1795206497Sluigi		if (strcmp(gp->name, name) == 0)
1796206497Sluigi			return (gp);
1797206497Sluigi	}
1798206497Sluigi	return (NULL);
1799206497Sluigi}
1800206497Sluigi
1801206497Sluigistatic void
1802206497Sluigig_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp)
1803206497Sluigi{
1804206497Sluigi	int nargs, *force, error, i;
1805206497Sluigi	struct g_geom *gp;
1806206497Sluigi	const char *name;
1807206497Sluigi
1808206497Sluigi	g_topology_assert();
1809206497Sluigi
1810206497Sluigi	nargs = g_sched_get_nargs(req);
1811206497Sluigi
1812206497Sluigi	force = gctl_get_paraml(req, "force", sizeof(*force));
1813206497Sluigi	if (force == NULL) {
1814206497Sluigi		gctl_error(req, "No 'force' argument");
1815206497Sluigi		return;
1816206497Sluigi	}
1817206497Sluigi
1818206497Sluigi	for (i = 0; i < nargs; i++) {
1819206497Sluigi		name = g_sched_argi(req, i);
1820206497Sluigi		if (name == NULL)
1821206497Sluigi			break;
1822206497Sluigi
1823206497Sluigi		gp = g_sched_find_geom(mp, name);
1824206497Sluigi		if (gp == NULL) {
1825206497Sluigi			G_SCHED_DEBUG(1, "Device %s is invalid.", name);
1826206497Sluigi			gctl_error(req, "Device %s is invalid.", name);
1827206497Sluigi			break;
1828206497Sluigi		}
1829206497Sluigi
1830206497Sluigi		error = g_sched_destroy(gp, *force);
1831206497Sluigi		if (error != 0) {
1832206497Sluigi			gctl_error(req, "Cannot destroy device %s (error=%d).",
1833206497Sluigi			    gp->name, error);
1834206497Sluigi			break;
1835206497Sluigi		}
1836206497Sluigi	}
1837206497Sluigi}
1838206497Sluigi
1839206497Sluigistatic void
1840206497Sluigig_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb)
1841206497Sluigi{
1842206497Sluigi	uint32_t *version;
1843206497Sluigi
1844206497Sluigi	g_topology_assert();
1845206497Sluigi
1846206497Sluigi	version = gctl_get_paraml(req, "version", sizeof(*version));
1847206497Sluigi	if (version == NULL) {
1848206497Sluigi		gctl_error(req, "No '%s' argument.", "version");
1849206497Sluigi		return;
1850206497Sluigi	}
1851206497Sluigi
1852206497Sluigi	if (*version != G_SCHED_VERSION) {
1853206497Sluigi		gctl_error(req, "Userland and kernel parts are "
1854206497Sluigi		    "out of sync.");
1855206497Sluigi		return;
1856206497Sluigi	}
1857206497Sluigi
1858206497Sluigi	if (strcmp(verb, "create") == 0) {
1859206497Sluigi		g_sched_ctl_create(req, mp, 0);
1860206497Sluigi		return;
1861206497Sluigi	} else if (strcmp(verb, "insert") == 0) {
1862206497Sluigi		g_sched_ctl_create(req, mp, 1);
1863206497Sluigi		return;
1864206497Sluigi	} else if (strcmp(verb, "configure") == 0) {
1865206497Sluigi		g_sched_ctl_configure(req, mp);
1866206497Sluigi		return;
1867206497Sluigi	} else if (strcmp(verb, "destroy") == 0) {
1868206497Sluigi		g_sched_ctl_destroy(req, mp);
1869206497Sluigi		return;
1870206497Sluigi	}
1871206497Sluigi
1872206497Sluigi	gctl_error(req, "Unknown verb.");
1873206497Sluigi}
1874206497Sluigi
1875206497Sluigistatic void
1876206497Sluigig_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1877206497Sluigi    struct g_consumer *cp, struct g_provider *pp)
1878206497Sluigi{
1879206497Sluigi	struct g_sched_softc *sc = gp->softc;
1880206497Sluigi	struct g_gsched *gsp = sc->sc_gsched;
1881206497Sluigi	if (indent == NULL) {	/* plaintext */
1882206497Sluigi		sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--");
1883206497Sluigi	}
1884210795Sae	if (gsp != NULL && gsp->gs_dumpconf)
1885206497Sluigi		gsp->gs_dumpconf(sb, indent, gp, cp, pp);
1886206497Sluigi}
1887206497Sluigi
1888206497SluigiDECLARE_GEOM_CLASS(g_sched_class, g_sched);
1889206497SluigiMODULE_VERSION(geom_sched, 0);
1890