gs_rr.c revision 314667
1204431Sraj/*-
2204431Sraj * Copyright (c) 2009-2010 Fabio Checconi
3204431Sraj * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa
4204431Sraj * All rights reserved.
5204431Sraj *
6204431Sraj * Redistribution and use in source and binary forms, with or without
7204431Sraj * modification, are permitted provided that the following conditions
8204431Sraj * are met:
9204431Sraj * 1. Redistributions of source code must retain the above copyright
10204431Sraj *    notice, this list of conditions and the following disclaimer.
11204431Sraj * 2. Redistributions in binary form must reproduce the above copyright
12204431Sraj *    notice, this list of conditions and the following disclaimer in the
13204431Sraj *    documentation and/or other materials provided with the distribution.
14204431Sraj *
15204431Sraj * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16204431Sraj * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17204431Sraj * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18204431Sraj * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19204431Sraj * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20204431Sraj * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21204431Sraj * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22204431Sraj * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23204431Sraj * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24204431Sraj * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25204431Sraj * SUCH DAMAGE.
26204431Sraj */
27204431Sraj
28204431Sraj/*
29204431Sraj * $Id$
30204431Sraj * $FreeBSD: stable/10/sys/geom/sched/gs_rr.c 314667 2017-03-04 13:03:31Z avg $
31204431Sraj *
32204431Sraj * A round-robin (RR) anticipatory scheduler, with per-client queues.
33204431Sraj *
34204431Sraj * The goal of this implementation is to improve throughput compared
35204431Sraj * to the pure elevator algorithm, and insure some fairness among
36204431Sraj * clients.
37204431Sraj *
38204431Sraj * Requests coming from the same client are put in the same queue.
39204431Sraj * We use anticipation to help reducing seeks, and each queue
40204431Sraj * is never served continuously for more than a given amount of
41204431Sraj * time or data. Queues are then served in a round-robin fashion.
42204431Sraj *
43204431Sraj * Each queue can be in any of the following states:
44204431Sraj *     READY	immediately serve the first pending request;
45204431Sraj *     BUSY	one request is under service, wait for completion;
46204431Sraj *     IDLING	do not serve incoming requests immediately, unless
47204431Sraj * 		they are "eligible" as defined later.
48204431Sraj *
49204431Sraj * Scheduling is made looking at the status of all queues,
50204431Sraj * and the first one in round-robin order is privileged.
51204431Sraj */
52204431Sraj
53204431Sraj#include <sys/param.h>
54204431Sraj#include <sys/systm.h>
55204431Sraj#include <sys/kernel.h>
56204431Sraj#include <sys/bio.h>
57204431Sraj#include <sys/callout.h>
58204431Sraj#include <sys/malloc.h>
59204431Sraj#include <sys/module.h>
60204431Sraj#include <sys/proc.h>
61204431Sraj#include <sys/queue.h>
62204431Sraj#include <sys/sbuf.h>
63204431Sraj#include <sys/sysctl.h>
64204431Sraj#include "gs_scheduler.h"
65204431Sraj
66204431Sraj/* possible states of the scheduler */
67204431Srajenum g_rr_state {
68204431Sraj	G_QUEUE_READY = 0,	/* Ready to dispatch. */
69204431Sraj	G_QUEUE_BUSY,		/* Waiting for a completion. */
70204431Sraj	G_QUEUE_IDLING		/* Waiting for a new request. */
71204431Sraj};
72204431Sraj
73204433Sraj/* possible queue flags */
74204431Srajenum g_rr_flags {
75204431Sraj	/* G_FLAG_COMPLETED means that the field q_slice_end is valid. */
76204431Sraj	G_FLAG_COMPLETED = 1,	/* Completed a req. in the current budget. */
77204431Sraj};
78204431Sraj
79204431Srajstruct g_rr_softc;
80204431Sraj
81204431Sraj/*
82204431Sraj * Queue descriptor, containing reference count, scheduling
83204431Sraj * state, a queue of pending requests, configuration parameters.
84204431Sraj * Queues with pending request(s) and not under service are also
85204433Sraj * stored in a Round Robin (RR) list.
86204431Sraj */
87204431Srajstruct g_rr_queue {
88204431Sraj	struct g_rr_softc *q_sc;	/* link to the parent */
89204431Sraj
90204431Sraj	enum g_rr_state	q_status;
91204431Sraj	unsigned int	q_service;	/* service received so far */
92204431Sraj	int		q_slice_end;	/* actual slice end time, in ticks */
93204431Sraj	enum g_rr_flags	q_flags;	/* queue flags */
94204431Sraj	struct bio_queue_head q_bioq;
95204431Sraj
96204431Sraj	/* Scheduling parameters */
97204431Sraj	unsigned int	q_budget;	/* slice size in bytes */
98204431Sraj	unsigned int	q_slice_duration; /* slice size in ticks */
99204431Sraj	unsigned int	q_wait_ticks;	/* wait time for anticipation */
100204431Sraj
101204431Sraj	/* Stats to drive the various heuristics. */
102204431Sraj	struct g_savg	q_thinktime;	/* Thinktime average. */
103204431Sraj	struct g_savg	q_seekdist;	/* Seek distance average. */
104204431Sraj
105204431Sraj	int		q_bionum;	/* Number of requests. */
106204431Sraj
107204431Sraj	off_t		q_lastoff;	/* Last submitted req. offset. */
108204431Sraj	int		q_lastsub;	/* Last submitted req. time. */
109204431Sraj
110204431Sraj	/* Expiration deadline for an empty queue. */
111204431Sraj	int		q_expire;
112204431Sraj
113204431Sraj	TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */
114204431Sraj};
115204431Sraj
116204431Sraj/* List types. */
117204431SrajTAILQ_HEAD(g_rr_tailq, g_rr_queue);
118204431Sraj
119204431Sraj/* list of scheduler instances */
120204431SrajLIST_HEAD(g_scheds, g_rr_softc);
121204431Sraj
122204431Sraj/* Default quantum for RR between queues. */
123204431Sraj#define	G_RR_DEFAULT_BUDGET	0x00800000
124204431Sraj
125204431Sraj/*
126204431Sraj * Per device descriptor, holding the Round Robin list of queues
127204431Sraj * accessing the disk, a reference to the geom, and the timer.
128204431Sraj */
129204431Srajstruct g_rr_softc {
130204431Sraj	struct g_geom	*sc_geom;
131204431Sraj
132204431Sraj	/*
133204431Sraj	 * sc_active is the queue we are anticipating for.
134204431Sraj	 * It is set only in gs_rr_next(), and possibly cleared
135204431Sraj	 * only in gs_rr_next() or on a timeout.
136204431Sraj	 * The active queue is never in the Round Robin list
137204431Sraj	 * even if it has requests queued.
138204431Sraj	 */
139204431Sraj	struct g_rr_queue *sc_active;
140204431Sraj	struct callout	sc_wait;	/* timer for sc_active */
141204431Sraj
142204431Sraj	struct g_rr_tailq sc_rr_tailq;	/* the round-robin list */
143204431Sraj	int		sc_nqueues;	/* number of queues */
144204431Sraj
145204431Sraj	/* Statistics */
146204431Sraj	int		sc_in_flight;	/* requests in the driver */
147204431Sraj
148204431Sraj	LIST_ENTRY(g_rr_softc)	sc_next;
149204431Sraj};
150204431Sraj
151204431Sraj/* Descriptor for bounded values, min and max are constant. */
152204431Srajstruct x_bound {
153204431Sraj	const int	x_min;
154204431Sraj	int		x_cur;
155204431Sraj	const int	x_max;
156204431Sraj};
157204431Sraj
158204431Sraj/*
159204431Sraj * parameters, config and stats
160204431Sraj */
161204431Srajstruct g_rr_params {
162204431Sraj	int	queues;			/* total number of queues */
163204431Sraj	int	w_anticipate;		/* anticipate writes */
164204431Sraj	int	bypass;			/* bypass scheduling writes */
165204431Sraj
166204431Sraj	int	units;			/* how many instances */
167204431Sraj	/* sc_head is used for debugging */
168204431Sraj	struct g_scheds	sc_head;	/* first scheduler instance */
169204431Sraj
170204431Sraj	struct x_bound queue_depth;	/* max parallel requests */
171204431Sraj	struct x_bound wait_ms;		/* wait time, milliseconds */
172204431Sraj	struct x_bound quantum_ms;	/* quantum size, milliseconds */
173204431Sraj	struct x_bound quantum_kb;	/* quantum size, Kb (1024 bytes) */
174204431Sraj
175204431Sraj	/* statistics */
176204431Sraj	int	wait_hit;		/* success in anticipation */
177204431Sraj	int	wait_miss;		/* failure in anticipation */
178204431Sraj};
179204431Sraj
180204431Sraj/*
181204431Sraj * Default parameters for the scheduler.  The quantum sizes target
182204431Sraj * a 80MB/s disk; if the hw is faster or slower the minimum of the
183204431Sraj * two will have effect: the clients will still be isolated but
184204431Sraj * the fairness may be limited.  A complete solution would involve
185204431Sraj * the on-line measurement of the actual disk throughput to derive
186204431Sraj * these parameters.  Or we may just choose to ignore service domain
187204431Sraj * fairness and accept what can be achieved with time-only budgets.
188204431Sraj */
189204431Srajstatic struct g_rr_params me = {
190204431Sraj	.sc_head = LIST_HEAD_INITIALIZER(&me.sc_head),
191204431Sraj	.w_anticipate =	1,
192204431Sraj	.queue_depth =	{ 1,	1,	50 },
193204431Sraj	.wait_ms =	{ 1, 	10,	30 },
194204431Sraj	.quantum_ms =	{ 1, 	100,	500 },
195204431Sraj	.quantum_kb =	{ 16, 	8192,	65536 },
196204431Sraj};
197204431Sraj
198204431Srajstruct g_rr_params *gs_rr_me = &me;
199204431Sraj
200204431SrajSYSCTL_DECL(_kern_geom_sched);
201204431Srajstatic SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0,
202204431Sraj    "GEOM_SCHED ROUND ROBIN stuff");
203204431SrajSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD,
204204431Sraj    &me.units, 0, "Scheduler instances");
205204431SrajSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD,
206204431Sraj    &me.queues, 0, "Total rr queues");
207204431SrajSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW,
208204431Sraj    &me.wait_ms.x_cur, 0, "Wait time milliseconds");
209204431SrajSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW,
210204431Sraj    &me.quantum_ms.x_cur, 0, "Quantum size milliseconds");
211204431SrajSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW,
212204431Sraj    &me.bypass, 0, "Bypass scheduler");
213204431SrajSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW,
214204431Sraj    &me.w_anticipate, 0, "Do anticipation on writes");
215204431SrajSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW,
216204431Sraj    &me.quantum_kb.x_cur, 0, "Quantum size Kbytes");
217204431SrajSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW,
218204431Sraj    &me.queue_depth.x_cur, 0, "Maximum simultaneous requests");
219204431SrajSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW,
220204431Sraj    &me.wait_hit, 0, "Hits in anticipation");
221204431SrajSYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW,
222204431Sraj    &me.wait_miss, 0, "Misses in anticipation");
223204431Sraj
224204431Sraj#ifdef DEBUG_QUEUES
225204431Sraj/* print the status of a queue */
226204431Srajstatic void
227204431Srajgs_rr_dump_q(struct g_rr_queue *qp, int index)
228204431Sraj{
229204431Sraj	int l = 0;
230204431Sraj	struct bio *bp;
231204431Sraj
232204431Sraj	TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) {
233204431Sraj		l++;
234204431Sraj	}
235204431Sraj	printf("--- rr queue %d %p status %d len %d ---\n",
236204431Sraj	    index, qp, qp->q_status, l);
237204431Sraj}
238204431Sraj
239204431Sraj/*
240204433Sraj * Dump the scheduler status when writing to this sysctl variable.
241204431Sraj * XXX right now we only dump the status of the last instance created.
242204431Sraj * not a severe issue because this is only for debugging
243204431Sraj */
244204431Srajstatic int
245204431Srajgs_rr_sysctl_status(SYSCTL_HANDLER_ARGS)
246204431Sraj{
247204431Sraj        int error, val = 0;
248204431Sraj	struct g_rr_softc *sc;
249204433Sraj
250204433Sraj        error = sysctl_handle_int(oidp, &val, 0, req);
251204431Sraj        if (error || !req->newptr )
252204431Sraj                return (error);
253204431Sraj
254204431Sraj        printf("called %s\n", __FUNCTION__);
255204431Sraj
256204431Sraj	LIST_FOREACH(sc, &me.sc_head, sc_next) {
257		int i, tot = 0;
258		printf("--- sc %p active %p nqueues %d "
259		    "callout %d in_flight %d ---\n",
260		    sc, sc->sc_active, sc->sc_nqueues,
261		    callout_active(&sc->sc_wait),
262		    sc->sc_in_flight);
263		for (i = 0; i < G_RR_HASH_SIZE; i++) {
264			struct g_rr_queue *qp;
265			LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) {
266				gs_rr_dump_q(qp, tot);
267				tot++;
268			}
269		}
270	}
271        return (0);
272}
273
274SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status,
275	CTLTYPE_UINT | CTLFLAG_RW,
276    0, sizeof(int), gs_rr_sysctl_status, "I", "status");
277
278#endif	/* DEBUG_QUEUES */
279
280/*
281 * Get a bounded value, optionally convert to a min of t_min ticks.
282 */
283static int
284get_bounded(struct x_bound *v, int t_min)
285{
286	int x;
287
288	x = v->x_cur;
289	if (x < v->x_min)
290		x = v->x_min;
291	else if (x > v->x_max)
292		x = v->x_max;
293	if (t_min) {
294		x = x * hz / 1000;	/* convert to ticks */
295		if (x < t_min)
296			x = t_min;
297	}
298	return x;
299}
300
301/*
302 * Get a reference to the queue for bp, using the generic
303 * classification mechanism.
304 */
305static struct g_rr_queue *
306g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp)
307{
308
309	return (g_sched_get_class(sc->sc_geom, bp));
310}
311
312static int
313g_rr_init_class(void *data, void *priv)
314{
315	struct g_rr_softc *sc = data;
316	struct g_rr_queue *qp = priv;
317
318	gs_bioq_init(&qp->q_bioq);
319
320	/*
321	 * Set the initial parameters for the client:
322	 * slice size in bytes and ticks, and wait ticks.
323	 * Right now these are constant, but we could have
324	 * autoconfiguration code to adjust the values based on
325	 * the actual workload.
326	 */
327	qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0);
328	qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
329	qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
330
331	qp->q_sc = sc;		/* link to the parent */
332	qp->q_sc->sc_nqueues++;
333	me.queues++;
334
335	return (0);
336}
337
338/*
339 * Release a reference to the queue.
340 */
341static void
342g_rr_queue_put(struct g_rr_queue *qp)
343{
344
345	g_sched_put_class(qp->q_sc->sc_geom, qp);
346}
347
348static void
349g_rr_fini_class(void *data, void *priv)
350{
351	struct g_rr_queue *qp = priv;
352
353	KASSERT(gs_bioq_first(&qp->q_bioq) == NULL,
354			("released nonempty queue"));
355	qp->q_sc->sc_nqueues--;
356	me.queues--;
357}
358
359static inline int
360g_rr_queue_expired(struct g_rr_queue *qp)
361{
362
363	if (qp->q_service >= qp->q_budget)
364		return (1);
365
366	if ((qp->q_flags & G_FLAG_COMPLETED) &&
367	    ticks - qp->q_slice_end >= 0)
368		return (1);
369
370	return (0);
371}
372
373static inline int
374g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp)
375{
376	int wait = get_bounded(&me.wait_ms, 2);
377
378	if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE))
379		return (0);
380
381	if (g_savg_valid(&qp->q_thinktime) &&
382	    g_savg_read(&qp->q_thinktime) > wait)
383		return (0);
384
385	if (g_savg_valid(&qp->q_seekdist) &&
386	    g_savg_read(&qp->q_seekdist) > 8192)
387		return (0);
388
389	return (1);
390}
391
392/*
393 * Called on a request arrival, timeout or completion.
394 * Try to serve a request among those queued.
395 */
396static struct bio *
397g_rr_next(void *data, int force)
398{
399	struct g_rr_softc *sc = data;
400	struct g_rr_queue *qp;
401	struct bio *bp, *next;
402	int expired;
403
404	qp = sc->sc_active;
405	if (me.bypass == 0 && !force) {
406		if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0))
407			return (NULL);
408
409		/* Try with the queue under service first. */
410		if (qp != NULL && qp->q_status != G_QUEUE_READY) {
411			/*
412			 * Queue is anticipating, ignore request.
413			 * We should check that we are not past
414			 * the timeout, but in that case the timeout
415			 * will fire immediately afterwards so we
416			 * don't bother.
417			 */
418			return (NULL);
419		}
420	} else if (qp != NULL && qp->q_status != G_QUEUE_READY) {
421		g_rr_queue_put(qp);
422		sc->sc_active = qp = NULL;
423	}
424
425	/*
426	 * No queue under service, look for the first in RR order.
427	 * If we find it, select if as sc_active, clear service
428	 * and record the end time of the slice.
429	 */
430	if (qp == NULL) {
431		qp = TAILQ_FIRST(&sc->sc_rr_tailq);
432		if (qp == NULL)
433			return (NULL); /* no queues at all, return */
434		/* otherwise select the new queue for service. */
435		TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq);
436		sc->sc_active = qp;
437		qp->q_service = 0;
438		qp->q_flags &= ~G_FLAG_COMPLETED;
439	}
440
441	bp = gs_bioq_takefirst(&qp->q_bioq);	/* surely not NULL */
442	qp->q_service += bp->bio_length;	/* charge the service */
443
444	/*
445	 * The request at the head of the active queue is always
446	 * dispatched, and gs_rr_next() will be called again
447	 * immediately.
448	 * We need to prepare for what to do next:
449	 *
450	 * 1. have we reached the end of the (time or service) slice ?
451	 *    If so, clear sc_active and possibly requeue the previous
452	 *    active queue if it has more requests pending;
453	 * 2. do we have more requests in sc_active ?
454	 *    If yes, do not anticipate, as gs_rr_next() will run again;
455	 *    if no, decide whether or not to anticipate depending
456	 *    on read or writes (e.g., anticipate only on reads).
457	 */
458	expired = g_rr_queue_expired(qp);	/* are we expired ? */
459	next = gs_bioq_first(&qp->q_bioq);	/* do we have one more ? */
460 	if (expired) {
461		sc->sc_active = NULL;
462		/* Either requeue or release reference. */
463		if (next != NULL)
464			TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
465		else
466			g_rr_queue_put(qp);
467	} else if (next != NULL) {
468		qp->q_status = G_QUEUE_READY;
469	} else {
470		if (!force && g_rr_should_anticipate(qp, bp)) {
471			/* anticipate */
472			qp->q_status = G_QUEUE_BUSY;
473		} else {
474			/* do not anticipate, release reference */
475			g_rr_queue_put(qp);
476			sc->sc_active = NULL;
477		}
478	}
479	/* If sc_active != NULL, its q_status is always correct. */
480
481	sc->sc_in_flight++;
482
483	return (bp);
484}
485
486static inline void
487g_rr_update_thinktime(struct g_rr_queue *qp)
488{
489	int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2);
490
491	if (qp->q_sc->sc_active != qp)
492		return;
493
494	qp->q_lastsub = ticks;
495	delta = (delta > 2 * wait) ? 2 * wait : delta;
496	if (qp->q_bionum > 7)
497		g_savg_add_sample(&qp->q_thinktime, delta);
498}
499
500static inline void
501g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp)
502{
503	off_t dist;
504
505	if (qp->q_lastoff > bp->bio_offset)
506		dist = qp->q_lastoff - bp->bio_offset;
507	else
508		dist = bp->bio_offset - qp->q_lastoff;
509
510	if (dist > (8192 * 8))
511		dist = 8192 * 8;
512
513	qp->q_lastoff = bp->bio_offset + bp->bio_length;
514
515	if (qp->q_bionum > 7)
516		g_savg_add_sample(&qp->q_seekdist, dist);
517}
518
519/*
520 * Called when a real request for disk I/O arrives.
521 * Locate the queue associated with the client.
522 * If the queue is the one we are anticipating for, reset its timeout;
523 * if the queue is not in the round robin list, insert it in the list.
524 * On any error, do not queue the request and return -1, the caller
525 * will take care of this request.
526 */
527static int
528g_rr_start(void *data, struct bio *bp)
529{
530	struct g_rr_softc *sc = data;
531	struct g_rr_queue *qp;
532
533	if (me.bypass)
534		return (-1);	/* bypass the scheduler */
535
536	/* Get the queue for the request. */
537	qp = g_rr_queue_get(sc, bp);
538	if (qp == NULL)
539		return (-1); /* allocation failed, tell upstream */
540
541	if (gs_bioq_first(&qp->q_bioq) == NULL) {
542		/*
543		 * We are inserting into an empty queue.
544		 * Reset its state if it is sc_active,
545		 * otherwise insert it in the RR list.
546		 */
547		if (qp == sc->sc_active) {
548			qp->q_status = G_QUEUE_READY;
549			callout_stop(&sc->sc_wait);
550		} else {
551			g_sched_priv_ref(qp);
552			TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq);
553		}
554	}
555
556	qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3);
557
558	g_rr_update_thinktime(qp);
559	g_rr_update_seekdist(qp, bp);
560
561	/* Inherit the reference returned by g_rr_queue_get(). */
562	bp->bio_caller1 = qp;
563	gs_bioq_disksort(&qp->q_bioq, bp);
564
565	return (0);
566}
567
568/*
569 * Callout executed when a queue times out anticipating a new request.
570 */
571static void
572g_rr_wait_timeout(void *data)
573{
574	struct g_rr_softc *sc = data;
575	struct g_geom *geom = sc->sc_geom;
576
577	g_sched_lock(geom);
578	/*
579	 * We can race with other events, so check if
580	 * sc_active is still valid.
581	 */
582	if (sc->sc_active != NULL) {
583		/* Release the reference to the queue. */
584		g_rr_queue_put(sc->sc_active);
585		sc->sc_active = NULL;
586		me.wait_hit--;
587		me.wait_miss++;	/* record the miss */
588	}
589	g_sched_dispatch(geom);
590	g_sched_unlock(geom);
591}
592
593/*
594 * Module glue: allocate descriptor, initialize its fields.
595 */
596static void *
597g_rr_init(struct g_geom *geom)
598{
599	struct g_rr_softc *sc;
600
601	/* XXX check whether we can sleep */
602	sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO);
603	sc->sc_geom = geom;
604	TAILQ_INIT(&sc->sc_rr_tailq);
605	callout_init(&sc->sc_wait, 1);
606	LIST_INSERT_HEAD(&me.sc_head, sc, sc_next);
607	me.units++;
608
609	return (sc);
610}
611
612/*
613 * Module glue -- drain the callout structure, destroy the
614 * hash table and its element, and free the descriptor.
615 */
616static void
617g_rr_fini(void *data)
618{
619	struct g_rr_softc *sc = data;
620
621	callout_drain(&sc->sc_wait);
622	KASSERT(sc->sc_active == NULL, ("still a queue under service"));
623	KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues"));
624
625	LIST_REMOVE(sc, sc_next);
626	me.units--;
627	free(sc, M_GEOM_SCHED);
628}
629
630/*
631 * Called when the request under service terminates.
632 * Start the anticipation timer if needed.
633 */
634static void
635g_rr_done(void *data, struct bio *bp)
636{
637	struct g_rr_softc *sc = data;
638	struct g_rr_queue *qp;
639
640	sc->sc_in_flight--;
641
642	qp = bp->bio_caller1;
643
644	/*
645	 * When the first request for this queue completes, update the
646	 * duration and end of the slice. We do not do it when the
647	 * slice starts to avoid charging to the queue the time for
648	 * the first seek.
649	 */
650	if (!(qp->q_flags & G_FLAG_COMPLETED)) {
651		qp->q_flags |= G_FLAG_COMPLETED;
652		/*
653		 * recompute the slice duration, in case we want
654		 * to make it adaptive. This is not used right now.
655		 * XXX should we do the same for q_quantum and q_wait_ticks ?
656		 */
657		qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
658		qp->q_slice_end = ticks + qp->q_slice_duration;
659	}
660
661	if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) {
662		/* The queue is trying anticipation, start the timer. */
663		qp->q_status = G_QUEUE_IDLING;
664		/* may make this adaptive */
665		qp->q_wait_ticks = get_bounded(&me.wait_ms, 2);
666		me.wait_hit++;
667		callout_reset(&sc->sc_wait, qp->q_wait_ticks,
668		    g_rr_wait_timeout, sc);
669	} else
670		g_sched_dispatch(sc->sc_geom);
671
672	/* Release a reference to the queue. */
673	g_rr_queue_put(qp);
674}
675
676static void
677g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
678    struct g_consumer *cp, struct g_provider *pp)
679{
680	if (indent == NULL) {   /* plaintext */
681		sbuf_printf(sb, " units %d queues %d",
682			me.units, me.queues);
683        }
684}
685
686static struct g_gsched g_rr = {
687	.gs_name = "rr",
688	.gs_priv_size = sizeof(struct g_rr_queue),
689	.gs_init = g_rr_init,
690	.gs_fini = g_rr_fini,
691	.gs_start = g_rr_start,
692	.gs_done = g_rr_done,
693	.gs_next = g_rr_next,
694	.gs_dumpconf = g_rr_dumpconf,
695	.gs_init_class = g_rr_init_class,
696	.gs_fini_class = g_rr_fini_class,
697};
698
699DECLARE_GSCHED_MODULE(rr, &g_rr);
700