1298010Simp/*-
2298010Simp * CAM IO Scheduler Interface
3298010Simp *
4298010Simp * Copyright (c) 2015 Netflix, Inc.
5298010Simp * All rights reserved.
6298010Simp *
7298010Simp * Redistribution and use in source and binary forms, with or without
8298010Simp * modification, are permitted provided that the following conditions
9298010Simp * are met:
10298010Simp * 1. Redistributions of source code must retain the above copyright
11298010Simp *    notice, this list of conditions, and the following disclaimer,
12298010Simp *    without modification, immediately at the beginning of the file.
13298010Simp * 2. The name of the author may not be used to endorse or promote products
14298010Simp *    derived from this software without specific prior written permission.
15298010Simp *
16298010Simp * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17298010Simp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18298010Simp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19298010Simp * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
20298010Simp * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21298010Simp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22298010Simp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23298010Simp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24298010Simp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25298010Simp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26298010Simp * SUCH DAMAGE.
27298010Simp *
28298010Simp * $FreeBSD: stable/11/sys/cam/cam_iosched.c 334263 2018-05-27 23:52:41Z sbruno $
29298010Simp */
30298010Simp
31298010Simp#include "opt_cam.h"
32298010Simp#include "opt_ddb.h"
33298010Simp
34298010Simp#include <sys/cdefs.h>
35298010Simp__FBSDID("$FreeBSD: stable/11/sys/cam/cam_iosched.c 334263 2018-05-27 23:52:41Z sbruno $");
36298010Simp
37298010Simp#include <sys/param.h>
38298010Simp
39298010Simp#include <sys/systm.h>
40298010Simp#include <sys/kernel.h>
41298010Simp#include <sys/bio.h>
42298010Simp#include <sys/lock.h>
43298010Simp#include <sys/malloc.h>
44298010Simp#include <sys/mutex.h>
45298010Simp#include <sys/sysctl.h>
46298010Simp
47298010Simp#include <cam/cam.h>
48298010Simp#include <cam/cam_ccb.h>
49298010Simp#include <cam/cam_periph.h>
50298010Simp#include <cam/cam_xpt_periph.h>
51298010Simp#include <cam/cam_iosched.h>
52298010Simp
53298010Simp#include <ddb/ddb.h>
54298010Simp
55298010Simpstatic MALLOC_DEFINE(M_CAMSCHED, "CAM I/O Scheduler",
56298010Simp    "CAM I/O Scheduler buffers");
57298010Simp
58298010Simp/*
59298010Simp * Default I/O scheduler for FreeBSD. This implementation is just a thin-vineer
60298010Simp * over the bioq_* interface, with notions of separate calls for normal I/O and
61298010Simp * for trims.
62302396Simp *
63302396Simp * When CAM_IOSCHED_DYNAMIC is defined, the scheduler is enhanced to dynamically
64302396Simp * steer the rate of one type of traffic to help other types of traffic (eg
65302396Simp * limit writes when read latency deteriorates on SSDs).
66298010Simp */
67298010Simp
68302163Simp#ifdef CAM_IOSCHED_DYNAMIC
69298010Simp
70302396Simpstatic int do_dynamic_iosched = 1;
71302396SimpTUNABLE_INT("kern.cam.do_dynamic_iosched", &do_dynamic_iosched);
72302396SimpSYSCTL_INT(_kern_cam, OID_AUTO, do_dynamic_iosched, CTLFLAG_RD,
73302396Simp    &do_dynamic_iosched, 1,
74302396Simp    "Enable Dynamic I/O scheduler optimizations.");
75298010Simp
76298010Simpstatic int alpha_bits = 9;
77298010SimpTUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits);
78298010SimpSYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW,
79298010Simp    &alpha_bits, 1,
80298010Simp    "Bits in EMA's alpha.");
81298010Simp
82298010Simp
83298010Simp
84298010Simpstruct iop_stats;
85298010Simpstruct cam_iosched_softc;
86298010Simp
87298010Simpint iosched_debug = 0;
88298010Simp
89298010Simptypedef enum {
90298010Simp	none = 0,				/* No limits */
91298010Simp	queue_depth,			/* Limit how many ops we queue to SIM */
92298010Simp	iops,				/* Limit # of IOPS to the drive */
93298010Simp	bandwidth,			/* Limit bandwidth to the drive */
94298010Simp	limiter_max
95298010Simp} io_limiter;
96298010Simp
97298010Simpstatic const char *cam_iosched_limiter_names[] =
98298010Simp    { "none", "queue_depth", "iops", "bandwidth" };
99298010Simp
100298010Simp/*
101298010Simp * Called to initialize the bits of the iop_stats structure relevant to the
102298010Simp * limiter. Called just after the limiter is set.
103298010Simp */
104298010Simptypedef int l_init_t(struct iop_stats *);
105298010Simp
106298010Simp/*
107298010Simp * Called every tick.
108298010Simp */
109298010Simptypedef int l_tick_t(struct iop_stats *);
110298010Simp
111298010Simp/*
112298010Simp * Called to see if the limiter thinks this IOP can be allowed to
113298010Simp * proceed. If so, the limiter assumes that the while IOP proceeded
114298010Simp * and makes any accounting of it that's needed.
115298010Simp */
116298010Simptypedef int l_iop_t(struct iop_stats *, struct bio *);
117298010Simp
118298010Simp/*
119298010Simp * Called when an I/O completes so the limiter can updates its
120298010Simp * accounting. Pending I/Os may complete in any order (even when
121298010Simp * sent to the hardware at the same time), so the limiter may not
122298010Simp * make any assumptions other than this I/O has completed. If it
123298010Simp * returns 1, then xpt_schedule() needs to be called again.
124298010Simp */
125298010Simptypedef int l_iodone_t(struct iop_stats *, struct bio *);
126298010Simp
127298010Simpstatic l_iop_t cam_iosched_qd_iop;
128298010Simpstatic l_iop_t cam_iosched_qd_caniop;
129298010Simpstatic l_iodone_t cam_iosched_qd_iodone;
130298010Simp
131298010Simpstatic l_init_t cam_iosched_iops_init;
132298010Simpstatic l_tick_t cam_iosched_iops_tick;
133298010Simpstatic l_iop_t cam_iosched_iops_caniop;
134298010Simpstatic l_iop_t cam_iosched_iops_iop;
135298010Simp
136298010Simpstatic l_init_t cam_iosched_bw_init;
137298010Simpstatic l_tick_t cam_iosched_bw_tick;
138298010Simpstatic l_iop_t cam_iosched_bw_caniop;
139298010Simpstatic l_iop_t cam_iosched_bw_iop;
140298010Simp
141298010Simpstruct limswitch
142298010Simp{
143298010Simp	l_init_t	*l_init;
144298010Simp	l_tick_t	*l_tick;
145298010Simp	l_iop_t		*l_iop;
146298010Simp	l_iop_t		*l_caniop;
147298010Simp	l_iodone_t	*l_iodone;
148298010Simp} limsw[] =
149298010Simp{
150298010Simp	{	/* none */
151298010Simp		.l_init = NULL,
152298010Simp		.l_tick = NULL,
153298010Simp		.l_iop = NULL,
154298010Simp		.l_iodone= NULL,
155298010Simp	},
156298010Simp	{	/* queue_depth */
157298010Simp		.l_init = NULL,
158298010Simp		.l_tick = NULL,
159298010Simp		.l_caniop = cam_iosched_qd_caniop,
160298010Simp		.l_iop = cam_iosched_qd_iop,
161298010Simp		.l_iodone= cam_iosched_qd_iodone,
162298010Simp	},
163298010Simp	{	/* iops */
164298010Simp		.l_init = cam_iosched_iops_init,
165298010Simp		.l_tick = cam_iosched_iops_tick,
166298010Simp		.l_caniop = cam_iosched_iops_caniop,
167298010Simp		.l_iop = cam_iosched_iops_iop,
168298010Simp		.l_iodone= NULL,
169298010Simp	},
170298010Simp	{	/* bandwidth */
171298010Simp		.l_init = cam_iosched_bw_init,
172298010Simp		.l_tick = cam_iosched_bw_tick,
173298010Simp		.l_caniop = cam_iosched_bw_caniop,
174298010Simp		.l_iop = cam_iosched_bw_iop,
175298010Simp		.l_iodone= NULL,
176298010Simp	},
177298010Simp};
178298010Simp
179298010Simpstruct iop_stats
180298010Simp{
181298010Simp	/*
182298010Simp	 * sysctl state for this subnode.
183298010Simp	 */
184298010Simp	struct sysctl_ctx_list	sysctl_ctx;
185298010Simp	struct sysctl_oid	*sysctl_tree;
186298010Simp
187298010Simp	/*
188298010Simp	 * Information about the current rate limiters, if any
189298010Simp	 */
190298010Simp	io_limiter	limiter;	/* How are I/Os being limited */
191298010Simp	int		min;		/* Low range of limit */
192298010Simp	int		max;		/* High range of limit */
193298010Simp	int		current;	/* Current rate limiter */
194298010Simp	int		l_value1;	/* per-limiter scratch value 1. */
195298010Simp	int		l_value2;	/* per-limiter scratch value 2. */
196298010Simp
197298010Simp
198298010Simp	/*
199298010Simp	 * Debug information about counts of I/Os that have gone through the
200298010Simp	 * scheduler.
201298010Simp	 */
202298010Simp	int		pending;	/* I/Os pending in the hardware */
203298010Simp	int		queued;		/* number currently in the queue */
204298010Simp	int		total;		/* Total for all time -- wraps */
205298010Simp	int		in;		/* number queued all time -- wraps */
206298010Simp	int		out;		/* number completed all time -- wraps */
207298010Simp
208298010Simp	/*
209298010Simp	 * Statistics on different bits of the process.
210298010Simp	 */
211298010Simp		/* Exp Moving Average, alpha = 1 / (1 << alpha_bits) */
212298010Simp	sbintime_t      ema;
213298010Simp	sbintime_t      emss;		/* Exp Moving sum of the squares */
214298010Simp	sbintime_t      sd;		/* Last computed sd */
215298010Simp
216298010Simp	struct cam_iosched_softc *softc;
217298010Simp};
218298010Simp
219298010Simp
220298010Simptypedef enum {
221298010Simp	set_max = 0,			/* current = max */
222298010Simp	read_latency,			/* Steer read latency by throttling writes */
223298010Simp	cl_max				/* Keep last */
224298010Simp} control_type;
225298010Simp
226298010Simpstatic const char *cam_iosched_control_type_names[] =
227298010Simp    { "set_max", "read_latency" };
228298010Simp
229298010Simpstruct control_loop
230298010Simp{
231298010Simp	/*
232298010Simp	 * sysctl state for this subnode.
233298010Simp	 */
234298010Simp	struct sysctl_ctx_list	sysctl_ctx;
235298010Simp	struct sysctl_oid	*sysctl_tree;
236298010Simp
237298010Simp	sbintime_t	next_steer;		/* Time of next steer */
238298010Simp	sbintime_t	steer_interval;		/* How often do we steer? */
239298010Simp	sbintime_t	lolat;
240298010Simp	sbintime_t	hilat;
241298010Simp	int		alpha;
242298010Simp	control_type	type;			/* What type of control? */
243298010Simp	int		last_count;		/* Last I/O count */
244298010Simp
245298010Simp	struct cam_iosched_softc *softc;
246298010Simp};
247298010Simp
248298010Simp#endif
249298010Simp
250298010Simpstruct cam_iosched_softc
251298010Simp{
252298010Simp	struct bio_queue_head bio_queue;
253298010Simp	struct bio_queue_head trim_queue;
254298010Simp				/* scheduler flags < 16, user flags >= 16 */
255298010Simp	uint32_t	flags;
256298010Simp	int		sort_io_queue;
257302163Simp#ifdef CAM_IOSCHED_DYNAMIC
258298010Simp	int		read_bias;		/* Read bias setting */
259298010Simp	int		current_read_bias;	/* Current read bias state */
260298010Simp	int		total_ticks;
261298010Simp
262298010Simp	struct bio_queue_head write_queue;
263298010Simp	struct iop_stats read_stats, write_stats, trim_stats;
264298010Simp	struct sysctl_ctx_list	sysctl_ctx;
265298010Simp	struct sysctl_oid	*sysctl_tree;
266298010Simp
267298010Simp	int		quanta;			/* Number of quanta per second */
268298010Simp	struct callout	ticker;			/* Callout for our quota system */
269298010Simp	struct cam_periph *periph;		/* cam periph associated with this device */
270298010Simp	uint32_t	this_frac;		/* Fraction of a second (1024ths) for this tick */
271298010Simp	sbintime_t	last_time;		/* Last time we ticked */
272298010Simp	struct control_loop cl;
273298010Simp#endif
274298010Simp};
275298010Simp
276302163Simp#ifdef CAM_IOSCHED_DYNAMIC
277298010Simp/*
278298010Simp * helper functions to call the limsw functions.
279298010Simp */
280298010Simpstatic int
281298010Simpcam_iosched_limiter_init(struct iop_stats *ios)
282298010Simp{
283298010Simp	int lim = ios->limiter;
284298010Simp
285298010Simp	/* maybe this should be a kassert */
286298010Simp	if (lim < none || lim >= limiter_max)
287298010Simp		return EINVAL;
288298010Simp
289298010Simp	if (limsw[lim].l_init)
290298010Simp		return limsw[lim].l_init(ios);
291298010Simp
292298010Simp	return 0;
293298010Simp}
294298010Simp
295298010Simpstatic int
296298010Simpcam_iosched_limiter_tick(struct iop_stats *ios)
297298010Simp{
298298010Simp	int lim = ios->limiter;
299298010Simp
300298010Simp	/* maybe this should be a kassert */
301298010Simp	if (lim < none || lim >= limiter_max)
302298010Simp		return EINVAL;
303298010Simp
304298010Simp	if (limsw[lim].l_tick)
305298010Simp		return limsw[lim].l_tick(ios);
306298010Simp
307298010Simp	return 0;
308298010Simp}
309298010Simp
310298010Simpstatic int
311298010Simpcam_iosched_limiter_iop(struct iop_stats *ios, struct bio *bp)
312298010Simp{
313298010Simp	int lim = ios->limiter;
314298010Simp
315298010Simp	/* maybe this should be a kassert */
316298010Simp	if (lim < none || lim >= limiter_max)
317298010Simp		return EINVAL;
318298010Simp
319298010Simp	if (limsw[lim].l_iop)
320298010Simp		return limsw[lim].l_iop(ios, bp);
321298010Simp
322298010Simp	return 0;
323298010Simp}
324298010Simp
325298010Simpstatic int
326298010Simpcam_iosched_limiter_caniop(struct iop_stats *ios, struct bio *bp)
327298010Simp{
328298010Simp	int lim = ios->limiter;
329298010Simp
330298010Simp	/* maybe this should be a kassert */
331298010Simp	if (lim < none || lim >= limiter_max)
332298010Simp		return EINVAL;
333298010Simp
334298010Simp	if (limsw[lim].l_caniop)
335298010Simp		return limsw[lim].l_caniop(ios, bp);
336298010Simp
337298010Simp	return 0;
338298010Simp}
339298010Simp
340298010Simpstatic int
341298010Simpcam_iosched_limiter_iodone(struct iop_stats *ios, struct bio *bp)
342298010Simp{
343298010Simp	int lim = ios->limiter;
344298010Simp
345298010Simp	/* maybe this should be a kassert */
346298010Simp	if (lim < none || lim >= limiter_max)
347298010Simp		return 0;
348298010Simp
349298010Simp	if (limsw[lim].l_iodone)
350298010Simp		return limsw[lim].l_iodone(ios, bp);
351298010Simp
352298010Simp	return 0;
353298010Simp}
354298010Simp
355298010Simp/*
356298010Simp * Functions to implement the different kinds of limiters
357298010Simp */
358298010Simp
359298010Simpstatic int
360298010Simpcam_iosched_qd_iop(struct iop_stats *ios, struct bio *bp)
361298010Simp{
362298010Simp
363298010Simp	if (ios->current <= 0 || ios->pending < ios->current)
364298010Simp		return 0;
365298010Simp
366298010Simp	return EAGAIN;
367298010Simp}
368298010Simp
369298010Simpstatic int
370298010Simpcam_iosched_qd_caniop(struct iop_stats *ios, struct bio *bp)
371298010Simp{
372298010Simp
373298010Simp	if (ios->current <= 0 || ios->pending < ios->current)
374298010Simp		return 0;
375298010Simp
376298010Simp	return EAGAIN;
377298010Simp}
378298010Simp
379298010Simpstatic int
380298010Simpcam_iosched_qd_iodone(struct iop_stats *ios, struct bio *bp)
381298010Simp{
382298010Simp
383298010Simp	if (ios->current <= 0 || ios->pending != ios->current)
384298010Simp		return 0;
385298010Simp
386298010Simp	return 1;
387298010Simp}
388298010Simp
389298010Simpstatic int
390298010Simpcam_iosched_iops_init(struct iop_stats *ios)
391298010Simp{
392298010Simp
393298010Simp	ios->l_value1 = ios->current / ios->softc->quanta;
394298010Simp	if (ios->l_value1 <= 0)
395298010Simp		ios->l_value1 = 1;
396298010Simp
397298010Simp	return 0;
398298010Simp}
399298010Simp
400298010Simpstatic int
401298010Simpcam_iosched_iops_tick(struct iop_stats *ios)
402298010Simp{
403298010Simp
404298010Simp	ios->l_value1 = (int)((ios->current * (uint64_t)ios->softc->this_frac) >> 16);
405298010Simp	if (ios->l_value1 <= 0)
406298010Simp		ios->l_value1 = 1;
407298010Simp
408298010Simp	return 0;
409298010Simp}
410298010Simp
411298010Simpstatic int
412298010Simpcam_iosched_iops_caniop(struct iop_stats *ios, struct bio *bp)
413298010Simp{
414298010Simp
415298010Simp	/*
416298010Simp	 * So if we have any more IOPs left, allow it,
417298010Simp	 * otherwise wait.
418298010Simp	 */
419298010Simp	if (ios->l_value1 <= 0)
420298010Simp		return EAGAIN;
421298010Simp	return 0;
422298010Simp}
423298010Simp
424298010Simpstatic int
425298010Simpcam_iosched_iops_iop(struct iop_stats *ios, struct bio *bp)
426298010Simp{
427298010Simp	int rv;
428298010Simp
429298010Simp	rv = cam_iosched_limiter_caniop(ios, bp);
430298010Simp	if (rv == 0)
431298010Simp		ios->l_value1--;
432298010Simp
433298010Simp	return rv;
434298010Simp}
435298010Simp
436298010Simpstatic int
437298010Simpcam_iosched_bw_init(struct iop_stats *ios)
438298010Simp{
439298010Simp
440298010Simp	/* ios->current is in kB/s, so scale to bytes */
441298010Simp	ios->l_value1 = ios->current * 1000 / ios->softc->quanta;
442298010Simp
443298010Simp	return 0;
444298010Simp}
445298010Simp
446298010Simpstatic int
447298010Simpcam_iosched_bw_tick(struct iop_stats *ios)
448298010Simp{
449298010Simp	int bw;
450298010Simp
451298010Simp	/*
452298010Simp	 * If we're in the hole for available quota from
453298010Simp	 * the last time, then add the quantum for this.
454298010Simp	 * If we have any left over from last quantum,
455298010Simp	 * then too bad, that's lost. Also, ios->current
456298010Simp	 * is in kB/s, so scale.
457298010Simp	 *
458298010Simp	 * We also allow up to 4 quanta of credits to
459298010Simp	 * accumulate to deal with burstiness. 4 is extremely
460298010Simp	 * arbitrary.
461298010Simp	 */
462298010Simp	bw = (int)((ios->current * 1000ull * (uint64_t)ios->softc->this_frac) >> 16);
463298010Simp	if (ios->l_value1 < bw * 4)
464298010Simp		ios->l_value1 += bw;
465298010Simp
466298010Simp	return 0;
467298010Simp}
468298010Simp
469298010Simpstatic int
470298010Simpcam_iosched_bw_caniop(struct iop_stats *ios, struct bio *bp)
471298010Simp{
472298010Simp	/*
473298010Simp	 * So if we have any more bw quota left, allow it,
474298010Simp	 * otherwise wait. Not, we'll go negative and that's
475298010Simp	 * OK. We'll just get a lettle less next quota.
476298010Simp	 *
477298010Simp	 * Note on going negative: that allows us to process
478298010Simp	 * requests in order better, since we won't allow
479298010Simp	 * shorter reads to get around the long one that we
480298010Simp	 * don't have the quota to do just yet. It also prevents
481298010Simp	 * starvation by being a little more permissive about
482298010Simp	 * what we let through this quantum (to prevent the
483298010Simp	 * starvation), at the cost of getting a little less
484298010Simp	 * next quantum.
485298010Simp	 */
486298010Simp	if (ios->l_value1 <= 0)
487298010Simp		return EAGAIN;
488298010Simp
489298010Simp
490298010Simp	return 0;
491298010Simp}
492298010Simp
493298010Simpstatic int
494298010Simpcam_iosched_bw_iop(struct iop_stats *ios, struct bio *bp)
495298010Simp{
496298010Simp	int rv;
497298010Simp
498298010Simp	rv = cam_iosched_limiter_caniop(ios, bp);
499298010Simp	if (rv == 0)
500298010Simp		ios->l_value1 -= bp->bio_length;
501298010Simp
502298010Simp	return rv;
503298010Simp}
504298010Simp
505298010Simpstatic void cam_iosched_cl_maybe_steer(struct control_loop *clp);
506298010Simp
507298010Simpstatic void
508298010Simpcam_iosched_ticker(void *arg)
509298010Simp{
510298010Simp	struct cam_iosched_softc *isc = arg;
511298010Simp	sbintime_t now, delta;
512298010Simp
513334263Ssbruno	callout_reset(&isc->ticker, hz / isc->quanta, cam_iosched_ticker, isc);
514298010Simp
515298010Simp	now = sbinuptime();
516298010Simp	delta = now - isc->last_time;
517298010Simp	isc->this_frac = (uint32_t)delta >> 16;		/* Note: discards seconds -- should be 0 harmless if not */
518298010Simp	isc->last_time = now;
519298010Simp
520298010Simp	cam_iosched_cl_maybe_steer(&isc->cl);
521298010Simp
522298010Simp	cam_iosched_limiter_tick(&isc->read_stats);
523298010Simp	cam_iosched_limiter_tick(&isc->write_stats);
524298010Simp	cam_iosched_limiter_tick(&isc->trim_stats);
525298010Simp
526298010Simp	cam_iosched_schedule(isc, isc->periph);
527298010Simp
528298010Simp	isc->total_ticks++;
529298010Simp}
530298010Simp
531298010Simp
532298010Simpstatic void
533298010Simpcam_iosched_cl_init(struct control_loop *clp, struct cam_iosched_softc *isc)
534298010Simp{
535298010Simp
536298010Simp	clp->next_steer = sbinuptime();
537298010Simp	clp->softc = isc;
538298010Simp	clp->steer_interval = SBT_1S * 5;	/* Let's start out steering every 5s */
539298010Simp	clp->lolat = 5 * SBT_1MS;
540298010Simp	clp->hilat = 15 * SBT_1MS;
541298010Simp	clp->alpha = 20;			/* Alpha == gain. 20 = .2 */
542298010Simp	clp->type = set_max;
543298010Simp}
544298010Simp
545298010Simpstatic void
546298010Simpcam_iosched_cl_maybe_steer(struct control_loop *clp)
547298010Simp{
548298010Simp	struct cam_iosched_softc *isc;
549298010Simp	sbintime_t now, lat;
550298010Simp	int old;
551298010Simp
552298010Simp	isc = clp->softc;
553298010Simp	now = isc->last_time;
554298010Simp	if (now < clp->next_steer)
555298010Simp		return;
556298010Simp
557298010Simp	clp->next_steer = now + clp->steer_interval;
558298010Simp	switch (clp->type) {
559298010Simp	case set_max:
560298010Simp		if (isc->write_stats.current != isc->write_stats.max)
561298010Simp			printf("Steering write from %d kBps to %d kBps\n",
562298010Simp			    isc->write_stats.current, isc->write_stats.max);
563298010Simp		isc->read_stats.current = isc->read_stats.max;
564298010Simp		isc->write_stats.current = isc->write_stats.max;
565298010Simp		isc->trim_stats.current = isc->trim_stats.max;
566298010Simp		break;
567298010Simp	case read_latency:
568298010Simp		old = isc->write_stats.current;
569298010Simp		lat = isc->read_stats.ema;
570298010Simp		/*
571298010Simp		 * Simple PLL-like engine. Since we're steering to a range for
572298010Simp		 * the SP (set point) that makes things a little more
573298010Simp		 * complicated. In addition, we're not directly controlling our
574298010Simp		 * PV (process variable), the read latency, but instead are
575298010Simp		 * manipulating the write bandwidth limit for our MV
576298010Simp		 * (manipulation variable), analysis of this code gets a bit
577298010Simp		 * messy. Also, the MV is a very noisy control surface for read
578298010Simp		 * latency since it is affected by many hidden processes inside
579298010Simp		 * the device which change how responsive read latency will be
580298010Simp		 * in reaction to changes in write bandwidth. Unlike the classic
581298010Simp		 * boiler control PLL. this may result in over-steering while
582298010Simp		 * the SSD takes its time to react to the new, lower load. This
583298010Simp		 * is why we use a relatively low alpha of between .1 and .25 to
584298010Simp		 * compensate for this effect. At .1, it takes ~22 steering
585298010Simp		 * intervals to back off by a factor of 10. At .2 it only takes
586298010Simp		 * ~10. At .25 it only takes ~8. However some preliminary data
587298010Simp		 * from the SSD drives suggests a reasponse time in 10's of
588298010Simp		 * seconds before latency drops regardless of the new write
589298010Simp		 * rate. Careful observation will be reqiured to tune this
590298010Simp		 * effectively.
591298010Simp		 *
592298010Simp		 * Also, when there's no read traffic, we jack up the write
593298010Simp		 * limit too regardless of the last read latency.  10 is
594298010Simp		 * somewhat arbitrary.
595298010Simp		 */
596298010Simp		if (lat < clp->lolat || isc->read_stats.total - clp->last_count < 10)
597298010Simp			isc->write_stats.current = isc->write_stats.current *
598298010Simp			    (100 + clp->alpha) / 100;	/* Scale up */
599298010Simp		else if (lat > clp->hilat)
600298010Simp			isc->write_stats.current = isc->write_stats.current *
601298010Simp			    (100 - clp->alpha) / 100;	/* Scale down */
602298010Simp		clp->last_count = isc->read_stats.total;
603298010Simp
604298010Simp		/*
605298010Simp		 * Even if we don't steer, per se, enforce the min/max limits as
606298010Simp		 * those may have changed.
607298010Simp		 */
608298010Simp		if (isc->write_stats.current < isc->write_stats.min)
609298010Simp			isc->write_stats.current = isc->write_stats.min;
610298010Simp		if (isc->write_stats.current > isc->write_stats.max)
611298010Simp			isc->write_stats.current = isc->write_stats.max;
612298164Simp		if (old != isc->write_stats.current && 	iosched_debug)
613298164Simp			printf("Steering write from %d kBps to %d kBps due to latency of %jdms\n",
614298010Simp			    old, isc->write_stats.current,
615298164Simp			    (uintmax_t)((uint64_t)1000000 * (uint32_t)lat) >> 32);
616298010Simp		break;
617298010Simp	case cl_max:
618298010Simp		break;
619298010Simp	}
620298010Simp}
621298010Simp#endif
622298010Simp
623298010Simp			/* Trim or similar currently pending completion */
624298010Simp#define CAM_IOSCHED_FLAG_TRIM_ACTIVE	(1ul << 0)
625298010Simp			/* Callout active, and needs to be torn down */
626298010Simp#define CAM_IOSCHED_FLAG_CALLOUT_ACTIVE (1ul << 1)
627298010Simp
628298010Simp			/* Periph drivers set these flags to indicate work */
629298010Simp#define CAM_IOSCHED_FLAG_WORK_FLAGS	((0xffffu) << 16)
630298010Simp
631302163Simp#ifdef CAM_IOSCHED_DYNAMIC
632298010Simpstatic void
633298010Simpcam_iosched_io_metric_update(struct cam_iosched_softc *isc,
634298010Simp    sbintime_t sim_latency, int cmd, size_t size);
635298036Simp#endif
636298010Simp
637298010Simpstatic inline int
638298010Simpcam_iosched_has_flagged_work(struct cam_iosched_softc *isc)
639298010Simp{
640298010Simp	return !!(isc->flags & CAM_IOSCHED_FLAG_WORK_FLAGS);
641298010Simp}
642298010Simp
643298010Simpstatic inline int
644298010Simpcam_iosched_has_io(struct cam_iosched_softc *isc)
645298010Simp{
646302163Simp#ifdef CAM_IOSCHED_DYNAMIC
647302396Simp	if (do_dynamic_iosched) {
648298010Simp		struct bio *rbp = bioq_first(&isc->bio_queue);
649298010Simp		struct bio *wbp = bioq_first(&isc->write_queue);
650298010Simp		int can_write = wbp != NULL &&
651298010Simp		    cam_iosched_limiter_caniop(&isc->write_stats, wbp) == 0;
652298010Simp		int can_read = rbp != NULL &&
653298010Simp		    cam_iosched_limiter_caniop(&isc->read_stats, rbp) == 0;
654298010Simp		if (iosched_debug > 2) {
655298010Simp			printf("can write %d: pending_writes %d max_writes %d\n", can_write, isc->write_stats.pending, isc->write_stats.max);
656298010Simp			printf("can read %d: read_stats.pending %d max_reads %d\n", can_read, isc->read_stats.pending, isc->read_stats.max);
657298010Simp			printf("Queued reads %d writes %d\n", isc->read_stats.queued, isc->write_stats.queued);
658298010Simp		}
659298010Simp		return can_read || can_write;
660298010Simp	}
661298010Simp#endif
662298010Simp	return bioq_first(&isc->bio_queue) != NULL;
663298010Simp}
664298010Simp
665298010Simpstatic inline int
666298010Simpcam_iosched_has_more_trim(struct cam_iosched_softc *isc)
667298010Simp{
668298010Simp	return !(isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) &&
669298010Simp	    bioq_first(&isc->trim_queue);
670298010Simp}
671298010Simp
672298010Simp#define cam_iosched_sort_queue(isc)	((isc)->sort_io_queue >= 0 ?	\
673298010Simp    (isc)->sort_io_queue : cam_sort_io_queues)
674298010Simp
675298010Simp
676298010Simpstatic inline int
677298010Simpcam_iosched_has_work(struct cam_iosched_softc *isc)
678298010Simp{
679302163Simp#ifdef CAM_IOSCHED_DYNAMIC
680298010Simp	if (iosched_debug > 2)
681298010Simp		printf("has work: %d %d %d\n", cam_iosched_has_io(isc),
682298010Simp		    cam_iosched_has_more_trim(isc),
683298010Simp		    cam_iosched_has_flagged_work(isc));
684298010Simp#endif
685298010Simp
686298010Simp	return cam_iosched_has_io(isc) ||
687298010Simp		cam_iosched_has_more_trim(isc) ||
688298010Simp		cam_iosched_has_flagged_work(isc);
689298010Simp}
690298010Simp
691302163Simp#ifdef CAM_IOSCHED_DYNAMIC
692298010Simpstatic void
693298010Simpcam_iosched_iop_stats_init(struct cam_iosched_softc *isc, struct iop_stats *ios)
694298010Simp{
695298010Simp
696298010Simp	ios->limiter = none;
697298010Simp	cam_iosched_limiter_init(ios);
698298010Simp	ios->in = 0;
699298010Simp	ios->max = 300000;
700298010Simp	ios->min = 1;
701298010Simp	ios->out = 0;
702298010Simp	ios->pending = 0;
703298010Simp	ios->queued = 0;
704298010Simp	ios->total = 0;
705298010Simp	ios->ema = 0;
706298010Simp	ios->emss = 0;
707298010Simp	ios->sd = 0;
708298010Simp	ios->softc = isc;
709298010Simp}
710298010Simp
711298010Simpstatic int
712298010Simpcam_iosched_limiter_sysctl(SYSCTL_HANDLER_ARGS)
713298010Simp{
714298010Simp	char buf[16];
715298010Simp	struct iop_stats *ios;
716298010Simp	struct cam_iosched_softc *isc;
717298010Simp	int value, i, error, cantick;
718298010Simp	const char *p;
719298010Simp
720298010Simp	ios = arg1;
721298010Simp	isc = ios->softc;
722298010Simp	value = ios->limiter;
723298010Simp	if (value < none || value >= limiter_max)
724298010Simp		p = "UNKNOWN";
725298010Simp	else
726298010Simp		p = cam_iosched_limiter_names[value];
727298010Simp
728298010Simp	strlcpy(buf, p, sizeof(buf));
729298010Simp	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
730298010Simp	if (error != 0 || req->newptr == NULL)
731298010Simp		return error;
732298010Simp
733298010Simp	cam_periph_lock(isc->periph);
734298010Simp
735298010Simp	for (i = none; i < limiter_max; i++) {
736298010Simp		if (strcmp(buf, cam_iosched_limiter_names[i]) != 0)
737298010Simp			continue;
738298010Simp		ios->limiter = i;
739298010Simp		error = cam_iosched_limiter_init(ios);
740298010Simp		if (error != 0) {
741298010Simp			ios->limiter = value;
742298010Simp			cam_periph_unlock(isc->periph);
743298010Simp			return error;
744298010Simp		}
745298010Simp		cantick = !!limsw[isc->read_stats.limiter].l_tick +
746298010Simp		    !!limsw[isc->write_stats.limiter].l_tick +
747298010Simp		    !!limsw[isc->trim_stats.limiter].l_tick +
748298010Simp		    1;	/* Control loop requires it */
749298010Simp		if (isc->flags & CAM_IOSCHED_FLAG_CALLOUT_ACTIVE) {
750298010Simp			if (cantick == 0) {
751298010Simp				callout_stop(&isc->ticker);
752298010Simp				isc->flags &= ~CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
753298010Simp			}
754298010Simp		} else {
755298010Simp			if (cantick != 0) {
756334229Ssbruno				callout_reset(&isc->ticker, hz / isc->quanta, cam_iosched_ticker, isc);
757298010Simp				isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
758298010Simp			}
759298010Simp		}
760298010Simp
761298010Simp		cam_periph_unlock(isc->periph);
762298010Simp		return 0;
763298010Simp	}
764298010Simp
765298010Simp	cam_periph_unlock(isc->periph);
766298010Simp	return EINVAL;
767298010Simp}
768298010Simp
769298010Simpstatic int
770298010Simpcam_iosched_control_type_sysctl(SYSCTL_HANDLER_ARGS)
771298010Simp{
772298010Simp	char buf[16];
773298010Simp	struct control_loop *clp;
774298010Simp	struct cam_iosched_softc *isc;
775298010Simp	int value, i, error;
776298010Simp	const char *p;
777298010Simp
778298010Simp	clp = arg1;
779298010Simp	isc = clp->softc;
780298010Simp	value = clp->type;
781298010Simp	if (value < none || value >= cl_max)
782298010Simp		p = "UNKNOWN";
783298010Simp	else
784298010Simp		p = cam_iosched_control_type_names[value];
785298010Simp
786298010Simp	strlcpy(buf, p, sizeof(buf));
787298010Simp	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
788298010Simp	if (error != 0 || req->newptr == NULL)
789298010Simp		return error;
790298010Simp
791298010Simp	for (i = set_max; i < cl_max; i++) {
792298010Simp		if (strcmp(buf, cam_iosched_control_type_names[i]) != 0)
793298010Simp			continue;
794298010Simp		cam_periph_lock(isc->periph);
795298010Simp		clp->type = i;
796298010Simp		cam_periph_unlock(isc->periph);
797298010Simp		return 0;
798298010Simp	}
799298010Simp
800298010Simp	return EINVAL;
801298010Simp}
802298010Simp
803298010Simpstatic int
804298010Simpcam_iosched_sbintime_sysctl(SYSCTL_HANDLER_ARGS)
805298010Simp{
806298010Simp	char buf[16];
807298010Simp	sbintime_t value;
808298010Simp	int error;
809298010Simp	uint64_t us;
810298010Simp
811298010Simp	value = *(sbintime_t *)arg1;
812298010Simp	us = (uint64_t)value / SBT_1US;
813298010Simp	snprintf(buf, sizeof(buf), "%ju", (intmax_t)us);
814298010Simp	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
815298010Simp	if (error != 0 || req->newptr == NULL)
816298010Simp		return error;
817298010Simp	us = strtoul(buf, NULL, 10);
818298010Simp	if (us == 0)
819298010Simp		return EINVAL;
820298010Simp	*(sbintime_t *)arg1 = us * SBT_1US;
821298010Simp	return 0;
822298010Simp}
823298010Simp
824334229Ssbrunostatic int
825334229Ssbrunocam_iosched_quanta_sysctl(SYSCTL_HANDLER_ARGS)
826334229Ssbruno{
827334229Ssbruno	int *quanta;
828334229Ssbruno	int error, value;
829334229Ssbruno
830334229Ssbruno	quanta = (unsigned *)arg1;
831334229Ssbruno	value = *quanta;
832334229Ssbruno
833334229Ssbruno	error = sysctl_handle_int(oidp, (int *)&value, 0, req);
834334229Ssbruno	if ((error != 0) || (req->newptr == NULL))
835334229Ssbruno		return (error);
836334229Ssbruno
837334229Ssbruno	if (value < 1 || value > hz)
838334229Ssbruno		return (EINVAL);
839334229Ssbruno
840334229Ssbruno	*quanta = value;
841334229Ssbruno
842334229Ssbruno	return (0);
843334229Ssbruno}
844334229Ssbruno
845298010Simpstatic void
846298010Simpcam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stats *ios, char *name)
847298010Simp{
848298010Simp	struct sysctl_oid_list *n;
849298010Simp	struct sysctl_ctx_list *ctx;
850298010Simp
851298010Simp	ios->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx,
852298010Simp	    SYSCTL_CHILDREN(isc->sysctl_tree), OID_AUTO, name,
853298010Simp	    CTLFLAG_RD, 0, name);
854298010Simp	n = SYSCTL_CHILDREN(ios->sysctl_tree);
855298010Simp	ctx = &ios->sysctl_ctx;
856298010Simp
857298010Simp	SYSCTL_ADD_UQUAD(ctx, n,
858298010Simp	    OID_AUTO, "ema", CTLFLAG_RD,
859298010Simp	    &ios->ema,
860298010Simp	    "Fast Exponentially Weighted Moving Average");
861298010Simp	SYSCTL_ADD_UQUAD(ctx, n,
862298010Simp	    OID_AUTO, "emss", CTLFLAG_RD,
863298010Simp	    &ios->emss,
864298010Simp	    "Fast Exponentially Weighted Moving Sum of Squares (maybe wrong)");
865298010Simp	SYSCTL_ADD_UQUAD(ctx, n,
866298010Simp	    OID_AUTO, "sd", CTLFLAG_RD,
867298010Simp	    &ios->sd,
868298010Simp	    "Estimated SD for fast ema (may be wrong)");
869298010Simp
870298010Simp	SYSCTL_ADD_INT(ctx, n,
871298010Simp	    OID_AUTO, "pending", CTLFLAG_RD,
872298010Simp	    &ios->pending, 0,
873298010Simp	    "Instantaneous # of pending transactions");
874298010Simp	SYSCTL_ADD_INT(ctx, n,
875298010Simp	    OID_AUTO, "count", CTLFLAG_RD,
876298010Simp	    &ios->total, 0,
877298010Simp	    "# of transactions submitted to hardware");
878298010Simp	SYSCTL_ADD_INT(ctx, n,
879298010Simp	    OID_AUTO, "queued", CTLFLAG_RD,
880298010Simp	    &ios->queued, 0,
881298010Simp	    "# of transactions in the queue");
882298010Simp	SYSCTL_ADD_INT(ctx, n,
883298010Simp	    OID_AUTO, "in", CTLFLAG_RD,
884298010Simp	    &ios->in, 0,
885298010Simp	    "# of transactions queued to driver");
886298010Simp	SYSCTL_ADD_INT(ctx, n,
887298010Simp	    OID_AUTO, "out", CTLFLAG_RD,
888298010Simp	    &ios->out, 0,
889298010Simp	    "# of transactions completed");
890298010Simp
891298010Simp	SYSCTL_ADD_PROC(ctx, n,
892298010Simp	    OID_AUTO, "limiter", CTLTYPE_STRING | CTLFLAG_RW,
893298010Simp	    ios, 0, cam_iosched_limiter_sysctl, "A",
894298010Simp	    "Current limiting type.");
895298010Simp	SYSCTL_ADD_INT(ctx, n,
896298010Simp	    OID_AUTO, "min", CTLFLAG_RW,
897298010Simp	    &ios->min, 0,
898298010Simp	    "min resource");
899298010Simp	SYSCTL_ADD_INT(ctx, n,
900298010Simp	    OID_AUTO, "max", CTLFLAG_RW,
901298010Simp	    &ios->max, 0,
902298010Simp	    "max resource");
903298010Simp	SYSCTL_ADD_INT(ctx, n,
904298010Simp	    OID_AUTO, "current", CTLFLAG_RW,
905298010Simp	    &ios->current, 0,
906298010Simp	    "current resource");
907298010Simp
908298010Simp}
909298010Simp
910298010Simpstatic void
911298010Simpcam_iosched_iop_stats_fini(struct iop_stats *ios)
912298010Simp{
913298010Simp	if (ios->sysctl_tree)
914298010Simp		if (sysctl_ctx_free(&ios->sysctl_ctx) != 0)
915298010Simp			printf("can't remove iosched sysctl stats context\n");
916298010Simp}
917298010Simp
918298010Simpstatic void
919298010Simpcam_iosched_cl_sysctl_init(struct cam_iosched_softc *isc)
920298010Simp{
921298010Simp	struct sysctl_oid_list *n;
922298010Simp	struct sysctl_ctx_list *ctx;
923298010Simp	struct control_loop *clp;
924298010Simp
925298010Simp	clp = &isc->cl;
926298010Simp	clp->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx,
927298010Simp	    SYSCTL_CHILDREN(isc->sysctl_tree), OID_AUTO, "control",
928298010Simp	    CTLFLAG_RD, 0, "Control loop info");
929298010Simp	n = SYSCTL_CHILDREN(clp->sysctl_tree);
930298010Simp	ctx = &clp->sysctl_ctx;
931298010Simp
932298010Simp	SYSCTL_ADD_PROC(ctx, n,
933298010Simp	    OID_AUTO, "type", CTLTYPE_STRING | CTLFLAG_RW,
934298010Simp	    clp, 0, cam_iosched_control_type_sysctl, "A",
935298010Simp	    "Control loop algorithm");
936298010Simp	SYSCTL_ADD_PROC(ctx, n,
937298010Simp	    OID_AUTO, "steer_interval", CTLTYPE_STRING | CTLFLAG_RW,
938298010Simp	    &clp->steer_interval, 0, cam_iosched_sbintime_sysctl, "A",
939298010Simp	    "How often to steer (in us)");
940298010Simp	SYSCTL_ADD_PROC(ctx, n,
941298010Simp	    OID_AUTO, "lolat", CTLTYPE_STRING | CTLFLAG_RW,
942298010Simp	    &clp->lolat, 0, cam_iosched_sbintime_sysctl, "A",
943298010Simp	    "Low water mark for Latency (in us)");
944298010Simp	SYSCTL_ADD_PROC(ctx, n,
945298010Simp	    OID_AUTO, "hilat", CTLTYPE_STRING | CTLFLAG_RW,
946298010Simp	    &clp->hilat, 0, cam_iosched_sbintime_sysctl, "A",
947298010Simp	    "Hi water mark for Latency (in us)");
948298010Simp	SYSCTL_ADD_INT(ctx, n,
949298010Simp	    OID_AUTO, "alpha", CTLFLAG_RW,
950298010Simp	    &clp->alpha, 0,
951298010Simp	    "Alpha for PLL (x100) aka gain");
952298010Simp}
953298010Simp
954298010Simpstatic void
955298010Simpcam_iosched_cl_sysctl_fini(struct control_loop *clp)
956298010Simp{
957298010Simp	if (clp->sysctl_tree)
958298010Simp		if (sysctl_ctx_free(&clp->sysctl_ctx) != 0)
959298010Simp			printf("can't remove iosched sysctl control loop context\n");
960298010Simp}
961298010Simp#endif
962298010Simp
963298010Simp/*
964298010Simp * Allocate the iosched structure. This also insulates callers from knowing
965298010Simp * sizeof struct cam_iosched_softc.
966298010Simp */
967298010Simpint
968298010Simpcam_iosched_init(struct cam_iosched_softc **iscp, struct cam_periph *periph)
969298010Simp{
970298010Simp
971298010Simp	*iscp = malloc(sizeof(**iscp), M_CAMSCHED, M_NOWAIT | M_ZERO);
972298010Simp	if (*iscp == NULL)
973298010Simp		return ENOMEM;
974302163Simp#ifdef CAM_IOSCHED_DYNAMIC
975298010Simp	if (iosched_debug)
976298010Simp		printf("CAM IOSCHEDULER Allocating entry at %p\n", *iscp);
977298010Simp#endif
978298010Simp	(*iscp)->sort_io_queue = -1;
979298010Simp	bioq_init(&(*iscp)->bio_queue);
980298010Simp	bioq_init(&(*iscp)->trim_queue);
981302163Simp#ifdef CAM_IOSCHED_DYNAMIC
982302396Simp	if (do_dynamic_iosched) {
983298010Simp		bioq_init(&(*iscp)->write_queue);
984298010Simp		(*iscp)->read_bias = 100;
985298010Simp		(*iscp)->current_read_bias = 100;
986298010Simp		(*iscp)->quanta = 200;
987298010Simp		cam_iosched_iop_stats_init(*iscp, &(*iscp)->read_stats);
988298010Simp		cam_iosched_iop_stats_init(*iscp, &(*iscp)->write_stats);
989298010Simp		cam_iosched_iop_stats_init(*iscp, &(*iscp)->trim_stats);
990298010Simp		(*iscp)->trim_stats.max = 1;	/* Trims are special: one at a time for now */
991298010Simp		(*iscp)->last_time = sbinuptime();
992298010Simp		callout_init_mtx(&(*iscp)->ticker, cam_periph_mtx(periph), 0);
993298010Simp		(*iscp)->periph = periph;
994298010Simp		cam_iosched_cl_init(&(*iscp)->cl, *iscp);
995334229Ssbruno		callout_reset(&(*iscp)->ticker, hz / (*iscp)->quanta, cam_iosched_ticker, *iscp);
996298010Simp		(*iscp)->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
997298010Simp	}
998298010Simp#endif
999298010Simp
1000298010Simp	return 0;
1001298010Simp}
1002298010Simp
1003298010Simp/*
1004298010Simp * Reclaim all used resources. This assumes that other folks have
1005298010Simp * drained the requests in the hardware. Maybe an unwise assumption.
1006298010Simp */
1007298010Simpvoid
1008298010Simpcam_iosched_fini(struct cam_iosched_softc *isc)
1009298010Simp{
1010298010Simp	if (isc) {
1011298010Simp		cam_iosched_flush(isc, NULL, ENXIO);
1012302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1013298010Simp		cam_iosched_iop_stats_fini(&isc->read_stats);
1014298010Simp		cam_iosched_iop_stats_fini(&isc->write_stats);
1015298010Simp		cam_iosched_iop_stats_fini(&isc->trim_stats);
1016298010Simp		cam_iosched_cl_sysctl_fini(&isc->cl);
1017298010Simp		if (isc->sysctl_tree)
1018298010Simp			if (sysctl_ctx_free(&isc->sysctl_ctx) != 0)
1019298010Simp				printf("can't remove iosched sysctl stats context\n");
1020298010Simp		if (isc->flags & CAM_IOSCHED_FLAG_CALLOUT_ACTIVE) {
1021298010Simp			callout_drain(&isc->ticker);
1022298010Simp			isc->flags &= ~ CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
1023298010Simp		}
1024298010Simp
1025298010Simp#endif
1026298010Simp		free(isc, M_CAMSCHED);
1027298010Simp	}
1028298010Simp}
1029298010Simp
1030298010Simp/*
1031298010Simp * After we're sure we're attaching a device, go ahead and add
1032298010Simp * hooks for any sysctl we may wish to honor.
1033298010Simp */
1034298010Simpvoid cam_iosched_sysctl_init(struct cam_iosched_softc *isc,
1035298010Simp    struct sysctl_ctx_list *ctx, struct sysctl_oid *node)
1036298010Simp{
1037302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1038298010Simp	struct sysctl_oid_list *n;
1039298010Simp#endif
1040298010Simp
1041298010Simp	SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(node),
1042298010Simp		OID_AUTO, "sort_io_queue", CTLFLAG_RW | CTLFLAG_MPSAFE,
1043298010Simp		&isc->sort_io_queue, 0,
1044298010Simp		"Sort IO queue to try and optimise disk access patterns");
1045298010Simp
1046302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1047302396Simp	if (!do_dynamic_iosched)
1048298010Simp		return;
1049298010Simp
1050298010Simp	isc->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx,
1051298010Simp	    SYSCTL_CHILDREN(node), OID_AUTO, "iosched",
1052298010Simp	    CTLFLAG_RD, 0, "I/O scheduler statistics");
1053298010Simp	n = SYSCTL_CHILDREN(isc->sysctl_tree);
1054298010Simp	ctx = &isc->sysctl_ctx;
1055298010Simp
1056298010Simp	cam_iosched_iop_stats_sysctl_init(isc, &isc->read_stats, "read");
1057298010Simp	cam_iosched_iop_stats_sysctl_init(isc, &isc->write_stats, "write");
1058298010Simp	cam_iosched_iop_stats_sysctl_init(isc, &isc->trim_stats, "trim");
1059298010Simp	cam_iosched_cl_sysctl_init(isc);
1060298010Simp
1061298010Simp	SYSCTL_ADD_INT(ctx, n,
1062298010Simp	    OID_AUTO, "read_bias", CTLFLAG_RW,
1063298010Simp	    &isc->read_bias, 100,
1064298010Simp	    "How biased towards read should we be independent of limits");
1065298010Simp
1066334229Ssbruno	SYSCTL_ADD_PROC(ctx, n,
1067334229Ssbruno	    OID_AUTO, "quanta", CTLTYPE_UINT | CTLFLAG_RW,
1068334229Ssbruno	    &isc->quanta, 0, cam_iosched_quanta_sysctl, "I",
1069298010Simp	    "How many quanta per second do we slice the I/O up into");
1070298010Simp
1071298010Simp	SYSCTL_ADD_INT(ctx, n,
1072298010Simp	    OID_AUTO, "total_ticks", CTLFLAG_RD,
1073298010Simp	    &isc->total_ticks, 0,
1074298010Simp	    "Total number of ticks we've done");
1075298010Simp#endif
1076298010Simp}
1077298010Simp
1078298010Simp/*
1079298010Simp * Flush outstanding I/O. Consumers of this library don't know all the
1080298010Simp * queues we may keep, so this allows all I/O to be flushed in one
1081298010Simp * convenient call.
1082298010Simp */
1083298010Simpvoid
1084298010Simpcam_iosched_flush(struct cam_iosched_softc *isc, struct devstat *stp, int err)
1085298010Simp{
1086298010Simp	bioq_flush(&isc->bio_queue, stp, err);
1087298010Simp	bioq_flush(&isc->trim_queue, stp, err);
1088302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1089302396Simp	if (do_dynamic_iosched)
1090298010Simp		bioq_flush(&isc->write_queue, stp, err);
1091298010Simp#endif
1092298010Simp}
1093298010Simp
1094302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1095298010Simpstatic struct bio *
1096298010Simpcam_iosched_get_write(struct cam_iosched_softc *isc)
1097298010Simp{
1098298010Simp	struct bio *bp;
1099298010Simp
1100298010Simp	/*
1101298010Simp	 * We control the write rate by controlling how many requests we send
1102298010Simp	 * down to the drive at any one time. Fewer requests limits the
1103298010Simp	 * effects of both starvation when the requests take a while and write
1104298010Simp	 * amplification when each request is causing more than one write to
1105298010Simp	 * the NAND media. Limiting the queue depth like this will also limit
1106298010Simp	 * the write throughput and give and reads that want to compete to
1107298010Simp	 * compete unfairly.
1108298010Simp	 */
1109298010Simp	bp = bioq_first(&isc->write_queue);
1110298010Simp	if (bp == NULL) {
1111298010Simp		if (iosched_debug > 3)
1112298010Simp			printf("No writes present in write_queue\n");
1113298010Simp		return NULL;
1114298010Simp	}
1115298010Simp
1116298010Simp	/*
1117298010Simp	 * If pending read, prefer that based on current read bias
1118298010Simp	 * setting.
1119298010Simp	 */
1120298010Simp	if (bioq_first(&isc->bio_queue) && isc->current_read_bias) {
1121298010Simp		if (iosched_debug)
1122298010Simp			printf("Reads present and current_read_bias is %d queued writes %d queued reads %d\n", isc->current_read_bias, isc->write_stats.queued, isc->read_stats.queued);
1123298010Simp		isc->current_read_bias--;
1124298010Simp		return NULL;
1125298010Simp	}
1126298010Simp
1127298010Simp	/*
1128298010Simp	 * See if our current limiter allows this I/O.
1129298010Simp	 */
1130298010Simp	if (cam_iosched_limiter_iop(&isc->write_stats, bp) != 0) {
1131298010Simp		if (iosched_debug)
1132298010Simp			printf("Can't write because limiter says no.\n");
1133298010Simp		return NULL;
1134298010Simp	}
1135298010Simp
1136298010Simp	/*
1137298010Simp	 * Let's do this: We've passed all the gates and we're a go
1138298010Simp	 * to schedule the I/O in the SIM.
1139298010Simp	 */
1140298010Simp	isc->current_read_bias = isc->read_bias;
1141298010Simp	bioq_remove(&isc->write_queue, bp);
1142298010Simp	if (bp->bio_cmd == BIO_WRITE) {
1143298010Simp		isc->write_stats.queued--;
1144298010Simp		isc->write_stats.total++;
1145298010Simp		isc->write_stats.pending++;
1146298010Simp	}
1147298010Simp	if (iosched_debug > 9)
1148298010Simp		printf("HWQ : %p %#x\n", bp, bp->bio_cmd);
1149298010Simp	return bp;
1150298010Simp}
1151298010Simp#endif
1152298010Simp
1153298010Simp/*
1154298010Simp * Put back a trim that you weren't able to actually schedule this time.
1155298010Simp */
1156298010Simpvoid
1157298010Simpcam_iosched_put_back_trim(struct cam_iosched_softc *isc, struct bio *bp)
1158298010Simp{
1159298010Simp	bioq_insert_head(&isc->trim_queue, bp);
1160302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1161298010Simp	isc->trim_stats.queued++;
1162298010Simp	isc->trim_stats.total--;		/* since we put it back, don't double count */
1163298010Simp	isc->trim_stats.pending--;
1164298010Simp#endif
1165298010Simp}
1166298010Simp
1167298010Simp/*
1168298010Simp * gets the next trim from the trim queue.
1169298010Simp *
1170298010Simp * Assumes we're called with the periph lock held.  It removes this
1171298010Simp * trim from the queue and the device must explicitly reinstert it
1172298010Simp * should the need arise.
1173298010Simp */
1174298010Simpstruct bio *
1175298010Simpcam_iosched_next_trim(struct cam_iosched_softc *isc)
1176298010Simp{
1177298010Simp	struct bio *bp;
1178298010Simp
1179298010Simp	bp  = bioq_first(&isc->trim_queue);
1180298010Simp	if (bp == NULL)
1181298010Simp		return NULL;
1182298010Simp	bioq_remove(&isc->trim_queue, bp);
1183302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1184298010Simp	isc->trim_stats.queued--;
1185298010Simp	isc->trim_stats.total++;
1186298010Simp	isc->trim_stats.pending++;
1187298010Simp#endif
1188298010Simp	return bp;
1189298010Simp}
1190298010Simp
1191298010Simp/*
1192298010Simp * gets the an available trim from the trim queue, if there's no trim
1193298010Simp * already pending. It removes this trim from the queue and the device
1194298010Simp * must explicitly reinstert it should the need arise.
1195298010Simp *
1196298010Simp * Assumes we're called with the periph lock held.
1197298010Simp */
1198298010Simpstruct bio *
1199298010Simpcam_iosched_get_trim(struct cam_iosched_softc *isc)
1200298010Simp{
1201298010Simp
1202298010Simp	if (!cam_iosched_has_more_trim(isc))
1203298010Simp		return NULL;
1204298010Simp
1205298010Simp	return cam_iosched_next_trim(isc);
1206298010Simp}
1207298010Simp
1208298010Simp/*
1209298010Simp * Determine what the next bit of work to do is for the periph. The
1210298010Simp * default implementation looks to see if we have trims to do, but no
1211298010Simp * trims outstanding. If so, we do that. Otherwise we see if we have
1212298010Simp * other work. If we do, then we do that. Otherwise why were we called?
1213298010Simp */
1214298010Simpstruct bio *
1215298010Simpcam_iosched_next_bio(struct cam_iosched_softc *isc)
1216298010Simp{
1217298010Simp	struct bio *bp;
1218298010Simp
1219298010Simp	/*
1220298010Simp	 * See if we have a trim that can be scheduled. We can only send one
1221298010Simp	 * at a time down, so this takes that into account.
1222298010Simp	 *
1223298010Simp	 * XXX newer TRIM commands are queueable. Revisit this when we
1224298010Simp	 * implement them.
1225298010Simp	 */
1226298010Simp	if ((bp = cam_iosched_get_trim(isc)) != NULL)
1227298010Simp		return bp;
1228298010Simp
1229302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1230298010Simp	/*
1231298010Simp	 * See if we have any pending writes, and room in the queue for them,
1232298010Simp	 * and if so, those are next.
1233298010Simp	 */
1234302396Simp	if (do_dynamic_iosched) {
1235298010Simp		if ((bp = cam_iosched_get_write(isc)) != NULL)
1236298010Simp			return bp;
1237298010Simp	}
1238298010Simp#endif
1239298010Simp
1240298010Simp	/*
1241298010Simp	 * next, see if there's other, normal I/O waiting. If so return that.
1242298010Simp	 */
1243298010Simp	if ((bp = bioq_first(&isc->bio_queue)) == NULL)
1244298010Simp		return NULL;
1245298010Simp
1246302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1247298010Simp	/*
1248298010Simp	 * For the netflix scheduler, bio_queue is only for reads, so enforce
1249298010Simp	 * the limits here. Enforce only for reads.
1250298010Simp	 */
1251302396Simp	if (do_dynamic_iosched) {
1252298010Simp		if (bp->bio_cmd == BIO_READ &&
1253298010Simp		    cam_iosched_limiter_iop(&isc->read_stats, bp) != 0)
1254298010Simp			return NULL;
1255298010Simp	}
1256298010Simp#endif
1257298010Simp	bioq_remove(&isc->bio_queue, bp);
1258302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1259302396Simp	if (do_dynamic_iosched) {
1260298010Simp		if (bp->bio_cmd == BIO_READ) {
1261298010Simp			isc->read_stats.queued--;
1262298010Simp			isc->read_stats.total++;
1263298010Simp			isc->read_stats.pending++;
1264298010Simp		} else
1265298010Simp			printf("Found bio_cmd = %#x\n", bp->bio_cmd);
1266298010Simp	}
1267298010Simp	if (iosched_debug > 9)
1268298010Simp		printf("HWQ : %p %#x\n", bp, bp->bio_cmd);
1269298010Simp#endif
1270298010Simp	return bp;
1271298010Simp}
1272298010Simp
1273298010Simp/*
1274298010Simp * Driver has been given some work to do by the block layer. Tell the
1275298010Simp * scheduler about it and have it queue the work up. The scheduler module
1276298010Simp * will then return the currently most useful bit of work later, possibly
1277298010Simp * deferring work for various reasons.
1278298010Simp */
1279298010Simpvoid
1280298010Simpcam_iosched_queue_work(struct cam_iosched_softc *isc, struct bio *bp)
1281298010Simp{
1282298010Simp
1283298010Simp	/*
1284298010Simp	 * Put all trims on the trim queue sorted, since we know
1285298010Simp	 * that the collapsing code requires this. Otherwise put
1286298010Simp	 * the work on the bio queue.
1287298010Simp	 */
1288298010Simp	if (bp->bio_cmd == BIO_DELETE) {
1289298010Simp		bioq_disksort(&isc->trim_queue, bp);
1290302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1291298010Simp		isc->trim_stats.in++;
1292298010Simp		isc->trim_stats.queued++;
1293298010Simp#endif
1294298010Simp	}
1295302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1296302396Simp	else if (do_dynamic_iosched &&
1297298010Simp	    (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) {
1298298010Simp		if (cam_iosched_sort_queue(isc))
1299298010Simp			bioq_disksort(&isc->write_queue, bp);
1300298010Simp		else
1301298010Simp			bioq_insert_tail(&isc->write_queue, bp);
1302298010Simp		if (iosched_debug > 9)
1303298010Simp			printf("Qw  : %p %#x\n", bp, bp->bio_cmd);
1304298010Simp		if (bp->bio_cmd == BIO_WRITE) {
1305298010Simp			isc->write_stats.in++;
1306298010Simp			isc->write_stats.queued++;
1307298010Simp		}
1308298010Simp	}
1309298010Simp#endif
1310298010Simp	else {
1311298010Simp		if (cam_iosched_sort_queue(isc))
1312298010Simp			bioq_disksort(&isc->bio_queue, bp);
1313298010Simp		else
1314298010Simp			bioq_insert_tail(&isc->bio_queue, bp);
1315302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1316298010Simp		if (iosched_debug > 9)
1317298010Simp			printf("Qr  : %p %#x\n", bp, bp->bio_cmd);
1318298010Simp		if (bp->bio_cmd == BIO_READ) {
1319298010Simp			isc->read_stats.in++;
1320298010Simp			isc->read_stats.queued++;
1321298010Simp		} else if (bp->bio_cmd == BIO_WRITE) {
1322298010Simp			isc->write_stats.in++;
1323298010Simp			isc->write_stats.queued++;
1324298010Simp		}
1325298010Simp#endif
1326298010Simp	}
1327298010Simp}
1328298010Simp
1329298010Simp/*
1330298010Simp * If we have work, get it scheduled. Called with the periph lock held.
1331298010Simp */
1332298010Simpvoid
1333298010Simpcam_iosched_schedule(struct cam_iosched_softc *isc, struct cam_periph *periph)
1334298010Simp{
1335298010Simp
1336298010Simp	if (cam_iosched_has_work(isc))
1337298010Simp		xpt_schedule(periph, CAM_PRIORITY_NORMAL);
1338298010Simp}
1339298010Simp
1340298010Simp/*
1341298010Simp * Complete a trim request
1342298010Simp */
1343298010Simpvoid
1344298010Simpcam_iosched_trim_done(struct cam_iosched_softc *isc)
1345298010Simp{
1346298010Simp
1347298010Simp	isc->flags &= ~CAM_IOSCHED_FLAG_TRIM_ACTIVE;
1348298010Simp}
1349298010Simp
1350298010Simp/*
1351298010Simp * Complete a bio. Called before we release the ccb with xpt_release_ccb so we
1352298010Simp * might use notes in the ccb for statistics.
1353298010Simp */
1354298010Simpint
1355298010Simpcam_iosched_bio_complete(struct cam_iosched_softc *isc, struct bio *bp,
1356298010Simp    union ccb *done_ccb)
1357298010Simp{
1358298010Simp	int retval = 0;
1359302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1360302396Simp	if (!do_dynamic_iosched)
1361298010Simp		return retval;
1362298010Simp
1363298010Simp	if (iosched_debug > 10)
1364298010Simp		printf("done: %p %#x\n", bp, bp->bio_cmd);
1365298010Simp	if (bp->bio_cmd == BIO_WRITE) {
1366298010Simp		retval = cam_iosched_limiter_iodone(&isc->write_stats, bp);
1367298010Simp		isc->write_stats.out++;
1368298010Simp		isc->write_stats.pending--;
1369298010Simp	} else if (bp->bio_cmd == BIO_READ) {
1370298010Simp		retval = cam_iosched_limiter_iodone(&isc->read_stats, bp);
1371298010Simp		isc->read_stats.out++;
1372298010Simp		isc->read_stats.pending--;
1373298010Simp	} else if (bp->bio_cmd == BIO_DELETE) {
1374298010Simp		isc->trim_stats.out++;
1375298010Simp		isc->trim_stats.pending--;
1376298010Simp	} else if (bp->bio_cmd != BIO_FLUSH) {
1377298010Simp		if (iosched_debug)
1378298010Simp			printf("Completing command with bio_cmd == %#x\n", bp->bio_cmd);
1379298010Simp	}
1380298010Simp
1381298010Simp	if (!(bp->bio_flags & BIO_ERROR))
1382298010Simp		cam_iosched_io_metric_update(isc, done_ccb->ccb_h.qos.sim_data,
1383298010Simp		    bp->bio_cmd, bp->bio_bcount);
1384298010Simp#endif
1385298010Simp	return retval;
1386298010Simp}
1387298010Simp
1388298010Simp/*
1389298010Simp * Tell the io scheduler that you've pushed a trim down into the sim.
1390298010Simp * xxx better place for this?
1391298010Simp */
1392298010Simpvoid
1393298010Simpcam_iosched_submit_trim(struct cam_iosched_softc *isc)
1394298010Simp{
1395298010Simp
1396298010Simp	isc->flags |= CAM_IOSCHED_FLAG_TRIM_ACTIVE;
1397298010Simp}
1398298010Simp
1399298010Simp/*
1400298010Simp * Change the sorting policy hint for I/O transactions for this device.
1401298010Simp */
1402298010Simpvoid
1403298010Simpcam_iosched_set_sort_queue(struct cam_iosched_softc *isc, int val)
1404298010Simp{
1405298010Simp
1406298010Simp	isc->sort_io_queue = val;
1407298010Simp}
1408298010Simp
1409298010Simpint
1410298010Simpcam_iosched_has_work_flags(struct cam_iosched_softc *isc, uint32_t flags)
1411298010Simp{
1412298010Simp	return isc->flags & flags;
1413298010Simp}
1414298010Simp
1415298010Simpvoid
1416298010Simpcam_iosched_set_work_flags(struct cam_iosched_softc *isc, uint32_t flags)
1417298010Simp{
1418298010Simp	isc->flags |= flags;
1419298010Simp}
1420298010Simp
1421298010Simpvoid
1422298010Simpcam_iosched_clr_work_flags(struct cam_iosched_softc *isc, uint32_t flags)
1423298010Simp{
1424298010Simp	isc->flags &= ~flags;
1425298010Simp}
1426298010Simp
1427302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1428298010Simp/*
1429298010Simp * After the method presented in Jack Crenshaw's 1998 article "Integer
1430298010Simp * Suqare Roots," reprinted at
1431298010Simp * http://www.embedded.com/electronics-blogs/programmer-s-toolbox/4219659/Integer-Square-Roots
1432298010Simp * and well worth the read. Briefly, we find the power of 4 that's the
1433298010Simp * largest smaller than val. We then check each smaller power of 4 to
1434298010Simp * see if val is still bigger. The right shifts at each step divide
1435298010Simp * the result by 2 which after successive application winds up
1436298010Simp * accumulating the right answer. It could also have been accumulated
1437298010Simp * using a separate root counter, but this code is smaller and faster
1438298010Simp * than that method. This method is also integer size invariant.
1439298010Simp * It returns floor(sqrt((float)val)), or the larget integer less than
1440298010Simp * or equal to the square root.
1441298010Simp */
1442298010Simpstatic uint64_t
1443298010Simpisqrt64(uint64_t val)
1444298010Simp{
1445298010Simp	uint64_t res = 0;
1446298010Simp	uint64_t bit = 1ULL << (sizeof(uint64_t) * NBBY - 2);
1447298010Simp
1448298010Simp	/*
1449298010Simp	 * Find the largest power of 4 smaller than val.
1450298010Simp	 */
1451298010Simp	while (bit > val)
1452298010Simp		bit >>= 2;
1453298010Simp
1454298010Simp	/*
1455298010Simp	 * Accumulate the answer, one bit at a time (we keep moving
1456298010Simp	 * them over since 2 is the square root of 4 and we test
1457298010Simp	 * powers of 4). We accumulate where we find the bit, but
1458298010Simp	 * the successive shifts land the bit in the right place
1459298010Simp	 * by the end.
1460298010Simp	 */
1461298010Simp	while (bit != 0) {
1462298010Simp		if (val >= res + bit) {
1463298010Simp			val -= res + bit;
1464298010Simp			res = (res >> 1) + bit;
1465298010Simp		} else
1466298010Simp			res >>= 1;
1467298010Simp		bit >>= 2;
1468298010Simp	}
1469298010Simp
1470298010Simp	return res;
1471298010Simp}
1472298010Simp
1473298010Simp/*
1474298010Simp * a and b are 32.32 fixed point stored in a 64-bit word.
1475298010Simp * Let al and bl be the .32 part of a and b.
1476298010Simp * Let ah and bh be the 32 part of a and b.
1477298010Simp * R is the radix and is 1 << 32
1478298010Simp *
1479298010Simp * a * b
1480298010Simp * (ah + al / R) * (bh + bl / R)
1481298010Simp * ah * bh + (al * bh + ah * bl) / R + al * bl / R^2
1482298010Simp *
1483298010Simp * After multiplicaiton, we have to renormalize by multiply by
1484298010Simp * R, so we wind up with
1485298010Simp *	ah * bh * R + al * bh + ah * bl + al * bl / R
1486298010Simp * which turns out to be a very nice way to compute this value
1487298010Simp * so long as ah and bh are < 65536 there's no loss of high bits
1488298010Simp * and the low order bits are below the threshold of caring for
1489298010Simp * this application.
1490298010Simp */
1491298010Simpstatic uint64_t
1492298010Simpmul(uint64_t a, uint64_t b)
1493298010Simp{
1494298010Simp	uint64_t al, ah, bl, bh;
1495298010Simp	al = a & 0xffffffff;
1496298010Simp	ah = a >> 32;
1497298010Simp	bl = b & 0xffffffff;
1498298010Simp	bh = b >> 32;
1499298010Simp	return ((ah * bh) << 32) + al * bh + ah * bl + ((al * bl) >> 32);
1500298010Simp}
1501298010Simp
1502298010Simpstatic void
1503298010Simpcam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency)
1504298010Simp{
1505298010Simp	sbintime_t y, yy;
1506298010Simp	uint64_t var;
1507298010Simp
1508298010Simp	/*
1509298010Simp	 * Classic expoentially decaying average with a tiny alpha
1510298010Simp	 * (2 ^ -alpha_bits). For more info see the NIST statistical
1511298010Simp	 * handbook.
1512298010Simp	 *
1513298010Simp	 * ema_t = y_t * alpha + ema_t-1 * (1 - alpha)
1514298010Simp	 * alpha = 1 / (1 << alpha_bits)
1515298010Simp	 *
1516298010Simp	 * Since alpha is a power of two, we can compute this w/o any mult or
1517298010Simp	 * division.
1518298010Simp	 */
1519298010Simp	y = sim_latency;
1520298010Simp	iop->ema = (y + (iop->ema << alpha_bits) - iop->ema) >> alpha_bits;
1521298010Simp
1522298010Simp	yy = mul(y, y);
1523298010Simp	iop->emss = (yy + (iop->emss << alpha_bits) - iop->emss) >> alpha_bits;
1524298010Simp
1525298010Simp	/*
1526298010Simp         * s_1 = sum of data
1527298010Simp	 * s_2 = sum of data * data
1528298010Simp	 * ema ~ mean (or s_1 / N)
1529298010Simp	 * emss ~ s_2 / N
1530298010Simp	 *
1531298010Simp	 * sd = sqrt((N * s_2 - s_1 ^ 2) / (N * (N - 1)))
1532298010Simp	 * sd = sqrt((N * s_2 / N * (N - 1)) - (s_1 ^ 2 / (N * (N - 1))))
1533298010Simp	 *
1534298010Simp	 * N ~ 2 / alpha - 1
1535298010Simp	 * alpha < 1 / 16 (typically much less)
1536298010Simp	 * N > 31 --> N large so N * (N - 1) is approx N * N
1537298010Simp	 *
1538298010Simp	 * substituting and rearranging:
1539298010Simp	 * sd ~ sqrt(s_2 / N - (s_1 / N) ^ 2)
1540298010Simp	 *    ~ sqrt(emss - ema ^ 2);
1541298010Simp	 * which is the formula used here to get a decent estimate of sd which
1542298010Simp	 * we use to detect outliers. Note that when first starting up, it
1543298010Simp	 * takes a while for emss sum of squares estimator to converge on a
1544298010Simp	 * good value.  during this time, it can be less than ema^2. We
1545298010Simp	 * compute a sd of 0 in that case, and ignore outliers.
1546298010Simp	 */
1547298010Simp	var = iop->emss - mul(iop->ema, iop->ema);
1548298010Simp	iop->sd = (int64_t)var < 0 ? 0 : isqrt64(var);
1549298010Simp}
1550298010Simp
1551302163Simp#ifdef CAM_IOSCHED_DYNAMIC
1552298010Simpstatic void
1553298010Simpcam_iosched_io_metric_update(struct cam_iosched_softc *isc,
1554298010Simp    sbintime_t sim_latency, int cmd, size_t size)
1555298010Simp{
1556298010Simp	/* xxx Do we need to scale based on the size of the I/O ? */
1557298010Simp	switch (cmd) {
1558298010Simp	case BIO_READ:
1559298010Simp		cam_iosched_update(&isc->read_stats, sim_latency);
1560298010Simp		break;
1561298010Simp	case BIO_WRITE:
1562298010Simp		cam_iosched_update(&isc->write_stats, sim_latency);
1563298010Simp		break;
1564298010Simp	case BIO_DELETE:
1565298010Simp		cam_iosched_update(&isc->trim_stats, sim_latency);
1566298010Simp		break;
1567298010Simp	default:
1568298010Simp		break;
1569298010Simp	}
1570298010Simp}
1571298036Simp#endif
1572298010Simp
1573298010Simp#ifdef DDB
1574298010Simpstatic int biolen(struct bio_queue_head *bq)
1575298010Simp{
1576298010Simp	int i = 0;
1577298010Simp	struct bio *bp;
1578298010Simp
1579298010Simp	TAILQ_FOREACH(bp, &bq->queue, bio_queue) {
1580298010Simp		i++;
1581298010Simp	}
1582298010Simp	return i;
1583298010Simp}
1584298010Simp
1585298010Simp/*
1586298010Simp * Show the internal state of the I/O scheduler.
1587298010Simp */
1588298010SimpDB_SHOW_COMMAND(iosched, cam_iosched_db_show)
1589298010Simp{
1590298010Simp	struct cam_iosched_softc *isc;
1591298010Simp
1592298010Simp	if (!have_addr) {
1593298010Simp		db_printf("Need addr\n");
1594298010Simp		return;
1595298010Simp	}
1596298010Simp	isc = (struct cam_iosched_softc *)addr;
1597298010Simp	db_printf("pending_reads:     %d\n", isc->read_stats.pending);
1598298010Simp	db_printf("min_reads:         %d\n", isc->read_stats.min);
1599298010Simp	db_printf("max_reads:         %d\n", isc->read_stats.max);
1600298010Simp	db_printf("reads:             %d\n", isc->read_stats.total);
1601298010Simp	db_printf("in_reads:          %d\n", isc->read_stats.in);
1602298010Simp	db_printf("out_reads:         %d\n", isc->read_stats.out);
1603298010Simp	db_printf("queued_reads:      %d\n", isc->read_stats.queued);
1604298010Simp	db_printf("Current Q len      %d\n", biolen(&isc->bio_queue));
1605298010Simp	db_printf("pending_writes:    %d\n", isc->write_stats.pending);
1606298010Simp	db_printf("min_writes:        %d\n", isc->write_stats.min);
1607298010Simp	db_printf("max_writes:        %d\n", isc->write_stats.max);
1608298010Simp	db_printf("writes:            %d\n", isc->write_stats.total);
1609298010Simp	db_printf("in_writes:         %d\n", isc->write_stats.in);
1610298010Simp	db_printf("out_writes:        %d\n", isc->write_stats.out);
1611298010Simp	db_printf("queued_writes:     %d\n", isc->write_stats.queued);
1612298010Simp	db_printf("Current Q len      %d\n", biolen(&isc->write_queue));
1613298010Simp	db_printf("pending_trims:     %d\n", isc->trim_stats.pending);
1614298010Simp	db_printf("min_trims:         %d\n", isc->trim_stats.min);
1615298010Simp	db_printf("max_trims:         %d\n", isc->trim_stats.max);
1616298010Simp	db_printf("trims:             %d\n", isc->trim_stats.total);
1617298010Simp	db_printf("in_trims:          %d\n", isc->trim_stats.in);
1618298010Simp	db_printf("out_trims:         %d\n", isc->trim_stats.out);
1619298010Simp	db_printf("queued_trims:      %d\n", isc->trim_stats.queued);
1620298010Simp	db_printf("Current Q len      %d\n", biolen(&isc->trim_queue));
1621298010Simp	db_printf("read_bias:         %d\n", isc->read_bias);
1622298010Simp	db_printf("current_read_bias: %d\n", isc->current_read_bias);
1623298010Simp	db_printf("Trim active?       %s\n",
1624298010Simp	    (isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) ? "yes" : "no");
1625298010Simp}
1626298010Simp#endif
1627298010Simp#endif
1628