altq_rmclass.c revision 298133
1/*	$FreeBSD: stable/10/sys/contrib/altq/altq/altq_rmclass.c 298133 2016-04-16 22:02:32Z loos $	*/
2/*	$KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $	*/
3
4/*
5 * Copyright (c) 1991-1997 Regents of the University of California.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *      This product includes software developed by the Network Research
19 *      Group at Lawrence Berkeley Laboratory.
20 * 4. Neither the name of the University nor of the Laboratory may be used
21 *    to endorse or promote products derived from this software without
22 *    specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * LBL code modified by speer@eng.sun.com, May 1977.
37 * For questions and/or comments, please send mail to cbq@ee.lbl.gov
38 *
39 * @(#)rm_class.c  1.48     97/12/05 SMI
40 */
41#if defined(__FreeBSD__) || defined(__NetBSD__)
42#include "opt_altq.h"
43#include "opt_inet.h"
44#ifdef __FreeBSD__
45#include "opt_inet6.h"
46#endif
47#endif /* __FreeBSD__ || __NetBSD__ */
48#ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
49
50#include <sys/param.h>
51#include <sys/malloc.h>
52#include <sys/mbuf.h>
53#include <sys/socket.h>
54#include <sys/systm.h>
55#include <sys/errno.h>
56#include <sys/time.h>
57#ifdef ALTQ3_COMPAT
58#include <sys/kernel.h>
59#endif
60
61#include <net/if.h>
62#include <net/if_var.h>
63#ifdef ALTQ3_COMPAT
64#include <netinet/in.h>
65#include <netinet/in_systm.h>
66#include <netinet/ip.h>
67#endif
68
69#include <altq/if_altq.h>
70#include <altq/altq.h>
71#include <altq/altq_codel.h>
72#include <altq/altq_rmclass.h>
73#include <altq/altq_rmclass_debug.h>
74#include <altq/altq_red.h>
75#include <altq/altq_rio.h>
76
77/*
78 * Local Macros
79 */
80
81#define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
82
83/*
84 * Local routines.
85 */
86
87static int	rmc_satisfied(struct rm_class *, struct timeval *);
88static void	rmc_wrr_set_weights(struct rm_ifdat *);
89static void	rmc_depth_compute(struct rm_class *);
90static void	rmc_depth_recompute(rm_class_t *);
91
92static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
93static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
94
95static int	_rmc_addq(rm_class_t *, mbuf_t *);
96static void	_rmc_dropq(rm_class_t *);
97static mbuf_t	*_rmc_getq(rm_class_t *);
98static mbuf_t	*_rmc_pollq(rm_class_t *);
99
100static int	rmc_under_limit(struct rm_class *, struct timeval *);
101static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
102static void	rmc_drop_action(struct rm_class *);
103static void	rmc_restart(struct rm_class *);
104static void	rmc_root_overlimit(struct rm_class *, struct rm_class *);
105
106#define	BORROW_OFFTIME
107/*
108 * BORROW_OFFTIME (experimental):
109 * borrow the offtime of the class borrowing from.
110 * the reason is that when its own offtime is set, the class is unable
111 * to borrow much, especially when cutoff is taking effect.
112 * but when the borrowed class is overloaded (advidle is close to minidle),
113 * use the borrowing class's offtime to avoid overload.
114 */
115#define	ADJUST_CUTOFF
116/*
117 * ADJUST_CUTOFF (experimental):
118 * if no underlimit class is found due to cutoff, increase cutoff and
119 * retry the scheduling loop.
120 * also, don't invoke delay_actions while cutoff is taking effect,
121 * since a sleeping class won't have a chance to be scheduled in the
122 * next loop.
123 *
124 * now heuristics for setting the top-level variable (cutoff_) becomes:
125 *	1. if a packet arrives for a not-overlimit class, set cutoff
126 *	   to the depth of the class.
127 *	2. if cutoff is i, and a packet arrives for an overlimit class
128 *	   with an underlimit ancestor at a lower level than i (say j),
129 *	   then set cutoff to j.
130 *	3. at scheduling a packet, if there is no underlimit class
131 *	   due to the current cutoff level, increase cutoff by 1 and
132 *	   then try to schedule again.
133 */
134
135/*
136 * rm_class_t *
137 * rmc_newclass(...) - Create a new resource management class at priority
138 * 'pri' on the interface given by 'ifd'.
139 *
140 * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
141 *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
142 *              than 100% of the bandwidth, this number should be the
143 *              'effective' rate for the class.  Let f be the
144 *              bandwidth fraction allocated to this class, and let
145 *              nsPerByte be the data rate of the output link in
146 *              nanoseconds/byte.  Then nsecPerByte is set to
147 *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
148 *              for a class that gets 50% of an ethernet's bandwidth.
149 *
150 * action       the routine to call when the class is over limit.
151 *
152 * maxq         max allowable queue size for class (in packets).
153 *
154 * parent       parent class pointer.
155 *
156 * borrow       class to borrow from (should be either 'parent' or null).
157 *
158 * maxidle      max value allowed for class 'idle' time estimate (this
159 *              parameter determines how large an initial burst of packets
160 *              can be before overlimit action is invoked.
161 *
162 * offtime      how long 'delay' action will delay when class goes over
163 *              limit (this parameter determines the steady-state burst
164 *              size when a class is running over its limit).
165 *
166 * Maxidle and offtime have to be computed from the following:  If the
167 * average packet size is s, the bandwidth fraction allocated to this
168 * class is f, we want to allow b packet bursts, and the gain of the
169 * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
170 *
171 *   ptime = s * nsPerByte * (1 - f) / f
172 *   maxidle = ptime * (1 - g^b) / g^b
173 *   minidle = -ptime * (1 / (f - 1))
174 *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
175 *
176 * Operationally, it's convenient to specify maxidle & offtime in units
177 * independent of the link bandwidth so the maxidle & offtime passed to
178 * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
179 * (The constant factor is a scale factor needed to make the parameters
180 * integers.  This scaling also means that the 'unscaled' values of
181 * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
182 * not nanoseconds.)  Also note that the 'idle' filter computation keeps
183 * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
184 * maxidle also must be scaled upward by this value.  Thus, the passed
185 * values for maxidle and offtime can be computed as follows:
186 *
187 * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
188 * offtime = offtime * 8 / (1000 * nsecPerByte)
189 *
190 * When USE_HRTIME is employed, then maxidle and offtime become:
191 * 	maxidle = maxilde * (8.0 / nsecPerByte);
192 * 	offtime = offtime * (8.0 / nsecPerByte);
193 */
194struct rm_class *
195rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
196    void (*action)(rm_class_t *, rm_class_t *), int maxq,
197    struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
198    int minidle, u_int offtime, int pktsize, int flags)
199{
200	struct rm_class	*cl;
201	struct rm_class	*peer;
202	int		 s;
203
204	if (pri >= RM_MAXPRIO)
205		return (NULL);
206#ifndef ALTQ_RED
207	if (flags & RMCF_RED) {
208#ifdef ALTQ_DEBUG
209		printf("rmc_newclass: RED not configured for CBQ!\n");
210#endif
211		return (NULL);
212	}
213#endif
214#ifndef ALTQ_RIO
215	if (flags & RMCF_RIO) {
216#ifdef ALTQ_DEBUG
217		printf("rmc_newclass: RIO not configured for CBQ!\n");
218#endif
219		return (NULL);
220	}
221#endif
222#ifndef ALTQ_CODEL
223	if (flags & RMCF_CODEL) {
224#ifdef ALTQ_DEBUG
225		printf("rmc_newclass: CODEL not configured for CBQ!\n");
226#endif
227		return (NULL);
228	}
229#endif
230
231	cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_NOWAIT | M_ZERO);
232	if (cl == NULL)
233		return (NULL);
234	CALLOUT_INIT(&cl->callout_);
235	cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO);
236	if (cl->q_ == NULL) {
237		free(cl, M_DEVBUF);
238		return (NULL);
239	}
240
241	/*
242	 * Class initialization.
243	 */
244	cl->children_ = NULL;
245	cl->parent_ = parent;
246	cl->borrow_ = borrow;
247	cl->leaf_ = 1;
248	cl->ifdat_ = ifd;
249	cl->pri_ = pri;
250	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
251	cl->depth_ = 0;
252	cl->qthresh_ = 0;
253	cl->ns_per_byte_ = nsecPerByte;
254
255	qlimit(cl->q_) = maxq;
256	qtype(cl->q_) = Q_DROPHEAD;
257	qlen(cl->q_) = 0;
258	cl->flags_ = flags;
259
260#if 1 /* minidle is also scaled in ALTQ */
261	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
262	if (cl->minidle_ > 0)
263		cl->minidle_ = 0;
264#else
265	cl->minidle_ = minidle;
266#endif
267	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
268	if (cl->maxidle_ == 0)
269		cl->maxidle_ = 1;
270#if 1 /* offtime is also scaled in ALTQ */
271	cl->avgidle_ = cl->maxidle_;
272	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
273	if (cl->offtime_ == 0)
274		cl->offtime_ = 1;
275#else
276	cl->avgidle_ = 0;
277	cl->offtime_ = (offtime * nsecPerByte) / 8;
278#endif
279	cl->overlimit = action;
280
281#ifdef ALTQ_RED
282	if (flags & (RMCF_RED|RMCF_RIO)) {
283		int red_flags, red_pkttime;
284
285		red_flags = 0;
286		if (flags & RMCF_ECN)
287			red_flags |= REDF_ECN;
288		if (flags & RMCF_FLOWVALVE)
289			red_flags |= REDF_FLOWVALVE;
290#ifdef ALTQ_RIO
291		if (flags & RMCF_CLEARDSCP)
292			red_flags |= RIOF_CLEARDSCP;
293#endif
294		red_pkttime = nsecPerByte * pktsize  / 1000;
295
296		if (flags & RMCF_RED) {
297			cl->red_ = red_alloc(0, 0,
298			    qlimit(cl->q_) * 10/100,
299			    qlimit(cl->q_) * 30/100,
300			    red_flags, red_pkttime);
301			if (cl->red_ != NULL)
302				qtype(cl->q_) = Q_RED;
303		}
304#ifdef ALTQ_RIO
305		else {
306			cl->red_ = (red_t *)rio_alloc(0, NULL,
307						      red_flags, red_pkttime);
308			if (cl->red_ != NULL)
309				qtype(cl->q_) = Q_RIO;
310		}
311#endif
312	}
313#endif /* ALTQ_RED */
314#ifdef ALTQ_CODEL
315	if (flags & RMCF_CODEL) {
316		cl->codel_ = codel_alloc(5, 100, 0);
317		if (cl->codel_ != NULL)
318			qtype(cl->q_) = Q_CODEL;
319	}
320#endif
321
322	/*
323	 * put the class into the class tree
324	 */
325#ifdef __NetBSD__
326	s = splnet();
327#else
328	s = splimp();
329#endif
330	IFQ_LOCK(ifd->ifq_);
331	if ((peer = ifd->active_[pri]) != NULL) {
332		/* find the last class at this pri */
333		cl->peer_ = peer;
334		while (peer->peer_ != ifd->active_[pri])
335			peer = peer->peer_;
336		peer->peer_ = cl;
337	} else {
338		ifd->active_[pri] = cl;
339		cl->peer_ = cl;
340	}
341
342	if (cl->parent_) {
343		cl->next_ = parent->children_;
344		parent->children_ = cl;
345		parent->leaf_ = 0;
346	}
347
348	/*
349	 * Compute the depth of this class and its ancestors in the class
350	 * hierarchy.
351	 */
352	rmc_depth_compute(cl);
353
354	/*
355	 * If CBQ's WRR is enabled, then initialize the class WRR state.
356	 */
357	if (ifd->wrr_) {
358		ifd->num_[pri]++;
359		ifd->alloc_[pri] += cl->allotment_;
360		rmc_wrr_set_weights(ifd);
361	}
362	IFQ_UNLOCK(ifd->ifq_);
363	splx(s);
364	return (cl);
365}
366
367int
368rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
369    int minidle, u_int offtime, int pktsize)
370{
371	struct rm_ifdat	*ifd;
372	u_int		 old_allotment;
373	int		 s;
374
375	ifd = cl->ifdat_;
376	old_allotment = cl->allotment_;
377
378#ifdef __NetBSD__
379	s = splnet();
380#else
381	s = splimp();
382#endif
383	IFQ_LOCK(ifd->ifq_);
384	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
385	cl->qthresh_ = 0;
386	cl->ns_per_byte_ = nsecPerByte;
387
388	qlimit(cl->q_) = maxq;
389
390#if 1 /* minidle is also scaled in ALTQ */
391	cl->minidle_ = (minidle * nsecPerByte) / 8;
392	if (cl->minidle_ > 0)
393		cl->minidle_ = 0;
394#else
395	cl->minidle_ = minidle;
396#endif
397	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
398	if (cl->maxidle_ == 0)
399		cl->maxidle_ = 1;
400#if 1 /* offtime is also scaled in ALTQ */
401	cl->avgidle_ = cl->maxidle_;
402	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
403	if (cl->offtime_ == 0)
404		cl->offtime_ = 1;
405#else
406	cl->avgidle_ = 0;
407	cl->offtime_ = (offtime * nsecPerByte) / 8;
408#endif
409
410	/*
411	 * If CBQ's WRR is enabled, then initialize the class WRR state.
412	 */
413	if (ifd->wrr_) {
414		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
415		rmc_wrr_set_weights(ifd);
416	}
417	IFQ_UNLOCK(ifd->ifq_);
418	splx(s);
419	return (0);
420}
421
422/*
423 * static void
424 * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
425 *	the appropriate run robin weights for the CBQ weighted round robin
426 *	algorithm.
427 *
428 *	Returns: NONE
429 */
430
431static void
432rmc_wrr_set_weights(struct rm_ifdat *ifd)
433{
434	int		i;
435	struct rm_class	*cl, *clh;
436
437	for (i = 0; i < RM_MAXPRIO; i++) {
438		/*
439		 * This is inverted from that of the simulator to
440		 * maintain precision.
441		 */
442		if (ifd->num_[i] == 0)
443			ifd->M_[i] = 0;
444		else
445			ifd->M_[i] = ifd->alloc_[i] /
446				(ifd->num_[i] * ifd->maxpkt_);
447		/*
448		 * Compute the weighted allotment for each class.
449		 * This takes the expensive div instruction out
450		 * of the main loop for the wrr scheduling path.
451		 * These only get recomputed when a class comes or
452		 * goes.
453		 */
454		if (ifd->active_[i] != NULL) {
455			clh = cl = ifd->active_[i];
456			do {
457				/* safe-guard for slow link or alloc_ == 0 */
458				if (ifd->M_[i] == 0)
459					cl->w_allotment_ = 0;
460				else
461					cl->w_allotment_ = cl->allotment_ /
462						ifd->M_[i];
463				cl = cl->peer_;
464			} while ((cl != NULL) && (cl != clh));
465		}
466	}
467}
468
469int
470rmc_get_weight(struct rm_ifdat *ifd, int pri)
471{
472	if ((pri >= 0) && (pri < RM_MAXPRIO))
473		return (ifd->M_[pri]);
474	else
475		return (0);
476}
477
478/*
479 * static void
480 * rmc_depth_compute(struct rm_class *cl) - This function computes the
481 *	appropriate depth of class 'cl' and its ancestors.
482 *
483 *	Returns:	NONE
484 */
485
486static void
487rmc_depth_compute(struct rm_class *cl)
488{
489	rm_class_t	*t = cl, *p;
490
491	/*
492	 * Recompute the depth for the branch of the tree.
493	 */
494	while (t != NULL) {
495		p = t->parent_;
496		if (p && (t->depth_ >= p->depth_)) {
497			p->depth_ = t->depth_ + 1;
498			t = p;
499		} else
500			t = NULL;
501	}
502}
503
504/*
505 * static void
506 * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
507 *	the depth of the tree after a class has been deleted.
508 *
509 *	Returns:	NONE
510 */
511
512static void
513rmc_depth_recompute(rm_class_t *cl)
514{
515#if 1 /* ALTQ */
516	rm_class_t	*p, *t;
517
518	p = cl;
519	while (p != NULL) {
520		if ((t = p->children_) == NULL) {
521			p->depth_ = 0;
522		} else {
523			int cdepth = 0;
524
525			while (t != NULL) {
526				if (t->depth_ > cdepth)
527					cdepth = t->depth_;
528				t = t->next_;
529			}
530
531			if (p->depth_ == cdepth + 1)
532				/* no change to this parent */
533				return;
534
535			p->depth_ = cdepth + 1;
536		}
537
538		p = p->parent_;
539	}
540#else
541	rm_class_t	*t;
542
543	if (cl->depth_ >= 1) {
544		if (cl->children_ == NULL) {
545			cl->depth_ = 0;
546		} else if ((t = cl->children_) != NULL) {
547			while (t != NULL) {
548				if (t->children_ != NULL)
549					rmc_depth_recompute(t);
550				t = t->next_;
551			}
552		} else
553			rmc_depth_compute(cl);
554	}
555#endif
556}
557
558/*
559 * void
560 * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
561 *	function deletes a class from the link-sharing structure and frees
562 *	all resources associated with the class.
563 *
564 *	Returns: NONE
565 */
566
567void
568rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
569{
570	struct rm_class	*p, *head, *previous;
571	int		 s;
572
573	ASSERT(cl->children_ == NULL);
574
575	if (cl->sleeping_)
576		CALLOUT_STOP(&cl->callout_);
577
578#ifdef __NetBSD__
579	s = splnet();
580#else
581	s = splimp();
582#endif
583	IFQ_LOCK(ifd->ifq_);
584	/*
585	 * Free packets in the packet queue.
586	 * XXX - this may not be a desired behavior.  Packets should be
587	 *		re-queued.
588	 */
589	rmc_dropall(cl);
590
591	/*
592	 * If the class has a parent, then remove the class from the
593	 * class from the parent's children chain.
594	 */
595	if (cl->parent_ != NULL) {
596		head = cl->parent_->children_;
597		p = previous = head;
598		if (head->next_ == NULL) {
599			ASSERT(head == cl);
600			cl->parent_->children_ = NULL;
601			cl->parent_->leaf_ = 1;
602		} else while (p != NULL) {
603			if (p == cl) {
604				if (cl == head)
605					cl->parent_->children_ = cl->next_;
606				else
607					previous->next_ = cl->next_;
608				cl->next_ = NULL;
609				p = NULL;
610			} else {
611				previous = p;
612				p = p->next_;
613			}
614		}
615	}
616
617	/*
618	 * Delete class from class priority peer list.
619	 */
620	if ((p = ifd->active_[cl->pri_]) != NULL) {
621		/*
622		 * If there is more than one member of this priority
623		 * level, then look for class(cl) in the priority level.
624		 */
625		if (p != p->peer_) {
626			while (p->peer_ != cl)
627				p = p->peer_;
628			p->peer_ = cl->peer_;
629
630			if (ifd->active_[cl->pri_] == cl)
631				ifd->active_[cl->pri_] = cl->peer_;
632		} else {
633			ASSERT(p == cl);
634			ifd->active_[cl->pri_] = NULL;
635		}
636	}
637
638	/*
639	 * Recompute the WRR weights.
640	 */
641	if (ifd->wrr_) {
642		ifd->alloc_[cl->pri_] -= cl->allotment_;
643		ifd->num_[cl->pri_]--;
644		rmc_wrr_set_weights(ifd);
645	}
646
647	/*
648	 * Re-compute the depth of the tree.
649	 */
650#if 1 /* ALTQ */
651	rmc_depth_recompute(cl->parent_);
652#else
653	rmc_depth_recompute(ifd->root_);
654#endif
655
656	IFQ_UNLOCK(ifd->ifq_);
657	splx(s);
658
659	/*
660	 * Free the class structure.
661	 */
662	if (cl->red_ != NULL) {
663#ifdef ALTQ_RIO
664		if (q_is_rio(cl->q_))
665			rio_destroy((rio_t *)cl->red_);
666#endif
667#ifdef ALTQ_RED
668		if (q_is_red(cl->q_))
669			red_destroy(cl->red_);
670#endif
671#ifdef ALTQ_CODEL
672		if (q_is_codel(cl->q_))
673			codel_destroy(cl->codel_);
674#endif
675	}
676	free(cl->q_, M_DEVBUF);
677	free(cl, M_DEVBUF);
678}
679
680
681/*
682 * void
683 * rmc_init(...) - Initialize the resource management data structures
684 *	associated with the output portion of interface 'ifp'.  'ifd' is
685 *	where the structures will be built (for backwards compatibility, the
686 *	structures aren't kept in the ifnet struct).  'nsecPerByte'
687 *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
688 *	'restart' is the driver-specific routine that the generic 'delay
689 *	until under limit' action will call to restart output.  `maxq'
690 *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
691 *	is the maximum number of packets that the resource management
692 *	code will allow to be queued 'downstream' (this is typically 1).
693 *
694 *	Returns:	NONE
695 */
696
697void
698rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
699    void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
700    int minidle, u_int offtime, int flags)
701{
702	int		i, mtu;
703
704	/*
705	 * Initialize the CBQ tracing/debug facility.
706	 */
707	CBQTRACEINIT();
708
709	bzero((char *)ifd, sizeof (*ifd));
710	mtu = ifq->altq_ifp->if_mtu;
711	ifd->ifq_ = ifq;
712	ifd->restart = restart;
713	ifd->maxqueued_ = maxqueued;
714	ifd->ns_per_byte_ = nsecPerByte;
715	ifd->maxpkt_ = mtu;
716	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
717	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
718#if 1
719	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
720	if (mtu * nsecPerByte > 10 * 1000000)
721		ifd->maxiftime_ /= 4;
722#endif
723
724	reset_cutoff(ifd);
725	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
726
727	/*
728	 * Initialize the CBQ's WRR state.
729	 */
730	for (i = 0; i < RM_MAXPRIO; i++) {
731		ifd->alloc_[i] = 0;
732		ifd->M_[i] = 0;
733		ifd->num_[i] = 0;
734		ifd->na_[i] = 0;
735		ifd->active_[i] = NULL;
736	}
737
738	/*
739	 * Initialize current packet state.
740	 */
741	ifd->qi_ = 0;
742	ifd->qo_ = 0;
743	for (i = 0; i < RM_MAXQUEUED; i++) {
744		ifd->class_[i] = NULL;
745		ifd->curlen_[i] = 0;
746		ifd->borrowed_[i] = NULL;
747	}
748
749	/*
750	 * Create the root class of the link-sharing structure.
751	 */
752	if ((ifd->root_ = rmc_newclass(0, ifd,
753				       nsecPerByte,
754				       rmc_root_overlimit, maxq, 0, 0,
755				       maxidle, minidle, offtime,
756				       0, 0)) == NULL) {
757		printf("rmc_init: root class not allocated\n");
758		return ;
759	}
760	ifd->root_->depth_ = 0;
761}
762
763/*
764 * void
765 * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
766 *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
767 *	by a driver's if_output routine.  This routine must be called with
768 *	output packet completion interrupts locked out (to avoid racing with
769 *	rmc_dequeue_next).
770 *
771 *	Returns:	0 on successful queueing
772 *			-1 when packet drop occurs
773 */
774int
775rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
776{
777	struct timeval	 now;
778	struct rm_ifdat *ifd = cl->ifdat_;
779	int		 cpri = cl->pri_;
780	int		 is_empty = qempty(cl->q_);
781
782	RM_GETTIME(now);
783	if (ifd->cutoff_ > 0) {
784		if (TV_LT(&cl->undertime_, &now)) {
785			if (ifd->cutoff_ > cl->depth_)
786				ifd->cutoff_ = cl->depth_;
787			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
788		}
789#if 1 /* ALTQ */
790		else {
791			/*
792			 * the class is overlimit. if the class has
793			 * underlimit ancestors, set cutoff to the lowest
794			 * depth among them.
795			 */
796			struct rm_class *borrow = cl->borrow_;
797
798			while (borrow != NULL &&
799			       borrow->depth_ < ifd->cutoff_) {
800				if (TV_LT(&borrow->undertime_, &now)) {
801					ifd->cutoff_ = borrow->depth_;
802					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
803					break;
804				}
805				borrow = borrow->borrow_;
806			}
807		}
808#else /* !ALTQ */
809		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
810			if (TV_LT(&cl->borrow_->undertime_, &now)) {
811				ifd->cutoff_ = cl->borrow_->depth_;
812				CBQTRACE(rmc_queue_packet, 'ffob',
813					 cl->borrow_->depth_);
814			}
815		}
816#endif /* !ALTQ */
817	}
818
819	if (_rmc_addq(cl, m) < 0)
820		/* failed */
821		return (-1);
822
823	if (is_empty) {
824		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
825		ifd->na_[cpri]++;
826	}
827
828	if (qlen(cl->q_) > qlimit(cl->q_)) {
829		/* note: qlimit can be set to 0 or 1 */
830		rmc_drop_action(cl);
831		return (-1);
832	}
833	return (0);
834}
835
836/*
837 * void
838 * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
839 *	classes to see if there are satified.
840 */
841
842static void
843rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
844{
845	int		 i;
846	rm_class_t	*p, *bp;
847
848	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
849		if ((bp = ifd->active_[i]) != NULL) {
850			p = bp;
851			do {
852				if (!rmc_satisfied(p, now)) {
853					ifd->cutoff_ = p->depth_;
854					return;
855				}
856				p = p->peer_;
857			} while (p != bp);
858		}
859	}
860
861	reset_cutoff(ifd);
862}
863
864/*
865 * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
866 */
867
868static int
869rmc_satisfied(struct rm_class *cl, struct timeval *now)
870{
871	rm_class_t	*p;
872
873	if (cl == NULL)
874		return (1);
875	if (TV_LT(now, &cl->undertime_))
876		return (1);
877	if (cl->depth_ == 0) {
878		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
879			return (0);
880		else
881			return (1);
882	}
883	if (cl->children_ != NULL) {
884		p = cl->children_;
885		while (p != NULL) {
886			if (!rmc_satisfied(p, now))
887				return (0);
888			p = p->next_;
889		}
890	}
891
892	return (1);
893}
894
895/*
896 * Return 1 if class 'cl' is under limit or can borrow from a parent,
897 * 0 if overlimit.  As a side-effect, this routine will invoke the
898 * class overlimit action if the class if overlimit.
899 */
900
901static int
902rmc_under_limit(struct rm_class *cl, struct timeval *now)
903{
904	rm_class_t	*p = cl;
905	rm_class_t	*top;
906	struct rm_ifdat	*ifd = cl->ifdat_;
907
908	ifd->borrowed_[ifd->qi_] = NULL;
909	/*
910	 * If cl is the root class, then always return that it is
911	 * underlimit.  Otherwise, check to see if the class is underlimit.
912	 */
913	if (cl->parent_ == NULL)
914		return (1);
915
916	if (cl->sleeping_) {
917		if (TV_LT(now, &cl->undertime_))
918			return (0);
919
920		CALLOUT_STOP(&cl->callout_);
921		cl->sleeping_ = 0;
922		cl->undertime_.tv_sec = 0;
923		return (1);
924	}
925
926	top = NULL;
927	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
928		if (((cl = cl->borrow_) == NULL) ||
929		    (cl->depth_ > ifd->cutoff_)) {
930#ifdef ADJUST_CUTOFF
931			if (cl != NULL)
932				/* cutoff is taking effect, just
933				   return false without calling
934				   the delay action. */
935				return (0);
936#endif
937#ifdef BORROW_OFFTIME
938			/*
939			 * check if the class can borrow offtime too.
940			 * borrow offtime from the top of the borrow
941			 * chain if the top class is not overloaded.
942			 */
943			if (cl != NULL) {
944				/* cutoff is taking effect, use this class as top. */
945				top = cl;
946				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
947			}
948			if (top != NULL && top->avgidle_ == top->minidle_)
949				top = NULL;
950			p->overtime_ = *now;
951			(p->overlimit)(p, top);
952#else
953			p->overtime_ = *now;
954			(p->overlimit)(p, NULL);
955#endif
956			return (0);
957		}
958		top = cl;
959	}
960
961	if (cl != p)
962		ifd->borrowed_[ifd->qi_] = cl;
963	return (1);
964}
965
966/*
967 * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
968 *	Packet-by-packet round robin.
969 *
970 * The heart of the weighted round-robin scheduler, which decides which
971 * class next gets to send a packet.  Highest priority first, then
972 * weighted round-robin within priorites.
973 *
974 * Each able-to-send class gets to send until its byte allocation is
975 * exhausted.  Thus, the active pointer is only changed after a class has
976 * exhausted its allocation.
977 *
978 * If the scheduler finds no class that is underlimit or able to borrow,
979 * then the first class found that had a nonzero queue and is allowed to
980 * borrow gets to send.
981 */
982
983static mbuf_t *
984_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
985{
986	struct rm_class	*cl = NULL, *first = NULL;
987	u_int		 deficit;
988	int		 cpri;
989	mbuf_t		*m;
990	struct timeval	 now;
991
992	RM_GETTIME(now);
993
994	/*
995	 * if the driver polls the top of the queue and then removes
996	 * the polled packet, we must return the same packet.
997	 */
998	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
999		cl = ifd->pollcache_;
1000		cpri = cl->pri_;
1001		if (ifd->efficient_) {
1002			/* check if this class is overlimit */
1003			if (cl->undertime_.tv_sec != 0 &&
1004			    rmc_under_limit(cl, &now) == 0)
1005				first = cl;
1006		}
1007		ifd->pollcache_ = NULL;
1008		goto _wrr_out;
1009	}
1010	else {
1011		/* mode == ALTDQ_POLL || pollcache == NULL */
1012		ifd->pollcache_ = NULL;
1013		ifd->borrowed_[ifd->qi_] = NULL;
1014	}
1015#ifdef ADJUST_CUTOFF
1016 _again:
1017#endif
1018	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1019		if (ifd->na_[cpri] == 0)
1020			continue;
1021		deficit = 0;
1022		/*
1023		 * Loop through twice for a priority level, if some class
1024		 * was unable to send a packet the first round because
1025		 * of the weighted round-robin mechanism.
1026		 * During the second loop at this level, deficit==2.
1027		 * (This second loop is not needed if for every class,
1028		 * "M[cl->pri_])" times "cl->allotment" is greater than
1029		 * the byte size for the largest packet in the class.)
1030		 */
1031 _wrr_loop:
1032		cl = ifd->active_[cpri];
1033		ASSERT(cl != NULL);
1034		do {
1035			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
1036				cl->bytes_alloc_ += cl->w_allotment_;
1037			if (!qempty(cl->q_)) {
1038				if ((cl->undertime_.tv_sec == 0) ||
1039				    rmc_under_limit(cl, &now)) {
1040					if (cl->bytes_alloc_ > 0 || deficit > 1)
1041						goto _wrr_out;
1042
1043					/* underlimit but no alloc */
1044					deficit = 1;
1045#if 1
1046					ifd->borrowed_[ifd->qi_] = NULL;
1047#endif
1048				}
1049				else if (first == NULL && cl->borrow_ != NULL)
1050					first = cl; /* borrowing candidate */
1051			}
1052
1053			cl->bytes_alloc_ = 0;
1054			cl = cl->peer_;
1055		} while (cl != ifd->active_[cpri]);
1056
1057		if (deficit == 1) {
1058			/* first loop found an underlimit class with deficit */
1059			/* Loop on same priority level, with new deficit.  */
1060			deficit = 2;
1061			goto _wrr_loop;
1062		}
1063	}
1064
1065#ifdef ADJUST_CUTOFF
1066	/*
1067	 * no underlimit class found.  if cutoff is taking effect,
1068	 * increase cutoff and try again.
1069	 */
1070	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1071		ifd->cutoff_++;
1072		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
1073		goto _again;
1074	}
1075#endif /* ADJUST_CUTOFF */
1076	/*
1077	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1078	 * class we encounter will send a packet if all the classes
1079	 * of the link-sharing structure are overlimit.
1080	 */
1081	reset_cutoff(ifd);
1082	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
1083
1084	if (!ifd->efficient_ || first == NULL)
1085		return (NULL);
1086
1087	cl = first;
1088	cpri = cl->pri_;
1089#if 0	/* too time-consuming for nothing */
1090	if (cl->sleeping_)
1091		CALLOUT_STOP(&cl->callout_);
1092	cl->sleeping_ = 0;
1093	cl->undertime_.tv_sec = 0;
1094#endif
1095	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1096	ifd->cutoff_ = cl->borrow_->depth_;
1097
1098	/*
1099	 * Deque the packet and do the book keeping...
1100	 */
1101 _wrr_out:
1102	if (op == ALTDQ_REMOVE) {
1103		m = _rmc_getq(cl);
1104		if (m == NULL)
1105			panic("_rmc_wrr_dequeue_next");
1106		if (qempty(cl->q_))
1107			ifd->na_[cpri]--;
1108
1109		/*
1110		 * Update class statistics and link data.
1111		 */
1112		if (cl->bytes_alloc_ > 0)
1113			cl->bytes_alloc_ -= m_pktlen(m);
1114
1115		if ((cl->bytes_alloc_ <= 0) || first == cl)
1116			ifd->active_[cl->pri_] = cl->peer_;
1117		else
1118			ifd->active_[cl->pri_] = cl;
1119
1120		ifd->class_[ifd->qi_] = cl;
1121		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1122		ifd->now_[ifd->qi_] = now;
1123		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1124		ifd->queued_++;
1125	} else {
1126		/* mode == ALTDQ_PPOLL */
1127		m = _rmc_pollq(cl);
1128		ifd->pollcache_ = cl;
1129	}
1130	return (m);
1131}
1132
1133/*
1134 * Dequeue & return next packet from the highest priority class that
1135 * has a packet to send & has enough allocation to send it.  This
1136 * routine is called by a driver whenever it needs a new packet to
1137 * output.
1138 */
1139static mbuf_t *
1140_rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
1141{
1142	mbuf_t		*m;
1143	int		 cpri;
1144	struct rm_class	*cl, *first = NULL;
1145	struct timeval	 now;
1146
1147	RM_GETTIME(now);
1148
1149	/*
1150	 * if the driver polls the top of the queue and then removes
1151	 * the polled packet, we must return the same packet.
1152	 */
1153	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
1154		cl = ifd->pollcache_;
1155		cpri = cl->pri_;
1156		ifd->pollcache_ = NULL;
1157		goto _prr_out;
1158	} else {
1159		/* mode == ALTDQ_POLL || pollcache == NULL */
1160		ifd->pollcache_ = NULL;
1161		ifd->borrowed_[ifd->qi_] = NULL;
1162	}
1163#ifdef ADJUST_CUTOFF
1164 _again:
1165#endif
1166	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1167		if (ifd->na_[cpri] == 0)
1168			continue;
1169		cl = ifd->active_[cpri];
1170		ASSERT(cl != NULL);
1171		do {
1172			if (!qempty(cl->q_)) {
1173				if ((cl->undertime_.tv_sec == 0) ||
1174				    rmc_under_limit(cl, &now))
1175					goto _prr_out;
1176				if (first == NULL && cl->borrow_ != NULL)
1177					first = cl;
1178			}
1179			cl = cl->peer_;
1180		} while (cl != ifd->active_[cpri]);
1181	}
1182
1183#ifdef ADJUST_CUTOFF
1184	/*
1185	 * no underlimit class found.  if cutoff is taking effect, increase
1186	 * cutoff and try again.
1187	 */
1188	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1189		ifd->cutoff_++;
1190		goto _again;
1191	}
1192#endif /* ADJUST_CUTOFF */
1193	/*
1194	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1195	 * class we encounter will send a packet if all the classes
1196	 * of the link-sharing structure are overlimit.
1197	 */
1198	reset_cutoff(ifd);
1199	if (!ifd->efficient_ || first == NULL)
1200		return (NULL);
1201
1202	cl = first;
1203	cpri = cl->pri_;
1204#if 0	/* too time-consuming for nothing */
1205	if (cl->sleeping_)
1206		CALLOUT_STOP(&cl->callout_);
1207	cl->sleeping_ = 0;
1208	cl->undertime_.tv_sec = 0;
1209#endif
1210	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1211	ifd->cutoff_ = cl->borrow_->depth_;
1212
1213	/*
1214	 * Deque the packet and do the book keeping...
1215	 */
1216 _prr_out:
1217	if (op == ALTDQ_REMOVE) {
1218		m = _rmc_getq(cl);
1219		if (m == NULL)
1220			panic("_rmc_prr_dequeue_next");
1221		if (qempty(cl->q_))
1222			ifd->na_[cpri]--;
1223
1224		ifd->active_[cpri] = cl->peer_;
1225
1226		ifd->class_[ifd->qi_] = cl;
1227		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1228		ifd->now_[ifd->qi_] = now;
1229		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1230		ifd->queued_++;
1231	} else {
1232		/* mode == ALTDQ_POLL */
1233		m = _rmc_pollq(cl);
1234		ifd->pollcache_ = cl;
1235	}
1236	return (m);
1237}
1238
1239/*
1240 * mbuf_t *
1241 * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
1242 *	is invoked by the packet driver to get the next packet to be
1243 *	dequeued and output on the link.  If WRR is enabled, then the
1244 *	WRR dequeue next routine will determine the next packet to sent.
1245 *	Otherwise, packet-by-packet round robin is invoked.
1246 *
1247 *	Returns:	NULL, if a packet is not available or if all
1248 *			classes are overlimit.
1249 *
1250 *			Otherwise, Pointer to the next packet.
1251 */
1252
1253mbuf_t *
1254rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
1255{
1256	if (ifd->queued_ >= ifd->maxqueued_)
1257		return (NULL);
1258	else if (ifd->wrr_)
1259		return (_rmc_wrr_dequeue_next(ifd, mode));
1260	else
1261		return (_rmc_prr_dequeue_next(ifd, mode));
1262}
1263
1264/*
1265 * Update the utilization estimate for the packet that just completed.
1266 * The packet's class & the parent(s) of that class all get their
1267 * estimators updated.  This routine is called by the driver's output-
1268 * packet-completion interrupt service routine.
1269 */
1270
1271/*
1272 * a macro to approximate "divide by 1000" that gives 0.000999,
1273 * if a value has enough effective digits.
1274 * (on pentium, mul takes 9 cycles but div takes 46!)
1275 */
1276#define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
1277void
1278rmc_update_class_util(struct rm_ifdat *ifd)
1279{
1280	int		 idle, avgidle, pktlen;
1281	int		 pkt_time, tidle;
1282	rm_class_t	*cl, *borrowed;
1283	rm_class_t	*borrows;
1284	struct timeval	*nowp;
1285
1286	/*
1287	 * Get the most recent completed class.
1288	 */
1289	if ((cl = ifd->class_[ifd->qo_]) == NULL)
1290		return;
1291
1292	pktlen = ifd->curlen_[ifd->qo_];
1293	borrowed = ifd->borrowed_[ifd->qo_];
1294	borrows = borrowed;
1295
1296	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1297
1298	/*
1299	 * Run estimator on class and its ancestors.
1300	 */
1301	/*
1302	 * rm_update_class_util is designed to be called when the
1303	 * transfer is completed from a xmit complete interrupt,
1304	 * but most drivers don't implement an upcall for that.
1305	 * so, just use estimated completion time.
1306	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
1307	 */
1308	nowp = &ifd->now_[ifd->qo_];
1309	/* get pkt_time (for link) in usec */
1310#if 1  /* use approximation */
1311	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
1312	pkt_time = NSEC_TO_USEC(pkt_time);
1313#else
1314	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
1315#endif
1316#if 1 /* ALTQ4PPP */
1317	if (TV_LT(nowp, &ifd->ifnow_)) {
1318		int iftime;
1319
1320		/*
1321		 * make sure the estimated completion time does not go
1322		 * too far.  it can happen when the link layer supports
1323		 * data compression or the interface speed is set to
1324		 * a much lower value.
1325		 */
1326		TV_DELTA(&ifd->ifnow_, nowp, iftime);
1327		if (iftime+pkt_time < ifd->maxiftime_) {
1328			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1329		} else {
1330			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
1331		}
1332	} else {
1333		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1334	}
1335#else
1336	if (TV_LT(nowp, &ifd->ifnow_)) {
1337		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1338	} else {
1339		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1340	}
1341#endif
1342
1343	while (cl != NULL) {
1344		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
1345		if (idle >= 2000000)
1346			/*
1347			 * this class is idle enough, reset avgidle.
1348			 * (TV_DELTA returns 2000000 us when delta is large.)
1349			 */
1350			cl->avgidle_ = cl->maxidle_;
1351
1352		/* get pkt_time (for class) in usec */
1353#if 1  /* use approximation */
1354		pkt_time = pktlen * cl->ns_per_byte_;
1355		pkt_time = NSEC_TO_USEC(pkt_time);
1356#else
1357		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
1358#endif
1359		idle -= pkt_time;
1360
1361		avgidle = cl->avgidle_;
1362		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
1363		cl->avgidle_ = avgidle;
1364
1365		/* Are we overlimit ? */
1366		if (avgidle <= 0) {
1367			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
1368#if 1 /* ALTQ */
1369			/*
1370			 * need some lower bound for avgidle, otherwise
1371			 * a borrowing class gets unbounded penalty.
1372			 */
1373			if (avgidle < cl->minidle_)
1374				avgidle = cl->avgidle_ = cl->minidle_;
1375#endif
1376			/* set next idle to make avgidle 0 */
1377			tidle = pkt_time +
1378				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
1379			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
1380			++cl->stats_.over;
1381		} else {
1382			cl->avgidle_ =
1383			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
1384			cl->undertime_.tv_sec = 0;
1385			if (cl->sleeping_) {
1386				CALLOUT_STOP(&cl->callout_);
1387				cl->sleeping_ = 0;
1388			}
1389		}
1390
1391		if (borrows != NULL) {
1392			if (borrows != cl)
1393				++cl->stats_.borrows;
1394			else
1395				borrows = NULL;
1396		}
1397		cl->last_ = ifd->ifnow_;
1398		cl->last_pkttime_ = pkt_time;
1399
1400#if 1
1401		if (cl->parent_ == NULL) {
1402			/* take stats of root class */
1403			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1404		}
1405#endif
1406
1407		cl = cl->parent_;
1408	}
1409
1410	/*
1411	 * Check to see if cutoff needs to set to a new level.
1412	 */
1413	cl = ifd->class_[ifd->qo_];
1414	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
1415#if 1 /* ALTQ */
1416		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
1417			rmc_tl_satisfied(ifd, nowp);
1418			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1419		} else {
1420			ifd->cutoff_ = borrowed->depth_;
1421			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1422		}
1423#else /* !ALTQ */
1424		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
1425			reset_cutoff(ifd);
1426#ifdef notdef
1427			rmc_tl_satisfied(ifd, &now);
1428#endif
1429			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1430		} else {
1431			ifd->cutoff_ = borrowed->depth_;
1432			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1433		}
1434#endif /* !ALTQ */
1435	}
1436
1437	/*
1438	 * Release class slot
1439	 */
1440	ifd->borrowed_[ifd->qo_] = NULL;
1441	ifd->class_[ifd->qo_] = NULL;
1442	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
1443	ifd->queued_--;
1444}
1445
1446/*
1447 * void
1448 * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
1449 *	over-limit action routines.  These get invoked by rmc_under_limit()
1450 *	if a class with packets to send if over its bandwidth limit & can't
1451 *	borrow from a parent class.
1452 *
1453 *	Returns: NONE
1454 */
1455
1456static void
1457rmc_drop_action(struct rm_class *cl)
1458{
1459	struct rm_ifdat	*ifd = cl->ifdat_;
1460
1461	ASSERT(qlen(cl->q_) > 0);
1462	_rmc_dropq(cl);
1463	if (qempty(cl->q_))
1464		ifd->na_[cl->pri_]--;
1465}
1466
1467void rmc_dropall(struct rm_class *cl)
1468{
1469	struct rm_ifdat	*ifd = cl->ifdat_;
1470
1471	if (!qempty(cl->q_)) {
1472		_flushq(cl->q_);
1473
1474		ifd->na_[cl->pri_]--;
1475	}
1476}
1477
1478#if (__FreeBSD_version > 300000)
1479/* hzto() is removed from FreeBSD-3.0 */
1480static int hzto(struct timeval *);
1481
1482static int
1483hzto(tv)
1484	struct timeval *tv;
1485{
1486	struct timeval t2;
1487
1488	getmicrotime(&t2);
1489	t2.tv_sec = tv->tv_sec - t2.tv_sec;
1490	t2.tv_usec = tv->tv_usec - t2.tv_usec;
1491	return (tvtohz(&t2));
1492}
1493#endif /* __FreeBSD_version > 300000 */
1494
1495/*
1496 * void
1497 * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
1498 *	delay action routine.  It is invoked via rmc_under_limit when the
1499 *	packet is discoverd to be overlimit.
1500 *
1501 *	If the delay action is result of borrow class being overlimit, then
1502 *	delay for the offtime of the borrowing class that is overlimit.
1503 *
1504 *	Returns: NONE
1505 */
1506
1507void
1508rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
1509{
1510	int	delay, t, extradelay;
1511
1512	cl->stats_.overactions++;
1513	TV_DELTA(&cl->undertime_, &cl->overtime_, delay);
1514#ifndef BORROW_OFFTIME
1515	delay += cl->offtime_;
1516#endif
1517
1518	if (!cl->sleeping_) {
1519		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
1520#ifdef BORROW_OFFTIME
1521		if (borrow != NULL)
1522			extradelay = borrow->offtime_;
1523		else
1524#endif
1525			extradelay = cl->offtime_;
1526
1527#ifdef ALTQ
1528		/*
1529		 * XXX recalculate suspend time:
1530		 * current undertime is (tidle + pkt_time) calculated
1531		 * from the last transmission.
1532		 *	tidle: time required to bring avgidle back to 0
1533		 *	pkt_time: target waiting time for this class
1534		 * we need to replace pkt_time by offtime
1535		 */
1536		extradelay -= cl->last_pkttime_;
1537#endif
1538		if (extradelay > 0) {
1539			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
1540			delay += extradelay;
1541		}
1542
1543		cl->sleeping_ = 1;
1544		cl->stats_.delays++;
1545
1546		/*
1547		 * Since packets are phased randomly with respect to the
1548		 * clock, 1 tick (the next clock tick) can be an arbitrarily
1549		 * short time so we have to wait for at least two ticks.
1550		 * NOTE:  If there's no other traffic, we need the timer as
1551		 * a 'backstop' to restart this class.
1552		 */
1553		if (delay > tick * 2) {
1554#ifdef __FreeBSD__
1555			/* FreeBSD rounds up the tick */
1556			t = hzto(&cl->undertime_);
1557#else
1558			/* other BSDs round down the tick */
1559			t = hzto(&cl->undertime_) + 1;
1560#endif
1561		} else
1562			t = 2;
1563		CALLOUT_RESET(&cl->callout_, t,
1564			      (timeout_t *)rmc_restart, (caddr_t)cl);
1565	}
1566}
1567
1568/*
1569 * void
1570 * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
1571 *	called by the system timer code & is responsible checking if the
1572 *	class is still sleeping (it might have been restarted as a side
1573 *	effect of the queue scan on a packet arrival) and, if so, restarting
1574 *	output for the class.  Inspecting the class state & restarting output
1575 *	require locking the class structure.  In general the driver is
1576 *	responsible for locking but this is the only routine that is not
1577 *	called directly or indirectly from the interface driver so it has
1578 *	know about system locking conventions.  Under bsd, locking is done
1579 *	by raising IPL to splimp so that's what's implemented here.  On a
1580 *	different system this would probably need to be changed.
1581 *
1582 *	Returns:	NONE
1583 */
1584
1585static void
1586rmc_restart(struct rm_class *cl)
1587{
1588	struct rm_ifdat	*ifd = cl->ifdat_;
1589	int		 s;
1590
1591#ifdef __NetBSD__
1592	s = splnet();
1593#else
1594	s = splimp();
1595#endif
1596	IFQ_LOCK(ifd->ifq_);
1597	if (cl->sleeping_) {
1598		cl->sleeping_ = 0;
1599		cl->undertime_.tv_sec = 0;
1600
1601		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
1602			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
1603			(ifd->restart)(ifd->ifq_);
1604		}
1605	}
1606	IFQ_UNLOCK(ifd->ifq_);
1607	splx(s);
1608}
1609
1610/*
1611 * void
1612 * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
1613 *	handling routine for the root class of the link sharing structure.
1614 *
1615 *	Returns: NONE
1616 */
1617
1618static void
1619rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow)
1620{
1621    panic("rmc_root_overlimit");
1622}
1623
1624/*
1625 * Packet Queue handling routines.  Eventually, this is to localize the
1626 *	effects on the code whether queues are red queues or droptail
1627 *	queues.
1628 */
1629
1630static int
1631_rmc_addq(rm_class_t *cl, mbuf_t *m)
1632{
1633#ifdef ALTQ_RIO
1634	if (q_is_rio(cl->q_))
1635		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
1636#endif
1637#ifdef ALTQ_RED
1638	if (q_is_red(cl->q_))
1639		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
1640#endif /* ALTQ_RED */
1641#ifdef ALTQ_CODEL
1642	if (q_is_codel(cl->q_))
1643		return codel_addq(cl->codel_, cl->q_, m);
1644#endif
1645
1646	if (cl->flags_ & RMCF_CLEARDSCP)
1647		write_dsfield(m, cl->pktattr_, 0);
1648
1649	_addq(cl->q_, m);
1650	return (0);
1651}
1652
1653/* note: _rmc_dropq is not called for red */
1654static void
1655_rmc_dropq(rm_class_t *cl)
1656{
1657	mbuf_t	*m;
1658
1659	if ((m = _getq(cl->q_)) != NULL)
1660		m_freem(m);
1661}
1662
1663static mbuf_t *
1664_rmc_getq(rm_class_t *cl)
1665{
1666#ifdef ALTQ_RIO
1667	if (q_is_rio(cl->q_))
1668		return rio_getq((rio_t *)cl->red_, cl->q_);
1669#endif
1670#ifdef ALTQ_RED
1671	if (q_is_red(cl->q_))
1672		return red_getq(cl->red_, cl->q_);
1673#endif
1674#ifdef ALTQ_CODEL
1675	if (q_is_codel(cl->q_))
1676		return codel_getq(cl->codel_, cl->q_);
1677#endif
1678	return _getq(cl->q_);
1679}
1680
1681static mbuf_t *
1682_rmc_pollq(rm_class_t *cl)
1683{
1684	return qhead(cl->q_);
1685}
1686
1687#ifdef CBQ_TRACE
1688
1689struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
1690struct cbqtrace		*cbqtrace_ptr = NULL;
1691int			 cbqtrace_count;
1692
1693/*
1694 * DDB hook to trace cbq events:
1695 *  the last 1024 events are held in a circular buffer.
1696 *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
1697 */
1698void cbqtrace_dump(int);
1699static char *rmc_funcname(void *);
1700
1701static struct rmc_funcs {
1702	void	*func;
1703	char	*name;
1704} rmc_funcs[] =
1705{
1706	rmc_init,		"rmc_init",
1707	rmc_queue_packet,	"rmc_queue_packet",
1708	rmc_under_limit,	"rmc_under_limit",
1709	rmc_update_class_util,	"rmc_update_class_util",
1710	rmc_delay_action,	"rmc_delay_action",
1711	rmc_restart,		"rmc_restart",
1712	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
1713	NULL,			NULL
1714};
1715
1716static char *rmc_funcname(void *func)
1717{
1718	struct rmc_funcs *fp;
1719
1720	for (fp = rmc_funcs; fp->func != NULL; fp++)
1721		if (fp->func == func)
1722			return (fp->name);
1723	return ("unknown");
1724}
1725
1726void cbqtrace_dump(int counter)
1727{
1728	int	 i, *p;
1729	char	*cp;
1730
1731	counter = counter % NCBQTRACE;
1732	p = (int *)&cbqtrace_buffer[counter];
1733
1734	for (i=0; i<20; i++) {
1735		printf("[0x%x] ", *p++);
1736		printf("%s: ", rmc_funcname((void *)*p++));
1737		cp = (char *)p++;
1738		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
1739		printf("%d\n",*p++);
1740
1741		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
1742			p = (int *)cbqtrace_buffer;
1743	}
1744}
1745#endif /* CBQ_TRACE */
1746#endif /* ALTQ_CBQ */
1747
1748#if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || \
1749    defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) || defined(ALTQ_CODEL)
1750#if !defined(__GNUC__) || defined(ALTQ_DEBUG)
1751
1752void
1753_addq(class_queue_t *q, mbuf_t *m)
1754{
1755        mbuf_t	*m0;
1756
1757	if ((m0 = qtail(q)) != NULL)
1758		m->m_nextpkt = m0->m_nextpkt;
1759	else
1760		m0 = m;
1761	m0->m_nextpkt = m;
1762	qtail(q) = m;
1763	qlen(q)++;
1764}
1765
1766mbuf_t *
1767_getq(class_queue_t *q)
1768{
1769	mbuf_t	*m, *m0;
1770
1771	if ((m = qtail(q)) == NULL)
1772		return (NULL);
1773	if ((m0 = m->m_nextpkt) != m)
1774		m->m_nextpkt = m0->m_nextpkt;
1775	else {
1776		ASSERT(qlen(q) == 1);
1777		qtail(q) = NULL;
1778	}
1779	qlen(q)--;
1780	m0->m_nextpkt = NULL;
1781	return (m0);
1782}
1783
1784/* drop a packet at the tail of the queue */
1785mbuf_t *
1786_getq_tail(class_queue_t *q)
1787{
1788	mbuf_t	*m, *m0, *prev;
1789
1790	if ((m = m0 = qtail(q)) == NULL)
1791		return NULL;
1792	do {
1793		prev = m0;
1794		m0 = m0->m_nextpkt;
1795	} while (m0 != m);
1796	prev->m_nextpkt = m->m_nextpkt;
1797	if (prev == m)  {
1798		ASSERT(qlen(q) == 1);
1799		qtail(q) = NULL;
1800	} else
1801		qtail(q) = prev;
1802	qlen(q)--;
1803	m->m_nextpkt = NULL;
1804	return (m);
1805}
1806
1807/* randomly select a packet in the queue */
1808mbuf_t *
1809_getq_random(class_queue_t *q)
1810{
1811	struct mbuf	*m;
1812	int		 i, n;
1813
1814	if ((m = qtail(q)) == NULL)
1815		return NULL;
1816	if (m->m_nextpkt == m) {
1817		ASSERT(qlen(q) == 1);
1818		qtail(q) = NULL;
1819	} else {
1820		struct mbuf *prev = NULL;
1821
1822		n = arc4random() % qlen(q) + 1;
1823		for (i = 0; i < n; i++) {
1824			prev = m;
1825			m = m->m_nextpkt;
1826		}
1827		prev->m_nextpkt = m->m_nextpkt;
1828		if (m == qtail(q))
1829			qtail(q) = prev;
1830	}
1831	qlen(q)--;
1832	m->m_nextpkt = NULL;
1833	return (m);
1834}
1835
1836void
1837_removeq(class_queue_t *q, mbuf_t *m)
1838{
1839	mbuf_t	*m0, *prev;
1840
1841	m0 = qtail(q);
1842	do {
1843		prev = m0;
1844		m0 = m0->m_nextpkt;
1845	} while (m0 != m);
1846	prev->m_nextpkt = m->m_nextpkt;
1847	if (prev == m)
1848		qtail(q) = NULL;
1849	else if (qtail(q) == m)
1850		qtail(q) = prev;
1851	qlen(q)--;
1852}
1853
1854void
1855_flushq(class_queue_t *q)
1856{
1857	mbuf_t *m;
1858
1859	while ((m = _getq(q)) != NULL)
1860		m_freem(m);
1861	ASSERT(qlen(q) == 0);
1862}
1863
1864#endif /* !__GNUC__ || ALTQ_DEBUG */
1865#endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
1866