1/*
2 * net/sched/sch_api.c	Packet scheduler API.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
30#include <linux/init.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33#include <linux/kmod.h>
34#include <linux/list.h>
35#include <linux/bitops.h>
36#include <linux/hrtimer.h>
37
38#include <net/netlink.h>
39#include <net/sock.h>
40#include <net/pkt_sched.h>
41
42#include <asm/processor.h>
43#include <asm/uaccess.h>
44#include <asm/system.h>
45
46static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47			struct Qdisc *old, struct Qdisc *new);
48static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49			 struct Qdisc *q, unsigned long cl, int event);
50
51/*
52
53   Short review.
54   -------------
55
56   This file consists of two interrelated parts:
57
58   1. queueing disciplines manager frontend.
59   2. traffic classes manager frontend.
60
61   Generally, queueing discipline ("qdisc") is a black box,
62   which is able to enqueue packets and to dequeue them (when
63   device is ready to send something) in order and at times
64   determined by algorithm hidden in it.
65
66   qdisc's are divided to two categories:
67   - "queues", which have no internal structure visible from outside.
68   - "schedulers", which split all the packets to "traffic classes",
69     using "packet classifiers" (look at cls_api.c)
70
71   In turn, classes may have child qdiscs (as rule, queues)
72   attached to them etc. etc. etc.
73
74   The goal of the routines in this file is to translate
75   information supplied by user in the form of handles
76   to more intelligible for kernel form, to make some sanity
77   checks and part of work, which is common to all qdiscs
78   and to provide rtnetlink notifications.
79
80   All real intelligent work is done inside qdisc modules.
81
82
83
84   Every discipline has two major routines: enqueue and dequeue.
85
86   ---dequeue
87
88   dequeue usually returns a skb to send. It is allowed to return NULL,
89   but it does not mean that queue is empty, it just means that
90   discipline does not want to send anything this time.
91   Queue is really empty if q->q.qlen == 0.
92   For complicated disciplines with multiple queues q->q is not
93   real packet queue, but however q->q.qlen must be valid.
94
95   ---enqueue
96
97   enqueue returns 0, if packet was enqueued successfully.
98   If packet (this one or another one) was dropped, it returns
99   not zero error code.
100   NET_XMIT_DROP 	- this packet dropped
101     Expected action: do not backoff, but wait until queue will clear.
102   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
103     Expected action: backoff or ignore
104   NET_XMIT_POLICED	- dropped by police.
105     Expected action: backoff or error to real-time apps.
106
107   Auxiliary routines:
108
109   ---requeue
110
111   requeues once dequeued packet. It is used for non-standard or
112   just buggy devices, which can defer output even if dev->tbusy=0.
113
114   ---reset
115
116   returns qdisc to initial state: purge all buffers, clear all
117   timers, counters (except for statistics) etc.
118
119   ---init
120
121   initializes newly created qdisc.
122
123   ---destroy
124
125   destroys resources allocated by init and during lifetime of qdisc.
126
127   ---change
128
129   changes qdisc parameters.
130 */
131
132/* Protects list of registered TC modules. It is pure SMP lock. */
133static DEFINE_RWLOCK(qdisc_mod_lock);
134
135
136/************************************************
137 *	Queueing disciplines manipulation.	*
138 ************************************************/
139
140
141/* The list of all installed queueing disciplines. */
142
143static struct Qdisc_ops *qdisc_base;
144
145/* Register/uregister queueing discipline */
146
147int register_qdisc(struct Qdisc_ops *qops)
148{
149	struct Qdisc_ops *q, **qp;
150	int rc = -EEXIST;
151
152	write_lock(&qdisc_mod_lock);
153	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154		if (!strcmp(qops->id, q->id))
155			goto out;
156
157	if (qops->enqueue == NULL)
158		qops->enqueue = noop_qdisc_ops.enqueue;
159	if (qops->requeue == NULL)
160		qops->requeue = noop_qdisc_ops.requeue;
161	if (qops->dequeue == NULL)
162		qops->dequeue = noop_qdisc_ops.dequeue;
163
164	qops->next = NULL;
165	*qp = qops;
166	rc = 0;
167out:
168	write_unlock(&qdisc_mod_lock);
169	return rc;
170}
171
172int unregister_qdisc(struct Qdisc_ops *qops)
173{
174	struct Qdisc_ops *q, **qp;
175	int err = -ENOENT;
176
177	write_lock(&qdisc_mod_lock);
178	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179		if (q == qops)
180			break;
181	if (q) {
182		*qp = q->next;
183		q->next = NULL;
184		err = 0;
185	}
186	write_unlock(&qdisc_mod_lock);
187	return err;
188}
189
190/* We know handle. Find qdisc among all qdisc's attached to device
191   (root qdisc, all its children, children of children etc.)
192 */
193
194struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
195{
196	struct Qdisc *q;
197
198	list_for_each_entry(q, &dev->qdisc_list, list) {
199		if (q->handle == handle)
200			return q;
201	}
202	return NULL;
203}
204
205static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
206{
207	unsigned long cl;
208	struct Qdisc *leaf;
209	struct Qdisc_class_ops *cops = p->ops->cl_ops;
210
211	if (cops == NULL)
212		return NULL;
213	cl = cops->get(p, classid);
214
215	if (cl == 0)
216		return NULL;
217	leaf = cops->leaf(p, cl);
218	cops->put(p, cl);
219	return leaf;
220}
221
222/* Find queueing discipline by name */
223
224static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
225{
226	struct Qdisc_ops *q = NULL;
227
228	if (kind) {
229		read_lock(&qdisc_mod_lock);
230		for (q = qdisc_base; q; q = q->next) {
231			if (rtattr_strcmp(kind, q->id) == 0) {
232				if (!try_module_get(q->owner))
233					q = NULL;
234				break;
235			}
236		}
237		read_unlock(&qdisc_mod_lock);
238	}
239	return q;
240}
241
242static struct qdisc_rate_table *qdisc_rtab_list;
243
244struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
245{
246	struct qdisc_rate_table *rtab;
247
248	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
249		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
250			rtab->refcnt++;
251			return rtab;
252		}
253	}
254
255	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
256		return NULL;
257
258	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
259	if (rtab) {
260		rtab->rate = *r;
261		rtab->refcnt = 1;
262		memcpy(rtab->data, RTA_DATA(tab), 1024);
263		rtab->next = qdisc_rtab_list;
264		qdisc_rtab_list = rtab;
265	}
266	return rtab;
267}
268
269void qdisc_put_rtab(struct qdisc_rate_table *tab)
270{
271	struct qdisc_rate_table *rtab, **rtabp;
272
273	if (!tab || --tab->refcnt)
274		return;
275
276	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
277		if (rtab == tab) {
278			*rtabp = rtab->next;
279			kfree(rtab);
280			return;
281		}
282	}
283}
284
285static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
286{
287	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
288						 timer);
289	struct net_device *dev = wd->qdisc->dev;
290
291	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
292	smp_wmb();
293	if (spin_trylock(&dev->queue_lock)) {
294		qdisc_run(dev);
295		spin_unlock(&dev->queue_lock);
296	} else
297		netif_schedule(dev);
298
299	return HRTIMER_NORESTART;
300}
301
302void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
303{
304	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
305	wd->timer.function = qdisc_watchdog;
306	wd->qdisc = qdisc;
307}
308EXPORT_SYMBOL(qdisc_watchdog_init);
309
310void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
311{
312	ktime_t time;
313
314	wd->qdisc->flags |= TCQ_F_THROTTLED;
315	time = ktime_set(0, 0);
316	time = ktime_add_ns(time, PSCHED_US2NS(expires));
317	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
318}
319EXPORT_SYMBOL(qdisc_watchdog_schedule);
320
321void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
322{
323	hrtimer_cancel(&wd->timer);
324	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
325}
326EXPORT_SYMBOL(qdisc_watchdog_cancel);
327
328/* Allocate an unique handle from space managed by kernel */
329
330static u32 qdisc_alloc_handle(struct net_device *dev)
331{
332	int i = 0x10000;
333	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
334
335	do {
336		autohandle += TC_H_MAKE(0x10000U, 0);
337		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
338			autohandle = TC_H_MAKE(0x80000000U, 0);
339	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
340
341	return i>0 ? autohandle : 0;
342}
343
344/* Attach toplevel qdisc to device dev */
345
346static struct Qdisc *
347dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
348{
349	struct Qdisc *oqdisc;
350
351	if (dev->flags & IFF_UP)
352		dev_deactivate(dev);
353
354	qdisc_lock_tree(dev);
355	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
356		oqdisc = dev->qdisc_ingress;
357		/* Prune old scheduler */
358		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
359			/* delete */
360			qdisc_reset(oqdisc);
361			dev->qdisc_ingress = NULL;
362		} else {  /* new */
363			dev->qdisc_ingress = qdisc;
364		}
365
366	} else {
367
368		oqdisc = dev->qdisc_sleeping;
369
370		/* Prune old scheduler */
371		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
372			qdisc_reset(oqdisc);
373
374		/* ... and graft new one */
375		if (qdisc == NULL)
376			qdisc = &noop_qdisc;
377		dev->qdisc_sleeping = qdisc;
378		dev->qdisc = &noop_qdisc;
379	}
380
381	qdisc_unlock_tree(dev);
382
383	if (dev->flags & IFF_UP)
384		dev_activate(dev);
385
386	return oqdisc;
387}
388
389void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
390{
391	struct Qdisc_class_ops *cops;
392	unsigned long cl;
393	u32 parentid;
394
395	if (n == 0)
396		return;
397	while ((parentid = sch->parent)) {
398		sch = qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
399		cops = sch->ops->cl_ops;
400		if (cops->qlen_notify) {
401			cl = cops->get(sch, parentid);
402			cops->qlen_notify(sch, cl);
403			cops->put(sch, cl);
404		}
405		sch->q.qlen -= n;
406	}
407}
408EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
409
410/* Graft qdisc "new" to class "classid" of qdisc "parent" or
411   to device "dev".
412
413   Old qdisc is not destroyed but returned in *old.
414 */
415
416static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
417		       u32 classid,
418		       struct Qdisc *new, struct Qdisc **old)
419{
420	int err = 0;
421	struct Qdisc *q = *old;
422
423
424	if (parent == NULL) {
425		if (q && q->flags&TCQ_F_INGRESS) {
426			*old = dev_graft_qdisc(dev, q);
427		} else {
428			*old = dev_graft_qdisc(dev, new);
429		}
430	} else {
431		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
432
433		err = -EINVAL;
434
435		if (cops) {
436			unsigned long cl = cops->get(parent, classid);
437			if (cl) {
438				err = cops->graft(parent, cl, new, old);
439				if (new)
440					new->parent = classid;
441				cops->put(parent, cl);
442			}
443		}
444	}
445	return err;
446}
447
448/*
449   Allocate and initialize new qdisc.
450
451   Parameters are passed via opt.
452 */
453
454static struct Qdisc *
455qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
456{
457	int err;
458	struct rtattr *kind = tca[TCA_KIND-1];
459	struct Qdisc *sch;
460	struct Qdisc_ops *ops;
461
462	ops = qdisc_lookup_ops(kind);
463#ifdef CONFIG_KMOD
464	if (ops == NULL && kind != NULL) {
465		char name[IFNAMSIZ];
466		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
467			/* We dropped the RTNL semaphore in order to
468			 * perform the module load.  So, even if we
469			 * succeeded in loading the module we have to
470			 * tell the caller to replay the request.  We
471			 * indicate this using -EAGAIN.
472			 * We replay the request because the device may
473			 * go away in the mean time.
474			 */
475			rtnl_unlock();
476			request_module("sch_%s", name);
477			rtnl_lock();
478			ops = qdisc_lookup_ops(kind);
479			if (ops != NULL) {
480				/* We will try again qdisc_lookup_ops,
481				 * so don't keep a reference.
482				 */
483				module_put(ops->owner);
484				err = -EAGAIN;
485				goto err_out;
486			}
487		}
488	}
489#endif
490
491	err = -ENOENT;
492	if (ops == NULL)
493		goto err_out;
494
495	sch = qdisc_alloc(dev, ops);
496	if (IS_ERR(sch)) {
497		err = PTR_ERR(sch);
498		goto err_out2;
499	}
500
501	if (handle == TC_H_INGRESS) {
502		sch->flags |= TCQ_F_INGRESS;
503		sch->stats_lock = &dev->ingress_lock;
504		handle = TC_H_MAKE(TC_H_INGRESS, 0);
505	} else {
506		sch->stats_lock = &dev->queue_lock;
507		if (handle == 0) {
508			handle = qdisc_alloc_handle(dev);
509			err = -ENOMEM;
510			if (handle == 0)
511				goto err_out3;
512		}
513	}
514
515	sch->handle = handle;
516
517	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
518#ifdef CONFIG_NET_ESTIMATOR
519		if (tca[TCA_RATE-1]) {
520			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
521						sch->stats_lock,
522						tca[TCA_RATE-1]);
523			if (err) {
524				/*
525				 * Any broken qdiscs that would require
526				 * a ops->reset() here? The qdisc was never
527				 * in action so it shouldn't be necessary.
528				 */
529				if (ops->destroy)
530					ops->destroy(sch);
531				goto err_out3;
532			}
533		}
534#endif
535		qdisc_lock_tree(dev);
536		list_add_tail(&sch->list, &dev->qdisc_list);
537		qdisc_unlock_tree(dev);
538
539		return sch;
540	}
541err_out3:
542	dev_put(dev);
543	kfree((char *) sch - sch->padded);
544err_out2:
545	module_put(ops->owner);
546err_out:
547	*errp = err;
548	return NULL;
549}
550
551static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
552{
553	if (tca[TCA_OPTIONS-1]) {
554		int err;
555
556		if (sch->ops->change == NULL)
557			return -EINVAL;
558		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
559		if (err)
560			return err;
561	}
562#ifdef CONFIG_NET_ESTIMATOR
563	if (tca[TCA_RATE-1])
564		gen_replace_estimator(&sch->bstats, &sch->rate_est,
565			sch->stats_lock, tca[TCA_RATE-1]);
566#endif
567	return 0;
568}
569
570struct check_loop_arg
571{
572	struct qdisc_walker 	w;
573	struct Qdisc		*p;
574	int			depth;
575};
576
577static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
578
579static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
580{
581	struct check_loop_arg	arg;
582
583	if (q->ops->cl_ops == NULL)
584		return 0;
585
586	arg.w.stop = arg.w.skip = arg.w.count = 0;
587	arg.w.fn = check_loop_fn;
588	arg.depth = depth;
589	arg.p = p;
590	q->ops->cl_ops->walk(q, &arg.w);
591	return arg.w.stop ? -ELOOP : 0;
592}
593
594static int
595check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
596{
597	struct Qdisc *leaf;
598	struct Qdisc_class_ops *cops = q->ops->cl_ops;
599	struct check_loop_arg *arg = (struct check_loop_arg *)w;
600
601	leaf = cops->leaf(q, cl);
602	if (leaf) {
603		if (leaf == arg->p || arg->depth > 7)
604			return -ELOOP;
605		return check_loop(leaf, arg->p, arg->depth + 1);
606	}
607	return 0;
608}
609
610/*
611 * Delete/get qdisc.
612 */
613
614static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
615{
616	struct tcmsg *tcm = NLMSG_DATA(n);
617	struct rtattr **tca = arg;
618	struct net_device *dev;
619	u32 clid = tcm->tcm_parent;
620	struct Qdisc *q = NULL;
621	struct Qdisc *p = NULL;
622	int err;
623
624	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
625		return -ENODEV;
626
627	if (clid) {
628		if (clid != TC_H_ROOT) {
629			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
630				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
631					return -ENOENT;
632				q = qdisc_leaf(p, clid);
633			} else { /* ingress */
634				q = dev->qdisc_ingress;
635			}
636		} else {
637			q = dev->qdisc_sleeping;
638		}
639		if (!q)
640			return -ENOENT;
641
642		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
643			return -EINVAL;
644	} else {
645		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
646			return -ENOENT;
647	}
648
649	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
650		return -EINVAL;
651
652	if (n->nlmsg_type == RTM_DELQDISC) {
653		if (!clid)
654			return -EINVAL;
655		if (q->handle == 0)
656			return -ENOENT;
657		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
658			return err;
659		if (q) {
660			qdisc_notify(skb, n, clid, q, NULL);
661			qdisc_lock_tree(dev);
662			qdisc_destroy(q);
663			qdisc_unlock_tree(dev);
664		}
665	} else {
666		qdisc_notify(skb, n, clid, NULL, q);
667	}
668	return 0;
669}
670
671/*
672   Create/change qdisc.
673 */
674
675static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
676{
677	struct tcmsg *tcm;
678	struct rtattr **tca;
679	struct net_device *dev;
680	u32 clid;
681	struct Qdisc *q, *p;
682	int err;
683
684replay:
685	/* Reinit, just in case something touches this. */
686	tcm = NLMSG_DATA(n);
687	tca = arg;
688	clid = tcm->tcm_parent;
689	q = p = NULL;
690
691	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
692		return -ENODEV;
693
694	if (clid) {
695		if (clid != TC_H_ROOT) {
696			if (clid != TC_H_INGRESS) {
697				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
698					return -ENOENT;
699				q = qdisc_leaf(p, clid);
700			} else { /*ingress */
701				q = dev->qdisc_ingress;
702			}
703		} else {
704			q = dev->qdisc_sleeping;
705		}
706
707		/* It may be default qdisc, ignore it */
708		if (q && q->handle == 0)
709			q = NULL;
710
711		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
712			if (tcm->tcm_handle) {
713				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
714					return -EEXIST;
715				if (TC_H_MIN(tcm->tcm_handle))
716					return -EINVAL;
717				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
718					goto create_n_graft;
719				if (n->nlmsg_flags&NLM_F_EXCL)
720					return -EEXIST;
721				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
722					return -EINVAL;
723				if (q == p ||
724				    (p && check_loop(q, p, 0)))
725					return -ELOOP;
726				atomic_inc(&q->refcnt);
727				goto graft;
728			} else {
729				if (q == NULL)
730					goto create_n_graft;
731
732				/* This magic test requires explanation.
733				 *
734				 *   We know, that some child q is already
735				 *   attached to this parent and have choice:
736				 *   either to change it or to create/graft new one.
737				 *
738				 *   1. We are allowed to create/graft only
739				 *   if CREATE and REPLACE flags are set.
740				 *
741				 *   2. If EXCL is set, requestor wanted to say,
742				 *   that qdisc tcm_handle is not expected
743				 *   to exist, so that we choose create/graft too.
744				 *
745				 *   3. The last case is when no flags are set.
746				 *   Alas, it is sort of hole in API, we
747				 *   cannot decide what to do unambiguously.
748				 *   For now we select create/graft, if
749				 *   user gave KIND, which does not match existing.
750				 */
751				if ((n->nlmsg_flags&NLM_F_CREATE) &&
752				    (n->nlmsg_flags&NLM_F_REPLACE) &&
753				    ((n->nlmsg_flags&NLM_F_EXCL) ||
754				     (tca[TCA_KIND-1] &&
755				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
756					goto create_n_graft;
757			}
758		}
759	} else {
760		if (!tcm->tcm_handle)
761			return -EINVAL;
762		q = qdisc_lookup(dev, tcm->tcm_handle);
763	}
764
765	/* Change qdisc parameters */
766	if (q == NULL)
767		return -ENOENT;
768	if (n->nlmsg_flags&NLM_F_EXCL)
769		return -EEXIST;
770	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
771		return -EINVAL;
772	err = qdisc_change(q, tca);
773	if (err == 0)
774		qdisc_notify(skb, n, clid, NULL, q);
775	return err;
776
777create_n_graft:
778	if (!(n->nlmsg_flags&NLM_F_CREATE))
779		return -ENOENT;
780	if (clid == TC_H_INGRESS)
781		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
782	else
783		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
784	if (q == NULL) {
785		if (err == -EAGAIN)
786			goto replay;
787		return err;
788	}
789
790graft:
791	if (1) {
792		struct Qdisc *old_q = NULL;
793		err = qdisc_graft(dev, p, clid, q, &old_q);
794		if (err) {
795			if (q) {
796				qdisc_lock_tree(dev);
797				qdisc_destroy(q);
798				qdisc_unlock_tree(dev);
799			}
800			return err;
801		}
802		qdisc_notify(skb, n, clid, old_q, q);
803		if (old_q) {
804			qdisc_lock_tree(dev);
805			qdisc_destroy(old_q);
806			qdisc_unlock_tree(dev);
807		}
808	}
809	return 0;
810}
811
812static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
813			 u32 pid, u32 seq, u16 flags, int event)
814{
815	struct tcmsg *tcm;
816	struct nlmsghdr  *nlh;
817	unsigned char *b = skb_tail_pointer(skb);
818	struct gnet_dump d;
819
820	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
821	tcm = NLMSG_DATA(nlh);
822	tcm->tcm_family = AF_UNSPEC;
823	tcm->tcm__pad1 = 0;
824	tcm->tcm__pad2 = 0;
825	tcm->tcm_ifindex = q->dev->ifindex;
826	tcm->tcm_parent = clid;
827	tcm->tcm_handle = q->handle;
828	tcm->tcm_info = atomic_read(&q->refcnt);
829	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
830	if (q->ops->dump && q->ops->dump(q, skb) < 0)
831		goto rtattr_failure;
832	q->qstats.qlen = q->q.qlen;
833
834	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
835			TCA_XSTATS, q->stats_lock, &d) < 0)
836		goto rtattr_failure;
837
838	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
839		goto rtattr_failure;
840
841	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
842#ifdef CONFIG_NET_ESTIMATOR
843	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
844#endif
845	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
846		goto rtattr_failure;
847
848	if (gnet_stats_finish_copy(&d) < 0)
849		goto rtattr_failure;
850
851	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
852	return skb->len;
853
854nlmsg_failure:
855rtattr_failure:
856	nlmsg_trim(skb, b);
857	return -1;
858}
859
860static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
861			u32 clid, struct Qdisc *old, struct Qdisc *new)
862{
863	struct sk_buff *skb;
864	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
865
866	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
867	if (!skb)
868		return -ENOBUFS;
869
870	if (old && old->handle) {
871		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
872			goto err_out;
873	}
874	if (new) {
875		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
876			goto err_out;
877	}
878
879	if (skb->len)
880		return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
881
882err_out:
883	kfree_skb(skb);
884	return -EINVAL;
885}
886
887static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
888{
889	int idx, q_idx;
890	int s_idx, s_q_idx;
891	struct net_device *dev;
892	struct Qdisc *q;
893
894	s_idx = cb->args[0];
895	s_q_idx = q_idx = cb->args[1];
896	read_lock(&dev_base_lock);
897	idx = 0;
898	for_each_netdev(dev) {
899		if (idx < s_idx)
900			goto cont;
901		if (idx > s_idx)
902			s_q_idx = 0;
903		q_idx = 0;
904		list_for_each_entry(q, &dev->qdisc_list, list) {
905			if (q_idx < s_q_idx) {
906				q_idx++;
907				continue;
908			}
909			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
910					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
911				goto done;
912			q_idx++;
913		}
914cont:
915		idx++;
916	}
917
918done:
919	read_unlock(&dev_base_lock);
920
921	cb->args[0] = idx;
922	cb->args[1] = q_idx;
923
924	return skb->len;
925}
926
927
928
929/************************************************
930 *	Traffic classes manipulation.		*
931 ************************************************/
932
933
934
935static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
936{
937	struct tcmsg *tcm = NLMSG_DATA(n);
938	struct rtattr **tca = arg;
939	struct net_device *dev;
940	struct Qdisc *q = NULL;
941	struct Qdisc_class_ops *cops;
942	unsigned long cl = 0;
943	unsigned long new_cl;
944	u32 pid = tcm->tcm_parent;
945	u32 clid = tcm->tcm_handle;
946	u32 qid = TC_H_MAJ(clid);
947	int err;
948
949	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
950		return -ENODEV;
951
952	/*
953	   parent == TC_H_UNSPEC - unspecified parent.
954	   parent == TC_H_ROOT   - class is root, which has no parent.
955	   parent == X:0	 - parent is root class.
956	   parent == X:Y	 - parent is a node in hierarchy.
957	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
958
959	   handle == 0:0	 - generate handle from kernel pool.
960	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
961	   handle == X:Y	 - clear.
962	   handle == X:0	 - root class.
963	 */
964
965	/* Step 1. Determine qdisc handle X:0 */
966
967	if (pid != TC_H_ROOT) {
968		u32 qid1 = TC_H_MAJ(pid);
969
970		if (qid && qid1) {
971			/* If both majors are known, they must be identical. */
972			if (qid != qid1)
973				return -EINVAL;
974		} else if (qid1) {
975			qid = qid1;
976		} else if (qid == 0)
977			qid = dev->qdisc_sleeping->handle;
978
979		/* Now qid is genuine qdisc handle consistent
980		   both with parent and child.
981
982		   TC_H_MAJ(pid) still may be unspecified, complete it now.
983		 */
984		if (pid)
985			pid = TC_H_MAKE(qid, pid);
986	} else {
987		if (qid == 0)
988			qid = dev->qdisc_sleeping->handle;
989	}
990
991	/* OK. Locate qdisc */
992	if ((q = qdisc_lookup(dev, qid)) == NULL)
993		return -ENOENT;
994
995	/* An check that it supports classes */
996	cops = q->ops->cl_ops;
997	if (cops == NULL)
998		return -EINVAL;
999
1000	/* Now try to get class */
1001	if (clid == 0) {
1002		if (pid == TC_H_ROOT)
1003			clid = qid;
1004	} else
1005		clid = TC_H_MAKE(qid, clid);
1006
1007	if (clid)
1008		cl = cops->get(q, clid);
1009
1010	if (cl == 0) {
1011		err = -ENOENT;
1012		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1013			goto out;
1014	} else {
1015		switch (n->nlmsg_type) {
1016		case RTM_NEWTCLASS:
1017			err = -EEXIST;
1018			if (n->nlmsg_flags&NLM_F_EXCL)
1019				goto out;
1020			break;
1021		case RTM_DELTCLASS:
1022			err = cops->delete(q, cl);
1023			if (err == 0)
1024				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1025			goto out;
1026		case RTM_GETTCLASS:
1027			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1028			goto out;
1029		default:
1030			err = -EINVAL;
1031			goto out;
1032		}
1033	}
1034
1035	new_cl = cl;
1036	err = cops->change(q, clid, pid, tca, &new_cl);
1037	if (err == 0)
1038		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1039
1040out:
1041	if (cl)
1042		cops->put(q, cl);
1043
1044	return err;
1045}
1046
1047
1048static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1049			  unsigned long cl,
1050			  u32 pid, u32 seq, u16 flags, int event)
1051{
1052	struct tcmsg *tcm;
1053	struct nlmsghdr  *nlh;
1054	unsigned char *b = skb_tail_pointer(skb);
1055	struct gnet_dump d;
1056	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1057
1058	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1059	tcm = NLMSG_DATA(nlh);
1060	tcm->tcm_family = AF_UNSPEC;
1061	tcm->tcm_ifindex = q->dev->ifindex;
1062	tcm->tcm_parent = q->handle;
1063	tcm->tcm_handle = q->handle;
1064	tcm->tcm_info = 0;
1065	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1066	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1067		goto rtattr_failure;
1068
1069	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1070			TCA_XSTATS, q->stats_lock, &d) < 0)
1071		goto rtattr_failure;
1072
1073	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1074		goto rtattr_failure;
1075
1076	if (gnet_stats_finish_copy(&d) < 0)
1077		goto rtattr_failure;
1078
1079	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1080	return skb->len;
1081
1082nlmsg_failure:
1083rtattr_failure:
1084	nlmsg_trim(skb, b);
1085	return -1;
1086}
1087
1088static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1089			  struct Qdisc *q, unsigned long cl, int event)
1090{
1091	struct sk_buff *skb;
1092	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1093
1094	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1095	if (!skb)
1096		return -ENOBUFS;
1097
1098	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1099		kfree_skb(skb);
1100		return -EINVAL;
1101	}
1102
1103	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1104}
1105
1106struct qdisc_dump_args
1107{
1108	struct qdisc_walker w;
1109	struct sk_buff *skb;
1110	struct netlink_callback *cb;
1111};
1112
1113static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1114{
1115	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1116
1117	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1118			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1119}
1120
1121static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1122{
1123	int t;
1124	int s_t;
1125	struct net_device *dev;
1126	struct Qdisc *q;
1127	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1128	struct qdisc_dump_args arg;
1129
1130	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1131		return 0;
1132	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1133		return 0;
1134
1135	s_t = cb->args[0];
1136	t = 0;
1137
1138	list_for_each_entry(q, &dev->qdisc_list, list) {
1139		if (t < s_t || !q->ops->cl_ops ||
1140		    (tcm->tcm_parent &&
1141		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1142			t++;
1143			continue;
1144		}
1145		if (t > s_t)
1146			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1147		arg.w.fn = qdisc_class_dump;
1148		arg.skb = skb;
1149		arg.cb = cb;
1150		arg.w.stop  = 0;
1151		arg.w.skip = cb->args[1];
1152		arg.w.count = 0;
1153		q->ops->cl_ops->walk(q, &arg.w);
1154		cb->args[1] = arg.w.count;
1155		if (arg.w.stop)
1156			break;
1157		t++;
1158	}
1159
1160	cb->args[0] = t;
1161
1162	dev_put(dev);
1163	return skb->len;
1164}
1165
1166/* Main classifier routine: scans classifier chain attached
1167   to this qdisc, (optionally) tests for protocol and asks
1168   specific classifiers.
1169 */
1170int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1171	struct tcf_result *res)
1172{
1173	int err = 0;
1174	__be16 protocol = skb->protocol;
1175#ifdef CONFIG_NET_CLS_ACT
1176	struct tcf_proto *otp = tp;
1177reclassify:
1178#endif
1179	protocol = skb->protocol;
1180
1181	for ( ; tp; tp = tp->next) {
1182		if ((tp->protocol == protocol ||
1183			tp->protocol == htons(ETH_P_ALL)) &&
1184			(err = tp->classify(skb, tp, res)) >= 0) {
1185#ifdef CONFIG_NET_CLS_ACT
1186			if ( TC_ACT_RECLASSIFY == err) {
1187				__u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1188				tp = otp;
1189
1190				if (MAX_REC_LOOP < verd++) {
1191					printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1192						tp->prio&0xffff, ntohs(tp->protocol));
1193					return TC_ACT_SHOT;
1194				}
1195				skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1196				goto reclassify;
1197			} else {
1198				if (skb->tc_verd)
1199					skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1200				return err;
1201			}
1202#else
1203
1204			return err;
1205#endif
1206		}
1207
1208	}
1209	return -1;
1210}
1211
1212void tcf_destroy(struct tcf_proto *tp)
1213{
1214	tp->ops->destroy(tp);
1215	module_put(tp->ops->owner);
1216	kfree(tp);
1217}
1218
1219void tcf_destroy_chain(struct tcf_proto *fl)
1220{
1221	struct tcf_proto *tp;
1222
1223	while ((tp = fl) != NULL) {
1224		fl = tp->next;
1225		tcf_destroy(tp);
1226	}
1227}
1228EXPORT_SYMBOL(tcf_destroy_chain);
1229
1230#ifdef CONFIG_PROC_FS
1231static int psched_show(struct seq_file *seq, void *v)
1232{
1233	seq_printf(seq, "%08x %08x %08x %08x\n",
1234		   (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1235		   1000000,
1236		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(KTIME_MONOTONIC_RES));
1237
1238	return 0;
1239}
1240
1241static int psched_open(struct inode *inode, struct file *file)
1242{
1243	return single_open(file, psched_show, PDE(inode)->data);
1244}
1245
1246static const struct file_operations psched_fops = {
1247	.owner = THIS_MODULE,
1248	.open = psched_open,
1249	.read  = seq_read,
1250	.llseek = seq_lseek,
1251	.release = single_release,
1252};
1253#endif
1254
1255static int __init pktsched_init(void)
1256{
1257	register_qdisc(&pfifo_qdisc_ops);
1258	register_qdisc(&bfifo_qdisc_ops);
1259	proc_net_fops_create("psched", 0, &psched_fops);
1260
1261	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1262	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1263	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1264	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1265	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1266	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1267
1268	return 0;
1269}
1270
1271subsys_initcall(pktsched_init);
1272
1273EXPORT_SYMBOL(qdisc_get_rtab);
1274EXPORT_SYMBOL(qdisc_put_rtab);
1275EXPORT_SYMBOL(register_qdisc);
1276EXPORT_SYMBOL(unregister_qdisc);
1277EXPORT_SYMBOL(tc_classify);
1278