dn_sched_fq_codel.c revision 301772
1/*
2 * FQ_Codel - The FlowQueue-Codel scheduler/AQM
3 *
4 * $FreeBSD: stable/10/sys/netpfil/ipfw/dn_sched_fq_codel.c 301772 2016-06-10 00:00:25Z truckman $
5 *
6 * Copyright (C) 2016 Centre for Advanced Internet Architectures,
7 *  Swinburne University of Technology, Melbourne, Australia.
8 * Portions of this code were made possible in part by a gift from
9 *  The Comcast Innovation Fund.
10 * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34#ifdef _KERNEL
35#include <sys/malloc.h>
36#include <sys/socket.h>
37//#include <sys/socketvar.h>
38#include <sys/kernel.h>
39#include <sys/mbuf.h>
40#include <sys/module.h>
41#include <net/if.h>	/* IFNAMSIZ */
42#include <netinet/in.h>
43#include <netinet/ip_var.h>		/* ipfw_rule_ref */
44#include <netinet/ip_fw.h>	/* flow_id */
45#include <netinet/ip_dummynet.h>
46
47#include <sys/proc.h>
48#include <sys/rwlock.h>
49
50#include <netpfil/ipfw/ip_fw_private.h>
51#include <sys/sysctl.h>
52#include <netinet/ip.h>
53#include <netinet/ip6.h>
54#include <netinet/ip_icmp.h>
55#include <netinet/tcp.h>
56#include <netinet/udp.h>
57#include <sys/queue.h>
58#include <sys/hash.h>
59
60#include <netpfil/ipfw/dn_heap.h>
61#include <netpfil/ipfw/ip_dn_private.h>
62
63#include <netpfil/ipfw/dn_aqm.h>
64#include <netpfil/ipfw/dn_aqm_codel.h>
65#include <netpfil/ipfw/dn_sched.h>
66#include <netpfil/ipfw/dn_sched_fq_codel.h>
67#include <netpfil/ipfw/dn_sched_fq_codel_helper.h>
68
69#else
70#include <dn_test.h>
71#endif
72
73/* NOTE: In fq_codel module, we reimplements CoDel AQM functions
74 * because fq_codel use different flows (sub-queues) structure and
75 * dn_queue includes many variables not needed by a flow (sub-queue
76 * )i.e. avoid extra overhead (88 bytes vs 208 bytes).
77 * Also, CoDel functions manages stats of sub-queues as well as the main queue.
78 */
79
80#define DN_SCHED_FQ_CODEL 6
81
82static struct dn_alg fq_codel_desc;
83
84/* fq_codel default parameters including codel */
85struct dn_sch_fq_codel_parms
86fq_codel_sysctl = {{5000 * AQM_TIME_1US, 100000 * AQM_TIME_1US,
87	CODEL_ECN_ENABLED}, 1024, 10240, 1514};
88
89static int
90fqcodel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS)
91{
92	int error;
93	long  value;
94
95	value = fq_codel_sysctl.ccfg.interval;
96	value /= AQM_TIME_1US;
97	error = sysctl_handle_long(oidp, &value, 0, req);
98	if (error != 0 || req->newptr == NULL)
99		return (error);
100	if (value < 1 || value > 100 * AQM_TIME_1S)
101		return (EINVAL);
102	fq_codel_sysctl.ccfg.interval = value * AQM_TIME_1US ;
103
104	return (0);
105}
106
107static int
108fqcodel_sysctl_target_handler(SYSCTL_HANDLER_ARGS)
109{
110	int error;
111	long  value;
112
113	value = fq_codel_sysctl.ccfg.target;
114	value /= AQM_TIME_1US;
115	error = sysctl_handle_long(oidp, &value, 0, req);
116	if (error != 0 || req->newptr == NULL)
117		return (error);
118	if (value < 1 || value > 5 * AQM_TIME_1S)
119		return (EINVAL);
120	fq_codel_sysctl.ccfg.target = value * AQM_TIME_1US ;
121
122	return (0);
123}
124
125
126SYSBEGIN(f4)
127
128SYSCTL_DECL(_net_inet);
129SYSCTL_DECL(_net_inet_ip);
130SYSCTL_DECL(_net_inet_ip_dummynet);
131static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqcodel,
132	CTLFLAG_RW, 0, "FQ_CODEL");
133
134#ifdef SYSCTL_NODE
135
136SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, target,
137	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_target_handler, "L",
138	"FQ_CoDel target in microsecond");
139SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, interval,
140	CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_interval_handler, "L",
141	"FQ_CoDel interval in microsecond");
142
143SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, quantum,
144	CTLFLAG_RW, &fq_codel_sysctl.quantum, 1514, "FQ_CoDel quantum");
145SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, flows,
146	CTLFLAG_RW, &fq_codel_sysctl.flows_cnt, 1024,
147	"Number of queues for FQ_CoDel");
148SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, limit,
149	CTLFLAG_RW, &fq_codel_sysctl.limit, 10240, "FQ_CoDel queues size limit");
150#endif
151
152/* Drop a packet form the head of codel queue */
153static void
154codel_drop_head(struct fq_codel_flow *q, struct fq_codel_si *si)
155{
156	struct mbuf *m = q->mq.head;
157
158	if (m == NULL)
159		return;
160	q->mq.head = m->m_nextpkt;
161
162	fq_update_stats(q, si, -m->m_pkthdr.len, 1);
163
164	if (si->main_q.ni.length == 0) /* queue is now idle */
165			si->main_q.q_time = dn_cfg.curr_time;
166
167	FREE_PKT(m);
168}
169
170/* Enqueue a packet 'm' to a queue 'q' and add timestamp to that packet.
171 * Return 1 when unable to add timestamp, otherwise return 0
172 */
173static int
174codel_enqueue(struct fq_codel_flow *q, struct mbuf *m, struct fq_codel_si *si)
175{
176	uint64_t len;
177
178	len = m->m_pkthdr.len;
179	/* finding maximum packet size */
180	if (len > q->cst.maxpkt_size)
181		q->cst.maxpkt_size = len;
182
183	/* Add timestamp to mbuf as MTAG */
184	struct m_tag *mtag;
185	mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
186	if (mtag == NULL)
187		mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, sizeof(aqm_time_t),
188			M_NOWAIT);
189	if (mtag == NULL) {
190		m_freem(m);
191		goto drop;
192	}
193	*(aqm_time_t *)(mtag + 1) = AQM_UNOW;
194	m_tag_prepend(m, mtag);
195
196	mq_append(&q->mq, m);
197	fq_update_stats(q, si, len, 0);
198	return 0;
199
200drop:
201	fq_update_stats(q, si, len, 1);
202	m_freem(m);
203	return 1;
204}
205
206/*
207 * Classify a packet to queue number using Jenkins hash function.
208 * Return: queue number
209 * the input of the hash are protocol no, perturbation, src IP, dst IP,
210 * src port, dst port,
211 */
212static inline int
213fq_codel_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_codel_si *si)
214{
215	struct ip *ip;
216	struct tcphdr *th;
217	struct udphdr *uh;
218	uint8_t tuple[41];
219	uint16_t hash=0;
220
221//#ifdef INET6
222	struct ip6_hdr *ip6;
223	int isip6;
224	isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
225
226	if(isip6) {
227		ip6 = mtod(m, struct ip6_hdr *);
228		*((uint8_t *) &tuple[0]) = ip6->ip6_nxt;
229		*((uint32_t *) &tuple[1]) = si->perturbation;
230		memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16);
231		memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16);
232
233		switch (ip6->ip6_nxt) {
234		case IPPROTO_TCP:
235			th = (struct tcphdr *)(ip6 + 1);
236			*((uint16_t *) &tuple[37]) = th->th_dport;
237			*((uint16_t *) &tuple[39]) = th->th_sport;
238			break;
239
240		case IPPROTO_UDP:
241			uh = (struct udphdr *)(ip6 + 1);
242			*((uint16_t *) &tuple[37]) = uh->uh_dport;
243			*((uint16_t *) &tuple[39]) = uh->uh_sport;
244			break;
245		default:
246			memset(&tuple[37], 0, 4);
247
248		}
249
250		hash = jenkins_hash(tuple, 41, HASHINIT) %  fcount;
251		return hash;
252	}
253//#endif
254
255	/* IPv4 */
256	ip = mtod(m, struct ip *);
257	*((uint8_t *) &tuple[0]) = ip->ip_p;
258	*((uint32_t *) &tuple[1]) = si->perturbation;
259	*((uint32_t *) &tuple[5]) = ip->ip_src.s_addr;
260	*((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr;
261
262	switch (ip->ip_p) {
263		case IPPROTO_TCP:
264			th = (struct tcphdr *)(ip + 1);
265			*((uint16_t *) &tuple[13]) = th->th_dport;
266			*((uint16_t *) &tuple[15]) = th->th_sport;
267			break;
268
269		case IPPROTO_UDP:
270			uh = (struct udphdr *)(ip + 1);
271			*((uint16_t *) &tuple[13]) = uh->uh_dport;
272			*((uint16_t *) &tuple[15]) = uh->uh_sport;
273			break;
274		default:
275			memset(&tuple[13], 0, 4);
276
277	}
278	hash = jenkins_hash(tuple, 17, HASHINIT) %  fcount;
279
280	return hash;
281}
282
283/*
284 * Enqueue a packet into an appropriate queue according to
285 * FQ_CODEL algorithm.
286 */
287static int
288fq_codel_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q,
289	struct mbuf *m)
290{
291	struct fq_codel_si *si;
292	struct fq_codel_schk *schk;
293	struct dn_sch_fq_codel_parms *param;
294	struct dn_queue *mainq;
295	int idx, drop, i, maxidx;
296
297	mainq = (struct dn_queue *)(_si + 1);
298	si = (struct fq_codel_si *)_si;
299	schk = (struct fq_codel_schk *)(si->_si.sched+1);
300	param = &schk->cfg;
301
302	 /* classify a packet to queue number*/
303	idx = fq_codel_classify_flow(m, param->flows_cnt, si);
304	/* enqueue packet into appropriate queue using CoDel AQM.
305	 * Note: 'codel_enqueue' function returns 1 only when it unable to
306	 * add timestamp to packet (no limit check)*/
307	drop = codel_enqueue(&si->flows[idx], m, si);
308
309	/* codel unable to timestamp a packet */
310	if (drop)
311		return 1;
312
313	/* If the flow (sub-queue) is not active ,then add it to the tail of
314	 * new flows list, initialize and activate it.
315	 */
316	if (!si->flows[idx].active ) {
317		STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain);
318		si->flows[idx].deficit = param->quantum;
319		si->flows[idx].cst.dropping = false;
320		si->flows[idx].cst.first_above_time = 0;
321		si->flows[idx].active = 1;
322		//D("activate %d",idx);
323	}
324
325	/* check the limit for all queues and remove a packet from the
326	 * largest one
327	 */
328	if (mainq->ni.length > schk->cfg.limit) { D("over limit");
329		/* find first active flow */
330		for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++)
331			if (si->flows[maxidx].active)
332				break;
333		if (maxidx < schk->cfg.flows_cnt) {
334			/* find the largest sub- queue */
335			for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++)
336				if (si->flows[i].active && si->flows[i].stats.length >
337					si->flows[maxidx].stats.length)
338					maxidx = i;
339			codel_drop_head(&si->flows[maxidx], si);
340			D("maxidx = %d",maxidx);
341			drop = 1;
342		}
343	}
344
345	return drop;
346}
347
348/*
349 * Dequeue a packet from an appropriate queue according to
350 * FQ_CODEL algorithm.
351 */
352static struct mbuf *
353fq_codel_dequeue(struct dn_sch_inst *_si)
354{
355	struct fq_codel_si *si;
356	struct fq_codel_schk *schk;
357	struct dn_sch_fq_codel_parms *param;
358	struct fq_codel_flow *f;
359	struct mbuf *mbuf;
360	struct fq_codel_list *fq_codel_flowlist;
361
362	si = (struct fq_codel_si *)_si;
363	schk = (struct fq_codel_schk *)(si->_si.sched+1);
364	param = &schk->cfg;
365
366	do {
367		/* select a list to start with */
368		if (STAILQ_EMPTY(&si->newflows))
369			fq_codel_flowlist = &si->oldflows;
370		else
371			fq_codel_flowlist = &si->newflows;
372
373		/* Both new and old queue lists are empty, return NULL */
374		if (STAILQ_EMPTY(fq_codel_flowlist))
375			return NULL;
376
377		f = STAILQ_FIRST(fq_codel_flowlist);
378		while (f != NULL)	{
379			/* if there is no flow(sub-queue) deficit, increase deficit
380			 * by quantum, move the flow to the tail of old flows list
381			 * and try another flow.
382			 * Otherwise, the flow will be used for dequeue.
383			 */
384			if (f->deficit < 0) {
385				 f->deficit += param->quantum;
386				 STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
387				 STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
388			 } else
389				 break;
390
391			f = STAILQ_FIRST(fq_codel_flowlist);
392		}
393
394		/* the new flows list is empty, try old flows list */
395		if (STAILQ_EMPTY(fq_codel_flowlist))
396			continue;
397
398		/* Dequeue a packet from the selected flow */
399		mbuf = fqc_codel_dequeue(f, si);
400
401		/* Codel did not return a packet */
402		if (!mbuf) {
403			/* If the selected flow belongs to new flows list, then move
404			 * it to the tail of old flows list. Otherwise, deactivate it and
405			 * remove it from the old list and
406			 */
407			if (fq_codel_flowlist == &si->newflows) {
408				STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
409				STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
410			}	else {
411				f->active = 0;
412				STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
413			}
414			/* start again */
415			continue;
416		}
417
418		/* we have a packet to return,
419		 * update flow deficit and return the packet*/
420		f->deficit -= mbuf->m_pkthdr.len;
421		return mbuf;
422
423	} while (1);
424
425	/* unreachable point */
426	return NULL;
427}
428
429/*
430 * Initialize fq_codel scheduler instance.
431 * also, allocate memory for flows array.
432 */
433static int
434fq_codel_new_sched(struct dn_sch_inst *_si)
435{
436	struct fq_codel_si *si;
437	struct dn_queue *q;
438	struct fq_codel_schk *schk;
439	int i;
440
441	si = (struct fq_codel_si *)_si;
442	schk = (struct fq_codel_schk *)(_si->sched+1);
443
444	if(si->flows) {
445		D("si already configured!");
446		return 0;
447	}
448
449	/* init the main queue */
450	q = &si->main_q;
451	set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
452	q->_si = _si;
453	q->fs = _si->sched->fs;
454
455	/* allocate memory for flows array */
456	si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_codel_flow),
457		 M_DUMMYNET, M_NOWAIT | M_ZERO);
458	if (si->flows == NULL) {
459		D("cannot allocate memory for fq_codel configuration parameters");
460		return ENOMEM ;
461	}
462
463	/* init perturbation for this si */
464	si->perturbation = random();
465
466	/* init the old and new flows lists */
467	STAILQ_INIT(&si->newflows);
468	STAILQ_INIT(&si->oldflows);
469
470	/* init the flows (sub-queues) */
471	for (i = 0; i < schk->cfg.flows_cnt; i++) {
472		/* init codel */
473		si->flows[i].cst.maxpkt_size = 500;
474	}
475
476	fq_codel_desc.ref_count++;
477	return 0;
478}
479
480/*
481 * Free fq_codel scheduler instance.
482 */
483static int
484fq_codel_free_sched(struct dn_sch_inst *_si)
485{
486	struct fq_codel_si *si = (struct fq_codel_si *)_si ;
487
488	/* free the flows array */
489	free(si->flows , M_DUMMYNET);
490	si->flows = NULL;
491	fq_codel_desc.ref_count--;
492
493	return 0;
494}
495
496/*
497 * Configure fq_codel scheduler.
498 * the configurations for the scheduler is passed from userland.
499 */
500static int
501fq_codel_config(struct dn_schk *_schk)
502{
503	struct fq_codel_schk *schk;
504	struct dn_extra_parms *ep;
505	struct dn_sch_fq_codel_parms *fqc_cfg;
506
507	schk = (struct fq_codel_schk *)(_schk+1);
508	ep = (struct dn_extra_parms *) _schk->cfg;
509
510	/* par array contains fq_codel configuration as follow
511	 * Codel: 0- target,1- interval, 2- flags
512	 * FQ_CODEL: 3- quantum, 4- limit, 5- flows
513	 */
514	if (ep && ep->oid.len ==sizeof(*ep) &&
515		ep->oid.subtype == DN_SCH_PARAMS) {
516
517		fqc_cfg = &schk->cfg;
518		if (ep->par[0] < 0)
519			fqc_cfg->ccfg.target = fq_codel_sysctl.ccfg.target;
520		else
521			fqc_cfg->ccfg.target = ep->par[0] * AQM_TIME_1US;
522
523		if (ep->par[1] < 0)
524			fqc_cfg->ccfg.interval = fq_codel_sysctl.ccfg.interval;
525		else
526			fqc_cfg->ccfg.interval = ep->par[1] * AQM_TIME_1US;
527
528		if (ep->par[2] < 0)
529			fqc_cfg->ccfg.flags = 0;
530		else
531			fqc_cfg->ccfg.flags = ep->par[2];
532
533		/* FQ configurations */
534		if (ep->par[3] < 0)
535			fqc_cfg->quantum = fq_codel_sysctl.quantum;
536		else
537			fqc_cfg->quantum = ep->par[3];
538
539		if (ep->par[4] < 0)
540			fqc_cfg->limit = fq_codel_sysctl.limit;
541		else
542			fqc_cfg->limit = ep->par[4];
543
544		if (ep->par[5] < 0)
545			fqc_cfg->flows_cnt = fq_codel_sysctl.flows_cnt;
546		else
547			fqc_cfg->flows_cnt = ep->par[5];
548
549		/* Bound the configurations */
550		fqc_cfg->ccfg.target = BOUND_VAR(fqc_cfg->ccfg.target, 1 ,
551			5 * AQM_TIME_1S); ;
552		fqc_cfg->ccfg.interval = BOUND_VAR(fqc_cfg->ccfg.interval, 1,
553			100 * AQM_TIME_1S);
554
555		fqc_cfg->quantum = BOUND_VAR(fqc_cfg->quantum,1, 9000);
556		fqc_cfg->limit= BOUND_VAR(fqc_cfg->limit,1,20480);
557		fqc_cfg->flows_cnt= BOUND_VAR(fqc_cfg->flows_cnt,1,65536);
558	}
559	else
560		return 1;
561
562	return 0;
563}
564
565/*
566 * Return fq_codel scheduler configurations
567 * the configurations for the scheduler is passed to userland.
568 */
569static int
570fq_codel_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) {
571
572	struct fq_codel_schk *schk = (struct fq_codel_schk *)(_schk+1);
573	struct dn_sch_fq_codel_parms *fqc_cfg;
574
575	fqc_cfg = &schk->cfg;
576
577	strcpy(ep->name, fq_codel_desc.name);
578	ep->par[0] = fqc_cfg->ccfg.target / AQM_TIME_1US;
579	ep->par[1] = fqc_cfg->ccfg.interval / AQM_TIME_1US;
580	ep->par[2] = fqc_cfg->ccfg.flags;
581
582	ep->par[3] = fqc_cfg->quantum;
583	ep->par[4] = fqc_cfg->limit;
584	ep->par[5] = fqc_cfg->flows_cnt;
585
586	return 0;
587}
588
589/*
590 * fq_codel scheduler descriptor
591 * contains the type of the scheduler, the name, the size of extra
592 * data structures, and function pointers.
593 */
594static struct dn_alg fq_codel_desc = {
595	_SI( .type = )  DN_SCHED_FQ_CODEL,
596	_SI( .name = ) "FQ_CODEL",
597	_SI( .flags = ) 0,
598
599	_SI( .schk_datalen = ) sizeof(struct fq_codel_schk),
600	_SI( .si_datalen = ) sizeof(struct fq_codel_si) - sizeof(struct dn_sch_inst),
601	_SI( .q_datalen = ) 0,
602
603	_SI( .enqueue = ) fq_codel_enqueue,
604	_SI( .dequeue = ) fq_codel_dequeue,
605	_SI( .config = ) fq_codel_config, /* new sched i.e. sched X config ...*/
606	_SI( .destroy = ) NULL,  /*sched x delete */
607	_SI( .new_sched = ) fq_codel_new_sched, /* new schd instance */
608	_SI( .free_sched = ) fq_codel_free_sched,	/* delete schd instance */
609	_SI( .new_fsk = ) NULL,
610	_SI( .free_fsk = ) NULL,
611	_SI( .new_queue = ) NULL,
612	_SI( .free_queue = ) NULL,
613	_SI( .getconfig = )  fq_codel_getconfig,
614	_SI( .ref_count = ) 0
615};
616
617DECLARE_DNSCHED_MODULE(dn_fq_codel, &fq_codel_desc);
618