1/*-
2 * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
3 * Copyright (c) 2008-2010, BitGravity Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 *  1. Redistributions of source code must retain the above copyright notice,
10 *     this list of conditions and the following disclaimer.
11 *
12 *  2. Neither the name of the BitGravity Corporation nor the names of its
13 *     contributors may be used to endorse or promote products derived from
14 *     this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "opt_route.h"
30#include "opt_mpath.h"
31#include "opt_ddb.h"
32#include "opt_inet.h"
33#include "opt_inet6.h"
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD$");
37
38#include <sys/param.h>
39#include <sys/types.h>
40#include <sys/bitstring.h>
41#include <sys/condvar.h>
42#include <sys/callout.h>
43#include <sys/hash.h>
44#include <sys/kernel.h>
45#include <sys/kthread.h>
46#include <sys/limits.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/pcpu.h>
50#include <sys/proc.h>
51#include <sys/queue.h>
52#include <sys/sbuf.h>
53#include <sys/sched.h>
54#include <sys/smp.h>
55#include <sys/socket.h>
56#include <sys/syslog.h>
57#include <sys/sysctl.h>
58#include <vm/uma.h>
59
60#include <net/if.h>
61#include <net/if_llatbl.h>
62#include <net/if_var.h>
63#include <net/route.h>
64#include <net/flowtable.h>
65#include <net/vnet.h>
66
67#include <netinet/in.h>
68#include <netinet/in_systm.h>
69#include <netinet/in_var.h>
70#include <netinet/if_ether.h>
71#include <netinet/ip.h>
72#ifdef INET6
73#include <netinet/ip6.h>
74#endif
75#ifdef FLOWTABLE_HASH_ALL
76#include <netinet/tcp.h>
77#include <netinet/udp.h>
78#include <netinet/sctp.h>
79#endif
80
81#include <ddb/ddb.h>
82
83#ifdef	FLOWTABLE_HASH_ALL
84#define	KEY_PORTS	(sizeof(uint16_t) * 2)
85#define	KEY_ADDRS	2
86#else
87#define	KEY_PORTS	0
88#define	KEY_ADDRS	1
89#endif
90
91#ifdef	INET6
92#define	KEY_ADDR_LEN	sizeof(struct in6_addr)
93#else
94#define	KEY_ADDR_LEN	sizeof(struct in_addr)
95#endif
96
97#define	KEYLEN	((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
98
99struct flentry {
100	uint32_t		f_hash;		/* hash flowing forward */
101	uint32_t		f_key[KEYLEN];	/* address(es and ports) */
102	uint32_t		f_uptime;	/* uptime at last access */
103	uint16_t		f_fibnum;	/* fib index */
104#ifdef FLOWTABLE_HASH_ALL
105	uint8_t			f_proto;	/* protocol */
106	uint8_t			f_flags;	/* stale? */
107#define FL_STALE 		1
108#endif
109	SLIST_ENTRY(flentry)	f_next;		/* pointer to collision entry */
110	struct rtentry		*f_rt;		/* rtentry for flow */
111	struct llentry		*f_lle;		/* llentry for flow */
112};
113#undef KEYLEN
114
115SLIST_HEAD(flist, flentry);
116/* Make sure we can use pcpu_zone_ptr for struct flist. */
117CTASSERT(sizeof(struct flist) == sizeof(void *));
118
119struct flowtable {
120	counter_u64_t	*ft_stat;
121	int 		ft_size;
122	/*
123	 * ft_table is a malloc(9)ed array of pointers.  Pointers point to
124	 * memory from UMA_ZONE_PCPU zone.
125	 * ft_masks is per-cpu pointer itself.  Each instance points
126	 * to a malloc(9)ed bitset, that is private to corresponding CPU.
127	 */
128	struct flist	**ft_table;
129	bitstr_t 	**ft_masks;
130	bitstr_t	*ft_tmpmask;
131};
132
133#define	FLOWSTAT_ADD(ft, name, v)	\
134	counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
135#define	FLOWSTAT_INC(ft, name)	FLOWSTAT_ADD(ft, name, 1)
136
137static struct proc *flowcleanerproc;
138static uint32_t flow_hashjitter;
139
140static struct cv 	flowclean_f_cv;
141static struct cv 	flowclean_c_cv;
142static struct mtx	flowclean_lock;
143static uint32_t		flowclean_cycles;
144
145/*
146 * TODO:
147 * - add sysctls to resize && flush flow tables
148 * - Add per flowtable sysctls for statistics and configuring timeouts
149 * - add saturation counter to rtentry to support per-packet load-balancing
150 *   add flag to indicate round-robin flow, add list lookup from head
151     for flows
152 * - add sysctl / device node / syscall to support exporting and importing
153 *   of flows with flag to indicate that a flow was imported so should
154 *   not be considered for auto-cleaning
155 * - support explicit connection state (currently only ad-hoc for DSR)
156 * - idetach() cleanup for options VIMAGE builds.
157 */
158#ifdef INET
159static VNET_DEFINE(struct flowtable, ip4_ft);
160#define	V_ip4_ft	VNET(ip4_ft)
161#endif
162#ifdef INET6
163static VNET_DEFINE(struct flowtable, ip6_ft);
164#define	V_ip6_ft	VNET(ip6_ft)
165#endif
166
167static uma_zone_t flow_zone;
168
169static VNET_DEFINE(int, flowtable_enable) = 1;
170#define	V_flowtable_enable		VNET(flowtable_enable)
171
172static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
173    "flowtable");
174SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
175    &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
176SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
177    &flow_zone, "Maximum number of flows allowed");
178
179static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
180
181static struct flentry *
182flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
183
184#ifdef INET
185static struct flentry *
186flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
187{
188	struct flentry *fle;
189	struct sockaddr_in *sin;
190	struct ip *ip;
191	uint32_t fibnum;
192#ifdef FLOWTABLE_HASH_ALL
193	uint32_t key[3];
194	int iphlen;
195	uint16_t sport, dport;
196	uint8_t proto;
197#endif
198
199	ip = mtod(m, struct ip *);
200
201	if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
202	    (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
203	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
204		return (NULL);
205
206	fibnum = M_GETFIB(m);
207
208#ifdef FLOWTABLE_HASH_ALL
209	iphlen = ip->ip_hl << 2;
210	proto = ip->ip_p;
211
212	switch (proto) {
213	case IPPROTO_TCP: {
214		struct tcphdr *th;
215
216		th = (struct tcphdr *)((char *)ip + iphlen);
217		sport = th->th_sport;
218		dport = th->th_dport;
219		if (th->th_flags & (TH_RST|TH_FIN))
220			fibnum |= (FL_STALE << 24);
221		break;
222	}
223	case IPPROTO_UDP: {
224		struct udphdr *uh;
225
226		uh = (struct udphdr *)((char *)ip + iphlen);
227		sport = uh->uh_sport;
228		dport = uh->uh_dport;
229		break;
230	}
231	case IPPROTO_SCTP: {
232		struct sctphdr *sh;
233
234		sh = (struct sctphdr *)((char *)ip + iphlen);
235		sport = sh->src_port;
236		dport = sh->dest_port;
237		/* XXXGL: handle stale? */
238		break;
239	}
240	default:
241		sport = dport = 0;
242		break;
243	}
244
245	key[0] = ip->ip_dst.s_addr;
246	key[1] = ip->ip_src.s_addr;
247	key[2] = (dport << 16) | sport;
248	fibnum |= proto << 16;
249
250	fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
251	    fibnum);
252
253#else	/* !FLOWTABLE_HASH_ALL */
254
255	fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
256	    sizeof(struct in_addr), fibnum);
257
258#endif	/* FLOWTABLE_HASH_ALL */
259
260	if (fle == NULL)
261		return (NULL);
262
263	sin = (struct sockaddr_in *)&ro->ro_dst;
264	sin->sin_family = AF_INET;
265	sin->sin_len = sizeof(*sin);
266	sin->sin_addr = ip->ip_dst;
267
268	return (fle);
269}
270#endif /* INET */
271
272#ifdef INET6
273/*
274 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
275 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
276 * pointer might become stale after other pullups (but we never use it
277 * this way).
278 */
279#define PULLUP_TO(_len, p, T)						\
280do {									\
281	int x = (_len) + sizeof(T);					\
282	if ((m)->m_len < x)						\
283		return (NULL);						\
284	p = (mtod(m, char *) + (_len));					\
285} while (0)
286
287#define	TCP(p)		((struct tcphdr *)(p))
288#define	SCTP(p)		((struct sctphdr *)(p))
289#define	UDP(p)		((struct udphdr *)(p))
290
291static struct flentry *
292flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
293{
294	struct flentry *fle;
295	struct sockaddr_in6 *sin6;
296	struct ip6_hdr *ip6;
297	uint32_t fibnum;
298#ifdef FLOWTABLE_HASH_ALL
299	uint32_t key[9];
300	void *ulp;
301	int hlen;
302	uint16_t sport, dport;
303	u_short offset;
304	uint8_t proto;
305#else
306	uint32_t key[4];
307#endif
308
309	ip6 = mtod(m, struct ip6_hdr *);
310	if (in6_localaddr(&ip6->ip6_dst))
311		return (NULL);
312
313	fibnum = M_GETFIB(m);
314
315#ifdef	FLOWTABLE_HASH_ALL
316	hlen = sizeof(struct ip6_hdr);
317	proto = ip6->ip6_nxt;
318	offset = sport = dport = 0;
319	ulp = NULL;
320	while (ulp == NULL) {
321		switch (proto) {
322		case IPPROTO_ICMPV6:
323		case IPPROTO_OSPFIGP:
324		case IPPROTO_PIM:
325		case IPPROTO_CARP:
326		case IPPROTO_ESP:
327		case IPPROTO_NONE:
328			ulp = ip6;
329			break;
330		case IPPROTO_TCP:
331			PULLUP_TO(hlen, ulp, struct tcphdr);
332			dport = TCP(ulp)->th_dport;
333			sport = TCP(ulp)->th_sport;
334			if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
335				fibnum |= (FL_STALE << 24);
336			break;
337		case IPPROTO_SCTP:
338			PULLUP_TO(hlen, ulp, struct sctphdr);
339			dport = SCTP(ulp)->src_port;
340			sport = SCTP(ulp)->dest_port;
341			/* XXXGL: handle stale? */
342			break;
343		case IPPROTO_UDP:
344			PULLUP_TO(hlen, ulp, struct udphdr);
345			dport = UDP(ulp)->uh_dport;
346			sport = UDP(ulp)->uh_sport;
347			break;
348		case IPPROTO_HOPOPTS:	/* RFC 2460 */
349			PULLUP_TO(hlen, ulp, struct ip6_hbh);
350			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
351			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
352			ulp = NULL;
353			break;
354		case IPPROTO_ROUTING:	/* RFC 2460 */
355			PULLUP_TO(hlen, ulp, struct ip6_rthdr);
356			hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
357			proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
358			ulp = NULL;
359			break;
360		case IPPROTO_FRAGMENT:	/* RFC 2460 */
361			PULLUP_TO(hlen, ulp, struct ip6_frag);
362			hlen += sizeof (struct ip6_frag);
363			proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
364			offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
365			    IP6F_OFF_MASK;
366			ulp = NULL;
367			break;
368		case IPPROTO_DSTOPTS:	/* RFC 2460 */
369			PULLUP_TO(hlen, ulp, struct ip6_hbh);
370			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
371			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
372			ulp = NULL;
373			break;
374		case IPPROTO_AH:	/* RFC 2402 */
375			PULLUP_TO(hlen, ulp, struct ip6_ext);
376			hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
377			proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
378			ulp = NULL;
379			break;
380		default:
381			PULLUP_TO(hlen, ulp, struct ip6_ext);
382			break;
383		}
384	}
385
386	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
387	bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
388	key[8] = (dport << 16) | sport;
389	fibnum |= proto << 16;
390
391	fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
392	    fibnum);
393#else	/* !FLOWTABLE_HASH_ALL */
394	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
395	fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
396	    fibnum);
397#endif	/* FLOWTABLE_HASH_ALL */
398
399	if (fle == NULL)
400		return (NULL);
401
402	sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
403	sin6->sin6_family = AF_INET6;
404	sin6->sin6_len = sizeof(*sin6);
405	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
406
407	return (fle);
408}
409#endif /* INET6 */
410
411static bitstr_t *
412flowtable_mask(struct flowtable *ft)
413{
414
415	/*
416	 * flowtable_free_stale() calls w/o critical section, but
417	 * with sched_bind(). Since pointer is stable throughout
418	 * ft lifetime, it is safe, otherwise...
419	 *
420	 * CRITICAL_ASSERT(curthread);
421	 */
422
423	return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
424}
425
426static struct flist *
427flowtable_list(struct flowtable *ft, uint32_t hash)
428{
429
430	CRITICAL_ASSERT(curthread);
431	return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
432}
433
434static int
435flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
436{
437
438	if (((fle->f_rt->rt_flags & RTF_HOST) &&
439	    ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) ||
440	    (fle->f_rt->rt_ifp == NULL) ||
441	    !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
442	    (fle->f_lle->la_flags & LLE_VALID) == 0)
443		return (1);
444
445	if (time_uptime - fle->f_uptime > maxidle)
446		return (1);
447
448#ifdef FLOWTABLE_HASH_ALL
449	if (fle->f_flags & FL_STALE)
450		return (1);
451#endif
452
453	return (0);
454}
455
456static int
457flow_full(void)
458{
459	int count, max;
460
461	count = uma_zone_get_cur(flow_zone);
462	max = uma_zone_get_max(flow_zone);
463
464	return (count > (max - (max >> 3)));
465}
466
467static int
468flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
469{
470#ifdef FLOWTABLE_HASH_ALL
471	uint8_t proto;
472
473	proto = (fibnum >> 16) & 0xff;
474	fibnum &= 0xffff;
475#endif
476
477	CRITICAL_ASSERT(curthread);
478
479	/* Microoptimization for IPv4: don't use bcmp(). */
480	if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) ||
481	    (bcmp(fle->f_key, key, keylen) == 0)) &&
482	    fibnum == fle->f_fibnum &&
483#ifdef FLOWTABLE_HASH_ALL
484	    proto == fle->f_proto &&
485#endif
486	    (fle->f_rt->rt_flags & RTF_UP) &&
487	    fle->f_rt->rt_ifp != NULL &&
488	    (fle->f_lle->la_flags & LLE_VALID))
489		return (1);
490
491	return (0);
492}
493
494static struct flentry *
495flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
496    int keylen, uint32_t fibnum0)
497{
498#ifdef INET6
499	struct route_in6 sro6;
500#endif
501#ifdef INET
502	struct route sro;
503#endif
504	struct route *ro = NULL;
505	struct rtentry *rt;
506	struct lltable *lt = NULL;
507	struct llentry *lle;
508	struct sockaddr_storage *l3addr;
509	struct ifnet *ifp;
510	struct flist *flist;
511	struct flentry *fle, *iter;
512	bitstr_t *mask;
513	uint16_t fibnum = fibnum0;
514#ifdef FLOWTABLE_HASH_ALL
515	uint8_t proto;
516
517	proto = (fibnum0 >> 16) & 0xff;
518	fibnum = fibnum0 & 0xffff;
519#endif
520
521	/*
522	 * This bit of code ends up locking the
523	 * same route 3 times (just like ip_output + ether_output)
524	 * - at lookup
525	 * - in rt_check when called by arpresolve
526	 * - dropping the refcount for the rtentry
527	 *
528	 * This could be consolidated to one if we wrote a variant
529	 * of arpresolve with an rt_check variant that expected to
530	 * receive the route locked
531	 */
532#ifdef INET
533	if (ft == &V_ip4_ft) {
534		struct sockaddr_in *sin;
535
536		ro = &sro;
537		bzero(&sro.ro_dst, sizeof(sro.ro_dst));
538
539		sin = (struct sockaddr_in *)&sro.ro_dst;
540		sin->sin_family = AF_INET;
541		sin->sin_len = sizeof(*sin);
542		sin->sin_addr.s_addr = key[0];
543	}
544#endif
545#ifdef INET6
546	if (ft == &V_ip6_ft) {
547		struct sockaddr_in6 *sin6;
548
549		ro = (struct route *)&sro6;
550		sin6 = &sro6.ro_dst;
551
552		bzero(sin6, sizeof(*sin6));
553		sin6->sin6_family = AF_INET6;
554		sin6->sin6_len = sizeof(*sin6);
555		bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
556	}
557#endif
558
559	ro->ro_rt = NULL;
560#ifdef RADIX_MPATH
561	rtalloc_mpath_fib(ro, hash, fibnum);
562#else
563	rtalloc_ign_fib(ro, 0, fibnum);
564#endif
565	if (ro->ro_rt == NULL)
566		return (NULL);
567
568	rt = ro->ro_rt;
569	ifp = rt->rt_ifp;
570
571	if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
572		RTFREE(rt);
573		return (NULL);
574	}
575
576#ifdef INET
577	if (ft == &V_ip4_ft)
578		lt = LLTABLE(ifp);
579#endif
580#ifdef INET6
581	if (ft == &V_ip6_ft)
582		lt = LLTABLE6(ifp);
583#endif
584
585	if (rt->rt_flags & RTF_GATEWAY)
586		l3addr = (struct sockaddr_storage *)rt->rt_gateway;
587	else
588		l3addr = (struct sockaddr_storage *)&ro->ro_dst;
589	lle = llentry_alloc(ifp, lt, l3addr);
590
591	if (lle == NULL) {
592		RTFREE(rt);
593		return (NULL);
594	}
595
596	/* Don't insert the entry if the ARP hasn't yet finished resolving. */
597	if ((lle->la_flags & LLE_VALID) == 0) {
598		RTFREE(rt);
599		LLE_FREE(lle);
600		FLOWSTAT_INC(ft, ft_fail_lle_invalid);
601		return (NULL);
602	}
603
604	fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
605	if (fle == NULL) {
606		RTFREE(rt);
607		LLE_FREE(lle);
608		return (NULL);
609	}
610
611	fle->f_hash = hash;
612	bcopy(key, &fle->f_key, keylen);
613	fle->f_rt = rt;
614	fle->f_lle = lle;
615	fle->f_fibnum = fibnum;
616	fle->f_uptime = time_uptime;
617#ifdef FLOWTABLE_HASH_ALL
618	fle->f_proto = proto;
619	fle->f_flags = fibnum0 >> 24;
620#endif
621
622	critical_enter();
623	mask = flowtable_mask(ft);
624	flist = flowtable_list(ft, hash);
625
626	if (SLIST_EMPTY(flist)) {
627		bit_set(mask, (hash % ft->ft_size));
628		SLIST_INSERT_HEAD(flist, fle, f_next);
629		goto skip;
630	}
631
632	/*
633	 * find end of list and make sure that we were not
634	 * preempted by another thread handling this flow
635	 */
636	SLIST_FOREACH(iter, flist, f_next) {
637		KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
638		    ("%s: wrong hash", __func__));
639		if (flow_matches(iter, key, keylen, fibnum)) {
640			/*
641			 * We probably migrated to an other CPU after
642			 * lookup in flowtable_lookup_common() failed.
643			 * It appeared that this CPU already has flow
644			 * entry.
645			 */
646			iter->f_uptime = time_uptime;
647#ifdef FLOWTABLE_HASH_ALL
648			iter->f_flags |= fibnum >> 24;
649#endif
650			critical_exit();
651			FLOWSTAT_INC(ft, ft_collisions);
652			uma_zfree(flow_zone, fle);
653			return (iter);
654		}
655	}
656
657	SLIST_INSERT_HEAD(flist, fle, f_next);
658skip:
659	critical_exit();
660	FLOWSTAT_INC(ft, ft_inserts);
661
662	return (fle);
663}
664
665int
666flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
667{
668	struct flentry *fle;
669
670	if (V_flowtable_enable == 0)
671		return (ENXIO);
672
673	switch (sa) {
674#ifdef INET
675	case AF_INET:
676		fle = flowtable_lookup_ipv4(m, ro);
677		break;
678#endif
679#ifdef INET6
680	case AF_INET6:
681		fle = flowtable_lookup_ipv6(m, ro);
682		break;
683#endif
684	default:
685		panic("%s: sa %d", __func__, sa);
686	}
687
688	if (fle == NULL)
689		return (EHOSTUNREACH);
690
691	if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) {
692		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
693		m->m_pkthdr.flowid = fle->f_hash;
694	}
695
696	ro->ro_rt = fle->f_rt;
697	ro->ro_lle = fle->f_lle;
698	ro->ro_flags |= RT_NORTREF;
699
700	return (0);
701}
702
703static struct flentry *
704flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
705    uint32_t fibnum)
706{
707	struct flist *flist;
708	struct flentry *fle;
709	uint32_t hash;
710
711	FLOWSTAT_INC(ft, ft_lookups);
712
713	hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
714
715	critical_enter();
716	flist = flowtable_list(ft, hash);
717	SLIST_FOREACH(fle, flist, f_next) {
718		KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
719		    ("%s: wrong hash", __func__));
720		if (flow_matches(fle, key, keylen, fibnum)) {
721			fle->f_uptime = time_uptime;
722#ifdef FLOWTABLE_HASH_ALL
723			fle->f_flags |= fibnum >> 24;
724#endif
725			critical_exit();
726			FLOWSTAT_INC(ft, ft_hits);
727			return (fle);
728		}
729	}
730	critical_exit();
731
732	FLOWSTAT_INC(ft, ft_misses);
733
734	return (flowtable_insert(ft, hash, key, keylen, fibnum));
735}
736
737/*
738 * used by the bit_alloc macro
739 */
740#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO)
741static void
742flowtable_alloc(struct flowtable *ft)
743{
744
745	ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
746	    M_FTABLE, M_WAITOK);
747	for (int i = 0; i < ft->ft_size; i++)
748		ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
749
750	ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
751	for (int i = 0; i < mp_ncpus; i++) {
752		bitstr_t **b;
753
754		b = zpcpu_get_cpu(ft->ft_masks, i);
755		*b = bit_alloc(ft->ft_size);
756	}
757	ft->ft_tmpmask = bit_alloc(ft->ft_size);
758}
759#undef calloc
760
761static void
762flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
763{
764	struct flist *flist, freelist;
765	struct flentry *fle, *fle1, *fleprev;
766	bitstr_t *mask, *tmpmask;
767	int curbit, tmpsize;
768
769	SLIST_INIT(&freelist);
770	mask = flowtable_mask(ft);
771	tmpmask = ft->ft_tmpmask;
772	tmpsize = ft->ft_size;
773	memcpy(tmpmask, mask, ft->ft_size/8);
774	curbit = 0;
775	fleprev = NULL; /* pacify gcc */
776	/*
777	 * XXX Note to self, bit_ffs operates at the byte level
778	 * and thus adds gratuitous overhead
779	 */
780	bit_ffs(tmpmask, ft->ft_size, &curbit);
781	while (curbit != -1) {
782		if (curbit >= ft->ft_size || curbit < -1) {
783			log(LOG_ALERT,
784			    "warning: bad curbit value %d \n",
785			    curbit);
786			break;
787		}
788
789		FLOWSTAT_INC(ft, ft_free_checks);
790
791		critical_enter();
792		flist = flowtable_list(ft, curbit);
793#ifdef DIAGNOSTIC
794		if (SLIST_EMPTY(flist) && curbit > 0) {
795			log(LOG_ALERT,
796			    "warning bit=%d set, but no fle found\n",
797			    curbit);
798		}
799#endif
800		SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
801			if (rt != NULL && fle->f_rt != rt) {
802				fleprev = fle;
803				continue;
804			}
805			if (!flow_stale(ft, fle, maxidle)) {
806				fleprev = fle;
807				continue;
808			}
809
810			if (fle == SLIST_FIRST(flist))
811				SLIST_REMOVE_HEAD(flist, f_next);
812			else
813				SLIST_REMOVE_AFTER(fleprev, f_next);
814			SLIST_INSERT_HEAD(&freelist, fle, f_next);
815		}
816		if (SLIST_EMPTY(flist))
817			bit_clear(mask, curbit);
818		critical_exit();
819
820		bit_clear(tmpmask, curbit);
821		tmpmask += (curbit / 8);
822		tmpsize -= (curbit / 8) * 8;
823		bit_ffs(tmpmask, tmpsize, &curbit);
824	}
825
826	SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
827		FLOWSTAT_INC(ft, ft_frees);
828		if (fle->f_rt != NULL)
829			RTFREE(fle->f_rt);
830		if (fle->f_lle != NULL)
831			LLE_FREE(fle->f_lle);
832		uma_zfree(flow_zone, fle);
833	}
834}
835
836static void
837flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
838{
839	int i;
840
841	CPU_FOREACH(i) {
842		if (smp_started == 1) {
843			thread_lock(curthread);
844			sched_bind(curthread, i);
845			thread_unlock(curthread);
846		}
847
848		flowtable_free_stale(ft, rt, maxidle);
849
850		if (smp_started == 1) {
851			thread_lock(curthread);
852			sched_unbind(curthread);
853			thread_unlock(curthread);
854		}
855	}
856}
857
858void
859flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
860{
861	struct flowtable *ft;
862
863	switch (sa) {
864#ifdef INET
865	case AF_INET:
866		ft = &V_ip4_ft;
867		break;
868#endif
869#ifdef INET6
870	case AF_INET6:
871		ft = &V_ip6_ft;
872		break;
873#endif
874	default:
875		panic("%s: sa %d", __func__, sa);
876	}
877
878	flowtable_clean_vnet(ft, rt, 0);
879}
880
881static void
882flowtable_cleaner(void)
883{
884	VNET_ITERATOR_DECL(vnet_iter);
885	struct thread *td;
886
887	if (bootverbose)
888		log(LOG_INFO, "flowtable cleaner started\n");
889	td = curthread;
890	while (1) {
891		uint32_t flowclean_freq, maxidle;
892
893		/*
894		 * The maximum idle time, as well as frequency are arbitrary.
895		 */
896		if (flow_full())
897			maxidle = 5;
898		else
899			maxidle = 30;
900
901		VNET_LIST_RLOCK();
902		VNET_FOREACH(vnet_iter) {
903			CURVNET_SET(vnet_iter);
904#ifdef INET
905			flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
906#endif
907#ifdef INET6
908			flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
909#endif
910			CURVNET_RESTORE();
911		}
912		VNET_LIST_RUNLOCK();
913
914		if (flow_full())
915			flowclean_freq = 4*hz;
916		else
917			flowclean_freq = 20*hz;
918		mtx_lock(&flowclean_lock);
919		thread_lock(td);
920		sched_prio(td, PPAUSE);
921		thread_unlock(td);
922		flowclean_cycles++;
923		cv_broadcast(&flowclean_f_cv);
924		cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
925		mtx_unlock(&flowclean_lock);
926	}
927}
928
929static void
930flowtable_flush(void *unused __unused)
931{
932	uint64_t start;
933
934	mtx_lock(&flowclean_lock);
935	start = flowclean_cycles;
936	while (start == flowclean_cycles) {
937		cv_broadcast(&flowclean_c_cv);
938		cv_wait(&flowclean_f_cv, &flowclean_lock);
939	}
940	mtx_unlock(&flowclean_lock);
941}
942
943static struct kproc_desc flow_kp = {
944	"flowcleaner",
945	flowtable_cleaner,
946	&flowcleanerproc
947};
948SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
949
950static int
951flowtable_get_size(char *name)
952{
953	int size;
954
955	if (TUNABLE_INT_FETCH(name, &size)) {
956		if (size < 256)
957			size = 256;
958		if (!powerof2(size)) {
959			printf("%s must be power of 2\n", name);
960			size = 2048;
961		}
962	} else {
963		/*
964		 * round up to the next power of 2
965		 */
966		size = 1 << fls((1024 + maxusers * 64) - 1);
967	}
968
969	return (size);
970}
971
972static void
973flowtable_init(const void *unused __unused)
974{
975
976	flow_hashjitter = arc4random();
977
978	flow_zone = uma_zcreate("flows", sizeof(struct flentry),
979	    NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
980	uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
981
982	cv_init(&flowclean_c_cv, "c_flowcleanwait");
983	cv_init(&flowclean_f_cv, "f_flowcleanwait");
984	mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
985	EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
986	    EVENTHANDLER_PRI_ANY);
987}
988SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
989    flowtable_init, NULL);
990
991#ifdef INET
992static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
993    "Flowtable for IPv4");
994
995static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
996VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
997VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
998SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
999    ip4_ftstat, "Flowtable statistics for IPv4 "
1000    "(struct flowtable_stat, net/flowtable.h)");
1001
1002static void
1003flowtable_init_vnet_v4(const void *unused __unused)
1004{
1005
1006	V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
1007	V_ip4_ft.ft_stat = VNET(ip4_ftstat);
1008	flowtable_alloc(&V_ip4_ft);
1009}
1010VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1011    flowtable_init_vnet_v4, NULL);
1012#endif /* INET */
1013
1014#ifdef INET6
1015static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
1016    "Flowtable for IPv6");
1017
1018static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
1019VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
1020VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
1021SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
1022    ip6_ftstat, "Flowtable statistics for IPv6 "
1023    "(struct flowtable_stat, net/flowtable.h)");
1024
1025static void
1026flowtable_init_vnet_v6(const void *unused __unused)
1027{
1028
1029	V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
1030	V_ip6_ft.ft_stat = VNET(ip6_ftstat);
1031	flowtable_alloc(&V_ip6_ft);
1032}
1033VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1034    flowtable_init_vnet_v6, NULL);
1035#endif /* INET6 */
1036
1037#ifdef DDB
1038static bitstr_t *
1039flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1040{
1041
1042	return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
1043}
1044
1045static struct flist *
1046flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1047{
1048
1049	return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
1050}
1051
1052static void
1053flow_show(struct flowtable *ft, struct flentry *fle)
1054{
1055	int idle_time;
1056	int rt_valid, ifp_valid;
1057	volatile struct rtentry *rt;
1058	struct ifnet *ifp = NULL;
1059	uint32_t *hashkey = fle->f_key;
1060
1061	idle_time = (int)(time_uptime - fle->f_uptime);
1062	rt = fle->f_rt;
1063	rt_valid = rt != NULL;
1064	if (rt_valid)
1065		ifp = rt->rt_ifp;
1066	ifp_valid = ifp != NULL;
1067
1068#ifdef INET
1069	if (ft == &V_ip4_ft) {
1070		char daddr[4*sizeof "123"];
1071#ifdef FLOWTABLE_HASH_ALL
1072		char saddr[4*sizeof "123"];
1073		uint16_t sport, dport;
1074#endif
1075
1076		inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
1077#ifdef FLOWTABLE_HASH_ALL
1078		inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1079		dport = ntohs((uint16_t)(hashkey[2] >> 16));
1080		sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
1081		db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
1082#else
1083		db_printf("%s ", daddr);
1084#endif
1085	}
1086#endif /* INET */
1087#ifdef INET6
1088	if (ft == &V_ip6_ft) {
1089#ifdef FLOWTABLE_HASH_ALL
1090		db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1091		    hashkey[0], hashkey[1], hashkey[2],
1092		    hashkey[3], hashkey[4], hashkey[5],
1093		    hashkey[6], hashkey[7], hashkey[8]);
1094#else
1095		db_printf("\n\tkey=%08x:%08x:%08x ",
1096		    hashkey[0], hashkey[1], hashkey[2]);
1097#endif
1098	}
1099#endif /* INET6 */
1100
1101	db_printf("hash=%08x idle_time=%03d"
1102	    "\n\tfibnum=%02d rt=%p",
1103	    fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
1104
1105#ifdef FLOWTABLE_HASH_ALL
1106	if (fle->f_flags & FL_STALE)
1107		db_printf(" FL_STALE ");
1108#endif
1109	if (rt_valid) {
1110		if (rt->rt_flags & RTF_UP)
1111			db_printf(" RTF_UP ");
1112	}
1113	if (ifp_valid) {
1114		if (ifp->if_flags & IFF_LOOPBACK)
1115			db_printf(" IFF_LOOPBACK ");
1116		if (ifp->if_flags & IFF_UP)
1117			db_printf(" IFF_UP ");
1118		if (ifp->if_flags & IFF_POINTOPOINT)
1119			db_printf(" IFF_POINTOPOINT ");
1120	}
1121	db_printf("\n");
1122}
1123
1124static void
1125flowtable_show(struct flowtable *ft, int cpuid)
1126{
1127	int curbit = 0;
1128	bitstr_t *mask, *tmpmask;
1129
1130	if (cpuid != -1)
1131		db_printf("cpu: %d\n", cpuid);
1132	mask = flowtable_mask_pcpu(ft, cpuid);
1133	tmpmask = ft->ft_tmpmask;
1134	memcpy(tmpmask, mask, ft->ft_size/8);
1135	/*
1136	 * XXX Note to self, bit_ffs operates at the byte level
1137	 * and thus adds gratuitous overhead
1138	 */
1139	bit_ffs(tmpmask, ft->ft_size, &curbit);
1140	while (curbit != -1) {
1141		struct flist *flist;
1142		struct flentry *fle;
1143
1144		if (curbit >= ft->ft_size || curbit < -1) {
1145			db_printf("warning: bad curbit value %d \n",
1146			    curbit);
1147			break;
1148		}
1149
1150		flist = flowtable_list_pcpu(ft, curbit, cpuid);
1151
1152		SLIST_FOREACH(fle, flist, f_next)
1153			flow_show(ft, fle);
1154		bit_clear(tmpmask, curbit);
1155		bit_ffs(tmpmask, ft->ft_size, &curbit);
1156	}
1157}
1158
1159static void
1160flowtable_show_vnet(struct flowtable *ft)
1161{
1162
1163	int i;
1164
1165	CPU_FOREACH(i)
1166		flowtable_show(ft, i);
1167}
1168
1169DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1170{
1171	VNET_ITERATOR_DECL(vnet_iter);
1172
1173	VNET_FOREACH(vnet_iter) {
1174		CURVNET_SET(vnet_iter);
1175#ifdef VIMAGE
1176		db_printf("vnet %p\n", vnet_iter);
1177#endif
1178#ifdef INET
1179		printf("IPv4:\n");
1180		flowtable_show_vnet(&V_ip4_ft);
1181#endif
1182#ifdef INET6
1183		printf("IPv6:\n");
1184		flowtable_show_vnet(&V_ip6_ft);
1185#endif
1186		CURVNET_RESTORE();
1187	}
1188}
1189#endif
1190