1262028Sglebius/*-
2262030Sglebius * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
3262028Sglebius * Copyright (c) 2008-2010, BitGravity Inc.
4262028Sglebius * All rights reserved.
5262028Sglebius *
6262028Sglebius * Redistribution and use in source and binary forms, with or without
7262028Sglebius * modification, are permitted provided that the following conditions are met:
8262028Sglebius *
9262028Sglebius *  1. Redistributions of source code must retain the above copyright notice,
10262028Sglebius *     this list of conditions and the following disclaimer.
11262028Sglebius *
12262028Sglebius *  2. Neither the name of the BitGravity Corporation nor the names of its
13262028Sglebius *     contributors may be used to endorse or promote products derived from
14262028Sglebius *     this software without specific prior written permission.
15262028Sglebius *
16262028Sglebius * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17262028Sglebius * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18262028Sglebius * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19262028Sglebius * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20262028Sglebius * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21262028Sglebius * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22262028Sglebius * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23262028Sglebius * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24262028Sglebius * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25262028Sglebius * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26262028Sglebius * POSSIBILITY OF SUCH DAMAGE.
27262028Sglebius */
28191255Skmacy
29191255Skmacy#include "opt_route.h"
30191255Skmacy#include "opt_mpath.h"
31196368Skmacy#include "opt_ddb.h"
32205066Skmacy#include "opt_inet.h"
33205066Skmacy#include "opt_inet6.h"
34191255Skmacy
35191255Skmacy#include <sys/cdefs.h>
36191255Skmacy__FBSDID("$FreeBSD$");
37191255Skmacy
38261582Sglebius#include <sys/param.h>
39191255Skmacy#include <sys/types.h>
40191255Skmacy#include <sys/bitstring.h>
41196368Skmacy#include <sys/condvar.h>
42191255Skmacy#include <sys/callout.h>
43240086Sglebius#include <sys/hash.h>
44261582Sglebius#include <sys/kernel.h>
45191255Skmacy#include <sys/kthread.h>
46191255Skmacy#include <sys/limits.h>
47191255Skmacy#include <sys/malloc.h>
48191255Skmacy#include <sys/mbuf.h>
49261823Sglebius#include <sys/pcpu.h>
50191255Skmacy#include <sys/proc.h>
51261823Sglebius#include <sys/queue.h>
52205066Skmacy#include <sys/sbuf.h>
53191255Skmacy#include <sys/sched.h>
54191255Skmacy#include <sys/smp.h>
55191255Skmacy#include <sys/socket.h>
56191255Skmacy#include <sys/syslog.h>
57191255Skmacy#include <sys/sysctl.h>
58261823Sglebius#include <vm/uma.h>
59191255Skmacy
60191255Skmacy#include <net/if.h>
61191255Skmacy#include <net/if_llatbl.h>
62191255Skmacy#include <net/if_var.h>
63261582Sglebius#include <net/route.h>
64191255Skmacy#include <net/flowtable.h>
65195837Srwatson#include <net/vnet.h>
66191255Skmacy
67191255Skmacy#include <netinet/in.h>
68191255Skmacy#include <netinet/in_systm.h>
69191255Skmacy#include <netinet/in_var.h>
70191255Skmacy#include <netinet/if_ether.h>
71191255Skmacy#include <netinet/ip.h>
72205066Skmacy#ifdef INET6
73205066Skmacy#include <netinet/ip6.h>
74205066Skmacy#endif
75262027Sglebius#ifdef FLOWTABLE_HASH_ALL
76191255Skmacy#include <netinet/tcp.h>
77191255Skmacy#include <netinet/udp.h>
78191255Skmacy#include <netinet/sctp.h>
79262027Sglebius#endif
80191255Skmacy
81196368Skmacy#include <ddb/ddb.h>
82191255Skmacy
83262027Sglebius#ifdef	FLOWTABLE_HASH_ALL
84262027Sglebius#define	KEY_PORTS	(sizeof(uint16_t) * 2)
85262027Sglebius#define	KEY_ADDRS	2
86262027Sglebius#else
87262027Sglebius#define	KEY_PORTS	0
88262027Sglebius#define	KEY_ADDRS	1
89262027Sglebius#endif
90191255Skmacy
91262027Sglebius#ifdef	INET6
92262027Sglebius#define	KEY_ADDR_LEN	sizeof(struct in6_addr)
93262027Sglebius#else
94262027Sglebius#define	KEY_ADDR_LEN	sizeof(struct in_addr)
95261823Sglebius#endif
96191255Skmacy
97262027Sglebius#define	KEYLEN	((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
98191255Skmacy
99191255Skmacystruct flentry {
100262027Sglebius	uint32_t		f_hash;		/* hash flowing forward */
101262027Sglebius	uint32_t		f_key[KEYLEN];	/* address(es and ports) */
102262027Sglebius	uint32_t		f_uptime;	/* uptime at last access */
103262027Sglebius	uint16_t		f_fibnum;	/* fib index */
104262027Sglebius#ifdef FLOWTABLE_HASH_ALL
105191255Skmacy	uint8_t			f_proto;	/* protocol */
106262027Sglebius	uint8_t			f_flags;	/* stale? */
107262027Sglebius#define FL_STALE 		1
108262027Sglebius#endif
109261823Sglebius	SLIST_ENTRY(flentry)	f_next;		/* pointer to collision entry */
110261823Sglebius	struct rtentry		*f_rt;		/* rtentry for flow */
111261823Sglebius	struct llentry		*f_lle;		/* llentry for flow */
112191255Skmacy};
113262027Sglebius#undef KEYLEN
114191255Skmacy
115261823SglebiusSLIST_HEAD(flist, flentry);
116261823Sglebius/* Make sure we can use pcpu_zone_ptr for struct flist. */
117261823SglebiusCTASSERT(sizeof(struct flist) == sizeof(void *));
118191255Skmacy
119191255Skmacystruct flowtable {
120261601Sglebius	counter_u64_t	*ft_stat;
121191255Skmacy	int 		ft_size;
122205488Skmacy	/*
123261823Sglebius	 * ft_table is a malloc(9)ed array of pointers.  Pointers point to
124261823Sglebius	 * memory from UMA_ZONE_PCPU zone.
125261823Sglebius	 * ft_masks is per-cpu pointer itself.  Each instance points
126261823Sglebius	 * to a malloc(9)ed bitset, that is private to corresponding CPU.
127261582Sglebius	 */
128261823Sglebius	struct flist	**ft_table;
129261823Sglebius	bitstr_t 	**ft_masks;
130191324Skmacy	bitstr_t	*ft_tmpmask;
131261823Sglebius};
132205488Skmacy
133261601Sglebius#define	FLOWSTAT_ADD(ft, name, v)	\
134261601Sglebius	counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
135261601Sglebius#define	FLOWSTAT_INC(ft, name)	FLOWSTAT_ADD(ft, name, 1)
136261601Sglebius
137191255Skmacystatic struct proc *flowcleanerproc;
138261601Sglebiusstatic uint32_t flow_hashjitter;
139191255Skmacy
140216855Sbzstatic struct cv 	flowclean_f_cv;
141216855Sbzstatic struct cv 	flowclean_c_cv;
142196368Skmacystatic struct mtx	flowclean_lock;
143196368Skmacystatic uint32_t		flowclean_cycles;
144196368Skmacy
145191255Skmacy/*
146191255Skmacy * TODO:
147261582Sglebius * - add sysctls to resize && flush flow tables
148191255Skmacy * - Add per flowtable sysctls for statistics and configuring timeouts
149191255Skmacy * - add saturation counter to rtentry to support per-packet load-balancing
150191255Skmacy *   add flag to indicate round-robin flow, add list lookup from head
151191255Skmacy     for flows
152191255Skmacy * - add sysctl / device node / syscall to support exporting and importing
153191255Skmacy *   of flows with flag to indicate that a flow was imported so should
154191255Skmacy *   not be considered for auto-cleaning
155191255Skmacy * - support explicit connection state (currently only ad-hoc for DSR)
156194660Szec * - idetach() cleanup for options VIMAGE builds.
157191255Skmacy */
158261601Sglebius#ifdef INET
159261601Sglebiusstatic VNET_DEFINE(struct flowtable, ip4_ft);
160261823Sglebius#define	V_ip4_ft	VNET(ip4_ft)
161261601Sglebius#endif
162261601Sglebius#ifdef INET6
163261601Sglebiusstatic VNET_DEFINE(struct flowtable, ip6_ft);
164261601Sglebius#define	V_ip6_ft	VNET(ip6_ft)
165261601Sglebius#endif
166261601Sglebius
167261823Sglebiusstatic uma_zone_t flow_zone;
168261823Sglebius
169261601Sglebiusstatic VNET_DEFINE(int, flowtable_enable) = 1;
170195727Srwatson#define	V_flowtable_enable		VNET(flowtable_enable)
171195699Srwatson
172261601Sglebiusstatic SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
173227309Sed    "flowtable");
174274225SglebiusSYSCTL_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
175195699Srwatson    &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
176261823SglebiusSYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
177261823Sglebius    &flow_zone, "Maximum number of flows allowed");
178191255Skmacy
179261823Sglebiusstatic MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
180191255Skmacy
181262027Sglebiusstatic struct flentry *
182262027Sglebiusflowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
183205066Skmacy
184205066Skmacy#ifdef INET
185262027Sglebiusstatic struct flentry *
186262027Sglebiusflowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
187205066Skmacy{
188262027Sglebius	struct flentry *fle;
189262027Sglebius	struct sockaddr_in *sin;
190205066Skmacy	struct ip *ip;
191262027Sglebius	uint32_t fibnum;
192262027Sglebius#ifdef FLOWTABLE_HASH_ALL
193262027Sglebius	uint32_t key[3];
194191255Skmacy	int iphlen;
195205066Skmacy	uint16_t sport, dport;
196262027Sglebius	uint8_t proto;
197262027Sglebius#endif
198191255Skmacy
199205066Skmacy	ip = mtod(m, struct ip *);
200191255Skmacy
201262027Sglebius	if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
202262027Sglebius	    (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
203262027Sglebius	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
204262027Sglebius		return (NULL);
205262027Sglebius
206262027Sglebius	fibnum = M_GETFIB(m);
207262027Sglebius
208262027Sglebius#ifdef FLOWTABLE_HASH_ALL
209262027Sglebius	iphlen = ip->ip_hl << 2;
210205066Skmacy	proto = ip->ip_p;
211191255Skmacy
212262027Sglebius	switch (proto) {
213262027Sglebius	case IPPROTO_TCP: {
214262027Sglebius		struct tcphdr *th;
215205066Skmacy
216262027Sglebius		th = (struct tcphdr *)((char *)ip + iphlen);
217205066Skmacy		sport = th->th_sport;
218205066Skmacy		dport = th->th_dport;
219262027Sglebius		if (th->th_flags & (TH_RST|TH_FIN))
220262027Sglebius			fibnum |= (FL_STALE << 24);
221261582Sglebius		break;
222262027Sglebius	}
223262027Sglebius	case IPPROTO_UDP: {
224262027Sglebius		struct udphdr *uh;
225262027Sglebius
226262027Sglebius		uh = (struct udphdr *)((char *)ip + iphlen);
227191255Skmacy		sport = uh->uh_sport;
228191255Skmacy		dport = uh->uh_dport;
229261582Sglebius		break;
230262027Sglebius	}
231262027Sglebius	case IPPROTO_SCTP: {
232262027Sglebius		struct sctphdr *sh;
233262027Sglebius
234262027Sglebius		sh = (struct sctphdr *)((char *)ip + iphlen);
235191255Skmacy		sport = sh->src_port;
236191255Skmacy		dport = sh->dest_port;
237262027Sglebius		/* XXXGL: handle stale? */
238261582Sglebius		break;
239262027Sglebius	}
240191255Skmacy	default:
241262027Sglebius		sport = dport = 0;
242201758Smbr		break;
243191255Skmacy	}
244191255Skmacy
245262027Sglebius	key[0] = ip->ip_dst.s_addr;
246262027Sglebius	key[1] = ip->ip_src.s_addr;
247262027Sglebius	key[2] = (dport << 16) | sport;
248262027Sglebius	fibnum |= proto << 16;
249191255Skmacy
250262027Sglebius	fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
251262027Sglebius	    fibnum);
252191255Skmacy
253262027Sglebius#else	/* !FLOWTABLE_HASH_ALL */
254191255Skmacy
255262027Sglebius	fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
256262027Sglebius	    sizeof(struct in_addr), fibnum);
257205066Skmacy
258262027Sglebius#endif	/* FLOWTABLE_HASH_ALL */
259205066Skmacy
260262027Sglebius	if (fle == NULL)
261205066Skmacy		return (NULL);
262205066Skmacy
263205066Skmacy	sin = (struct sockaddr_in *)&ro->ro_dst;
264205066Skmacy	sin->sin_family = AF_INET;
265205066Skmacy	sin->sin_len = sizeof(*sin);
266262027Sglebius	sin->sin_addr = ip->ip_dst;
267262027Sglebius
268262027Sglebius	return (fle);
269205066Skmacy}
270205066Skmacy#endif /* INET */
271205066Skmacy
272205066Skmacy#ifdef INET6
273205066Skmacy/*
274205066Skmacy * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
275205066Skmacy * then it sets p to point at the offset "len" in the mbuf. WARNING: the
276205066Skmacy * pointer might become stale after other pullups (but we never use it
277205066Skmacy * this way).
278205066Skmacy */
279205066Skmacy#define PULLUP_TO(_len, p, T)						\
280205066Skmacydo {									\
281205066Skmacy	int x = (_len) + sizeof(T);					\
282262027Sglebius	if ((m)->m_len < x)						\
283262027Sglebius		return (NULL);						\
284205066Skmacy	p = (mtod(m, char *) + (_len));					\
285205066Skmacy} while (0)
286205066Skmacy
287205066Skmacy#define	TCP(p)		((struct tcphdr *)(p))
288205066Skmacy#define	SCTP(p)		((struct sctphdr *)(p))
289205066Skmacy#define	UDP(p)		((struct udphdr *)(p))
290205066Skmacy
291262027Sglebiusstatic struct flentry *
292262027Sglebiusflowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
293205066Skmacy{
294262027Sglebius	struct flentry *fle;
295262027Sglebius	struct sockaddr_in6 *sin6;
296205066Skmacy	struct ip6_hdr *ip6;
297262027Sglebius	uint32_t fibnum;
298262027Sglebius#ifdef FLOWTABLE_HASH_ALL
299262027Sglebius	uint32_t key[9];
300262027Sglebius	void *ulp;
301205066Skmacy	int hlen;
302262027Sglebius	uint16_t sport, dport;
303205066Skmacy	u_short offset;
304262027Sglebius	uint8_t proto;
305262027Sglebius#else
306262027Sglebius	uint32_t key[4];
307262027Sglebius#endif
308205066Skmacy
309205066Skmacy	ip6 = mtod(m, struct ip6_hdr *);
310262027Sglebius	if (in6_localaddr(&ip6->ip6_dst))
311262027Sglebius		return (NULL);
312262027Sglebius
313262027Sglebius	fibnum = M_GETFIB(m);
314262027Sglebius
315262027Sglebius#ifdef	FLOWTABLE_HASH_ALL
316205066Skmacy	hlen = sizeof(struct ip6_hdr);
317205066Skmacy	proto = ip6->ip6_nxt;
318262027Sglebius	offset = sport = dport = 0;
319262027Sglebius	ulp = NULL;
320205066Skmacy	while (ulp == NULL) {
321205066Skmacy		switch (proto) {
322205066Skmacy		case IPPROTO_ICMPV6:
323205066Skmacy		case IPPROTO_OSPFIGP:
324205066Skmacy		case IPPROTO_PIM:
325205066Skmacy		case IPPROTO_CARP:
326205066Skmacy		case IPPROTO_ESP:
327205066Skmacy		case IPPROTO_NONE:
328205066Skmacy			ulp = ip6;
329205066Skmacy			break;
330205066Skmacy		case IPPROTO_TCP:
331205066Skmacy			PULLUP_TO(hlen, ulp, struct tcphdr);
332262027Sglebius			dport = TCP(ulp)->th_dport;
333262027Sglebius			sport = TCP(ulp)->th_sport;
334262027Sglebius			if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
335262027Sglebius				fibnum |= (FL_STALE << 24);
336205066Skmacy			break;
337205066Skmacy		case IPPROTO_SCTP:
338205066Skmacy			PULLUP_TO(hlen, ulp, struct sctphdr);
339262027Sglebius			dport = SCTP(ulp)->src_port;
340262027Sglebius			sport = SCTP(ulp)->dest_port;
341262027Sglebius			/* XXXGL: handle stale? */
342205066Skmacy			break;
343205066Skmacy		case IPPROTO_UDP:
344205066Skmacy			PULLUP_TO(hlen, ulp, struct udphdr);
345262027Sglebius			dport = UDP(ulp)->uh_dport;
346262027Sglebius			sport = UDP(ulp)->uh_sport;
347205066Skmacy			break;
348205066Skmacy		case IPPROTO_HOPOPTS:	/* RFC 2460 */
349205066Skmacy			PULLUP_TO(hlen, ulp, struct ip6_hbh);
350205066Skmacy			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
351205066Skmacy			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
352205066Skmacy			ulp = NULL;
353205066Skmacy			break;
354205066Skmacy		case IPPROTO_ROUTING:	/* RFC 2460 */
355261640Sglebius			PULLUP_TO(hlen, ulp, struct ip6_rthdr);
356205066Skmacy			hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
357205066Skmacy			proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
358205066Skmacy			ulp = NULL;
359205066Skmacy			break;
360205066Skmacy		case IPPROTO_FRAGMENT:	/* RFC 2460 */
361205066Skmacy			PULLUP_TO(hlen, ulp, struct ip6_frag);
362205066Skmacy			hlen += sizeof (struct ip6_frag);
363205066Skmacy			proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
364205066Skmacy			offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
365205066Skmacy			    IP6F_OFF_MASK;
366205066Skmacy			ulp = NULL;
367205066Skmacy			break;
368205066Skmacy		case IPPROTO_DSTOPTS:	/* RFC 2460 */
369205066Skmacy			PULLUP_TO(hlen, ulp, struct ip6_hbh);
370205066Skmacy			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
371205066Skmacy			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
372205066Skmacy			ulp = NULL;
373205066Skmacy			break;
374205066Skmacy		case IPPROTO_AH:	/* RFC 2402 */
375205066Skmacy			PULLUP_TO(hlen, ulp, struct ip6_ext);
376205066Skmacy			hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
377205066Skmacy			proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
378205066Skmacy			ulp = NULL;
379205066Skmacy			break;
380205066Skmacy		default:
381205066Skmacy			PULLUP_TO(hlen, ulp, struct ip6_ext);
382205066Skmacy			break;
383205066Skmacy		}
384205066Skmacy	}
385205066Skmacy
386262027Sglebius	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
387262027Sglebius	bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
388262027Sglebius	key[8] = (dport << 16) | sport;
389262027Sglebius	fibnum |= proto << 16;
390205066Skmacy
391262027Sglebius	fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
392262027Sglebius	    fibnum);
393262027Sglebius#else	/* !FLOWTABLE_HASH_ALL */
394262027Sglebius	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
395262027Sglebius	fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
396262027Sglebius	    fibnum);
397262027Sglebius#endif	/* FLOWTABLE_HASH_ALL */
398205066Skmacy
399262027Sglebius	if (fle == NULL)
400205066Skmacy		return (NULL);
401205066Skmacy
402205066Skmacy	sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
403205066Skmacy	sin6->sin6_family = AF_INET6;
404205066Skmacy	sin6->sin6_len = sizeof(*sin6);
405262027Sglebius	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
406262027Sglebius
407262027Sglebius	return (fle);
408205066Skmacy}
409205066Skmacy#endif /* INET6 */
410205066Skmacy
411191255Skmacystatic bitstr_t *
412191255Skmacyflowtable_mask(struct flowtable *ft)
413191255Skmacy{
414196368Skmacy
415262029Sglebius	/*
416261823Sglebius	 * flowtable_free_stale() calls w/o critical section, but
417261823Sglebius	 * with sched_bind(). Since pointer is stable throughout
418261823Sglebius	 * ft lifetime, it is safe, otherwise...
419261823Sglebius	 *
420261823Sglebius	 * CRITICAL_ASSERT(curthread);
421261823Sglebius	 */
422191255Skmacy
423261823Sglebius	return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
424191255Skmacy}
425191255Skmacy
426261823Sglebiusstatic struct flist *
427261823Sglebiusflowtable_list(struct flowtable *ft, uint32_t hash)
428191255Skmacy{
429191255Skmacy
430261823Sglebius	CRITICAL_ASSERT(curthread);
431261823Sglebius	return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
432191255Skmacy}
433191255Skmacy
434191255Skmacystatic int
435262027Sglebiusflow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
436191255Skmacy{
437191255Skmacy
438290276Srrs	if (((fle->f_rt->rt_flags & RTF_UP) == 0) ||
439262027Sglebius	    (fle->f_rt->rt_ifp == NULL) ||
440262027Sglebius	    !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
441262027Sglebius	    (fle->f_lle->la_flags & LLE_VALID) == 0)
442191255Skmacy		return (1);
443191255Skmacy
444262027Sglebius	if (time_uptime - fle->f_uptime > maxidle)
445262027Sglebius		return (1);
446191255Skmacy
447262027Sglebius#ifdef FLOWTABLE_HASH_ALL
448262027Sglebius	if (fle->f_flags & FL_STALE)
449191255Skmacy		return (1);
450262027Sglebius#endif
451191255Skmacy
452191255Skmacy	return (0);
453191255Skmacy}
454191255Skmacy
455191255Skmacystatic int
456262027Sglebiusflow_full(void)
457205488Skmacy{
458261601Sglebius	int count, max;
459261640Sglebius
460261823Sglebius	count = uma_zone_get_cur(flow_zone);
461261823Sglebius	max = uma_zone_get_max(flow_zone);
462205488Skmacy
463262027Sglebius	return (count > (max - (max >> 3)));
464205488Skmacy}
465205488Skmacy
466205488Skmacystatic int
467262027Sglebiusflow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
468261883Sglebius{
469262027Sglebius#ifdef FLOWTABLE_HASH_ALL
470262027Sglebius	uint8_t proto;
471261883Sglebius
472262027Sglebius	proto = (fibnum >> 16) & 0xff;
473262027Sglebius	fibnum &= 0xffff;
474262027Sglebius#endif
475262027Sglebius
476262027Sglebius	CRITICAL_ASSERT(curthread);
477262027Sglebius
478262027Sglebius	/* Microoptimization for IPv4: don't use bcmp(). */
479290276Srrs	if (((keylen == sizeof(uint32_t) && (fle->f_key[0] == key[0])) ||
480262027Sglebius	    (bcmp(fle->f_key, key, keylen) == 0)) &&
481262027Sglebius	    fibnum == fle->f_fibnum &&
482262027Sglebius#ifdef FLOWTABLE_HASH_ALL
483262027Sglebius	    proto == fle->f_proto &&
484262027Sglebius#endif
485261883Sglebius	    (fle->f_rt->rt_flags & RTF_UP) &&
486261883Sglebius	    fle->f_rt->rt_ifp != NULL &&
487261883Sglebius	    (fle->f_lle->la_flags & LLE_VALID))
488261883Sglebius		return (1);
489261883Sglebius
490261883Sglebius	return (0);
491261883Sglebius}
492261883Sglebius
493261883Sglebiusstatic struct flentry *
494191255Skmacyflowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
495262027Sglebius    int keylen, uint32_t fibnum0)
496191255Skmacy{
497262027Sglebius#ifdef INET6
498262029Sglebius	struct route_in6 sro6;
499262027Sglebius#endif
500262027Sglebius#ifdef INET
501262029Sglebius	struct route sro;
502262027Sglebius#endif
503262027Sglebius	struct route *ro = NULL;
504262027Sglebius	struct rtentry *rt;
505262027Sglebius	struct lltable *lt = NULL;
506262027Sglebius	struct llentry *lle;
507262027Sglebius	struct sockaddr_storage *l3addr;
508262027Sglebius	struct ifnet *ifp;
509261823Sglebius	struct flist *flist;
510261823Sglebius	struct flentry *fle, *iter;
511261883Sglebius	bitstr_t *mask;
512262027Sglebius	uint16_t fibnum = fibnum0;
513262027Sglebius#ifdef FLOWTABLE_HASH_ALL
514261883Sglebius	uint8_t proto;
515191255Skmacy
516262027Sglebius	proto = (fibnum0 >> 16) & 0xff;
517262027Sglebius	fibnum = fibnum0 & 0xffff;
518262027Sglebius#endif
519262027Sglebius
520262027Sglebius	/*
521262027Sglebius	 * This bit of code ends up locking the
522262027Sglebius	 * same route 3 times (just like ip_output + ether_output)
523262027Sglebius	 * - at lookup
524262027Sglebius	 * - in rt_check when called by arpresolve
525262027Sglebius	 * - dropping the refcount for the rtentry
526262027Sglebius	 *
527262027Sglebius	 * This could be consolidated to one if we wrote a variant
528262027Sglebius	 * of arpresolve with an rt_check variant that expected to
529262027Sglebius	 * receive the route locked
530262027Sglebius	 */
531262027Sglebius#ifdef INET
532262027Sglebius	if (ft == &V_ip4_ft) {
533262027Sglebius		struct sockaddr_in *sin;
534262027Sglebius
535262027Sglebius		ro = &sro;
536262027Sglebius		bzero(&sro.ro_dst, sizeof(sro.ro_dst));
537262027Sglebius
538262027Sglebius		sin = (struct sockaddr_in *)&sro.ro_dst;
539262027Sglebius		sin->sin_family = AF_INET;
540262027Sglebius		sin->sin_len = sizeof(*sin);
541262027Sglebius		sin->sin_addr.s_addr = key[0];
542262027Sglebius	}
543262027Sglebius#endif
544262027Sglebius#ifdef INET6
545262027Sglebius	if (ft == &V_ip6_ft) {
546262027Sglebius		struct sockaddr_in6 *sin6;
547262027Sglebius
548262027Sglebius		ro = (struct route *)&sro6;
549262027Sglebius		sin6 = &sro6.ro_dst;
550262027Sglebius
551262027Sglebius		bzero(sin6, sizeof(*sin6));
552262027Sglebius		sin6->sin6_family = AF_INET6;
553262027Sglebius		sin6->sin6_len = sizeof(*sin6);
554262027Sglebius		bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
555262027Sglebius	}
556262027Sglebius#endif
557262027Sglebius
558262027Sglebius	ro->ro_rt = NULL;
559262027Sglebius#ifdef RADIX_MPATH
560262027Sglebius	rtalloc_mpath_fib(ro, hash, fibnum);
561262027Sglebius#else
562262027Sglebius	rtalloc_ign_fib(ro, 0, fibnum);
563262027Sglebius#endif
564262027Sglebius	if (ro->ro_rt == NULL)
565262027Sglebius		return (NULL);
566262027Sglebius
567262027Sglebius	rt = ro->ro_rt;
568262027Sglebius	ifp = rt->rt_ifp;
569262027Sglebius
570262027Sglebius	if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
571262027Sglebius		RTFREE(rt);
572262027Sglebius		return (NULL);
573262027Sglebius	}
574262027Sglebius
575262027Sglebius#ifdef INET
576262027Sglebius	if (ft == &V_ip4_ft)
577262027Sglebius		lt = LLTABLE(ifp);
578262027Sglebius#endif
579262027Sglebius#ifdef INET6
580262027Sglebius	if (ft == &V_ip6_ft)
581262027Sglebius		lt = LLTABLE6(ifp);
582262027Sglebius#endif
583262027Sglebius
584262027Sglebius	if (rt->rt_flags & RTF_GATEWAY)
585262027Sglebius		l3addr = (struct sockaddr_storage *)rt->rt_gateway;
586262027Sglebius	else
587262027Sglebius		l3addr = (struct sockaddr_storage *)&ro->ro_dst;
588262027Sglebius	lle = llentry_alloc(ifp, lt, l3addr);
589262027Sglebius
590262027Sglebius	if (lle == NULL) {
591262027Sglebius		RTFREE(rt);
592262027Sglebius		return (NULL);
593262027Sglebius	}
594262027Sglebius
595262027Sglebius	/* Don't insert the entry if the ARP hasn't yet finished resolving. */
596262027Sglebius	if ((lle->la_flags & LLE_VALID) == 0) {
597262027Sglebius		RTFREE(rt);
598262027Sglebius		LLE_FREE(lle);
599262027Sglebius		FLOWSTAT_INC(ft, ft_fail_lle_invalid);
600262027Sglebius		return (NULL);
601262027Sglebius	}
602262027Sglebius
603261823Sglebius	fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
604262027Sglebius	if (fle == NULL) {
605262027Sglebius		RTFREE(rt);
606262027Sglebius		LLE_FREE(lle);
607261883Sglebius		return (NULL);
608262027Sglebius	}
609191255Skmacy
610262027Sglebius	fle->f_hash = hash;
611262027Sglebius	bcopy(key, &fle->f_key, keylen);
612262027Sglebius	fle->f_rt = rt;
613262027Sglebius	fle->f_lle = lle;
614261823Sglebius	fle->f_fibnum = fibnum;
615261823Sglebius	fle->f_uptime = time_uptime;
616262027Sglebius#ifdef FLOWTABLE_HASH_ALL
617262027Sglebius	fle->f_proto = proto;
618262027Sglebius	fle->f_flags = fibnum0 >> 24;
619262027Sglebius#endif
620205066Skmacy
621261823Sglebius	critical_enter();
622191255Skmacy	mask = flowtable_mask(ft);
623261823Sglebius	flist = flowtable_list(ft, hash);
624191255Skmacy
625261823Sglebius	if (SLIST_EMPTY(flist)) {
626261823Sglebius		bit_set(mask, (hash % ft->ft_size));
627261823Sglebius		SLIST_INSERT_HEAD(flist, fle, f_next);
628191255Skmacy		goto skip;
629261582Sglebius	}
630261640Sglebius
631191255Skmacy	/*
632191255Skmacy	 * find end of list and make sure that we were not
633191255Skmacy	 * preempted by another thread handling this flow
634191255Skmacy	 */
635261823Sglebius	SLIST_FOREACH(iter, flist, f_next) {
636262162Sglebius		KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
637262162Sglebius		    ("%s: wrong hash", __func__));
638262027Sglebius		if (flow_matches(iter, key, keylen, fibnum)) {
639191255Skmacy			/*
640261883Sglebius			 * We probably migrated to an other CPU after
641261883Sglebius			 * lookup in flowtable_lookup_common() failed.
642261883Sglebius			 * It appeared that this CPU already has flow
643261883Sglebius			 * entry.
644191255Skmacy			 */
645261883Sglebius			iter->f_uptime = time_uptime;
646262027Sglebius#ifdef FLOWTABLE_HASH_ALL
647262027Sglebius			iter->f_flags |= fibnum >> 24;
648262027Sglebius#endif
649261823Sglebius			critical_exit();
650261883Sglebius			FLOWSTAT_INC(ft, ft_collisions);
651261823Sglebius			uma_zfree(flow_zone, fle);
652261883Sglebius			return (iter);
653191255Skmacy		}
654261582Sglebius	}
655191255Skmacy
656261823Sglebius	SLIST_INSERT_HEAD(flist, fle, f_next);
657191255Skmacyskip:
658261823Sglebius	critical_exit();
659261883Sglebius	FLOWSTAT_INC(ft, ft_inserts);
660191255Skmacy
661261883Sglebius	return (fle);
662191255Skmacy}
663191255Skmacy
664262027Sglebiusint
665262027Sglebiusflowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
666191255Skmacy{
667262027Sglebius	struct flentry *fle;
668292978Smelifaro	struct llentry *lle;
669205066Skmacy
670262027Sglebius	if (V_flowtable_enable == 0)
671262027Sglebius		return (ENXIO);
672262027Sglebius
673261601Sglebius	switch (sa) {
674205066Skmacy#ifdef INET
675261601Sglebius	case AF_INET:
676262027Sglebius		fle = flowtable_lookup_ipv4(m, ro);
677262027Sglebius		break;
678205066Skmacy#endif
679205066Skmacy#ifdef INET6
680261601Sglebius	case AF_INET6:
681262027Sglebius		fle = flowtable_lookup_ipv6(m, ro);
682262027Sglebius		break;
683261601Sglebius#endif
684261601Sglebius	default:
685261601Sglebius		panic("%s: sa %d", __func__, sa);
686205066Skmacy	}
687262027Sglebius
688262027Sglebius	if (fle == NULL)
689262027Sglebius		return (EHOSTUNREACH);
690262027Sglebius
691275358Shselasky	if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) {
692301538Ssephe		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE_HASH);
693262027Sglebius		m->m_pkthdr.flowid = fle->f_hash;
694262027Sglebius	}
695262027Sglebius
696262027Sglebius	ro->ro_rt = fle->f_rt;
697262027Sglebius	ro->ro_flags |= RT_NORTREF;
698292978Smelifaro	lle = fle->f_lle;
699301217Sgnn	if (lle != NULL && (lle->la_flags & LLE_VALID))
700301217Sgnn		ro->ro_lle = lle;	/* share ref with fle->f_lle */
701262027Sglebius
702262027Sglebius	return (0);
703205066Skmacy}
704261601Sglebius
705261601Sglebiusstatic struct flentry *
706262027Sglebiusflowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
707262027Sglebius    uint32_t fibnum)
708205066Skmacy{
709261823Sglebius	struct flist *flist;
710191255Skmacy	struct flentry *fle;
711262027Sglebius	uint32_t hash;
712191255Skmacy
713262027Sglebius	FLOWSTAT_INC(ft, ft_lookups);
714261601Sglebius
715262027Sglebius	hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
716261601Sglebius
717261823Sglebius	critical_enter();
718261823Sglebius	flist = flowtable_list(ft, hash);
719262027Sglebius	SLIST_FOREACH(fle, flist, f_next) {
720262162Sglebius		KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
721262162Sglebius		    ("%s: wrong hash", __func__));
722262027Sglebius		if (flow_matches(fle, key, keylen, fibnum)) {
723261823Sglebius			fle->f_uptime = time_uptime;
724262027Sglebius#ifdef FLOWTABLE_HASH_ALL
725262027Sglebius			fle->f_flags |= fibnum >> 24;
726262027Sglebius#endif
727261823Sglebius			critical_exit();
728261823Sglebius			FLOWSTAT_INC(ft, ft_hits);
729262027Sglebius			return (fle);
730261823Sglebius		}
731262027Sglebius	}
732261823Sglebius	critical_exit();
733261823Sglebius
734261601Sglebius	FLOWSTAT_INC(ft, ft_misses);
735191255Skmacy
736262027Sglebius	return (flowtable_insert(ft, hash, key, keylen, fibnum));
737191255Skmacy}
738191255Skmacy
739261601Sglebiusstatic void
740261601Sglebiusflowtable_alloc(struct flowtable *ft)
741191255Skmacy{
742302378Snwhitehorn	int i;
743191255Skmacy
744261823Sglebius	ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
745261823Sglebius	    M_FTABLE, M_WAITOK);
746261823Sglebius	for (int i = 0; i < ft->ft_size; i++)
747261823Sglebius		ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
748191255Skmacy
749261823Sglebius	ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
750302372Snwhitehorn	CPU_FOREACH(i) {
751261823Sglebius		bitstr_t **b;
752261640Sglebius
753261823Sglebius		b = zpcpu_get_cpu(ft->ft_masks, i);
754299090Sasomers		*b = bit_alloc(ft->ft_size, M_FTABLE, M_WAITOK);
755191255Skmacy	}
756299090Sasomers	ft->ft_tmpmask = bit_alloc(ft->ft_size, M_FTABLE, M_WAITOK);
757191255Skmacy}
758191255Skmacy
759191255Skmacystatic void
760262027Sglebiusflowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
761191255Skmacy{
762261823Sglebius	struct flist *flist, freelist;
763261823Sglebius	struct flentry *fle, *fle1, *fleprev;
764191324Skmacy	bitstr_t *mask, *tmpmask;
765261823Sglebius	int curbit, tmpsize;
766205066Skmacy
767261823Sglebius	SLIST_INIT(&freelist);
768191255Skmacy	mask = flowtable_mask(ft);
769191324Skmacy	tmpmask = ft->ft_tmpmask;
770256563Semax	tmpsize = ft->ft_size;
771191324Skmacy	memcpy(tmpmask, mask, ft->ft_size/8);
772261823Sglebius	curbit = 0;
773262770Sglebius	fleprev = NULL; /* pacify gcc */
774191324Skmacy	/*
775191324Skmacy	 * XXX Note to self, bit_ffs operates at the byte level
776191324Skmacy	 * and thus adds gratuitous overhead
777191324Skmacy	 */
778191324Skmacy	bit_ffs(tmpmask, ft->ft_size, &curbit);
779191324Skmacy	while (curbit != -1) {
780191257Skmacy		if (curbit >= ft->ft_size || curbit < -1) {
781191257Skmacy			log(LOG_ALERT,
782191257Skmacy			    "warning: bad curbit value %d \n",
783191255Skmacy			    curbit);
784191257Skmacy			break;
785191255Skmacy		}
786205066Skmacy
787261823Sglebius		FLOWSTAT_INC(ft, ft_free_checks);
788191255Skmacy
789261823Sglebius		critical_enter();
790261823Sglebius		flist = flowtable_list(ft, curbit);
791191257Skmacy#ifdef DIAGNOSTIC
792261823Sglebius		if (SLIST_EMPTY(flist) && curbit > 0) {
793191257Skmacy			log(LOG_ALERT,
794191257Skmacy			    "warning bit=%d set, but no fle found\n",
795191257Skmacy			    curbit);
796191255Skmacy		}
797261640Sglebius#endif
798261823Sglebius		SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
799261823Sglebius			if (rt != NULL && fle->f_rt != rt) {
800191255Skmacy				fleprev = fle;
801191255Skmacy				continue;
802191255Skmacy			}
803262027Sglebius			if (!flow_stale(ft, fle, maxidle)) {
804261823Sglebius				fleprev = fle;
805261823Sglebius				continue;
806191255Skmacy			}
807205066Skmacy
808261823Sglebius			if (fle == SLIST_FIRST(flist))
809261823Sglebius				SLIST_REMOVE_HEAD(flist, f_next);
810261823Sglebius			else
811261823Sglebius				SLIST_REMOVE_AFTER(fleprev, f_next);
812261823Sglebius			SLIST_INSERT_HEAD(&freelist, fle, f_next);
813191255Skmacy		}
814261823Sglebius		if (SLIST_EMPTY(flist))
815191255Skmacy			bit_clear(mask, curbit);
816261823Sglebius		critical_exit();
817261823Sglebius
818191324Skmacy		bit_clear(tmpmask, curbit);
819256563Semax		bit_ffs(tmpmask, tmpsize, &curbit);
820191255Skmacy	}
821261823Sglebius
822261823Sglebius	SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
823261601Sglebius		FLOWSTAT_INC(ft, ft_frees);
824261823Sglebius		if (fle->f_rt != NULL)
825261823Sglebius			RTFREE(fle->f_rt);
826261823Sglebius		if (fle->f_lle != NULL)
827261823Sglebius			LLE_FREE(fle->f_lle);
828261823Sglebius		uma_zfree(flow_zone, fle);
829191255Skmacy	}
830191255Skmacy}
831191255Skmacy
832261823Sglebiusstatic void
833262027Sglebiusflowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
834261823Sglebius{
835261823Sglebius	int i;
836261823Sglebius
837261823Sglebius	CPU_FOREACH(i) {
838261823Sglebius		if (smp_started == 1) {
839261823Sglebius			thread_lock(curthread);
840261823Sglebius			sched_bind(curthread, i);
841261823Sglebius			thread_unlock(curthread);
842261823Sglebius		}
843261823Sglebius
844262027Sglebius		flowtable_free_stale(ft, rt, maxidle);
845261823Sglebius
846261823Sglebius		if (smp_started == 1) {
847261823Sglebius			thread_lock(curthread);
848261823Sglebius			sched_unbind(curthread);
849261823Sglebius			thread_unlock(curthread);
850261823Sglebius		}
851261823Sglebius	}
852261823Sglebius}
853261823Sglebius
854197687Sqinglivoid
855261601Sglebiusflowtable_route_flush(sa_family_t sa, struct rtentry *rt)
856197687Sqingli{
857261601Sglebius	struct flowtable *ft;
858205066Skmacy
859261601Sglebius	switch (sa) {
860261601Sglebius#ifdef INET
861261601Sglebius	case AF_INET:
862261601Sglebius		ft = &V_ip4_ft;
863261601Sglebius		break;
864261601Sglebius#endif
865261601Sglebius#ifdef INET6
866261601Sglebius	case AF_INET6:
867261601Sglebius		ft = &V_ip6_ft;
868261601Sglebius		break;
869261601Sglebius#endif
870261601Sglebius	default:
871261601Sglebius		panic("%s: sa %d", __func__, sa);
872261601Sglebius	}
873261601Sglebius
874262027Sglebius	flowtable_clean_vnet(ft, rt, 0);
875197687Sqingli}
876197687Sqingli
877191255Skmacystatic void
878194660Szecflowtable_cleaner(void)
879194660Szec{
880194660Szec	VNET_ITERATOR_DECL(vnet_iter);
881217076Sjhb	struct thread *td;
882194660Szec
883194660Szec	if (bootverbose)
884194660Szec		log(LOG_INFO, "flowtable cleaner started\n");
885217076Sjhb	td = curthread;
886194660Szec	while (1) {
887262027Sglebius		uint32_t flowclean_freq, maxidle;
888262027Sglebius
889262027Sglebius		/*
890262027Sglebius		 * The maximum idle time, as well as frequency are arbitrary.
891262027Sglebius		 */
892262027Sglebius		if (flow_full())
893262027Sglebius			maxidle = 5;
894262027Sglebius		else
895262027Sglebius			maxidle = 30;
896262027Sglebius
897194660Szec		VNET_LIST_RLOCK();
898194660Szec		VNET_FOREACH(vnet_iter) {
899194660Szec			CURVNET_SET(vnet_iter);
900261601Sglebius#ifdef INET
901262027Sglebius			flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
902261601Sglebius#endif
903261601Sglebius#ifdef INET6
904262027Sglebius			flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
905261601Sglebius#endif
906194660Szec			CURVNET_RESTORE();
907194660Szec		}
908194660Szec		VNET_LIST_RUNLOCK();
909194660Szec
910262027Sglebius		if (flow_full())
911262027Sglebius			flowclean_freq = 4*hz;
912262027Sglebius		else
913262027Sglebius			flowclean_freq = 20*hz;
914196368Skmacy		mtx_lock(&flowclean_lock);
915217076Sjhb		thread_lock(td);
916217076Sjhb		sched_prio(td, PPAUSE);
917217076Sjhb		thread_unlock(td);
918216855Sbz		flowclean_cycles++;
919216855Sbz		cv_broadcast(&flowclean_f_cv);
920216855Sbz		cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
921196368Skmacy		mtx_unlock(&flowclean_lock);
922191255Skmacy	}
923191255Skmacy}
924191255Skmacy
925196368Skmacystatic void
926196368Skmacyflowtable_flush(void *unused __unused)
927196368Skmacy{
928196368Skmacy	uint64_t start;
929205066Skmacy
930196368Skmacy	mtx_lock(&flowclean_lock);
931196368Skmacy	start = flowclean_cycles;
932196368Skmacy	while (start == flowclean_cycles) {
933216855Sbz		cv_broadcast(&flowclean_c_cv);
934216855Sbz		cv_wait(&flowclean_f_cv, &flowclean_lock);
935196368Skmacy	}
936196368Skmacy	mtx_unlock(&flowclean_lock);
937196368Skmacy}
938196368Skmacy
939191255Skmacystatic struct kproc_desc flow_kp = {
940191255Skmacy	"flowcleaner",
941191255Skmacy	flowtable_cleaner,
942191255Skmacy	&flowcleanerproc
943191255Skmacy};
944191255SkmacySYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
945193863Skmacy
946261601Sglebiusstatic int
947261601Sglebiusflowtable_get_size(char *name)
948196368Skmacy{
949261601Sglebius	int size;
950196368Skmacy
951261601Sglebius	if (TUNABLE_INT_FETCH(name, &size)) {
952261601Sglebius		if (size < 256)
953261601Sglebius			size = 256;
954261601Sglebius		if (!powerof2(size)) {
955261601Sglebius			printf("%s must be power of 2\n", name);
956261601Sglebius			size = 2048;
957261601Sglebius		}
958261601Sglebius	} else {
959261601Sglebius		/*
960261601Sglebius		 * round up to the next power of 2
961261601Sglebius		 */
962261601Sglebius		size = 1 << fls((1024 + maxusers * 64) - 1);
963261601Sglebius	}
964261601Sglebius
965261601Sglebius	return (size);
966196368Skmacy}
967196368Skmacy
968196368Skmacystatic void
969196368Skmacyflowtable_init(const void *unused __unused)
970196368Skmacy{
971196368Skmacy
972261601Sglebius	flow_hashjitter = arc4random();
973261601Sglebius
974261823Sglebius	flow_zone = uma_zcreate("flows", sizeof(struct flentry),
975262027Sglebius	    NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
976261823Sglebius	uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
977261601Sglebius
978216855Sbz	cv_init(&flowclean_c_cv, "c_flowcleanwait");
979216855Sbz	cv_init(&flowclean_f_cv, "f_flowcleanwait");
980196368Skmacy	mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
981196368Skmacy	EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
982196368Skmacy	    EVENTHANDLER_PRI_ANY);
983196368Skmacy}
984261601SglebiusSYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
985196368Skmacy    flowtable_init, NULL);
986196368Skmacy
987261601Sglebius#ifdef INET
988261601Sglebiusstatic SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
989261601Sglebius    "Flowtable for IPv4");
990196368Skmacy
991261601Sglebiusstatic VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
992261601SglebiusVNET_PCPUSTAT_SYSINIT(ip4_ftstat);
993261601SglebiusVNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
994261601SglebiusSYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
995261601Sglebius    ip4_ftstat, "Flowtable statistics for IPv4 "
996261601Sglebius    "(struct flowtable_stat, net/flowtable.h)");
997261601Sglebius
998196368Skmacystatic void
999261601Sglebiusflowtable_init_vnet_v4(const void *unused __unused)
1000196368Skmacy{
1001196368Skmacy
1002261601Sglebius	V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
1003261601Sglebius	V_ip4_ft.ft_stat = VNET(ip4_ftstat);
1004261601Sglebius	flowtable_alloc(&V_ip4_ft);
1005196368Skmacy}
1006261601SglebiusVNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1007261601Sglebius    flowtable_init_vnet_v4, NULL);
1008261601Sglebius#endif /* INET */
1009196368Skmacy
1010261601Sglebius#ifdef INET6
1011261601Sglebiusstatic SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
1012261601Sglebius    "Flowtable for IPv6");
1013196368Skmacy
1014261601Sglebiusstatic VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
1015261601SglebiusVNET_PCPUSTAT_SYSINIT(ip6_ftstat);
1016261601SglebiusVNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
1017261601SglebiusSYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
1018261601Sglebius    ip6_ftstat, "Flowtable statistics for IPv6 "
1019261601Sglebius    "(struct flowtable_stat, net/flowtable.h)");
1020261601Sglebius
1021261601Sglebiusstatic void
1022261601Sglebiusflowtable_init_vnet_v6(const void *unused __unused)
1023261601Sglebius{
1024261601Sglebius
1025261601Sglebius	V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
1026261601Sglebius	V_ip6_ft.ft_stat = VNET(ip6_ftstat);
1027261601Sglebius	flowtable_alloc(&V_ip6_ft);
1028261601Sglebius}
1029261601SglebiusVNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1030261601Sglebius    flowtable_init_vnet_v6, NULL);
1031261601Sglebius#endif /* INET6 */
1032261601Sglebius
1033196368Skmacy#ifdef DDB
1034196368Skmacystatic bitstr_t *
1035196368Skmacyflowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1036196368Skmacy{
1037196368Skmacy
1038261823Sglebius	return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
1039196368Skmacy}
1040196368Skmacy
1041261823Sglebiusstatic struct flist *
1042261823Sglebiusflowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1043196368Skmacy{
1044196368Skmacy
1045261823Sglebius	return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
1046196368Skmacy}
1047196368Skmacy
1048196368Skmacystatic void
1049196368Skmacyflow_show(struct flowtable *ft, struct flentry *fle)
1050196368Skmacy{
1051196368Skmacy	int idle_time;
1052205066Skmacy	int rt_valid, ifp_valid;
1053205066Skmacy	volatile struct rtentry *rt;
1054205066Skmacy	struct ifnet *ifp = NULL;
1055262027Sglebius	uint32_t *hashkey = fle->f_key;
1056196368Skmacy
1057196368Skmacy	idle_time = (int)(time_uptime - fle->f_uptime);
1058205066Skmacy	rt = fle->f_rt;
1059205066Skmacy	rt_valid = rt != NULL;
1060261582Sglebius	if (rt_valid)
1061205066Skmacy		ifp = rt->rt_ifp;
1062205066Skmacy	ifp_valid = ifp != NULL;
1063205066Skmacy
1064262027Sglebius#ifdef INET
1065262027Sglebius	if (ft == &V_ip4_ft) {
1066262027Sglebius		char daddr[4*sizeof "123"];
1067262027Sglebius#ifdef FLOWTABLE_HASH_ALL
1068262027Sglebius		char saddr[4*sizeof "123"];
1069262027Sglebius		uint16_t sport, dport;
1070262027Sglebius#endif
1071262027Sglebius
1072262027Sglebius		inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
1073262027Sglebius#ifdef FLOWTABLE_HASH_ALL
1074261640Sglebius		inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1075262027Sglebius		dport = ntohs((uint16_t)(hashkey[2] >> 16));
1076262027Sglebius		sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
1077262027Sglebius		db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
1078262027Sglebius#else
1079205066Skmacy		db_printf("%s ", daddr);
1080262027Sglebius#endif
1081262027Sglebius	}
1082262027Sglebius#endif /* INET */
1083262027Sglebius#ifdef INET6
1084262027Sglebius	if (ft == &V_ip6_ft) {
1085262027Sglebius#ifdef FLOWTABLE_HASH_ALL
1086262027Sglebius		db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1087262027Sglebius		    hashkey[0], hashkey[1], hashkey[2],
1088262027Sglebius		    hashkey[3], hashkey[4], hashkey[5],
1089262027Sglebius		    hashkey[6], hashkey[7], hashkey[8]);
1090262027Sglebius#else
1091262027Sglebius		db_printf("\n\tkey=%08x:%08x:%08x ",
1092262027Sglebius		    hashkey[0], hashkey[1], hashkey[2]);
1093262027Sglebius#endif
1094262027Sglebius	}
1095262027Sglebius#endif /* INET6 */
1096261582Sglebius
1097262027Sglebius	db_printf("hash=%08x idle_time=%03d"
1098262027Sglebius	    "\n\tfibnum=%02d rt=%p",
1099262027Sglebius	    fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
1100262027Sglebius
1101262027Sglebius#ifdef FLOWTABLE_HASH_ALL
1102196368Skmacy	if (fle->f_flags & FL_STALE)
1103196368Skmacy		db_printf(" FL_STALE ");
1104262027Sglebius#endif
1105205066Skmacy	if (rt_valid) {
1106205066Skmacy		if (rt->rt_flags & RTF_UP)
1107205066Skmacy			db_printf(" RTF_UP ");
1108205066Skmacy	}
1109205066Skmacy	if (ifp_valid) {
1110205066Skmacy		if (ifp->if_flags & IFF_LOOPBACK)
1111205066Skmacy			db_printf(" IFF_LOOPBACK ");
1112205066Skmacy		if (ifp->if_flags & IFF_UP)
1113261640Sglebius			db_printf(" IFF_UP ");
1114205066Skmacy		if (ifp->if_flags & IFF_POINTOPOINT)
1115261640Sglebius			db_printf(" IFF_POINTOPOINT ");
1116205066Skmacy	}
1117196368Skmacy	db_printf("\n");
1118196368Skmacy}
1119196368Skmacy
1120196368Skmacystatic void
1121196368Skmacyflowtable_show(struct flowtable *ft, int cpuid)
1122196368Skmacy{
1123196368Skmacy	int curbit = 0;
1124196368Skmacy	bitstr_t *mask, *tmpmask;
1125196368Skmacy
1126205066Skmacy	if (cpuid != -1)
1127205066Skmacy		db_printf("cpu: %d\n", cpuid);
1128196368Skmacy	mask = flowtable_mask_pcpu(ft, cpuid);
1129196368Skmacy	tmpmask = ft->ft_tmpmask;
1130196368Skmacy	memcpy(tmpmask, mask, ft->ft_size/8);
1131196368Skmacy	/*
1132196368Skmacy	 * XXX Note to self, bit_ffs operates at the byte level
1133196368Skmacy	 * and thus adds gratuitous overhead
1134196368Skmacy	 */
1135196368Skmacy	bit_ffs(tmpmask, ft->ft_size, &curbit);
1136196368Skmacy	while (curbit != -1) {
1137261823Sglebius		struct flist *flist;
1138261823Sglebius		struct flentry *fle;
1139261823Sglebius
1140196368Skmacy		if (curbit >= ft->ft_size || curbit < -1) {
1141196368Skmacy			db_printf("warning: bad curbit value %d \n",
1142196368Skmacy			    curbit);
1143196368Skmacy			break;
1144196368Skmacy		}
1145196368Skmacy
1146261823Sglebius		flist = flowtable_list_pcpu(ft, curbit, cpuid);
1147196368Skmacy
1148261823Sglebius		SLIST_FOREACH(fle, flist, f_next)
1149196368Skmacy			flow_show(ft, fle);
1150196368Skmacy		bit_clear(tmpmask, curbit);
1151196368Skmacy		bit_ffs(tmpmask, ft->ft_size, &curbit);
1152196368Skmacy	}
1153196368Skmacy}
1154196368Skmacy
1155196368Skmacystatic void
1156261601Sglebiusflowtable_show_vnet(struct flowtable *ft)
1157196368Skmacy{
1158196368Skmacy
1159261823Sglebius	int i;
1160261601Sglebius
1161261823Sglebius	CPU_FOREACH(i)
1162261823Sglebius		flowtable_show(ft, i);
1163196368Skmacy}
1164196368Skmacy
1165196368SkmacyDB_SHOW_COMMAND(flowtables, db_show_flowtables)
1166196368Skmacy{
1167196368Skmacy	VNET_ITERATOR_DECL(vnet_iter);
1168196368Skmacy
1169196368Skmacy	VNET_FOREACH(vnet_iter) {
1170196368Skmacy		CURVNET_SET(vnet_iter);
1171216856Sbz#ifdef VIMAGE
1172216856Sbz		db_printf("vnet %p\n", vnet_iter);
1173216856Sbz#endif
1174261601Sglebius#ifdef INET
1175261601Sglebius		printf("IPv4:\n");
1176261601Sglebius		flowtable_show_vnet(&V_ip4_ft);
1177261601Sglebius#endif
1178261601Sglebius#ifdef INET6
1179261601Sglebius		printf("IPv6:\n");
1180261601Sglebius		flowtable_show_vnet(&V_ip6_ft);
1181261601Sglebius#endif
1182196368Skmacy		CURVNET_RESTORE();
1183196368Skmacy	}
1184196368Skmacy}
1185196368Skmacy#endif
1186