ip_state.c revision 64580
1/*
2 * Copyright (C) 1995-2000 by Darren Reed.
3 *
4 * Redistribution and use in source and binary forms are permitted
5 * provided that this notice is preserved and due credit is given
6 * to the original author and the contributors.
7 */
8#if !defined(lint)
9static const char sccsid[] = "@(#)ip_state.c	1.8 6/5/96 (C) 1993-1995 Darren Reed";
10static const char rcsid[] = "@(#)$FreeBSD: head/sys/contrib/ipfilter/netinet/ip_state.c 64580 2000-08-13 04:31:06Z darrenr $";
11#endif
12
13#include <sys/errno.h>
14#include <sys/types.h>
15#include <sys/param.h>
16#include <sys/file.h>
17#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \
18    defined(_KERNEL)
19# include "opt_ipfilter_log.h"
20#endif
21#if defined(_KERNEL) && defined(__FreeBSD_version) && \
22    (__FreeBSD_version >= 400000) && !defined(KLD_MODULE)
23#include "opt_inet6.h"
24#endif
25#if !defined(_KERNEL) && !defined(KERNEL) && !defined(__KERNEL__)
26# include <stdio.h>
27# include <stdlib.h>
28# include <string.h>
29#else
30# ifdef linux
31#  include <linux/kernel.h>
32#  include <linux/module.h>
33# endif
34#endif
35#if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000)
36# include <sys/filio.h>
37# include <sys/fcntl.h>
38# if (__FreeBSD_version >= 300000) && !defined(IPFILTER_LKM)
39#  include "opt_ipfilter.h"
40# endif
41#else
42# include <sys/ioctl.h>
43#endif
44#include <sys/time.h>
45#include <sys/uio.h>
46#ifndef linux
47# include <sys/protosw.h>
48#endif
49#include <sys/socket.h>
50#if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux)
51# include <sys/systm.h>
52#endif
53#if !defined(__SVR4) && !defined(__svr4__)
54# ifndef linux
55#  include <sys/mbuf.h>
56# endif
57#else
58# include <sys/filio.h>
59# include <sys/byteorder.h>
60# ifdef _KERNEL
61#  include <sys/dditypes.h>
62# endif
63# include <sys/stream.h>
64# include <sys/kmem.h>
65#endif
66
67#include <net/if.h>
68#ifdef sun
69# include <net/af.h>
70#endif
71#include <net/route.h>
72#include <netinet/in.h>
73#include <netinet/in_systm.h>
74#include <netinet/ip.h>
75#include <netinet/tcp.h>
76#ifndef linux
77# include <netinet/ip_var.h>
78# include <netinet/tcp_fsm.h>
79#endif
80#include <netinet/udp.h>
81#include <netinet/ip_icmp.h>
82#include "netinet/ip_compat.h"
83#include <netinet/tcpip.h>
84#include "netinet/ip_fil.h"
85#include "netinet/ip_nat.h"
86#include "netinet/ip_frag.h"
87#include "netinet/ip_proxy.h"
88#include "netinet/ip_state.h"
89#ifdef	USE_INET6
90#include <netinet/icmp6.h>
91#endif
92#if (__FreeBSD_version >= 300000)
93# include <sys/malloc.h>
94# if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM)
95#  include <sys/libkern.h>
96#  include <sys/systm.h>
97# endif
98#endif
99
100#ifndef	MIN
101# define	MIN(a,b)	(((a)<(b))?(a):(b))
102#endif
103
104#define	TCP_CLOSE	(TH_FIN|TH_RST)
105
106static ipstate_t **ips_table = NULL;
107static ipstate_t *ips_list = NULL;
108static int	ips_num = 0;
109static ips_stat_t ips_stats;
110#if	(SOLARIS || defined(__sgi)) && defined(_KERNEL)
111extern	KRWLOCK_T	ipf_state, ipf_mutex;
112extern	kmutex_t	ipf_rw;
113#endif
114
115#ifdef	USE_INET6
116static frentry_t *fr_checkicmp6matchingstate __P((ip6_t *, fr_info_t *));
117#endif
118static int fr_matchsrcdst __P((ipstate_t *, union i6addr, union i6addr,
119			       fr_info_t *, tcphdr_t *));
120static frentry_t *fr_checkicmpmatchingstate __P((ip_t *, fr_info_t *));
121static int fr_matchicmpqueryreply __P((int, ipstate_t *, icmphdr_t *));
122static int fr_state_flush __P((int));
123static ips_stat_t *fr_statetstats __P((void));
124static void fr_delstate __P((ipstate_t *));
125static int fr_state_remove __P((caddr_t));
126int fr_stputent __P((caddr_t));
127int fr_stgetent __P((caddr_t));
128void fr_stinsert __P((ipstate_t *));
129
130
131#define	FIVE_DAYS	(2 * 5 * 86400)	/* 5 days: half closed session */
132
133#define	TCP_MSL	240			/* 2 minutes */
134u_long	fr_tcpidletimeout = FIVE_DAYS,
135	fr_tcpclosewait = 2 * TCP_MSL,
136	fr_tcplastack = 2 * TCP_MSL,
137	fr_tcptimeout = 2 * TCP_MSL,
138	fr_tcpclosed = 1,
139	fr_udptimeout = 240,
140	fr_icmptimeout = 120;
141int	fr_statemax = IPSTATE_MAX,
142	fr_statesize = IPSTATE_SIZE;
143int	fr_state_doflush = 0,
144	fr_state_lock = 0;
145
146static 	int icmpreplytype4[ICMP_MAXTYPE + 1];
147
148int fr_stateinit()
149{
150	int i;
151
152	KMALLOCS(ips_table, ipstate_t **, fr_statesize * sizeof(ipstate_t *));
153	if (ips_table != NULL)
154		bzero((char *)ips_table, fr_statesize * sizeof(ipstate_t *));
155	else
156		return -1;
157
158	/* fill icmp reply type table */
159	for (i = 0; i <= ICMP_MAXTYPE; i++)
160		icmpreplytype4[i] = -1;
161	icmpreplytype4[ICMP_ECHO] = ICMP_ECHOREPLY;
162	icmpreplytype4[ICMP_TSTAMP] = ICMP_TSTAMPREPLY;
163	icmpreplytype4[ICMP_IREQ] = ICMP_IREQREPLY;
164	icmpreplytype4[ICMP_MASKREQ] = ICMP_MASKREPLY;
165
166	return 0;
167}
168
169
170static ips_stat_t *fr_statetstats()
171{
172	ips_stats.iss_active = ips_num;
173	ips_stats.iss_table = ips_table;
174	ips_stats.iss_list = ips_list;
175	return &ips_stats;
176}
177
178
179/*
180 * flush state tables.  two actions currently defined:
181 * which == 0 : flush all state table entries
182 * which == 1 : flush TCP connections which have started to close but are
183 *	        stuck for some reason.
184 */
185static int fr_state_flush(which)
186int which;
187{
188	register ipstate_t *is, **isp;
189#if defined(_KERNEL) && !SOLARIS
190	int s;
191#endif
192	int delete, removed = 0;
193
194	SPL_NET(s);
195	for (isp = &ips_list; (is = *isp); ) {
196		delete = 0;
197
198		switch (which)
199		{
200		case 0 :
201			delete = 1;
202			break;
203		case 1 :
204			if (is->is_p != IPPROTO_TCP)
205				break;
206			if ((is->is_state[0] != TCPS_ESTABLISHED) ||
207			    (is->is_state[1] != TCPS_ESTABLISHED))
208				delete = 1;
209			break;
210		}
211
212		if (delete) {
213			if (is->is_p == IPPROTO_TCP)
214				ips_stats.iss_fin++;
215			else
216				ips_stats.iss_expire++;
217#ifdef	IPFILTER_LOG
218			ipstate_log(is, ISL_FLUSH);
219#endif
220			fr_delstate(is);
221			removed++;
222		} else
223			isp = &is->is_next;
224	}
225	SPL_X(s);
226	return removed;
227}
228
229
230static int fr_state_remove(data)
231caddr_t data;
232{
233	ipstate_t *sp, st;
234	int error;
235
236	sp = &st;
237	error = IRCOPYPTR(data, (caddr_t)&st, sizeof(st));
238	if (error)
239		return EFAULT;
240
241	for (sp = ips_list; sp; sp = sp->is_next)
242		if ((sp->is_p == st.is_p) && (sp->is_v == st.is_v) &&
243		    !bcmp(&sp->is_src, &st.is_src, sizeof(st.is_src)) &&
244		    !bcmp(&sp->is_dst, &st.is_src, sizeof(st.is_dst)) &&
245		    !bcmp(&sp->is_ps, &st.is_ps, sizeof(st.is_ps))) {
246			WRITE_ENTER(&ipf_state);
247#ifdef	IPFILTER_LOG
248			ipstate_log(sp, ISL_REMOVE);
249#endif
250			fr_delstate(sp);
251			RWLOCK_EXIT(&ipf_state);
252			return 0;
253		}
254	return ESRCH;
255}
256
257
258int fr_state_ioctl(data, cmd, mode)
259caddr_t data;
260#if defined(__NetBSD__) || defined(__OpenBSD__)
261u_long cmd;
262#else
263int cmd;
264#endif
265int mode;
266{
267	int arg, ret, error = 0;
268
269	switch (cmd)
270	{
271	case SIOCDELST :
272		error = fr_state_remove(data);
273		break;
274	case SIOCIPFFL :
275		error = IRCOPY(data, (caddr_t)&arg, sizeof(arg));
276		if (error)
277			break;
278		if (arg == 0 || arg == 1) {
279			WRITE_ENTER(&ipf_state);
280			ret = fr_state_flush(arg);
281			RWLOCK_EXIT(&ipf_state);
282			error = IWCOPY((caddr_t)&ret, data, sizeof(ret));
283		} else
284			error = EINVAL;
285		break;
286#ifdef	IPFILTER_LOG
287	case SIOCIPFFB :
288		if (!(mode & FWRITE))
289			error = EPERM;
290		else {
291			int tmp;
292
293			tmp = ipflog_clear(IPL_LOGSTATE);
294			IWCOPY((char *)&tmp, data, sizeof(tmp));
295		}
296		break;
297#endif
298	case SIOCGETFS :
299		error = IWCOPYPTR((caddr_t)fr_statetstats(), data,
300				  sizeof(ips_stat_t));
301		break;
302	case FIONREAD :
303#ifdef	IPFILTER_LOG
304		error = IWCOPY((caddr_t)&iplused[IPL_LOGSTATE], (caddr_t)data,
305			       sizeof(iplused[IPL_LOGSTATE]));
306#endif
307		break;
308	case SIOCSTLCK :
309		error = fr_lock(data, &fr_state_lock);
310		break;
311	case SIOCSTPUT :
312		if (!fr_state_lock) {
313			error = EACCES;
314			break;
315		}
316		error = fr_stputent(data);
317		break;
318	case SIOCSTGET :
319		if (!fr_state_lock) {
320			error = EACCES;
321			break;
322		}
323		error = fr_stgetent(data);
324		break;
325	default :
326		error = EINVAL;
327		break;
328	}
329	return error;
330}
331
332
333int fr_stgetent(data)
334caddr_t data;
335{
336	register ipstate_t *is, *isn;
337	ipstate_save_t ips, *ipsp;
338	int error;
339
340	error = IRCOPY(data, (caddr_t)&ipsp, sizeof(ipsp));
341	if (error)
342		return EFAULT;
343	error = IRCOPY((caddr_t)ipsp, (caddr_t)&ips, sizeof(ips));
344	if (error)
345		return EFAULT;
346
347	isn = ips.ips_next;
348	if (!isn) {
349		isn = ips_list;
350		if (isn == NULL) {
351			if (ips.ips_next == NULL)
352				return ENOENT;
353			return 0;
354		}
355	} else {
356		/*
357		 * Make sure the pointer we're copying from exists in the
358		 * current list of entries.  Security precaution to prevent
359		 * copying of random kernel data.
360		 */
361		for (is = ips_list; is; is = is->is_next)
362			if (is == isn)
363				break;
364		if (!is)
365			return ESRCH;
366	}
367	ips.ips_next = isn->is_next;
368	bcopy((char *)isn, (char *)&ips.ips_is, sizeof(ips.ips_is));
369	if (isn->is_rule)
370		bcopy((char *)isn->is_rule, (char *)&ips.ips_fr,
371		      sizeof(ips.ips_fr));
372	error = IWCOPY((caddr_t)&ips, ipsp, sizeof(ips));
373	if (error)
374		error = EFAULT;
375	return error;
376}
377
378
379int fr_stputent(data)
380caddr_t data;
381{
382	register ipstate_t *is, *isn;
383	ipstate_save_t ips, *ipsp;
384	int error, out;
385	frentry_t *fr;
386
387	error = IRCOPY(data, (caddr_t)&ipsp, sizeof(ipsp));
388	if (error)
389		return EFAULT;
390	error = IRCOPY((caddr_t)ipsp, (caddr_t)&ips, sizeof(ips));
391	if (error)
392		return EFAULT;
393
394	KMALLOC(isn, ipstate_t *);
395	if (isn == NULL)
396		return ENOMEM;
397
398	bcopy((char *)&ips.ips_is, (char *)isn, sizeof(*isn));
399	fr = isn->is_rule;
400	if (fr != NULL) {
401		if (isn->is_flags & FI_NEWFR) {
402			KMALLOC(fr, frentry_t *);
403			if (fr == NULL) {
404				KFREE(isn);
405				return ENOMEM;
406			}
407			bcopy((char *)&ips.ips_fr, (char *)fr, sizeof(*fr));
408			out = fr->fr_flags & FR_OUTQUE ? 1 : 0;
409			isn->is_rule = fr;
410			ips.ips_is.is_rule = fr;
411			if (*fr->fr_ifname) {
412				fr->fr_ifa = GETUNIT(fr->fr_ifname, fr->fr_v);
413				if (fr->fr_ifa == NULL)
414					fr->fr_ifa = (void *)-1;
415#ifdef	_KERNEL
416				else {
417					strncpy(isn->is_ifname[out],
418						IFNAME(fr->fr_ifa), IFNAMSIZ);
419					isn->is_ifp[out] = fr->fr_ifa;
420				}
421#endif
422			} else
423				fr->fr_ifa = NULL;
424			/*
425			 * send a copy back to userland of what we ended up
426			 * to allow for verification.
427			 */
428			error = IWCOPY((caddr_t)&ips, ipsp, sizeof(ips));
429			if (error) {
430				KFREE(isn);
431				KFREE(fr);
432				return EFAULT;
433			}
434		} else {
435			for (is = ips_list; is; is = is->is_next)
436				if (is->is_rule == fr)
437					break;
438			if (!is) {
439				KFREE(isn);
440				return ESRCH;
441			}
442		}
443	}
444	fr_stinsert(isn);
445	return 0;
446}
447
448
449void fr_stinsert(is)
450register ipstate_t *is;
451{
452	register u_int hv = is->is_hv;
453
454	MUTEX_INIT(&is->is_lock, "ipf state entry", NULL);
455
456	is->is_ifname[0][sizeof(is->is_ifname[0]) - 1] = '\0';
457	if (is->is_ifname[0][0] != '\0') {
458		is->is_ifp[0] = GETUNIT(is->is_ifname[0], is->is_v);
459	}
460	is->is_ifname[1][sizeof(is->is_ifname[0]) - 1] = '\0';
461	if (is->is_ifname[1][0] != '\0') {
462		is->is_ifp[1] = GETUNIT(is->is_ifname[1], is->is_v);
463	}
464
465	/*
466	 * add into list table.
467	 */
468	if (ips_list)
469		ips_list->is_pnext = &is->is_next;
470	is->is_pnext = &ips_list;
471	is->is_next = ips_list;
472	ips_list = is;
473	if (ips_table[hv])
474		ips_table[hv]->is_phnext = &is->is_hnext;
475	else
476		ips_stats.iss_inuse++;
477	is->is_phnext = ips_table + hv;
478	is->is_hnext = ips_table[hv];
479	ips_table[hv] = is;
480	ips_num++;
481}
482
483
484/*
485 * Create a new ipstate structure and hang it off the hash table.
486 */
487ipstate_t *fr_addstate(ip, fin, flags)
488ip_t *ip;
489fr_info_t *fin;
490u_int flags;
491{
492	register tcphdr_t *tcp = NULL;
493	register ipstate_t *is;
494	register u_int hv;
495	ipstate_t ips;
496	u_int pass;
497	int out;
498
499	if (fr_state_lock || (fin->fin_off & IP_OFFMASK) ||
500	    (fin->fin_fi.fi_fl & FI_SHORT))
501		return NULL;
502	if (ips_num == fr_statemax) {
503		ips_stats.iss_max++;
504		fr_state_doflush = 1;
505		return NULL;
506	}
507	out = fin->fin_out;
508	is = &ips;
509	bzero((char *)is, sizeof(*is));
510	ips.is_age = 1;
511	ips.is_state[0] = 0;
512	ips.is_state[1] = 0;
513	/*
514	 * Copy and calculate...
515	 */
516	hv = (is->is_p = fin->fin_fi.fi_p);
517	is->is_src = fin->fin_fi.fi_src;
518	hv += is->is_saddr;
519	is->is_dst = fin->fin_fi.fi_dst;
520	hv += is->is_daddr;
521#ifdef	USE_INET6
522	if (fin->fin_v == 6) {
523		if (is->is_p == IPPROTO_ICMPV6) {
524			if (IN6_IS_ADDR_MULTICAST(&is->is_dst.in6))
525				flags |= FI_W_DADDR;
526			if (out)
527				hv -= is->is_daddr;
528			else
529				hv -= is->is_saddr;
530		}
531	}
532#endif
533
534	switch (is->is_p)
535	{
536#ifdef	USE_INET6
537	case IPPROTO_ICMPV6 :
538#endif
539	case IPPROTO_ICMP :
540	    {
541		struct icmp *ic = (struct icmp *)fin->fin_dp;
542
543#ifdef	USE_INET6
544		if ((is->is_p == IPPROTO_ICMPV6) &&
545		    ((ic->icmp_type & ICMP6_INFOMSG_MASK) == 0))
546			return NULL;
547#endif
548		switch (ic->icmp_type)
549		{
550#ifdef	USE_INET6
551		case ICMP6_ECHO_REQUEST :
552			is->is_icmp.ics_type = ICMP6_ECHO_REPLY;
553			hv += (is->is_icmp.ics_id = ic->icmp_id);
554			hv += (is->is_icmp.ics_seq = ic->icmp_seq);
555			break;
556		case ICMP6_MEMBERSHIP_QUERY :
557		case ND_ROUTER_SOLICIT :
558		case ND_NEIGHBOR_SOLICIT :
559			is->is_icmp.ics_type = ic->icmp_type + 1;
560			break;
561#endif
562		case ICMP_ECHO :
563		case ICMP_TSTAMP :
564		case ICMP_IREQ :
565		case ICMP_MASKREQ :
566			is->is_icmp.ics_type = ic->icmp_type;
567			hv += (is->is_icmp.ics_id = ic->icmp_id);
568			hv += (is->is_icmp.ics_seq = ic->icmp_seq);
569			break;
570		default :
571			return NULL;
572		}
573		ATOMIC_INCL(ips_stats.iss_icmp);
574		is->is_age = fr_icmptimeout;
575		break;
576	    }
577	case IPPROTO_TCP :
578	    {
579		tcp = (tcphdr_t *)fin->fin_dp;
580
581		if (tcp->th_flags & TH_RST)
582			return NULL;
583		/*
584		 * The endian of the ports doesn't matter, but the ack and
585		 * sequence numbers do as we do mathematics on them later.
586		 */
587		is->is_dport = tcp->th_dport;
588		is->is_sport = tcp->th_sport;
589		if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) {
590			hv += tcp->th_dport;
591			hv += tcp->th_sport;
592		}
593		is->is_send = ntohl(tcp->th_seq) + ip->ip_len -
594			      fin->fin_hlen - (tcp->th_off << 2) +
595			      ((tcp->th_flags & TH_SYN) ? 1 : 0) +
596			      ((tcp->th_flags & TH_FIN) ? 1 : 0);
597		is->is_maxsend = is->is_send;
598		is->is_dend = 0;
599		is->is_maxdwin = 1;
600		is->is_maxswin = ntohs(tcp->th_win);
601		if (is->is_maxswin == 0)
602			is->is_maxswin = 1;
603		/*
604		 * If we're creating state for a starting connection, start the
605		 * timer on it as we'll never see an error if it fails to
606		 * connect.
607		 */
608		ATOMIC_INCL(ips_stats.iss_tcp);
609		break;
610	    }
611	case IPPROTO_UDP :
612	    {
613		tcp = (tcphdr_t *)fin->fin_dp;
614
615		is->is_dport = tcp->th_dport;
616		is->is_sport = tcp->th_sport;
617		if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) {
618			hv += tcp->th_dport;
619			hv += tcp->th_sport;
620		}
621		ATOMIC_INCL(ips_stats.iss_udp);
622		is->is_age = fr_udptimeout;
623		break;
624	    }
625	default :
626		return NULL;
627	}
628
629	KMALLOC(is, ipstate_t *);
630	if (is == NULL) {
631		ATOMIC_INCL(ips_stats.iss_nomem);
632		return NULL;
633	}
634	bcopy((char *)&ips, (char *)is, sizeof(*is));
635	hv %= fr_statesize;
636	is->is_hv = hv;
637	is->is_rule = fin->fin_fr;
638	if (is->is_rule != NULL) {
639		ATOMIC_INC32(is->is_rule->fr_ref);
640		pass = is->is_rule->fr_flags;
641	} else
642		pass = fr_flags;
643	WRITE_ENTER(&ipf_state);
644
645	is->is_pass = pass;
646	is->is_pkts = 1;
647	is->is_bytes = fin->fin_dlen + fin->fin_hlen;
648	/*
649	 * We want to check everything that is a property of this packet,
650	 * but we don't (automatically) care about it's fragment status as
651	 * this may change.
652	 */
653	is->is_v = fin->fin_fi.fi_v;
654	is->is_opt = fin->fin_fi.fi_optmsk;
655	is->is_optmsk = 0xffffffff;
656	is->is_sec = fin->fin_fi.fi_secmsk;
657	is->is_secmsk = 0xffff;
658	is->is_auth = fin->fin_fi.fi_auth;
659	is->is_authmsk = 0xffff;
660	is->is_flags = fin->fin_fi.fi_fl & FI_CMP;
661	is->is_flags |= FI_CMP << 4;
662	is->is_flags |= flags & (FI_WILDP|FI_WILDA);
663	is->is_ifp[1 - out] = NULL;
664	is->is_ifp[out] = fin->fin_ifp;
665#ifdef	_KERNEL
666	strncpy(is->is_ifname[out], IFNAME(fin->fin_ifp), IFNAMSIZ);
667#endif
668	is->is_ifname[1 - out][0] = '\0';
669	if (pass & FR_LOGFIRST)
670		is->is_pass &= ~(FR_LOGFIRST|FR_LOG);
671	fr_stinsert(is);
672	if (is->is_p == IPPROTO_TCP) {
673		MUTEX_ENTER(&is->is_lock);
674		fr_tcp_age(&is->is_age, is->is_state, fin,
675			   0); /* 0 = packet from the source */
676		MUTEX_EXIT(&is->is_lock);
677	}
678#ifdef	IPFILTER_LOG
679	ipstate_log(is, ISL_NEW);
680#endif
681	RWLOCK_EXIT(&ipf_state);
682	fin->fin_rev = IP6NEQ(is->is_dst, fin->fin_fi.fi_dst);
683	if (fin->fin_fi.fi_fl & FI_FRAG)
684		ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE);
685	return is;
686}
687
688
689
690/*
691 * check to see if a packet with TCP headers fits within the TCP window.
692 * change timeout depending on whether new packet is a SYN-ACK returning for a
693 * SYN or a RST or FIN which indicate time to close up shop.
694 */
695int fr_tcpstate(is, fin, ip, tcp)
696register ipstate_t *is;
697fr_info_t *fin;
698ip_t *ip;
699tcphdr_t *tcp;
700{
701	register tcp_seq seq, ack, end;
702	register int ackskew;
703	tcpdata_t  *fdata, *tdata;
704	u_short	win, maxwin;
705	int ret = 0;
706	int source;
707
708	/*
709	 * Find difference between last checked packet and this packet.
710	 */
711	source = IP6EQ(fin->fin_fi.fi_src, is->is_src);
712	fdata = &is->is_tcp.ts_data[!source];
713	tdata = &is->is_tcp.ts_data[source];
714	seq = ntohl(tcp->th_seq);
715	ack = ntohl(tcp->th_ack);
716	win = ntohs(tcp->th_win);
717	end = seq + fin->fin_dlen - (tcp->th_off << 2) +
718	       ((tcp->th_flags & TH_SYN) ? 1 : 0) +
719	       ((tcp->th_flags & TH_FIN) ? 1 : 0);
720
721	if (fdata->td_end == 0) {
722		/*
723		 * Must be a (outgoing) SYN-ACK in reply to a SYN.
724		 */
725		fdata->td_end = end;
726		fdata->td_maxwin = 1;
727		fdata->td_maxend = end + 1;
728	}
729
730	if (!(tcp->th_flags & TH_ACK)) {  /* Pretend an ack was sent */
731		ack = tdata->td_end;
732	} else if (((tcp->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) &&
733		   (ack == 0)) {
734		/* gross hack to get around certain broken tcp stacks */
735		ack = tdata->td_end;
736	}
737
738	if (seq == end)
739		seq = end = fdata->td_end;
740
741	maxwin = tdata->td_maxwin;
742	ackskew = tdata->td_end - ack;
743
744#define	SEQ_GE(a,b)	((int)((a) - (b)) >= 0)
745#define	SEQ_GT(a,b)	((int)((a) - (b)) > 0)
746	if ((SEQ_GE(fdata->td_maxend, end)) &&
747	    (SEQ_GE(seq, fdata->td_end - maxwin)) &&
748/* XXX what about big packets */
749#define MAXACKWINDOW 66000
750	    (ackskew >= -MAXACKWINDOW) &&
751	    (ackskew <= MAXACKWINDOW)) {
752		/* if ackskew < 0 then this should be due to fragented
753		 * packets. There is no way to know the length of the
754		 * total packet in advance.
755		 * We do know the total length from the fragment cache though.
756		 * Note however that there might be more sessions with
757		 * exactly the same source and destination paramters in the
758		 * state cache (and source and destination is the only stuff
759		 * that is saved in the fragment cache). Note further that
760		 * some TCP connections in the state cache are hashed with
761		 * sport and dport as well which makes it not worthwhile to
762		 * look for them.
763		 * Thus, when ackskew is negative but still seems to belong
764		 * to this session, we bump up the destinations end value.
765		 */
766		if (ackskew < 0)
767			tdata->td_end = ack;
768
769		/* update max window seen */
770		if (fdata->td_maxwin < win)
771			fdata->td_maxwin = win;
772		if (SEQ_GT(end, fdata->td_end))
773			fdata->td_end = end;
774		if (SEQ_GE(ack + win, tdata->td_maxend)) {
775			tdata->td_maxend = ack + win;
776			if (win == 0)
777				tdata->td_maxend++;
778		}
779
780		ATOMIC_INCL(ips_stats.iss_hits);
781		is->is_pkts++;
782		is->is_bytes += fin->fin_dlen + fin->fin_hlen;
783		/*
784		 * Nearing end of connection, start timeout.
785		 */
786		MUTEX_ENTER(&is->is_lock);
787		/* source ? 0 : 1 -> !source */
788		fr_tcp_age(&is->is_age, is->is_state, fin, !source);
789		MUTEX_EXIT(&is->is_lock);
790		ret = 1;
791	}
792	return ret;
793}
794
795
796static int fr_matchsrcdst(is, src, dst, fin, tcp)
797ipstate_t *is;
798union i6addr src, dst;
799fr_info_t *fin;
800tcphdr_t *tcp;
801{
802	int ret = 0, rev, out, flags;
803	u_short sp, dp;
804	void *ifp;
805
806	rev = fin->fin_rev = IP6NEQ(is->is_dst, dst);
807	ifp = fin->fin_ifp;
808	out = fin->fin_out;
809
810	if (tcp != NULL) {
811		flags = is->is_flags;
812		sp = tcp->th_sport;
813		dp = tcp->th_dport;
814	} else {
815		flags = is->is_flags & FI_WILDA;
816		sp = 0;
817		dp = 0;
818	}
819
820	if (rev == 0) {
821		if (!out) {
822			if (is->is_ifpin == NULL || is->is_ifpin == ifp)
823				ret = 1;
824		} else {
825			if (is->is_ifpout == NULL || is->is_ifpout == ifp)
826				ret = 1;
827		}
828	} else {
829		if (out) {
830			if (is->is_ifpin == NULL || is->is_ifpin == ifp)
831				ret = 1;
832		} else {
833			if (is->is_ifpout == NULL || is->is_ifpout == ifp)
834				ret = 1;
835		}
836	}
837	if (ret == 0)
838		return 0;
839	ret = 0;
840
841	if (rev == 0) {
842		if (
843		    (IP6EQ(is->is_dst, dst) || (flags & FI_W_DADDR)) &&
844		    (IP6EQ(is->is_src, src) || (flags & FI_W_SADDR)) &&
845		    (!tcp || ((sp == is->is_sport || flags & FI_W_SPORT) &&
846		     (dp == is->is_dport || flags & FI_W_DPORT)))) {
847			ret = 1;
848		}
849	} else {
850		if (
851		    (IP6EQ(is->is_dst, src) || (flags & FI_W_DADDR)) &&
852		    (IP6EQ(is->is_src, dst) || (flags & FI_W_SADDR)) &&
853		    (!tcp || ((sp == is->is_dport || flags & FI_W_DPORT) &&
854		     (dp == is->is_sport || flags & FI_W_SPORT)))) {
855			ret = 1;
856		}
857	}
858	if (ret == 0)
859		return 0;
860
861	/*
862	 * Whether or not this should be here, is questionable, but the aim
863	 * is to get this out of the main line.
864	 */
865	if (tcp == NULL)
866		flags = is->is_flags & (FI_CMP|(FI_CMP<<4));
867
868	if (((fin->fin_fi.fi_fl & (flags >> 4)) != (flags & FI_CMP)) ||
869	    ((fin->fin_fi.fi_optmsk & is->is_optmsk) != is->is_opt) ||
870	    ((fin->fin_fi.fi_secmsk & is->is_secmsk) != is->is_sec) ||
871	    ((fin->fin_fi.fi_auth & is->is_authmsk) != is->is_auth))
872		return 0;
873
874	if ((flags & (FI_W_SPORT|FI_W_DPORT))) {
875		if ((flags & FI_W_SPORT) != 0) {
876			if (rev == 0) {
877				is->is_sport = sp;
878				is->is_send = htonl(tcp->th_seq);
879			} else {
880				is->is_sport = dp;
881				is->is_send = htonl(tcp->th_ack);
882			}
883			is->is_maxsend = is->is_send + 1;
884		} else if ((flags & FI_W_DPORT) != 0) {
885			if (rev == 0) {
886				is->is_dport = dp;
887				is->is_dend = htonl(tcp->th_ack);
888			} else {
889				is->is_dport = sp;
890				is->is_dend = htonl(tcp->th_seq);
891			}
892			is->is_maxdend = is->is_dend + 1;
893		}
894		is->is_flags &= ~(FI_W_SPORT|FI_W_DPORT);
895	}
896
897	ret = -1;
898
899	if (!rev) {
900		if (out) {
901			if (!is->is_ifpout)
902				ret = 1;
903		} else {
904			if (!is->is_ifpin)
905				ret = 0;
906		}
907	} else {
908		if (out) {
909			if (!is->is_ifpin)
910				ret = 0;
911		} else {
912			if (!is->is_ifpout)
913				ret = 1;
914		}
915	}
916
917	if (ret >= 0) {
918		is->is_ifp[ret] = ifp;
919#ifdef	_KERNEL
920		strncpy(is->is_ifname[out], IFNAME(fin->fin_ifp),
921			sizeof(is->is_ifname[1]));
922#endif
923	}
924#ifdef  _KERNEL
925	if (ret >= 0) {
926		strncpy(is->is_ifname[out], IFNAME(fin->fin_ifp),
927			sizeof(is->is_ifname[1]));
928	}
929#endif
930	return 1;
931}
932
933static int fr_matchicmpqueryreply(v, is, icmp)
934int v;
935ipstate_t *is;
936icmphdr_t *icmp;
937{
938	if (v == 4) {
939		/*
940		 * If we matched its type on the way in, then when going out
941		 * it will still be the same type.
942		 */
943		if (((icmp->icmp_type == is->is_type) ||
944		     (icmpreplytype4[is->is_type] == icmp->icmp_type)) &&
945		    (icmp->icmp_id == is->is_icmp.ics_id) &&
946		    (icmp->icmp_seq == is->is_icmp.ics_seq)) {
947			return 1;
948		};
949	}
950#ifdef	USE_INET6
951	else if (is->is_v == 6) {
952		if ((is->is_type == ICMP6_ECHO_REPLY) &&
953		    (icmp->icmp_type == ICMP6_ECHO_REQUEST) &&
954		    (icmp->icmp_id == is->is_icmp.ics_id) &&
955		    (icmp->icmp_seq == is->is_icmp.ics_seq)) {
956			return 1;
957		};
958	}
959#endif
960	return 0;
961}
962
963static frentry_t *fr_checkicmpmatchingstate(ip, fin)
964ip_t *ip;
965fr_info_t *fin;
966{
967	register ipstate_t *is, **isp;
968	register u_short sport, dport;
969	register u_char	pr;
970	union i6addr dst, src;
971	struct icmp *ic;
972	u_short savelen;
973	icmphdr_t *icmp;
974	fr_info_t ofin;
975	int type, len;
976	tcphdr_t *tcp;
977	frentry_t *fr;
978	ip_t *oip;
979	u_int hv;
980
981	/*
982	 * Does it at least have the return (basic) IP header ?
983	 * Only a basic IP header (no options) should be with
984	 * an ICMP error header.
985	 */
986	if (((ip->ip_v != 4) && (ip->ip_hl != 5)) ||
987	    (fin->fin_plen < ICMPERR_MINPKTLEN))
988		return NULL;
989	ic = (struct icmp *)fin->fin_dp;
990	type = ic->icmp_type;
991	/*
992	 * If it's not an error type, then return
993	 */
994	if ((type != ICMP_UNREACH) && (type != ICMP_SOURCEQUENCH) &&
995    	    (type != ICMP_REDIRECT) && (type != ICMP_TIMXCEED) &&
996    	    (type != ICMP_PARAMPROB))
997		return NULL;
998
999	oip = (ip_t *)((char *)ic + ICMPERR_ICMPHLEN);
1000	if (fin->fin_plen < ICMPERR_MAXPKTLEN + ((oip->ip_hl - 5) << 2))
1001		return NULL;
1002
1003	/*
1004	 * Sanity checks.
1005	 */
1006	len = fin->fin_dlen - ICMPERR_ICMPHLEN;
1007	if ((len <= 0) || ((oip->ip_hl << 2) > len))
1008		return NULL;
1009
1010	/*
1011	 * Is the buffer big enough for all of it ?  It's the size of the IP
1012	 * header claimed in the encapsulated part which is of concern.  It
1013	 * may be too big to be in this buffer but not so big that it's
1014	 * outside the ICMP packet, leading to TCP deref's causing problems.
1015	 * This is possible because we don't know how big oip_hl is when we
1016	 * do the pullup early in fr_check() and thus can't gaurantee it is
1017	 * all here now.
1018	 */
1019#ifdef  _KERNEL
1020	{
1021	mb_t *m;
1022
1023# if SOLARIS
1024	m = fin->fin_qfm;
1025	if ((char *)oip + len > (char *)m->b_wptr)
1026		return NULL;
1027# else
1028	m = *(mb_t **)fin->fin_mp;
1029	if ((char *)oip + len > (char *)ip + m->m_len)
1030		return NULL;
1031# endif
1032	}
1033#endif
1034
1035	/*
1036	 * in the IPv4 case we must zero the i6addr union otherwise
1037	 * the IP6EQ and IP6NEQ macros produce the wrong results because
1038	 * of the 'junk' in the unused part of the union
1039	 */
1040	bzero(&src, sizeof(src));
1041	bzero(&dst, sizeof(dst));
1042
1043	if (oip->ip_p == IPPROTO_ICMP) {
1044		icmp = (icmphdr_t *)((char *)oip + (oip->ip_hl << 2));
1045
1046		/*
1047		 * a ICMP error can only be generated as a result of an
1048		 * ICMP query, not as the response on an ICMP error
1049		 *
1050		 * XXX theoretically ICMP_ECHOREP and the other reply's are
1051		 * ICMP query's as well, but adding them here seems strange XXX
1052		 */
1053		 if ((icmp->icmp_type != ICMP_ECHO) &&
1054		     (icmp->icmp_type != ICMP_TSTAMP) &&
1055		     (icmp->icmp_type != ICMP_IREQ) &&
1056		     (icmp->icmp_type != ICMP_MASKREQ))
1057		    	return NULL;
1058
1059		/*
1060		 * perform a lookup of the ICMP packet in the state table
1061		 */
1062		hv = (pr = oip->ip_p);
1063		src.in4 = oip->ip_src;
1064		hv += src.in4.s_addr;
1065		dst.in4 = oip->ip_dst;
1066		hv += dst.in4.s_addr;
1067		hv += icmp->icmp_id;
1068		hv += icmp->icmp_seq;
1069		hv %= fr_statesize;
1070
1071		savelen = oip->ip_len;
1072		oip->ip_len = len;
1073		ofin.fin_v = 4;
1074		fr_makefrip(oip->ip_hl << 2, oip, &ofin);
1075		oip->ip_len = savelen;
1076		ofin.fin_ifp = fin->fin_ifp;
1077		ofin.fin_out = !fin->fin_out;
1078		ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
1079
1080		READ_ENTER(&ipf_state);
1081		for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext)
1082			if ((is->is_p == pr) && (is->is_v == 4) &&
1083			    fr_matchsrcdst(is, src, dst, &ofin, NULL) &&
1084			    fr_matchicmpqueryreply(is->is_v, is, icmp)) {
1085				ips_stats.iss_hits++;
1086				is->is_pkts++;
1087				is->is_bytes += ip->ip_len;
1088				fr = is->is_rule;
1089				RWLOCK_EXIT(&ipf_state);
1090				return fr;
1091			}
1092		RWLOCK_EXIT(&ipf_state);
1093		return NULL;
1094	};
1095
1096	if ((oip->ip_p != IPPROTO_TCP) && (oip->ip_p != IPPROTO_UDP))
1097		return NULL;
1098
1099	tcp = (tcphdr_t *)((char *)oip + (oip->ip_hl << 2));
1100	dport = tcp->th_dport;
1101	sport = tcp->th_sport;
1102
1103	hv = (pr = oip->ip_p);
1104	src.in4 = oip->ip_src;
1105	hv += src.in4.s_addr;
1106	dst.in4 = oip->ip_dst;
1107	hv += dst.in4.s_addr;
1108	hv += dport;
1109	hv += sport;
1110	hv %= fr_statesize;
1111	/*
1112	 * we make an fin entry to be able to feed it to
1113	 * matchsrcdst note that not all fields are encessary
1114	 * but this is the cleanest way. Note further we fill
1115	 * in fin_mp such that if someone uses it we'll get
1116	 * a kernel panic. fr_matchsrcdst does not use this.
1117	 *
1118	 * watch out here, as ip is in host order and oip in network
1119	 * order. Any change we make must be undone afterwards.
1120	 */
1121	savelen = oip->ip_len;
1122	oip->ip_len = len;
1123	ofin.fin_v = 4;
1124	fr_makefrip(oip->ip_hl << 2, oip, &ofin);
1125	oip->ip_len = savelen;
1126	ofin.fin_ifp = fin->fin_ifp;
1127	ofin.fin_out = !fin->fin_out;
1128	ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
1129	READ_ENTER(&ipf_state);
1130	for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) {
1131		/*
1132		 * Only allow this icmp though if the
1133		 * encapsulated packet was allowed through the
1134		 * other way around. Note that the minimal amount
1135		 * of info present does not allow for checking against
1136		 * tcp internals such as seq and ack numbers.
1137		 */
1138		if ((is->is_p == pr) && (is->is_v == 4) &&
1139		    fr_matchsrcdst(is, src, dst, &ofin, tcp)) {
1140			fr = is->is_rule;
1141			ips_stats.iss_hits++;
1142			/*
1143			 * we must swap src and dst here because the icmp
1144			 * comes the other way around
1145			 */
1146			is->is_pkts++;
1147			is->is_bytes += fin->fin_plen;
1148			/*
1149			 * we deliberately do not touch the timeouts
1150			 * for the accompanying state table entry.
1151			 * It remains to be seen if that is correct. XXX
1152			 */
1153			RWLOCK_EXIT(&ipf_state);
1154			return fr;
1155		}
1156	}
1157	RWLOCK_EXIT(&ipf_state);
1158	return NULL;
1159}
1160
1161/*
1162 * Check if a packet has a registered state.
1163 */
1164frentry_t *fr_checkstate(ip, fin)
1165ip_t *ip;
1166fr_info_t *fin;
1167{
1168	union i6addr dst, src;
1169	register ipstate_t *is, **isp;
1170	register u_char pr;
1171	u_int hv, hvm, hlen, tryagain, pass, v;
1172	struct icmp *ic;
1173	frentry_t *fr;
1174	tcphdr_t *tcp;
1175
1176	if (fr_state_lock || (fin->fin_off & IP_OFFMASK) ||
1177	    (fin->fin_fi.fi_fl & FI_SHORT))
1178		return NULL;
1179
1180	is = NULL;
1181	hlen = fin->fin_hlen;
1182	tcp = (tcphdr_t *)((char *)ip + hlen);
1183	ic = (struct icmp *)tcp;
1184	hv = (pr = fin->fin_fi.fi_p);
1185	src = fin->fin_fi.fi_src;
1186	dst = fin->fin_fi.fi_dst;
1187	hv += src.in4.s_addr;
1188	hv += dst.in4.s_addr;
1189
1190	/*
1191	 * Search the hash table for matching packet header info.
1192	 */
1193	v = fin->fin_fi.fi_v;
1194	switch (fin->fin_fi.fi_p)
1195	{
1196#ifdef	USE_INET6
1197	case IPPROTO_ICMPV6 :
1198		if (v == 6) {
1199			if (fin->fin_out)
1200				hv -= dst.in4.s_addr;
1201			else
1202				hv -= src.in4.s_addr;
1203			if ((ic->icmp_type == ICMP6_ECHO_REQUEST) ||
1204			    (ic->icmp_type == ICMP6_ECHO_REPLY)) {
1205				hv += ic->icmp_id;
1206				hv += ic->icmp_seq;
1207			}
1208		}
1209#endif
1210	case IPPROTO_ICMP :
1211		if (v == 4) {
1212			hv += ic->icmp_id;
1213			hv += ic->icmp_seq;
1214		}
1215		hv %= fr_statesize;
1216		READ_ENTER(&ipf_state);
1217		for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) {
1218			if ((is->is_p == pr) && (is->is_v == v) &&
1219			    fr_matchsrcdst(is, src, dst, fin, NULL) &&
1220			    fr_matchicmpqueryreply(v, is, ic)) {
1221				is->is_age = fr_icmptimeout;
1222				break;
1223			}
1224		}
1225		if (is != NULL)
1226			break;
1227		RWLOCK_EXIT(&ipf_state);
1228		/*
1229		 * No matching icmp state entry. Perhaps this is a
1230		 * response to another state entry.
1231		 */
1232#ifdef	USE_INET6
1233		if (v == 6)
1234			fr = fr_checkicmp6matchingstate((ip6_t *)ip, fin);
1235		else
1236#endif
1237			fr = fr_checkicmpmatchingstate(ip, fin);
1238		if (fr)
1239			return fr;
1240		break;
1241	case IPPROTO_TCP :
1242	    {
1243		register u_short dport = tcp->th_dport, sport = tcp->th_sport;
1244		register int i;
1245
1246		i = tcp->th_flags;
1247		/*
1248		 * Just plain ignore RST flag set with either FIN or SYN.
1249		 */
1250		if ((i & TH_RST) &&
1251		    ((i & (TH_FIN|TH_SYN|TH_RST)) != TH_RST))
1252			break;
1253		tryagain = 0;
1254retry_tcp:
1255		hvm = hv % fr_statesize;
1256		WRITE_ENTER(&ipf_state);
1257		for (isp = &ips_table[hvm]; (is = *isp);
1258		     isp = &is->is_hnext)
1259
1260
1261			if ((is->is_p == pr) && (is->is_v == v) &&
1262			    fr_matchsrcdst(is, src, dst, fin, tcp)) {
1263				if (fr_tcpstate(is, fin, ip, tcp))
1264					break;
1265				is = NULL;
1266				break;
1267			}
1268		if (is != NULL)
1269			break;
1270		RWLOCK_EXIT(&ipf_state);
1271		hv += dport;
1272		hv += sport;
1273		if (tryagain == 0) {
1274			tryagain = 1;
1275			goto retry_tcp;
1276		}
1277		break;
1278	    }
1279	case IPPROTO_UDP :
1280	    {
1281		register u_short dport = tcp->th_dport, sport = tcp->th_sport;
1282
1283		tryagain = 0;
1284retry_udp:
1285		hvm = hv % fr_statesize;
1286		/*
1287		 * Nothing else to match on but ports. and IP#'s
1288		 */
1289		READ_ENTER(&ipf_state);
1290		for (is = ips_table[hvm]; is; is = is->is_hnext)
1291			if ((is->is_p == pr) && (is->is_v == v) &&
1292			    fr_matchsrcdst(is, src, dst, fin, tcp)) {
1293				is->is_age = fr_udptimeout;
1294				break;
1295			}
1296		if (is != NULL)
1297			break;
1298		RWLOCK_EXIT(&ipf_state);
1299		hv += dport;
1300		hv += sport;
1301		if (tryagain == 0) {
1302			tryagain = 1;
1303			goto retry_udp;
1304		}
1305		break;
1306	    }
1307	default :
1308		break;
1309	}
1310	if (is == NULL) {
1311		ATOMIC_INCL(ips_stats.iss_miss);
1312		return NULL;
1313	}
1314	MUTEX_ENTER(&is->is_lock);
1315	is->is_bytes += fin->fin_plen;
1316	ips_stats.iss_hits++;
1317	is->is_pkts++;
1318	MUTEX_EXIT(&is->is_lock);
1319	fr = is->is_rule;
1320	fin->fin_fr = fr;
1321	pass = is->is_pass;
1322#ifndef	_KERNEL
1323	if (tcp->th_flags & TCP_CLOSE)
1324		fr_delstate(is);
1325#endif
1326	RWLOCK_EXIT(&ipf_state);
1327	if (fin->fin_fi.fi_fl & FI_FRAG)
1328		ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE);
1329	return fr;
1330}
1331
1332
1333void ip_statesync(ifp)
1334void *ifp;
1335{
1336	register ipstate_t *is;
1337
1338	WRITE_ENTER(&ipf_state);
1339	for (is = ips_list; is; is = is->is_next) {
1340		if (is->is_ifpin == ifp) {
1341			is->is_ifpin = GETUNIT(is->is_ifname[0], is->is_v);
1342			if (!is->is_ifpin)
1343				is->is_ifpin = (void *)-1;
1344		}
1345		if (is->is_ifpout == ifp) {
1346			is->is_ifpout = GETUNIT(is->is_ifname[1], is->is_v);
1347			if (!is->is_ifpout)
1348				is->is_ifpout = (void *)-1;
1349		}
1350	}
1351	RWLOCK_EXIT(&ipf_state);
1352}
1353
1354
1355static void fr_delstate(is)
1356ipstate_t *is;
1357{
1358	frentry_t *fr;
1359
1360	if (is->is_next)
1361		is->is_next->is_pnext = is->is_pnext;
1362	*is->is_pnext = is->is_next;
1363	if (is->is_hnext)
1364		is->is_hnext->is_phnext = is->is_phnext;
1365	*is->is_phnext = is->is_hnext;
1366	if (ips_table[is->is_hv] == NULL)
1367		ips_stats.iss_inuse--;
1368
1369	fr = is->is_rule;
1370	if (fr != NULL) {
1371		ATOMIC_DEC32(fr->fr_ref);
1372		if (fr->fr_ref == 0)
1373			KFREE(fr);
1374	}
1375#ifdef	_KERNEL
1376	MUTEX_DESTROY(&is->is_lock);
1377#endif
1378	KFREE(is);
1379	ips_num--;
1380}
1381
1382
1383/*
1384 * Free memory in use by all state info. kept.
1385 */
1386void fr_stateunload()
1387{
1388	register ipstate_t *is;
1389
1390	WRITE_ENTER(&ipf_state);
1391	while ((is = ips_list))
1392		fr_delstate(is);
1393	ips_stats.iss_inuse = 0;
1394	ips_num = 0;
1395	RWLOCK_EXIT(&ipf_state);
1396	KFREES(ips_table, fr_statesize * sizeof(ipstate_t *));
1397	ips_table = NULL;
1398}
1399
1400
1401/*
1402 * Slowly expire held state for thingslike UDP and ICMP.  Timeouts are set
1403 * in expectation of this being called twice per second.
1404 */
1405void fr_timeoutstate()
1406{
1407	register ipstate_t *is, **isp;
1408#if defined(_KERNEL) && !SOLARIS
1409	int s;
1410#endif
1411
1412	SPL_NET(s);
1413	WRITE_ENTER(&ipf_state);
1414	for (isp = &ips_list; (is = *isp); )
1415		if (is->is_age && !--is->is_age) {
1416			if (is->is_p == IPPROTO_TCP)
1417				ips_stats.iss_fin++;
1418			else
1419				ips_stats.iss_expire++;
1420#ifdef	IPFILTER_LOG
1421			ipstate_log(is, ISL_EXPIRE);
1422#endif
1423			fr_delstate(is);
1424		} else
1425			isp = &is->is_next;
1426	RWLOCK_EXIT(&ipf_state);
1427	SPL_X(s);
1428	if (fr_state_doflush) {
1429		(void) fr_state_flush(1);
1430		fr_state_doflush = 0;
1431	}
1432}
1433
1434
1435/*
1436 * Original idea freom Pradeep Krishnan for use primarily with NAT code.
1437 * (pkrishna@netcom.com)
1438 *
1439 * Rewritten by Arjan de Vet <Arjan.deVet@adv.iae.nl>, 2000-07-29:
1440 *
1441 * - (try to) base state transitions on real evidence only,
1442 *   i.e. packets that are sent and have been received by ipfilter;
1443 *   diagram 18.12 of TCP/IP volume 1 by W. Richard Stevens was used.
1444 *
1445 * - deal with half-closed connections correctly;
1446 *
1447 * - store the state of the source in state[0] such that ipfstat
1448 *   displays the state as source/dest instead of dest/source; the calls
1449 *   to fr_tcp_age have been changed accordingly.
1450 *
1451 * Parameters:
1452 *
1453 *    state[0] = state of source (host that initiated connection)
1454 *    state[1] = state of dest   (host that accepted the connection)
1455 *
1456 *    dir == 0 : a packet from source to dest
1457 *    dir == 1 : a packet from dest to source
1458 *
1459 */
1460void fr_tcp_age(age, state, fin, dir)
1461u_long *age;
1462u_char *state;
1463fr_info_t *fin;
1464int dir;
1465{
1466	tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp;
1467	u_char flags = tcp->th_flags;
1468	int dlen, ostate;
1469
1470	ostate = state[1 - dir];
1471
1472	dlen = fin->fin_plen - fin->fin_hlen - (tcp->th_off << 2);
1473
1474	if (flags & TH_RST) {
1475		if (!(tcp->th_flags & TH_PUSH) && !dlen) {
1476			*age = fr_tcpclosed;
1477			state[dir] = TCPS_CLOSED;
1478		} else {
1479			*age = fr_tcpclosewait;
1480			state[dir] = TCPS_CLOSE_WAIT;
1481		}
1482		return;
1483	}
1484
1485	*age = fr_tcptimeout; /* default 4 mins */
1486
1487	switch(state[dir])
1488	{
1489	case TCPS_CLOSED: /* 0 */
1490		if ((flags & TH_OPENING) == TH_OPENING) {
1491			/*
1492			 * 'dir' received an S and sends SA in response,
1493			 * CLOSED -> SYN_RECEIVED
1494			 */
1495			state[dir] = TCPS_SYN_RECEIVED;
1496			*age = fr_tcptimeout;
1497		} else if ((flags & (TH_SYN|TH_ACK)) == TH_SYN) {
1498			/* 'dir' sent S, CLOSED -> SYN_SENT */
1499			state[dir] = TCPS_SYN_SENT;
1500			*age = fr_tcptimeout;
1501		}
1502		/*
1503		 * The next piece of code makes it possible to get
1504		 * already established connections into the state table
1505		 * after a restart or reload of the filter rules; this
1506		 * does not work when a strict 'flags S keep state' is
1507		 * used for tcp connections of course
1508		 */
1509		if ((flags & (TH_FIN|TH_SYN|TH_RST|TH_ACK)) == TH_ACK) {
1510			/* we saw an A, guess 'dir' is in ESTABLISHED mode */
1511			state[dir] = TCPS_ESTABLISHED;
1512			*age = fr_tcpidletimeout;
1513		}
1514		/*
1515		 * TODO: besides regular ACK packets we can have other
1516		 * packets as well; it is yet to be determined how we
1517		 * should initialize the states in those cases
1518		 */
1519		break;
1520
1521	case TCPS_LISTEN: /* 1 */
1522		/* NOT USED */
1523		break;
1524
1525	case TCPS_SYN_SENT: /* 2 */
1526		if ((flags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) {
1527			/*
1528			 * We see an A from 'dir' which is in SYN_SENT
1529			 * state: 'dir' sent an A in response to an SA
1530			 * which it received, SYN_SENT -> ESTABLISHED
1531			 */
1532			state[dir] = TCPS_ESTABLISHED;
1533			*age = fr_tcpidletimeout;
1534		} else if (flags & TH_FIN) {
1535			/*
1536			 * We see an F from 'dir' which is in SYN_SENT
1537			 * state and wants to close its side of the
1538			 * connection; SYN_SENT -> FIN_WAIT_1
1539			 */
1540			state[dir] = TCPS_FIN_WAIT_1;
1541			*age = fr_tcpidletimeout; /* or fr_tcptimeout? */
1542		} else if ((flags & TH_OPENING) == TH_OPENING) {
1543			/*
1544			 * We see an SA from 'dir' which is already in
1545			 * SYN_SENT state, this means we have a
1546			 * simultaneous open; SYN_SENT -> SYN_RECEIVED
1547			 */
1548			state[dir] = TCPS_SYN_RECEIVED;
1549			*age = fr_tcptimeout;
1550		}
1551		break;
1552
1553	case TCPS_SYN_RECEIVED: /* 3 */
1554		if ((flags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) {
1555			/*
1556			 * We see an A from 'dir' which was in SYN_RECEIVED
1557			 * state so it must now be in established state,
1558			 * SYN_RECEIVED -> ESTABLISHED
1559			 */
1560			state[dir] = TCPS_ESTABLISHED;
1561			*age = fr_tcpidletimeout;
1562		} else if (flags & TH_FIN) {
1563			/*
1564			 * We see an F from 'dir' which is in SYN_RECEIVED
1565			 * state and wants to close its side of the connection;
1566			 * SYN_RECEIVED -> FIN_WAIT_1
1567			 */
1568			state[dir] = TCPS_FIN_WAIT_1;
1569			*age = fr_tcpidletimeout; /* or fr_tcptimeout? */
1570		}
1571		break;
1572
1573	case TCPS_ESTABLISHED: /* 4 */
1574		if (flags & TH_FIN) {
1575			/*
1576			 * 'dir' closed its side of the connection; this
1577			 * gives us a half-closed connection;
1578			 * ESTABLISHED -> FIN_WAIT_1
1579			 */
1580			state[dir] = TCPS_FIN_WAIT_1;
1581			*age = fr_tcpidletimeout;
1582		} else if (flags & TH_ACK) {
1583			/* an ACK, should we exclude other flags here? */
1584			if (ostate == TCPS_FIN_WAIT_1) {
1585				/*
1586				 * We know the other side did an active close,
1587				 * so we are ACKing the recvd FIN packet (does
1588				 * the window matching code guarantee this?)
1589				 * and go into CLOSE_WAIT state; this gives us
1590				 * a half-closed connection
1591				 */
1592				state[dir] = TCPS_CLOSE_WAIT;
1593				*age = fr_tcpidletimeout;
1594			} else if (ostate < TCPS_CLOSE_WAIT)
1595				/*
1596				 * Still a fully established connection,
1597				 * reset timeout
1598				 */
1599				*age = fr_tcpidletimeout;
1600		}
1601		break;
1602
1603	case TCPS_CLOSE_WAIT: /* 5 */
1604		if (flags & TH_FIN) {
1605			/*
1606			 * Application closed and 'dir' sent a FIN, we're now
1607			 * going into LAST_ACK state
1608			 */
1609			*age  = fr_tcplastack;
1610			state[dir] = TCPS_LAST_ACK;
1611		} else {
1612			/*
1613			 * We remain in CLOSE_WAIT because the other side has
1614			 * closed already and we did not close our side yet;
1615			 * reset timeout
1616			 */
1617			*age  = fr_tcpidletimeout;
1618		}
1619		break;
1620
1621	case TCPS_FIN_WAIT_1: /* 6 */
1622		if ((flags & TH_ACK) && ostate > TCPS_CLOSE_WAIT) {
1623			/*
1624			 * If the other side is not active anymore it has sent
1625			 * us a FIN packet that we are ack'ing now with an ACK;
1626			 * this means both sides have now closed the connection
1627			 * and we go into TIME_WAIT
1628			 */
1629			/*
1630			 * XXX: how do we know we really are ACKing the FIN
1631			 * packet here? does the window code guarantee that?
1632			 */
1633			state[dir] = TCPS_TIME_WAIT;
1634			*age = fr_tcptimeout;
1635		} else
1636			/*
1637			 * We closed our side of the connection already but the
1638			 * other side is still active (ESTABLISHED/CLOSE_WAIT);
1639			 * continue with this half-closed connection
1640			 */
1641			*age = fr_tcpidletimeout;
1642		break;
1643
1644	case TCPS_CLOSING: /* 7 */
1645		/* NOT USED */
1646		break;
1647
1648	case TCPS_LAST_ACK: /* 8 */
1649		if (flags & TH_ACK) {
1650			if ((flags & TH_PUSH) || dlen)
1651				/*
1652				 * There is still data to be delivered, reset
1653				 * timeout
1654				 */
1655				*age  = fr_tcplastack;
1656		}
1657		/*
1658		 * We cannot detect when we go out of LAST_ACK state to CLOSED
1659		 * because that is based on the reception of ACK packets;
1660		 * ipfilter can only detect that a packet has been sent by a
1661		 * host
1662		 */
1663		break;
1664
1665	case TCPS_FIN_WAIT_2: /* 9 */
1666		/* NOT USED */
1667		break;
1668
1669	case TCPS_TIME_WAIT: /* 10 */
1670		/* we're in 2MSL timeout now */
1671		break;
1672	}
1673}
1674
1675
1676#ifdef	IPFILTER_LOG
1677void ipstate_log(is, type)
1678struct ipstate *is;
1679u_int type;
1680{
1681	struct	ipslog	ipsl;
1682	void *items[1];
1683	size_t sizes[1];
1684	int types[1];
1685
1686	ipsl.isl_type = type;
1687	ipsl.isl_pkts = is->is_pkts;
1688	ipsl.isl_bytes = is->is_bytes;
1689	ipsl.isl_src = is->is_src;
1690	ipsl.isl_dst = is->is_dst;
1691	ipsl.isl_p = is->is_p;
1692	ipsl.isl_v = is->is_v;
1693	ipsl.isl_flags = is->is_flags;
1694	if (ipsl.isl_p == IPPROTO_TCP || ipsl.isl_p == IPPROTO_UDP) {
1695		ipsl.isl_sport = is->is_sport;
1696		ipsl.isl_dport = is->is_dport;
1697		if (ipsl.isl_p == IPPROTO_TCP) {
1698			ipsl.isl_state[0] = is->is_state[0];
1699			ipsl.isl_state[1] = is->is_state[1];
1700		}
1701	} else if (ipsl.isl_p == IPPROTO_ICMP)
1702		ipsl.isl_itype = is->is_icmp.ics_type;
1703	else {
1704		ipsl.isl_ps.isl_filler[0] = 0;
1705		ipsl.isl_ps.isl_filler[1] = 0;
1706	}
1707	items[0] = &ipsl;
1708	sizes[0] = sizeof(ipsl);
1709	types[0] = 0;
1710
1711	(void) ipllog(IPL_LOGSTATE, NULL, items, sizes, types, 1);
1712}
1713#endif
1714
1715
1716#ifdef	USE_INET6
1717frentry_t *fr_checkicmp6matchingstate(ip, fin)
1718ip6_t *ip;
1719fr_info_t *fin;
1720{
1721	register ipstate_t *is, **isp;
1722	register u_short sport, dport;
1723	register u_char	pr;
1724	struct icmp6_hdr *ic, *oic;
1725	union i6addr dst, src;
1726	u_short savelen;
1727	fr_info_t ofin;
1728	tcphdr_t *tcp;
1729	frentry_t *fr;
1730	ip6_t *oip;
1731	int type;
1732	u_int hv;
1733
1734	/*
1735	 * Does it at least have the return (basic) IP header ?
1736	 * Only a basic IP header (no options) should be with
1737	 * an ICMP error header.
1738	 */
1739	if ((fin->fin_v != 6) || (fin->fin_plen < ICMP6ERR_MINPKTLEN))
1740		return NULL;
1741	ic = (struct icmp6_hdr *)fin->fin_dp;
1742	type = ic->icmp6_type;
1743	/*
1744	 * If it's not an error type, then return
1745	 */
1746	if ((type != ICMP6_DST_UNREACH) && (type != ICMP6_PACKET_TOO_BIG) &&
1747	    (type != ICMP6_TIME_EXCEEDED) && (type != ICMP6_PARAM_PROB))
1748		return NULL;
1749
1750	oip = (ip6_t *)((char *)ic + ICMPERR_ICMPHLEN);
1751	if (fin->fin_plen < sizeof(*oip))
1752		return NULL;
1753
1754	if (oip->ip6_nxt == IPPROTO_ICMPV6) {
1755		oic = (struct icmp6_hdr *)(oip + 1);
1756		/*
1757		 * a ICMP error can only be generated as a result of an
1758		 * ICMP query, not as the response on an ICMP error
1759		 *
1760		 * XXX theoretically ICMP_ECHOREP and the other reply's are
1761		 * ICMP query's as well, but adding them here seems strange XXX
1762		 */
1763		 if (!(oic->icmp6_type & ICMP6_INFOMSG_MASK))
1764		    	return NULL;
1765
1766		/*
1767		 * perform a lookup of the ICMP packet in the state table
1768		 */
1769		hv = (pr = oip->ip6_nxt);
1770		src.in6 = oip->ip6_src;
1771		hv += src.in4.s_addr;
1772		dst.in6 = oip->ip6_dst;
1773		hv += dst.in4.s_addr;
1774		hv += oic->icmp6_id;
1775		hv += oic->icmp6_seq;
1776		hv %= fr_statesize;
1777
1778		oip->ip6_plen = ntohs(oip->ip6_plen);
1779		ofin.fin_v = 6;
1780		fr_makefrip(sizeof(*oip), (ip_t *)oip, &ofin);
1781		oip->ip6_plen = htons(oip->ip6_plen);
1782		ofin.fin_ifp = fin->fin_ifp;
1783		ofin.fin_out = !fin->fin_out;
1784		ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
1785
1786		READ_ENTER(&ipf_state);
1787		for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext)
1788			if ((is->is_p == pr) &&
1789			    (oic->icmp6_id == is->is_icmp.ics_id) &&
1790			    (oic->icmp6_seq == is->is_icmp.ics_seq) &&
1791			    fr_matchsrcdst(is, src, dst, &ofin, NULL)) {
1792			    	/*
1793			    	 * in the state table ICMP query's are stored
1794			    	 * with the type of the corresponding ICMP
1795			    	 * response. Correct here
1796			    	 */
1797				if (((is->is_type == ICMP6_ECHO_REPLY) &&
1798				     (oic->icmp6_type == ICMP6_ECHO_REQUEST)) ||
1799				     (is->is_type - 1 == oic->icmp6_type )) {
1800				    	ips_stats.iss_hits++;
1801    					is->is_pkts++;
1802					is->is_bytes += fin->fin_plen;
1803					return is->is_rule;
1804				}
1805			}
1806		RWLOCK_EXIT(&ipf_state);
1807
1808		return NULL;
1809	};
1810
1811	if ((oip->ip6_nxt != IPPROTO_TCP) && (oip->ip6_nxt != IPPROTO_UDP))
1812		return NULL;
1813	tcp = (tcphdr_t *)(oip + 1);
1814	dport = tcp->th_dport;
1815	sport = tcp->th_sport;
1816
1817	hv = (pr = oip->ip6_nxt);
1818	src.in6 = oip->ip6_src;
1819	hv += src.in4.s_addr;
1820	dst.in6 = oip->ip6_dst;
1821	hv += dst.in4.s_addr;
1822	hv += dport;
1823	hv += sport;
1824	hv %= fr_statesize;
1825	/*
1826	 * we make an fin entry to be able to feed it to
1827	 * matchsrcdst note that not all fields are encessary
1828	 * but this is the cleanest way. Note further we fill
1829	 * in fin_mp such that if someone uses it we'll get
1830	 * a kernel panic. fr_matchsrcdst does not use this.
1831	 *
1832	 * watch out here, as ip is in host order and oip in network
1833	 * order. Any change we make must be undone afterwards.
1834	 */
1835	savelen = oip->ip6_plen;
1836	oip->ip6_plen = ip->ip6_plen - sizeof(*ip) - ICMPERR_ICMPHLEN;
1837	ofin.fin_v = 6;
1838	fr_makefrip(sizeof(*oip), (ip_t *)oip, &ofin);
1839	oip->ip6_plen = savelen;
1840	ofin.fin_ifp = fin->fin_ifp;
1841	ofin.fin_out = !fin->fin_out;
1842	ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
1843	READ_ENTER(&ipf_state);
1844	for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) {
1845		/*
1846		 * Only allow this icmp though if the
1847		 * encapsulated packet was allowed through the
1848		 * other way around. Note that the minimal amount
1849		 * of info present does not allow for checking against
1850		 * tcp internals such as seq and ack numbers.
1851		 */
1852		if ((is->is_p == pr) && (is->is_v == 6) &&
1853		    fr_matchsrcdst(is, src, dst, &ofin, tcp)) {
1854			fr = is->is_rule;
1855			ips_stats.iss_hits++;
1856			/*
1857			 * we must swap src and dst here because the icmp
1858			 * comes the other way around
1859			 */
1860			is->is_pkts++;
1861			is->is_bytes += fin->fin_plen;
1862			/*
1863			 * we deliberately do not touch the timeouts
1864			 * for the accompanying state table entry.
1865			 * It remains to be seen if that is correct. XXX
1866			 */
1867			RWLOCK_EXIT(&ipf_state);
1868			return fr;
1869		}
1870	}
1871	RWLOCK_EXIT(&ipf_state);
1872	return NULL;
1873}
1874#endif
1875