pf.c revision 270574
1/*-
2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
4 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 *    - Redistributions of source code must retain the above copyright
12 *      notice, this list of conditions and the following disclaimer.
13 *    - Redistributions in binary form must reproduce the above
14 *      copyright notice, this list of conditions and the following
15 *      disclaimer in the documentation and/or other materials provided
16 *      with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * Effort sponsored in part by the Defense Advanced Research Projects
32 * Agency (DARPA) and Air Force Research Laboratory, Air Force
33 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
34 *
35 *	$OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: stable/10/sys/netpfil/pf/pf.c 270574 2014-08-25 15:40:37Z glebius $");
40
41#include "opt_inet.h"
42#include "opt_inet6.h"
43#include "opt_bpf.h"
44#include "opt_pf.h"
45
46#include <sys/param.h>
47#include <sys/bus.h>
48#include <sys/endian.h>
49#include <sys/hash.h>
50#include <sys/interrupt.h>
51#include <sys/kernel.h>
52#include <sys/kthread.h>
53#include <sys/limits.h>
54#include <sys/mbuf.h>
55#include <sys/md5.h>
56#include <sys/random.h>
57#include <sys/refcount.h>
58#include <sys/socket.h>
59#include <sys/sysctl.h>
60#include <sys/taskqueue.h>
61#include <sys/ucred.h>
62
63#include <net/if.h>
64#include <net/if_types.h>
65#include <net/route.h>
66#include <net/radix_mpath.h>
67#include <net/vnet.h>
68
69#include <net/pfvar.h>
70#include <net/if_pflog.h>
71#include <net/if_pfsync.h>
72
73#include <netinet/in_pcb.h>
74#include <netinet/in_var.h>
75#include <netinet/ip.h>
76#include <netinet/ip_fw.h>
77#include <netinet/ip_icmp.h>
78#include <netinet/icmp_var.h>
79#include <netinet/ip_var.h>
80#include <netinet/tcp.h>
81#include <netinet/tcp_fsm.h>
82#include <netinet/tcp_seq.h>
83#include <netinet/tcp_timer.h>
84#include <netinet/tcp_var.h>
85#include <netinet/udp.h>
86#include <netinet/udp_var.h>
87
88#include <netpfil/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */
89
90#ifdef INET6
91#include <netinet/ip6.h>
92#include <netinet/icmp6.h>
93#include <netinet6/nd6.h>
94#include <netinet6/ip6_var.h>
95#include <netinet6/in6_pcb.h>
96#endif /* INET6 */
97
98#include <machine/in_cksum.h>
99#include <security/mac/mac_framework.h>
100
101#define	DPFPRINTF(n, x)	if (V_pf_status.debug >= (n)) printf x
102
103/*
104 * Global variables
105 */
106
107/* state tables */
108VNET_DEFINE(struct pf_altqqueue,	 pf_altqs[2]);
109VNET_DEFINE(struct pf_palist,		 pf_pabuf);
110VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_active);
111VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_inactive);
112VNET_DEFINE(struct pf_kstatus,		 pf_status);
113
114VNET_DEFINE(u_int32_t,			 ticket_altqs_active);
115VNET_DEFINE(u_int32_t,			 ticket_altqs_inactive);
116VNET_DEFINE(int,			 altqs_inactive_open);
117VNET_DEFINE(u_int32_t,			 ticket_pabuf);
118
119VNET_DEFINE(MD5_CTX,			 pf_tcp_secret_ctx);
120#define	V_pf_tcp_secret_ctx		 VNET(pf_tcp_secret_ctx)
121VNET_DEFINE(u_char,			 pf_tcp_secret[16]);
122#define	V_pf_tcp_secret			 VNET(pf_tcp_secret)
123VNET_DEFINE(int,			 pf_tcp_secret_init);
124#define	V_pf_tcp_secret_init		 VNET(pf_tcp_secret_init)
125VNET_DEFINE(int,			 pf_tcp_iss_off);
126#define	V_pf_tcp_iss_off		 VNET(pf_tcp_iss_off)
127
128/*
129 * Queue for pf_intr() sends.
130 */
131static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
132struct pf_send_entry {
133	STAILQ_ENTRY(pf_send_entry)	pfse_next;
134	struct mbuf			*pfse_m;
135	enum {
136		PFSE_IP,
137		PFSE_IP6,
138		PFSE_ICMP,
139		PFSE_ICMP6,
140	}				pfse_type;
141	union {
142		struct route		ro;
143		struct {
144			int		type;
145			int		code;
146			int		mtu;
147		} icmpopts;
148	} u;
149#define	pfse_ro		u.ro
150#define	pfse_icmp_type	u.icmpopts.type
151#define	pfse_icmp_code	u.icmpopts.code
152#define	pfse_icmp_mtu	u.icmpopts.mtu
153};
154
155STAILQ_HEAD(pf_send_head, pf_send_entry);
156static VNET_DEFINE(struct pf_send_head, pf_sendqueue);
157#define	V_pf_sendqueue	VNET(pf_sendqueue)
158
159static struct mtx pf_sendqueue_mtx;
160#define	PF_SENDQ_LOCK()		mtx_lock(&pf_sendqueue_mtx)
161#define	PF_SENDQ_UNLOCK()	mtx_unlock(&pf_sendqueue_mtx)
162
163/*
164 * Queue for pf_overload_task() tasks.
165 */
166struct pf_overload_entry {
167	SLIST_ENTRY(pf_overload_entry)	next;
168	struct pf_addr  		addr;
169	sa_family_t			af;
170	uint8_t				dir;
171	struct pf_rule  		*rule;
172};
173
174SLIST_HEAD(pf_overload_head, pf_overload_entry);
175static VNET_DEFINE(struct pf_overload_head, pf_overloadqueue);
176#define V_pf_overloadqueue	VNET(pf_overloadqueue)
177static VNET_DEFINE(struct task, pf_overloadtask);
178#define	V_pf_overloadtask	VNET(pf_overloadtask)
179
180static struct mtx pf_overloadqueue_mtx;
181#define	PF_OVERLOADQ_LOCK()	mtx_lock(&pf_overloadqueue_mtx)
182#define	PF_OVERLOADQ_UNLOCK()	mtx_unlock(&pf_overloadqueue_mtx)
183
184VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules);
185struct mtx pf_unlnkdrules_mtx;
186
187static VNET_DEFINE(uma_zone_t,	pf_sources_z);
188#define	V_pf_sources_z	VNET(pf_sources_z)
189uma_zone_t		pf_mtag_z;
190VNET_DEFINE(uma_zone_t,	 pf_state_z);
191VNET_DEFINE(uma_zone_t,	 pf_state_key_z);
192
193VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]);
194#define	PFID_CPUBITS	8
195#define	PFID_CPUSHIFT	(sizeof(uint64_t) * NBBY - PFID_CPUBITS)
196#define	PFID_CPUMASK	((uint64_t)((1 << PFID_CPUBITS) - 1) <<	PFID_CPUSHIFT)
197#define	PFID_MAXID	(~PFID_CPUMASK)
198CTASSERT((1 << PFID_CPUBITS) > MAXCPU);
199
200static void		 pf_src_tree_remove_state(struct pf_state *);
201static void		 pf_init_threshold(struct pf_threshold *, u_int32_t,
202			    u_int32_t);
203static void		 pf_add_threshold(struct pf_threshold *);
204static int		 pf_check_threshold(struct pf_threshold *);
205
206static void		 pf_change_ap(struct pf_addr *, u_int16_t *,
207			    u_int16_t *, u_int16_t *, struct pf_addr *,
208			    u_int16_t, u_int8_t, sa_family_t);
209static int		 pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
210			    struct tcphdr *, struct pf_state_peer *);
211static void		 pf_change_icmp(struct pf_addr *, u_int16_t *,
212			    struct pf_addr *, struct pf_addr *, u_int16_t,
213			    u_int16_t *, u_int16_t *, u_int16_t *,
214			    u_int16_t *, u_int8_t, sa_family_t);
215static void		 pf_send_tcp(struct mbuf *,
216			    const struct pf_rule *, sa_family_t,
217			    const struct pf_addr *, const struct pf_addr *,
218			    u_int16_t, u_int16_t, u_int32_t, u_int32_t,
219			    u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
220			    u_int16_t, struct ifnet *);
221static void		 pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
222			    sa_family_t, struct pf_rule *);
223static void		 pf_detach_state(struct pf_state *);
224static int		 pf_state_key_attach(struct pf_state_key *,
225			    struct pf_state_key *, struct pf_state *);
226static void		 pf_state_key_detach(struct pf_state *, int);
227static int		 pf_state_key_ctor(void *, int, void *, int);
228static u_int32_t	 pf_tcp_iss(struct pf_pdesc *);
229static int		 pf_test_rule(struct pf_rule **, struct pf_state **,
230			    int, struct pfi_kif *, struct mbuf *, int,
231			    struct pf_pdesc *, struct pf_rule **,
232			    struct pf_ruleset **, struct inpcb *);
233static int		 pf_create_state(struct pf_rule *, struct pf_rule *,
234			    struct pf_rule *, struct pf_pdesc *,
235			    struct pf_src_node *, struct pf_state_key *,
236			    struct pf_state_key *, struct mbuf *, int,
237			    u_int16_t, u_int16_t, int *, struct pfi_kif *,
238			    struct pf_state **, int, u_int16_t, u_int16_t,
239			    int);
240static int		 pf_test_fragment(struct pf_rule **, int,
241			    struct pfi_kif *, struct mbuf *, void *,
242			    struct pf_pdesc *, struct pf_rule **,
243			    struct pf_ruleset **);
244static int		 pf_tcp_track_full(struct pf_state_peer *,
245			    struct pf_state_peer *, struct pf_state **,
246			    struct pfi_kif *, struct mbuf *, int,
247			    struct pf_pdesc *, u_short *, int *);
248static int		 pf_tcp_track_sloppy(struct pf_state_peer *,
249			    struct pf_state_peer *, struct pf_state **,
250			    struct pf_pdesc *, u_short *);
251static int		 pf_test_state_tcp(struct pf_state **, int,
252			    struct pfi_kif *, struct mbuf *, int,
253			    void *, struct pf_pdesc *, u_short *);
254static int		 pf_test_state_udp(struct pf_state **, int,
255			    struct pfi_kif *, struct mbuf *, int,
256			    void *, struct pf_pdesc *);
257static int		 pf_test_state_icmp(struct pf_state **, int,
258			    struct pfi_kif *, struct mbuf *, int,
259			    void *, struct pf_pdesc *, u_short *);
260static int		 pf_test_state_other(struct pf_state **, int,
261			    struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
262static u_int8_t		 pf_get_wscale(struct mbuf *, int, u_int16_t,
263			    sa_family_t);
264static u_int16_t	 pf_get_mss(struct mbuf *, int, u_int16_t,
265			    sa_family_t);
266static u_int16_t	 pf_calc_mss(struct pf_addr *, sa_family_t,
267				int, u_int16_t);
268static void		 pf_set_rt_ifp(struct pf_state *,
269			    struct pf_addr *);
270static int		 pf_check_proto_cksum(struct mbuf *, int, int,
271			    u_int8_t, sa_family_t);
272static void		 pf_print_state_parts(struct pf_state *,
273			    struct pf_state_key *, struct pf_state_key *);
274static int		 pf_addr_wrap_neq(struct pf_addr_wrap *,
275			    struct pf_addr_wrap *);
276static struct pf_state	*pf_find_state(struct pfi_kif *,
277			    struct pf_state_key_cmp *, u_int);
278static int		 pf_src_connlimit(struct pf_state **);
279static void		 pf_overload_task(void *v, int pending);
280static int		 pf_insert_src_node(struct pf_src_node **,
281			    struct pf_rule *, struct pf_addr *, sa_family_t);
282static u_int		 pf_purge_expired_states(u_int, int);
283static void		 pf_purge_unlinked_rules(void);
284static int		 pf_mtag_uminit(void *, int, int);
285static void		 pf_mtag_free(struct m_tag *);
286#ifdef INET
287static void		 pf_route(struct mbuf **, struct pf_rule *, int,
288			    struct ifnet *, struct pf_state *,
289			    struct pf_pdesc *);
290#endif /* INET */
291#ifdef INET6
292static void		 pf_change_a6(struct pf_addr *, u_int16_t *,
293			    struct pf_addr *, u_int8_t);
294static void		 pf_route6(struct mbuf **, struct pf_rule *, int,
295			    struct ifnet *, struct pf_state *,
296			    struct pf_pdesc *);
297#endif /* INET6 */
298
299int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
300
301VNET_DECLARE(int, pf_end_threads);
302
303VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
304
305#define	PACKET_LOOPED(pd)	((pd)->pf_mtag &&			\
306				 (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
307
308#define	STATE_LOOKUP(i, k, d, s, pd)					\
309	do {								\
310		(s) = pf_find_state((i), (k), (d));			\
311		if ((s) == NULL)					\
312			return (PF_DROP);				\
313		if (PACKET_LOOPED(pd))					\
314			return (PF_PASS);				\
315		if ((d) == PF_OUT &&					\
316		    (((s)->rule.ptr->rt == PF_ROUTETO &&		\
317		    (s)->rule.ptr->direction == PF_OUT) ||		\
318		    ((s)->rule.ptr->rt == PF_REPLYTO &&			\
319		    (s)->rule.ptr->direction == PF_IN)) &&		\
320		    (s)->rt_kif != NULL &&				\
321		    (s)->rt_kif != (i))					\
322			return (PF_PASS);				\
323	} while (0)
324
325#define	BOUND_IFACE(r, k) \
326	((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
327
328#define	STATE_INC_COUNTERS(s)						\
329	do {								\
330		counter_u64_add(s->rule.ptr->states_cur, 1);		\
331		counter_u64_add(s->rule.ptr->states_tot, 1);		\
332		if (s->anchor.ptr != NULL) {				\
333			counter_u64_add(s->anchor.ptr->states_cur, 1);	\
334			counter_u64_add(s->anchor.ptr->states_tot, 1);	\
335		}							\
336		if (s->nat_rule.ptr != NULL) {				\
337			counter_u64_add(s->nat_rule.ptr->states_cur, 1);\
338			counter_u64_add(s->nat_rule.ptr->states_tot, 1);\
339		}							\
340	} while (0)
341
342#define	STATE_DEC_COUNTERS(s)						\
343	do {								\
344		if (s->nat_rule.ptr != NULL)				\
345			counter_u64_add(s->nat_rule.ptr->states_cur, -1);\
346		if (s->anchor.ptr != NULL)				\
347			counter_u64_add(s->anchor.ptr->states_cur, -1);	\
348		counter_u64_add(s->rule.ptr->states_cur, -1);		\
349	} while (0)
350
351static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
352VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
353VNET_DEFINE(struct pf_idhash *, pf_idhash);
354VNET_DEFINE(u_long, pf_hashmask);
355VNET_DEFINE(struct pf_srchash *, pf_srchash);
356VNET_DEFINE(u_long, pf_srchashmask);
357
358SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)");
359
360VNET_DEFINE(u_long, pf_hashsize);
361#define	V_pf_hashsize	VNET(pf_hashsize)
362SYSCTL_VNET_UINT(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
363    &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable");
364
365VNET_DEFINE(u_long, pf_srchashsize);
366#define	V_pf_srchashsize	VNET(pf_srchashsize)
367SYSCTL_VNET_UINT(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
368    &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable");
369
370VNET_DEFINE(void *, pf_swi_cookie);
371
372VNET_DEFINE(uint32_t, pf_hashseed);
373#define	V_pf_hashseed	VNET(pf_hashseed)
374
375static __inline uint32_t
376pf_hashkey(struct pf_state_key *sk)
377{
378	uint32_t h;
379
380	h = jenkins_hash32((uint32_t *)sk,
381	    sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
382	    V_pf_hashseed);
383
384	return (h & V_pf_hashmask);
385}
386
387static __inline uint32_t
388pf_hashsrc(struct pf_addr *addr, sa_family_t af)
389{
390	uint32_t h;
391
392	switch (af) {
393	case AF_INET:
394		h = jenkins_hash32((uint32_t *)&addr->v4,
395		    sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed);
396		break;
397	case AF_INET6:
398		h = jenkins_hash32((uint32_t *)&addr->v6,
399		    sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed);
400		break;
401	default:
402		panic("%s: unknown address family %u", __func__, af);
403	}
404
405	return (h & V_pf_srchashmask);
406}
407
408#ifdef INET6
409void
410pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
411{
412	switch (af) {
413#ifdef INET
414	case AF_INET:
415		dst->addr32[0] = src->addr32[0];
416		break;
417#endif /* INET */
418	case AF_INET6:
419		dst->addr32[0] = src->addr32[0];
420		dst->addr32[1] = src->addr32[1];
421		dst->addr32[2] = src->addr32[2];
422		dst->addr32[3] = src->addr32[3];
423		break;
424	}
425}
426#endif /* INET6 */
427
428static void
429pf_init_threshold(struct pf_threshold *threshold,
430    u_int32_t limit, u_int32_t seconds)
431{
432	threshold->limit = limit * PF_THRESHOLD_MULT;
433	threshold->seconds = seconds;
434	threshold->count = 0;
435	threshold->last = time_uptime;
436}
437
438static void
439pf_add_threshold(struct pf_threshold *threshold)
440{
441	u_int32_t t = time_uptime, diff = t - threshold->last;
442
443	if (diff >= threshold->seconds)
444		threshold->count = 0;
445	else
446		threshold->count -= threshold->count * diff /
447		    threshold->seconds;
448	threshold->count += PF_THRESHOLD_MULT;
449	threshold->last = t;
450}
451
452static int
453pf_check_threshold(struct pf_threshold *threshold)
454{
455	return (threshold->count > threshold->limit);
456}
457
458static int
459pf_src_connlimit(struct pf_state **state)
460{
461	struct pf_overload_entry *pfoe;
462	int bad = 0;
463
464	PF_STATE_LOCK_ASSERT(*state);
465
466	(*state)->src_node->conn++;
467	(*state)->src.tcp_est = 1;
468	pf_add_threshold(&(*state)->src_node->conn_rate);
469
470	if ((*state)->rule.ptr->max_src_conn &&
471	    (*state)->rule.ptr->max_src_conn <
472	    (*state)->src_node->conn) {
473		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1);
474		bad++;
475	}
476
477	if ((*state)->rule.ptr->max_src_conn_rate.limit &&
478	    pf_check_threshold(&(*state)->src_node->conn_rate)) {
479		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1);
480		bad++;
481	}
482
483	if (!bad)
484		return (0);
485
486	/* Kill this state. */
487	(*state)->timeout = PFTM_PURGE;
488	(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
489
490	if ((*state)->rule.ptr->overload_tbl == NULL)
491		return (1);
492
493	/* Schedule overloading and flushing task. */
494	pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT);
495	if (pfoe == NULL)
496		return (1);	/* too bad :( */
497
498	bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr));
499	pfoe->af = (*state)->key[PF_SK_WIRE]->af;
500	pfoe->rule = (*state)->rule.ptr;
501	pfoe->dir = (*state)->direction;
502	PF_OVERLOADQ_LOCK();
503	SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next);
504	PF_OVERLOADQ_UNLOCK();
505	taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask);
506
507	return (1);
508}
509
510static void
511pf_overload_task(void *v, int pending)
512{
513	struct pf_overload_head queue;
514	struct pfr_addr p;
515	struct pf_overload_entry *pfoe, *pfoe1;
516	uint32_t killed = 0;
517
518	CURVNET_SET((struct vnet *)v);
519
520	PF_OVERLOADQ_LOCK();
521	queue = V_pf_overloadqueue;
522	SLIST_INIT(&V_pf_overloadqueue);
523	PF_OVERLOADQ_UNLOCK();
524
525	bzero(&p, sizeof(p));
526	SLIST_FOREACH(pfoe, &queue, next) {
527		counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1);
528		if (V_pf_status.debug >= PF_DEBUG_MISC) {
529			printf("%s: blocking address ", __func__);
530			pf_print_host(&pfoe->addr, 0, pfoe->af);
531			printf("\n");
532		}
533
534		p.pfra_af = pfoe->af;
535		switch (pfoe->af) {
536#ifdef INET
537		case AF_INET:
538			p.pfra_net = 32;
539			p.pfra_ip4addr = pfoe->addr.v4;
540			break;
541#endif
542#ifdef INET6
543		case AF_INET6:
544			p.pfra_net = 128;
545			p.pfra_ip6addr = pfoe->addr.v6;
546			break;
547#endif
548		}
549
550		PF_RULES_WLOCK();
551		pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second);
552		PF_RULES_WUNLOCK();
553	}
554
555	/*
556	 * Remove those entries, that don't need flushing.
557	 */
558	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
559		if (pfoe->rule->flush == 0) {
560			SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next);
561			free(pfoe, M_PFTEMP);
562		} else
563			counter_u64_add(
564			    V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1);
565
566	/* If nothing to flush, return. */
567	if (SLIST_EMPTY(&queue)) {
568		CURVNET_RESTORE();
569		return;
570	}
571
572	for (int i = 0; i <= V_pf_hashmask; i++) {
573		struct pf_idhash *ih = &V_pf_idhash[i];
574		struct pf_state_key *sk;
575		struct pf_state *s;
576
577		PF_HASHROW_LOCK(ih);
578		LIST_FOREACH(s, &ih->states, entry) {
579		    sk = s->key[PF_SK_WIRE];
580		    SLIST_FOREACH(pfoe, &queue, next)
581			if (sk->af == pfoe->af &&
582			    ((pfoe->rule->flush & PF_FLUSH_GLOBAL) ||
583			    pfoe->rule == s->rule.ptr) &&
584			    ((pfoe->dir == PF_OUT &&
585			    PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) ||
586			    (pfoe->dir == PF_IN &&
587			    PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) {
588				s->timeout = PFTM_PURGE;
589				s->src.state = s->dst.state = TCPS_CLOSED;
590				killed++;
591			}
592		}
593		PF_HASHROW_UNLOCK(ih);
594	}
595	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
596		free(pfoe, M_PFTEMP);
597	if (V_pf_status.debug >= PF_DEBUG_MISC)
598		printf("%s: %u states killed", __func__, killed);
599
600	CURVNET_RESTORE();
601}
602
603/*
604 * Can return locked on failure, so that we can consistently
605 * allocate and insert a new one.
606 */
607struct pf_src_node *
608pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af,
609	int returnlocked)
610{
611	struct pf_srchash *sh;
612	struct pf_src_node *n;
613
614	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1);
615
616	sh = &V_pf_srchash[pf_hashsrc(src, af)];
617	PF_HASHROW_LOCK(sh);
618	LIST_FOREACH(n, &sh->nodes, entry)
619		if (n->rule.ptr == rule && n->af == af &&
620		    ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
621		    (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
622			break;
623	if (n != NULL || returnlocked == 0)
624		PF_HASHROW_UNLOCK(sh);
625
626	return (n);
627}
628
629static int
630pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
631    struct pf_addr *src, sa_family_t af)
632{
633
634	KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK ||
635	    rule->rpool.opts & PF_POOL_STICKYADDR),
636	    ("%s for non-tracking rule %p", __func__, rule));
637
638	if (*sn == NULL)
639		*sn = pf_find_src_node(src, rule, af, 1);
640
641	if (*sn == NULL) {
642		struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
643
644		PF_HASHROW_ASSERT(sh);
645
646		if (!rule->max_src_nodes ||
647		    counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes)
648			(*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
649		else
650			counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES],
651			    1);
652		if ((*sn) == NULL) {
653			PF_HASHROW_UNLOCK(sh);
654			return (-1);
655		}
656
657		pf_init_threshold(&(*sn)->conn_rate,
658		    rule->max_src_conn_rate.limit,
659		    rule->max_src_conn_rate.seconds);
660
661		(*sn)->af = af;
662		(*sn)->rule.ptr = rule;
663		PF_ACPY(&(*sn)->addr, src, af);
664		LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
665		(*sn)->creation = time_uptime;
666		(*sn)->ruletype = rule->action;
667		if ((*sn)->rule.ptr != NULL)
668			counter_u64_add((*sn)->rule.ptr->src_nodes, 1);
669		PF_HASHROW_UNLOCK(sh);
670		counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1);
671	} else {
672		if (rule->max_src_states &&
673		    (*sn)->states >= rule->max_src_states) {
674			counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES],
675			    1);
676			return (-1);
677		}
678	}
679	return (0);
680}
681
682void
683pf_unlink_src_node_locked(struct pf_src_node *src)
684{
685#ifdef INVARIANTS
686	struct pf_srchash *sh;
687
688	sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)];
689	PF_HASHROW_ASSERT(sh);
690#endif
691	LIST_REMOVE(src, entry);
692	if (src->rule.ptr)
693		counter_u64_add(src->rule.ptr->src_nodes, -1);
694	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
695}
696
697void
698pf_unlink_src_node(struct pf_src_node *src)
699{
700	struct pf_srchash *sh;
701
702	sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)];
703	PF_HASHROW_LOCK(sh);
704	pf_unlink_src_node_locked(src);
705	PF_HASHROW_UNLOCK(sh);
706}
707
708static void
709pf_free_src_node(struct pf_src_node *sn)
710{
711
712	KASSERT(sn->states == 0, ("%s: %p has refs", __func__, sn));
713	uma_zfree(V_pf_sources_z, sn);
714}
715
716u_int
717pf_free_src_nodes(struct pf_src_node_list *head)
718{
719	struct pf_src_node *sn, *tmp;
720	u_int count = 0;
721
722	LIST_FOREACH_SAFE(sn, head, entry, tmp) {
723		pf_free_src_node(sn);
724		count++;
725	}
726
727	return (count);
728}
729
730void
731pf_mtag_initialize()
732{
733
734	pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
735	    sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL,
736	    UMA_ALIGN_PTR, 0);
737}
738
739/* Per-vnet data storage structures initialization. */
740void
741pf_initialize()
742{
743	struct pf_keyhash	*kh;
744	struct pf_idhash	*ih;
745	struct pf_srchash	*sh;
746	u_int i;
747
748	TUNABLE_ULONG_FETCH("net.pf.states_hashsize", &V_pf_hashsize);
749	if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize))
750		V_pf_hashsize = PF_HASHSIZ;
751	TUNABLE_ULONG_FETCH("net.pf.source_nodes_hashsize", &V_pf_srchashsize);
752	if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize))
753		V_pf_srchashsize = PF_HASHSIZ / 4;
754
755	V_pf_hashseed = arc4random();
756
757	/* States and state keys storage. */
758	V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
759	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
760	V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
761	uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
762	uma_zone_set_warning(V_pf_state_z, "PF states limit reached");
763
764	V_pf_state_key_z = uma_zcreate("pf state keys",
765	    sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
766	    UMA_ALIGN_PTR, 0);
767	V_pf_keyhash = malloc(V_pf_hashsize * sizeof(struct pf_keyhash),
768	    M_PFHASH, M_WAITOK | M_ZERO);
769	V_pf_idhash = malloc(V_pf_hashsize * sizeof(struct pf_idhash),
770	    M_PFHASH, M_WAITOK | M_ZERO);
771	V_pf_hashmask = V_pf_hashsize - 1;
772	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
773	    i++, kh++, ih++) {
774		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK);
775		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
776	}
777
778	/* Source nodes. */
779	V_pf_sources_z = uma_zcreate("pf source nodes",
780	    sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
781	    0);
782	V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
783	uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
784	uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached");
785	V_pf_srchash = malloc(V_pf_srchashsize * sizeof(struct pf_srchash),
786	  M_PFHASH, M_WAITOK|M_ZERO);
787	V_pf_srchashmask = V_pf_srchashsize - 1;
788	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++)
789		mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
790
791	/* ALTQ */
792	TAILQ_INIT(&V_pf_altqs[0]);
793	TAILQ_INIT(&V_pf_altqs[1]);
794	TAILQ_INIT(&V_pf_pabuf);
795	V_pf_altqs_active = &V_pf_altqs[0];
796	V_pf_altqs_inactive = &V_pf_altqs[1];
797
798
799	/* Send & overload+flush queues. */
800	STAILQ_INIT(&V_pf_sendqueue);
801	SLIST_INIT(&V_pf_overloadqueue);
802	TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet);
803	mtx_init(&pf_sendqueue_mtx, "pf send queue", NULL, MTX_DEF);
804	mtx_init(&pf_overloadqueue_mtx, "pf overload/flush queue", NULL,
805	    MTX_DEF);
806
807	/* Unlinked, but may be referenced rules. */
808	TAILQ_INIT(&V_pf_unlinked_rules);
809	mtx_init(&pf_unlnkdrules_mtx, "pf unlinked rules", NULL, MTX_DEF);
810}
811
812void
813pf_mtag_cleanup()
814{
815
816	uma_zdestroy(pf_mtag_z);
817}
818
819void
820pf_cleanup()
821{
822	struct pf_keyhash	*kh;
823	struct pf_idhash	*ih;
824	struct pf_srchash	*sh;
825	struct pf_send_entry	*pfse, *next;
826	u_int i;
827
828	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
829	    i++, kh++, ih++) {
830		KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
831		    __func__));
832		KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
833		    __func__));
834		mtx_destroy(&kh->lock);
835		mtx_destroy(&ih->lock);
836	}
837	free(V_pf_keyhash, M_PFHASH);
838	free(V_pf_idhash, M_PFHASH);
839
840	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
841		KASSERT(LIST_EMPTY(&sh->nodes),
842		    ("%s: source node hash not empty", __func__));
843		mtx_destroy(&sh->lock);
844	}
845	free(V_pf_srchash, M_PFHASH);
846
847	STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
848		m_freem(pfse->pfse_m);
849		free(pfse, M_PFTEMP);
850	}
851
852	mtx_destroy(&pf_sendqueue_mtx);
853	mtx_destroy(&pf_overloadqueue_mtx);
854	mtx_destroy(&pf_unlnkdrules_mtx);
855
856	uma_zdestroy(V_pf_sources_z);
857	uma_zdestroy(V_pf_state_z);
858	uma_zdestroy(V_pf_state_key_z);
859}
860
861static int
862pf_mtag_uminit(void *mem, int size, int how)
863{
864	struct m_tag *t;
865
866	t = (struct m_tag *)mem;
867	t->m_tag_cookie = MTAG_ABI_COMPAT;
868	t->m_tag_id = PACKET_TAG_PF;
869	t->m_tag_len = sizeof(struct pf_mtag);
870	t->m_tag_free = pf_mtag_free;
871
872	return (0);
873}
874
875static void
876pf_mtag_free(struct m_tag *t)
877{
878
879	uma_zfree(pf_mtag_z, t);
880}
881
882struct pf_mtag *
883pf_get_mtag(struct mbuf *m)
884{
885	struct m_tag *mtag;
886
887	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
888		return ((struct pf_mtag *)(mtag + 1));
889
890	mtag = uma_zalloc(pf_mtag_z, M_NOWAIT);
891	if (mtag == NULL)
892		return (NULL);
893	bzero(mtag + 1, sizeof(struct pf_mtag));
894	m_tag_prepend(m, mtag);
895
896	return ((struct pf_mtag *)(mtag + 1));
897}
898
899static int
900pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
901    struct pf_state *s)
902{
903	struct pf_keyhash	*khs, *khw, *kh;
904	struct pf_state_key	*sk, *cur;
905	struct pf_state		*si, *olds = NULL;
906	int idx;
907
908	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
909	KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
910	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
911
912	/*
913	 * We need to lock hash slots of both keys. To avoid deadlock
914	 * we always lock the slot with lower address first. Unlock order
915	 * isn't important.
916	 *
917	 * We also need to lock ID hash slot before dropping key
918	 * locks. On success we return with ID hash slot locked.
919	 */
920
921	if (skw == sks) {
922		khs = khw = &V_pf_keyhash[pf_hashkey(skw)];
923		PF_HASHROW_LOCK(khs);
924	} else {
925		khs = &V_pf_keyhash[pf_hashkey(sks)];
926		khw = &V_pf_keyhash[pf_hashkey(skw)];
927		if (khs == khw) {
928			PF_HASHROW_LOCK(khs);
929		} else if (khs < khw) {
930			PF_HASHROW_LOCK(khs);
931			PF_HASHROW_LOCK(khw);
932		} else {
933			PF_HASHROW_LOCK(khw);
934			PF_HASHROW_LOCK(khs);
935		}
936	}
937
938#define	KEYS_UNLOCK()	do {			\
939	if (khs != khw) {			\
940		PF_HASHROW_UNLOCK(khs);		\
941		PF_HASHROW_UNLOCK(khw);		\
942	} else					\
943		PF_HASHROW_UNLOCK(khs);		\
944} while (0)
945
946	/*
947	 * First run: start with wire key.
948	 */
949	sk = skw;
950	kh = khw;
951	idx = PF_SK_WIRE;
952
953keyattach:
954	LIST_FOREACH(cur, &kh->keys, entry)
955		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
956			break;
957
958	if (cur != NULL) {
959		/* Key exists. Check for same kif, if none, add to key. */
960		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
961			struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
962
963			PF_HASHROW_LOCK(ih);
964			if (si->kif == s->kif &&
965			    si->direction == s->direction) {
966				if (sk->proto == IPPROTO_TCP &&
967				    si->src.state >= TCPS_FIN_WAIT_2 &&
968				    si->dst.state >= TCPS_FIN_WAIT_2) {
969					/*
970					 * New state matches an old >FIN_WAIT_2
971					 * state. We can't drop key hash locks,
972					 * thus we can't unlink it properly.
973					 *
974					 * As a workaround we drop it into
975					 * TCPS_CLOSED state, schedule purge
976					 * ASAP and push it into the very end
977					 * of the slot TAILQ, so that it won't
978					 * conflict with our new state.
979					 */
980					si->src.state = si->dst.state =
981					    TCPS_CLOSED;
982					si->timeout = PFTM_PURGE;
983					olds = si;
984				} else {
985					if (V_pf_status.debug >= PF_DEBUG_MISC) {
986						printf("pf: %s key attach "
987						    "failed on %s: ",
988						    (idx == PF_SK_WIRE) ?
989						    "wire" : "stack",
990						    s->kif->pfik_name);
991						pf_print_state_parts(s,
992						    (idx == PF_SK_WIRE) ?
993						    sk : NULL,
994						    (idx == PF_SK_STACK) ?
995						    sk : NULL);
996						printf(", existing: ");
997						pf_print_state_parts(si,
998						    (idx == PF_SK_WIRE) ?
999						    sk : NULL,
1000						    (idx == PF_SK_STACK) ?
1001						    sk : NULL);
1002						printf("\n");
1003					}
1004					PF_HASHROW_UNLOCK(ih);
1005					KEYS_UNLOCK();
1006					uma_zfree(V_pf_state_key_z, sk);
1007					if (idx == PF_SK_STACK)
1008						pf_detach_state(s);
1009					return (EEXIST); /* collision! */
1010				}
1011			}
1012			PF_HASHROW_UNLOCK(ih);
1013		}
1014		uma_zfree(V_pf_state_key_z, sk);
1015		s->key[idx] = cur;
1016	} else {
1017		LIST_INSERT_HEAD(&kh->keys, sk, entry);
1018		s->key[idx] = sk;
1019	}
1020
1021stateattach:
1022	/* List is sorted, if-bound states before floating. */
1023	if (s->kif == V_pfi_all)
1024		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
1025	else
1026		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
1027
1028	if (olds) {
1029		TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]);
1030		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds,
1031		    key_list[idx]);
1032		olds = NULL;
1033	}
1034
1035	/*
1036	 * Attach done. See how should we (or should not?)
1037	 * attach a second key.
1038	 */
1039	if (sks == skw) {
1040		s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
1041		idx = PF_SK_STACK;
1042		sks = NULL;
1043		goto stateattach;
1044	} else if (sks != NULL) {
1045		/*
1046		 * Continue attaching with stack key.
1047		 */
1048		sk = sks;
1049		kh = khs;
1050		idx = PF_SK_STACK;
1051		sks = NULL;
1052		goto keyattach;
1053	}
1054
1055	PF_STATE_LOCK(s);
1056	KEYS_UNLOCK();
1057
1058	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
1059	    ("%s failure", __func__));
1060
1061	return (0);
1062#undef	KEYS_UNLOCK
1063}
1064
1065static void
1066pf_detach_state(struct pf_state *s)
1067{
1068	struct pf_state_key *sks = s->key[PF_SK_STACK];
1069	struct pf_keyhash *kh;
1070
1071	if (sks != NULL) {
1072		kh = &V_pf_keyhash[pf_hashkey(sks)];
1073		PF_HASHROW_LOCK(kh);
1074		if (s->key[PF_SK_STACK] != NULL)
1075			pf_state_key_detach(s, PF_SK_STACK);
1076		/*
1077		 * If both point to same key, then we are done.
1078		 */
1079		if (sks == s->key[PF_SK_WIRE]) {
1080			pf_state_key_detach(s, PF_SK_WIRE);
1081			PF_HASHROW_UNLOCK(kh);
1082			return;
1083		}
1084		PF_HASHROW_UNLOCK(kh);
1085	}
1086
1087	if (s->key[PF_SK_WIRE] != NULL) {
1088		kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
1089		PF_HASHROW_LOCK(kh);
1090		if (s->key[PF_SK_WIRE] != NULL)
1091			pf_state_key_detach(s, PF_SK_WIRE);
1092		PF_HASHROW_UNLOCK(kh);
1093	}
1094}
1095
1096static void
1097pf_state_key_detach(struct pf_state *s, int idx)
1098{
1099	struct pf_state_key *sk = s->key[idx];
1100#ifdef INVARIANTS
1101	struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
1102
1103	PF_HASHROW_ASSERT(kh);
1104#endif
1105	TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
1106	s->key[idx] = NULL;
1107
1108	if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
1109		LIST_REMOVE(sk, entry);
1110		uma_zfree(V_pf_state_key_z, sk);
1111	}
1112}
1113
1114static int
1115pf_state_key_ctor(void *mem, int size, void *arg, int flags)
1116{
1117	struct pf_state_key *sk = mem;
1118
1119	bzero(sk, sizeof(struct pf_state_key_cmp));
1120	TAILQ_INIT(&sk->states[PF_SK_WIRE]);
1121	TAILQ_INIT(&sk->states[PF_SK_STACK]);
1122
1123	return (0);
1124}
1125
1126struct pf_state_key *
1127pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
1128	struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
1129{
1130	struct pf_state_key *sk;
1131
1132	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1133	if (sk == NULL)
1134		return (NULL);
1135
1136	PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
1137	PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
1138	sk->port[pd->sidx] = sport;
1139	sk->port[pd->didx] = dport;
1140	sk->proto = pd->proto;
1141	sk->af = pd->af;
1142
1143	return (sk);
1144}
1145
1146struct pf_state_key *
1147pf_state_key_clone(struct pf_state_key *orig)
1148{
1149	struct pf_state_key *sk;
1150
1151	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1152	if (sk == NULL)
1153		return (NULL);
1154
1155	bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
1156
1157	return (sk);
1158}
1159
1160int
1161pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
1162    struct pf_state_key *sks, struct pf_state *s)
1163{
1164	struct pf_idhash *ih;
1165	struct pf_state *cur;
1166	int error;
1167
1168	KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
1169	    ("%s: sks not pristine", __func__));
1170	KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
1171	    ("%s: skw not pristine", __func__));
1172	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1173
1174	s->kif = kif;
1175
1176	if (s->id == 0 && s->creatorid == 0) {
1177		/* XXX: should be atomic, but probability of collision low */
1178		if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID)
1179			V_pf_stateid[curcpu] = 1;
1180		s->id |= (uint64_t )curcpu << PFID_CPUSHIFT;
1181		s->id = htobe64(s->id);
1182		s->creatorid = V_pf_status.hostid;
1183	}
1184
1185	/* Returns with ID locked on success. */
1186	if ((error = pf_state_key_attach(skw, sks, s)) != 0)
1187		return (error);
1188
1189	ih = &V_pf_idhash[PF_IDHASH(s)];
1190	PF_HASHROW_ASSERT(ih);
1191	LIST_FOREACH(cur, &ih->states, entry)
1192		if (cur->id == s->id && cur->creatorid == s->creatorid)
1193			break;
1194
1195	if (cur != NULL) {
1196		PF_HASHROW_UNLOCK(ih);
1197		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1198			printf("pf: state ID collision: "
1199			    "id: %016llx creatorid: %08x\n",
1200			    (unsigned long long)be64toh(s->id),
1201			    ntohl(s->creatorid));
1202		}
1203		pf_detach_state(s);
1204		return (EEXIST);
1205	}
1206	LIST_INSERT_HEAD(&ih->states, s, entry);
1207	/* One for keys, one for ID hash. */
1208	refcount_init(&s->refs, 2);
1209
1210	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_INSERT], 1);
1211	if (pfsync_insert_state_ptr != NULL)
1212		pfsync_insert_state_ptr(s);
1213
1214	/* Returns locked. */
1215	return (0);
1216}
1217
1218/*
1219 * Find state by ID: returns with locked row on success.
1220 */
1221struct pf_state *
1222pf_find_state_byid(uint64_t id, uint32_t creatorid)
1223{
1224	struct pf_idhash *ih;
1225	struct pf_state *s;
1226
1227	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1228
1229	ih = &V_pf_idhash[(be64toh(id) % (V_pf_hashmask + 1))];
1230
1231	PF_HASHROW_LOCK(ih);
1232	LIST_FOREACH(s, &ih->states, entry)
1233		if (s->id == id && s->creatorid == creatorid)
1234			break;
1235
1236	if (s == NULL)
1237		PF_HASHROW_UNLOCK(ih);
1238
1239	return (s);
1240}
1241
1242/*
1243 * Find state by key.
1244 * Returns with ID hash slot locked on success.
1245 */
1246static struct pf_state *
1247pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir)
1248{
1249	struct pf_keyhash	*kh;
1250	struct pf_state_key	*sk;
1251	struct pf_state		*s;
1252	int idx;
1253
1254	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1255
1256	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1257
1258	PF_HASHROW_LOCK(kh);
1259	LIST_FOREACH(sk, &kh->keys, entry)
1260		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1261			break;
1262	if (sk == NULL) {
1263		PF_HASHROW_UNLOCK(kh);
1264		return (NULL);
1265	}
1266
1267	idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
1268
1269	/* List is sorted, if-bound states before floating ones. */
1270	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
1271		if (s->kif == V_pfi_all || s->kif == kif) {
1272			PF_STATE_LOCK(s);
1273			PF_HASHROW_UNLOCK(kh);
1274			if (s->timeout >= PFTM_MAX) {
1275				/*
1276				 * State is either being processed by
1277				 * pf_unlink_state() in an other thread, or
1278				 * is scheduled for immediate expiry.
1279				 */
1280				PF_STATE_UNLOCK(s);
1281				return (NULL);
1282			}
1283			return (s);
1284		}
1285	PF_HASHROW_UNLOCK(kh);
1286
1287	return (NULL);
1288}
1289
1290struct pf_state *
1291pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1292{
1293	struct pf_keyhash	*kh;
1294	struct pf_state_key	*sk;
1295	struct pf_state		*s, *ret = NULL;
1296	int			 idx, inout = 0;
1297
1298	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1299
1300	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1301
1302	PF_HASHROW_LOCK(kh);
1303	LIST_FOREACH(sk, &kh->keys, entry)
1304		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1305			break;
1306	if (sk == NULL) {
1307		PF_HASHROW_UNLOCK(kh);
1308		return (NULL);
1309	}
1310	switch (dir) {
1311	case PF_IN:
1312		idx = PF_SK_WIRE;
1313		break;
1314	case PF_OUT:
1315		idx = PF_SK_STACK;
1316		break;
1317	case PF_INOUT:
1318		idx = PF_SK_WIRE;
1319		inout = 1;
1320		break;
1321	default:
1322		panic("%s: dir %u", __func__, dir);
1323	}
1324second_run:
1325	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
1326		if (more == NULL) {
1327			PF_HASHROW_UNLOCK(kh);
1328			return (s);
1329		}
1330
1331		if (ret)
1332			(*more)++;
1333		else
1334			ret = s;
1335	}
1336	if (inout == 1) {
1337		inout = 0;
1338		idx = PF_SK_STACK;
1339		goto second_run;
1340	}
1341	PF_HASHROW_UNLOCK(kh);
1342
1343	return (ret);
1344}
1345
1346/* END state table stuff */
1347
1348static void
1349pf_send(struct pf_send_entry *pfse)
1350{
1351
1352	PF_SENDQ_LOCK();
1353	STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
1354	PF_SENDQ_UNLOCK();
1355	swi_sched(V_pf_swi_cookie, 0);
1356}
1357
1358void
1359pf_intr(void *v)
1360{
1361	struct pf_send_head queue;
1362	struct pf_send_entry *pfse, *next;
1363
1364	CURVNET_SET((struct vnet *)v);
1365
1366	PF_SENDQ_LOCK();
1367	queue = V_pf_sendqueue;
1368	STAILQ_INIT(&V_pf_sendqueue);
1369	PF_SENDQ_UNLOCK();
1370
1371	STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
1372		switch (pfse->pfse_type) {
1373#ifdef INET
1374		case PFSE_IP:
1375			ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL);
1376			break;
1377		case PFSE_ICMP:
1378			icmp_error(pfse->pfse_m, pfse->pfse_icmp_type,
1379			    pfse->pfse_icmp_code, 0, pfse->pfse_icmp_mtu);
1380			break;
1381#endif /* INET */
1382#ifdef INET6
1383		case PFSE_IP6:
1384			ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL,
1385			    NULL);
1386			break;
1387		case PFSE_ICMP6:
1388			icmp6_error(pfse->pfse_m, pfse->pfse_icmp_type,
1389			    pfse->pfse_icmp_code, pfse->pfse_icmp_mtu);
1390			break;
1391#endif /* INET6 */
1392		default:
1393			panic("%s: unknown type", __func__);
1394		}
1395		free(pfse, M_PFTEMP);
1396	}
1397	CURVNET_RESTORE();
1398}
1399
1400void
1401pf_purge_thread(void *v)
1402{
1403	u_int idx = 0;
1404
1405	CURVNET_SET((struct vnet *)v);
1406
1407	for (;;) {
1408		PF_RULES_RLOCK();
1409		rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10);
1410
1411		if (V_pf_end_threads) {
1412			/*
1413			 * To cleanse up all kifs and rules we need
1414			 * two runs: first one clears reference flags,
1415			 * then pf_purge_expired_states() doesn't
1416			 * raise them, and then second run frees.
1417			 */
1418			PF_RULES_RUNLOCK();
1419			pf_purge_unlinked_rules();
1420			pfi_kif_purge();
1421
1422			/*
1423			 * Now purge everything.
1424			 */
1425			pf_purge_expired_states(0, V_pf_hashmask);
1426			pf_purge_expired_fragments();
1427			pf_purge_expired_src_nodes();
1428
1429			/*
1430			 * Now all kifs & rules should be unreferenced,
1431			 * thus should be successfully freed.
1432			 */
1433			pf_purge_unlinked_rules();
1434			pfi_kif_purge();
1435
1436			/*
1437			 * Announce success and exit.
1438			 */
1439			PF_RULES_RLOCK();
1440			V_pf_end_threads++;
1441			PF_RULES_RUNLOCK();
1442			wakeup(pf_purge_thread);
1443			kproc_exit(0);
1444		}
1445		PF_RULES_RUNLOCK();
1446
1447		/* Process 1/interval fraction of the state table every run. */
1448		idx = pf_purge_expired_states(idx, V_pf_hashmask /
1449			    (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
1450
1451		/* Purge other expired types every PFTM_INTERVAL seconds. */
1452		if (idx == 0) {
1453			/*
1454			 * Order is important:
1455			 * - states and src nodes reference rules
1456			 * - states and rules reference kifs
1457			 */
1458			pf_purge_expired_fragments();
1459			pf_purge_expired_src_nodes();
1460			pf_purge_unlinked_rules();
1461			pfi_kif_purge();
1462		}
1463	}
1464	/* not reached */
1465	CURVNET_RESTORE();
1466}
1467
1468u_int32_t
1469pf_state_expires(const struct pf_state *state)
1470{
1471	u_int32_t	timeout;
1472	u_int32_t	start;
1473	u_int32_t	end;
1474	u_int32_t	states;
1475
1476	/* handle all PFTM_* > PFTM_MAX here */
1477	if (state->timeout == PFTM_PURGE)
1478		return (time_uptime);
1479	KASSERT(state->timeout != PFTM_UNLINKED,
1480	    ("pf_state_expires: timeout == PFTM_UNLINKED"));
1481	KASSERT((state->timeout < PFTM_MAX),
1482	    ("pf_state_expires: timeout > PFTM_MAX"));
1483	timeout = state->rule.ptr->timeout[state->timeout];
1484	if (!timeout)
1485		timeout = V_pf_default_rule.timeout[state->timeout];
1486	start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
1487	if (start) {
1488		end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
1489		states = counter_u64_fetch(state->rule.ptr->states_cur);
1490	} else {
1491		start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
1492		end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
1493		states = V_pf_status.states;
1494	}
1495	if (end && states > start && start < end) {
1496		if (states < end)
1497			return (state->expire + timeout * (end - states) /
1498			    (end - start));
1499		else
1500			return (time_uptime);
1501	}
1502	return (state->expire + timeout);
1503}
1504
1505void
1506pf_purge_expired_src_nodes()
1507{
1508	struct pf_src_node_list	 freelist;
1509	struct pf_srchash	*sh;
1510	struct pf_src_node	*cur, *next;
1511	int i;
1512
1513	LIST_INIT(&freelist);
1514	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
1515	    PF_HASHROW_LOCK(sh);
1516	    LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
1517		if (cur->states == 0 && cur->expire <= time_uptime) {
1518			pf_unlink_src_node_locked(cur);
1519			LIST_INSERT_HEAD(&freelist, cur, entry);
1520		} else if (cur->rule.ptr != NULL)
1521			cur->rule.ptr->rule_flag |= PFRULE_REFS;
1522	    PF_HASHROW_UNLOCK(sh);
1523	}
1524
1525	pf_free_src_nodes(&freelist);
1526
1527	V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z);
1528}
1529
1530static void
1531pf_src_tree_remove_state(struct pf_state *s)
1532{
1533	u_int32_t timeout;
1534
1535	if (s->src_node != NULL) {
1536		if (s->src.tcp_est)
1537			--s->src_node->conn;
1538		if (--s->src_node->states == 0) {
1539			timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1540			if (!timeout)
1541				timeout =
1542				    V_pf_default_rule.timeout[PFTM_SRC_NODE];
1543			s->src_node->expire = time_uptime + timeout;
1544		}
1545	}
1546	if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
1547		if (--s->nat_src_node->states == 0) {
1548			timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1549			if (!timeout)
1550				timeout =
1551				    V_pf_default_rule.timeout[PFTM_SRC_NODE];
1552			s->nat_src_node->expire = time_uptime + timeout;
1553		}
1554	}
1555	s->src_node = s->nat_src_node = NULL;
1556}
1557
1558/*
1559 * Unlink and potentilly free a state. Function may be
1560 * called with ID hash row locked, but always returns
1561 * unlocked, since it needs to go through key hash locking.
1562 */
1563int
1564pf_unlink_state(struct pf_state *s, u_int flags)
1565{
1566	struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
1567
1568	if ((flags & PF_ENTER_LOCKED) == 0)
1569		PF_HASHROW_LOCK(ih);
1570	else
1571		PF_HASHROW_ASSERT(ih);
1572
1573	if (s->timeout == PFTM_UNLINKED) {
1574		/*
1575		 * State is being processed
1576		 * by pf_unlink_state() in
1577		 * an other thread.
1578		 */
1579		PF_HASHROW_UNLOCK(ih);
1580		return (0);	/* XXXGL: undefined actually */
1581	}
1582
1583	if (s->src.state == PF_TCPS_PROXY_DST) {
1584		/* XXX wire key the right one? */
1585		pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af,
1586		    &s->key[PF_SK_WIRE]->addr[1],
1587		    &s->key[PF_SK_WIRE]->addr[0],
1588		    s->key[PF_SK_WIRE]->port[1],
1589		    s->key[PF_SK_WIRE]->port[0],
1590		    s->src.seqhi, s->src.seqlo + 1,
1591		    TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL);
1592	}
1593
1594	LIST_REMOVE(s, entry);
1595	pf_src_tree_remove_state(s);
1596
1597	if (pfsync_delete_state_ptr != NULL)
1598		pfsync_delete_state_ptr(s);
1599
1600	STATE_DEC_COUNTERS(s);
1601
1602	s->timeout = PFTM_UNLINKED;
1603
1604	PF_HASHROW_UNLOCK(ih);
1605
1606	pf_detach_state(s);
1607	refcount_release(&s->refs);
1608
1609	return (pf_release_state(s));
1610}
1611
1612void
1613pf_free_state(struct pf_state *cur)
1614{
1615
1616	KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
1617	KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
1618	    cur->timeout));
1619
1620	pf_normalize_tcp_cleanup(cur);
1621	uma_zfree(V_pf_state_z, cur);
1622	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1);
1623}
1624
1625/*
1626 * Called only from pf_purge_thread(), thus serialized.
1627 */
1628static u_int
1629pf_purge_expired_states(u_int i, int maxcheck)
1630{
1631	struct pf_idhash *ih;
1632	struct pf_state *s;
1633
1634	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1635
1636	/*
1637	 * Go through hash and unlink states that expire now.
1638	 */
1639	while (maxcheck > 0) {
1640
1641		ih = &V_pf_idhash[i];
1642relock:
1643		PF_HASHROW_LOCK(ih);
1644		LIST_FOREACH(s, &ih->states, entry) {
1645			if (pf_state_expires(s) <= time_uptime) {
1646				V_pf_status.states -=
1647				    pf_unlink_state(s, PF_ENTER_LOCKED);
1648				goto relock;
1649			}
1650			s->rule.ptr->rule_flag |= PFRULE_REFS;
1651			if (s->nat_rule.ptr != NULL)
1652				s->nat_rule.ptr->rule_flag |= PFRULE_REFS;
1653			if (s->anchor.ptr != NULL)
1654				s->anchor.ptr->rule_flag |= PFRULE_REFS;
1655			s->kif->pfik_flags |= PFI_IFLAG_REFS;
1656			if (s->rt_kif)
1657				s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
1658		}
1659		PF_HASHROW_UNLOCK(ih);
1660
1661		/* Return when we hit end of hash. */
1662		if (++i > V_pf_hashmask) {
1663			V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1664			return (0);
1665		}
1666
1667		maxcheck--;
1668	}
1669
1670	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1671
1672	return (i);
1673}
1674
1675static void
1676pf_purge_unlinked_rules()
1677{
1678	struct pf_rulequeue tmpq;
1679	struct pf_rule *r, *r1;
1680
1681	/*
1682	 * If we have overloading task pending, then we'd
1683	 * better skip purging this time. There is a tiny
1684	 * probability that overloading task references
1685	 * an already unlinked rule.
1686	 */
1687	PF_OVERLOADQ_LOCK();
1688	if (!SLIST_EMPTY(&V_pf_overloadqueue)) {
1689		PF_OVERLOADQ_UNLOCK();
1690		return;
1691	}
1692	PF_OVERLOADQ_UNLOCK();
1693
1694	/*
1695	 * Do naive mark-and-sweep garbage collecting of old rules.
1696	 * Reference flag is raised by pf_purge_expired_states()
1697	 * and pf_purge_expired_src_nodes().
1698	 *
1699	 * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
1700	 * use a temporary queue.
1701	 */
1702	TAILQ_INIT(&tmpq);
1703	PF_UNLNKDRULES_LOCK();
1704	TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
1705		if (!(r->rule_flag & PFRULE_REFS)) {
1706			TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
1707			TAILQ_INSERT_TAIL(&tmpq, r, entries);
1708		} else
1709			r->rule_flag &= ~PFRULE_REFS;
1710	}
1711	PF_UNLNKDRULES_UNLOCK();
1712
1713	if (!TAILQ_EMPTY(&tmpq)) {
1714		PF_RULES_WLOCK();
1715		TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
1716			TAILQ_REMOVE(&tmpq, r, entries);
1717			pf_free_rule(r);
1718		}
1719		PF_RULES_WUNLOCK();
1720	}
1721}
1722
1723void
1724pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
1725{
1726	switch (af) {
1727#ifdef INET
1728	case AF_INET: {
1729		u_int32_t a = ntohl(addr->addr32[0]);
1730		printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
1731		    (a>>8)&255, a&255);
1732		if (p) {
1733			p = ntohs(p);
1734			printf(":%u", p);
1735		}
1736		break;
1737	}
1738#endif /* INET */
1739#ifdef INET6
1740	case AF_INET6: {
1741		u_int16_t b;
1742		u_int8_t i, curstart, curend, maxstart, maxend;
1743		curstart = curend = maxstart = maxend = 255;
1744		for (i = 0; i < 8; i++) {
1745			if (!addr->addr16[i]) {
1746				if (curstart == 255)
1747					curstart = i;
1748				curend = i;
1749			} else {
1750				if ((curend - curstart) >
1751				    (maxend - maxstart)) {
1752					maxstart = curstart;
1753					maxend = curend;
1754				}
1755				curstart = curend = 255;
1756			}
1757		}
1758		if ((curend - curstart) >
1759		    (maxend - maxstart)) {
1760			maxstart = curstart;
1761			maxend = curend;
1762		}
1763		for (i = 0; i < 8; i++) {
1764			if (i >= maxstart && i <= maxend) {
1765				if (i == 0)
1766					printf(":");
1767				if (i == maxend)
1768					printf(":");
1769			} else {
1770				b = ntohs(addr->addr16[i]);
1771				printf("%x", b);
1772				if (i < 7)
1773					printf(":");
1774			}
1775		}
1776		if (p) {
1777			p = ntohs(p);
1778			printf("[%u]", p);
1779		}
1780		break;
1781	}
1782#endif /* INET6 */
1783	}
1784}
1785
1786void
1787pf_print_state(struct pf_state *s)
1788{
1789	pf_print_state_parts(s, NULL, NULL);
1790}
1791
1792static void
1793pf_print_state_parts(struct pf_state *s,
1794    struct pf_state_key *skwp, struct pf_state_key *sksp)
1795{
1796	struct pf_state_key *skw, *sks;
1797	u_int8_t proto, dir;
1798
1799	/* Do our best to fill these, but they're skipped if NULL */
1800	skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
1801	sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
1802	proto = skw ? skw->proto : (sks ? sks->proto : 0);
1803	dir = s ? s->direction : 0;
1804
1805	switch (proto) {
1806	case IPPROTO_IPV4:
1807		printf("IPv4");
1808		break;
1809	case IPPROTO_IPV6:
1810		printf("IPv6");
1811		break;
1812	case IPPROTO_TCP:
1813		printf("TCP");
1814		break;
1815	case IPPROTO_UDP:
1816		printf("UDP");
1817		break;
1818	case IPPROTO_ICMP:
1819		printf("ICMP");
1820		break;
1821	case IPPROTO_ICMPV6:
1822		printf("ICMPv6");
1823		break;
1824	default:
1825		printf("%u", skw->proto);
1826		break;
1827	}
1828	switch (dir) {
1829	case PF_IN:
1830		printf(" in");
1831		break;
1832	case PF_OUT:
1833		printf(" out");
1834		break;
1835	}
1836	if (skw) {
1837		printf(" wire: ");
1838		pf_print_host(&skw->addr[0], skw->port[0], skw->af);
1839		printf(" ");
1840		pf_print_host(&skw->addr[1], skw->port[1], skw->af);
1841	}
1842	if (sks) {
1843		printf(" stack: ");
1844		if (sks != skw) {
1845			pf_print_host(&sks->addr[0], sks->port[0], sks->af);
1846			printf(" ");
1847			pf_print_host(&sks->addr[1], sks->port[1], sks->af);
1848		} else
1849			printf("-");
1850	}
1851	if (s) {
1852		if (proto == IPPROTO_TCP) {
1853			printf(" [lo=%u high=%u win=%u modulator=%u",
1854			    s->src.seqlo, s->src.seqhi,
1855			    s->src.max_win, s->src.seqdiff);
1856			if (s->src.wscale && s->dst.wscale)
1857				printf(" wscale=%u",
1858				    s->src.wscale & PF_WSCALE_MASK);
1859			printf("]");
1860			printf(" [lo=%u high=%u win=%u modulator=%u",
1861			    s->dst.seqlo, s->dst.seqhi,
1862			    s->dst.max_win, s->dst.seqdiff);
1863			if (s->src.wscale && s->dst.wscale)
1864				printf(" wscale=%u",
1865				s->dst.wscale & PF_WSCALE_MASK);
1866			printf("]");
1867		}
1868		printf(" %u:%u", s->src.state, s->dst.state);
1869	}
1870}
1871
1872void
1873pf_print_flags(u_int8_t f)
1874{
1875	if (f)
1876		printf(" ");
1877	if (f & TH_FIN)
1878		printf("F");
1879	if (f & TH_SYN)
1880		printf("S");
1881	if (f & TH_RST)
1882		printf("R");
1883	if (f & TH_PUSH)
1884		printf("P");
1885	if (f & TH_ACK)
1886		printf("A");
1887	if (f & TH_URG)
1888		printf("U");
1889	if (f & TH_ECE)
1890		printf("E");
1891	if (f & TH_CWR)
1892		printf("W");
1893}
1894
1895#define	PF_SET_SKIP_STEPS(i)					\
1896	do {							\
1897		while (head[i] != cur) {			\
1898			head[i]->skip[i].ptr = cur;		\
1899			head[i] = TAILQ_NEXT(head[i], entries);	\
1900		}						\
1901	} while (0)
1902
1903void
1904pf_calc_skip_steps(struct pf_rulequeue *rules)
1905{
1906	struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
1907	int i;
1908
1909	cur = TAILQ_FIRST(rules);
1910	prev = cur;
1911	for (i = 0; i < PF_SKIP_COUNT; ++i)
1912		head[i] = cur;
1913	while (cur != NULL) {
1914
1915		if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
1916			PF_SET_SKIP_STEPS(PF_SKIP_IFP);
1917		if (cur->direction != prev->direction)
1918			PF_SET_SKIP_STEPS(PF_SKIP_DIR);
1919		if (cur->af != prev->af)
1920			PF_SET_SKIP_STEPS(PF_SKIP_AF);
1921		if (cur->proto != prev->proto)
1922			PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
1923		if (cur->src.neg != prev->src.neg ||
1924		    pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
1925			PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
1926		if (cur->src.port[0] != prev->src.port[0] ||
1927		    cur->src.port[1] != prev->src.port[1] ||
1928		    cur->src.port_op != prev->src.port_op)
1929			PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
1930		if (cur->dst.neg != prev->dst.neg ||
1931		    pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
1932			PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
1933		if (cur->dst.port[0] != prev->dst.port[0] ||
1934		    cur->dst.port[1] != prev->dst.port[1] ||
1935		    cur->dst.port_op != prev->dst.port_op)
1936			PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
1937
1938		prev = cur;
1939		cur = TAILQ_NEXT(cur, entries);
1940	}
1941	for (i = 0; i < PF_SKIP_COUNT; ++i)
1942		PF_SET_SKIP_STEPS(i);
1943}
1944
1945static int
1946pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
1947{
1948	if (aw1->type != aw2->type)
1949		return (1);
1950	switch (aw1->type) {
1951	case PF_ADDR_ADDRMASK:
1952	case PF_ADDR_RANGE:
1953		if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0))
1954			return (1);
1955		if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0))
1956			return (1);
1957		return (0);
1958	case PF_ADDR_DYNIFTL:
1959		return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
1960	case PF_ADDR_NOROUTE:
1961	case PF_ADDR_URPFFAILED:
1962		return (0);
1963	case PF_ADDR_TABLE:
1964		return (aw1->p.tbl != aw2->p.tbl);
1965	default:
1966		printf("invalid address type: %d\n", aw1->type);
1967		return (1);
1968	}
1969}
1970
1971u_int16_t
1972pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
1973{
1974	u_int32_t	l;
1975
1976	if (udp && !cksum)
1977		return (0x0000);
1978	l = cksum + old - new;
1979	l = (l >> 16) + (l & 65535);
1980	l = l & 65535;
1981	if (udp && !l)
1982		return (0xFFFF);
1983	return (l);
1984}
1985
1986static void
1987pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc,
1988    struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
1989{
1990	struct pf_addr	ao;
1991	u_int16_t	po = *p;
1992
1993	PF_ACPY(&ao, a, af);
1994	PF_ACPY(a, an, af);
1995
1996	*p = pn;
1997
1998	switch (af) {
1999#ifdef INET
2000	case AF_INET:
2001		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2002		    ao.addr16[0], an->addr16[0], 0),
2003		    ao.addr16[1], an->addr16[1], 0);
2004		*p = pn;
2005		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
2006		    ao.addr16[0], an->addr16[0], u),
2007		    ao.addr16[1], an->addr16[1], u),
2008		    po, pn, u);
2009		break;
2010#endif /* INET */
2011#ifdef INET6
2012	case AF_INET6:
2013		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2014		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2015		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
2016		    ao.addr16[0], an->addr16[0], u),
2017		    ao.addr16[1], an->addr16[1], u),
2018		    ao.addr16[2], an->addr16[2], u),
2019		    ao.addr16[3], an->addr16[3], u),
2020		    ao.addr16[4], an->addr16[4], u),
2021		    ao.addr16[5], an->addr16[5], u),
2022		    ao.addr16[6], an->addr16[6], u),
2023		    ao.addr16[7], an->addr16[7], u),
2024		    po, pn, u);
2025		break;
2026#endif /* INET6 */
2027	}
2028}
2029
2030
2031/* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
2032void
2033pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
2034{
2035	u_int32_t	ao;
2036
2037	memcpy(&ao, a, sizeof(ao));
2038	memcpy(a, &an, sizeof(u_int32_t));
2039	*c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
2040	    ao % 65536, an % 65536, u);
2041}
2042
2043#ifdef INET6
2044static void
2045pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
2046{
2047	struct pf_addr	ao;
2048
2049	PF_ACPY(&ao, a, AF_INET6);
2050	PF_ACPY(a, an, AF_INET6);
2051
2052	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2053	    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2054	    pf_cksum_fixup(pf_cksum_fixup(*c,
2055	    ao.addr16[0], an->addr16[0], u),
2056	    ao.addr16[1], an->addr16[1], u),
2057	    ao.addr16[2], an->addr16[2], u),
2058	    ao.addr16[3], an->addr16[3], u),
2059	    ao.addr16[4], an->addr16[4], u),
2060	    ao.addr16[5], an->addr16[5], u),
2061	    ao.addr16[6], an->addr16[6], u),
2062	    ao.addr16[7], an->addr16[7], u);
2063}
2064#endif /* INET6 */
2065
2066static void
2067pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
2068    struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
2069    u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
2070{
2071	struct pf_addr	oia, ooa;
2072
2073	PF_ACPY(&oia, ia, af);
2074	if (oa)
2075		PF_ACPY(&ooa, oa, af);
2076
2077	/* Change inner protocol port, fix inner protocol checksum. */
2078	if (ip != NULL) {
2079		u_int16_t	oip = *ip;
2080		u_int32_t	opc;
2081
2082		if (pc != NULL)
2083			opc = *pc;
2084		*ip = np;
2085		if (pc != NULL)
2086			*pc = pf_cksum_fixup(*pc, oip, *ip, u);
2087		*ic = pf_cksum_fixup(*ic, oip, *ip, 0);
2088		if (pc != NULL)
2089			*ic = pf_cksum_fixup(*ic, opc, *pc, 0);
2090	}
2091	/* Change inner ip address, fix inner ip and icmp checksums. */
2092	PF_ACPY(ia, na, af);
2093	switch (af) {
2094#ifdef INET
2095	case AF_INET: {
2096		u_int32_t	 oh2c = *h2c;
2097
2098		*h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
2099		    oia.addr16[0], ia->addr16[0], 0),
2100		    oia.addr16[1], ia->addr16[1], 0);
2101		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2102		    oia.addr16[0], ia->addr16[0], 0),
2103		    oia.addr16[1], ia->addr16[1], 0);
2104		*ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
2105		break;
2106	}
2107#endif /* INET */
2108#ifdef INET6
2109	case AF_INET6:
2110		*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2111		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2112		    pf_cksum_fixup(pf_cksum_fixup(*ic,
2113		    oia.addr16[0], ia->addr16[0], u),
2114		    oia.addr16[1], ia->addr16[1], u),
2115		    oia.addr16[2], ia->addr16[2], u),
2116		    oia.addr16[3], ia->addr16[3], u),
2117		    oia.addr16[4], ia->addr16[4], u),
2118		    oia.addr16[5], ia->addr16[5], u),
2119		    oia.addr16[6], ia->addr16[6], u),
2120		    oia.addr16[7], ia->addr16[7], u);
2121		break;
2122#endif /* INET6 */
2123	}
2124	/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
2125	if (oa) {
2126		PF_ACPY(oa, na, af);
2127		switch (af) {
2128#ifdef INET
2129		case AF_INET:
2130			*hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2131			    ooa.addr16[0], oa->addr16[0], 0),
2132			    ooa.addr16[1], oa->addr16[1], 0);
2133			break;
2134#endif /* INET */
2135#ifdef INET6
2136		case AF_INET6:
2137			*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2138			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2139			    pf_cksum_fixup(pf_cksum_fixup(*ic,
2140			    ooa.addr16[0], oa->addr16[0], u),
2141			    ooa.addr16[1], oa->addr16[1], u),
2142			    ooa.addr16[2], oa->addr16[2], u),
2143			    ooa.addr16[3], oa->addr16[3], u),
2144			    ooa.addr16[4], oa->addr16[4], u),
2145			    ooa.addr16[5], oa->addr16[5], u),
2146			    ooa.addr16[6], oa->addr16[6], u),
2147			    ooa.addr16[7], oa->addr16[7], u);
2148			break;
2149#endif /* INET6 */
2150		}
2151	}
2152}
2153
2154
2155/*
2156 * Need to modulate the sequence numbers in the TCP SACK option
2157 * (credits to Krzysztof Pfaff for report and patch)
2158 */
2159static int
2160pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2161    struct tcphdr *th, struct pf_state_peer *dst)
2162{
2163	int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2164	u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2165	int copyback = 0, i, olen;
2166	struct sackblk sack;
2167
2168#define	TCPOLEN_SACKLEN	(TCPOLEN_SACK + 2)
2169	if (hlen < TCPOLEN_SACKLEN ||
2170	    !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2171		return 0;
2172
2173	while (hlen >= TCPOLEN_SACKLEN) {
2174		olen = opt[1];
2175		switch (*opt) {
2176		case TCPOPT_EOL:	/* FALLTHROUGH */
2177		case TCPOPT_NOP:
2178			opt++;
2179			hlen--;
2180			break;
2181		case TCPOPT_SACK:
2182			if (olen > hlen)
2183				olen = hlen;
2184			if (olen >= TCPOLEN_SACKLEN) {
2185				for (i = 2; i + TCPOLEN_SACK <= olen;
2186				    i += TCPOLEN_SACK) {
2187					memcpy(&sack, &opt[i], sizeof(sack));
2188					pf_change_a(&sack.start, &th->th_sum,
2189					    htonl(ntohl(sack.start) -
2190					    dst->seqdiff), 0);
2191					pf_change_a(&sack.end, &th->th_sum,
2192					    htonl(ntohl(sack.end) -
2193					    dst->seqdiff), 0);
2194					memcpy(&opt[i], &sack, sizeof(sack));
2195				}
2196				copyback = 1;
2197			}
2198			/* FALLTHROUGH */
2199		default:
2200			if (olen < 2)
2201				olen = 2;
2202			hlen -= olen;
2203			opt += olen;
2204		}
2205	}
2206
2207	if (copyback)
2208		m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
2209	return (copyback);
2210}
2211
2212static void
2213pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af,
2214    const struct pf_addr *saddr, const struct pf_addr *daddr,
2215    u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2216    u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
2217    u_int16_t rtag, struct ifnet *ifp)
2218{
2219	struct pf_send_entry *pfse;
2220	struct mbuf	*m;
2221	int		 len, tlen;
2222#ifdef INET
2223	struct ip	*h = NULL;
2224#endif /* INET */
2225#ifdef INET6
2226	struct ip6_hdr	*h6 = NULL;
2227#endif /* INET6 */
2228	struct tcphdr	*th;
2229	char		*opt;
2230	struct pf_mtag  *pf_mtag;
2231
2232	len = 0;
2233	th = NULL;
2234
2235	/* maximum segment size tcp option */
2236	tlen = sizeof(struct tcphdr);
2237	if (mss)
2238		tlen += 4;
2239
2240	switch (af) {
2241#ifdef INET
2242	case AF_INET:
2243		len = sizeof(struct ip) + tlen;
2244		break;
2245#endif /* INET */
2246#ifdef INET6
2247	case AF_INET6:
2248		len = sizeof(struct ip6_hdr) + tlen;
2249		break;
2250#endif /* INET6 */
2251	default:
2252		panic("%s: unsupported af %d", __func__, af);
2253	}
2254
2255	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
2256	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2257	if (pfse == NULL)
2258		return;
2259	m = m_gethdr(M_NOWAIT, MT_DATA);
2260	if (m == NULL) {
2261		free(pfse, M_PFTEMP);
2262		return;
2263	}
2264#ifdef MAC
2265	mac_netinet_firewall_send(m);
2266#endif
2267	if ((pf_mtag = pf_get_mtag(m)) == NULL) {
2268		free(pfse, M_PFTEMP);
2269		m_freem(m);
2270		return;
2271	}
2272	if (tag)
2273		m->m_flags |= M_SKIP_FIREWALL;
2274	pf_mtag->tag = rtag;
2275
2276	if (r != NULL && r->rtableid >= 0)
2277		M_SETFIB(m, r->rtableid);
2278
2279#ifdef ALTQ
2280	if (r != NULL && r->qid) {
2281		pf_mtag->qid = r->qid;
2282
2283		/* add hints for ecn */
2284		pf_mtag->hdr = mtod(m, struct ip *);
2285	}
2286#endif /* ALTQ */
2287	m->m_data += max_linkhdr;
2288	m->m_pkthdr.len = m->m_len = len;
2289	m->m_pkthdr.rcvif = NULL;
2290	bzero(m->m_data, len);
2291	switch (af) {
2292#ifdef INET
2293	case AF_INET:
2294		h = mtod(m, struct ip *);
2295
2296		/* IP header fields included in the TCP checksum */
2297		h->ip_p = IPPROTO_TCP;
2298		h->ip_len = htons(tlen);
2299		h->ip_src.s_addr = saddr->v4.s_addr;
2300		h->ip_dst.s_addr = daddr->v4.s_addr;
2301
2302		th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
2303		break;
2304#endif /* INET */
2305#ifdef INET6
2306	case AF_INET6:
2307		h6 = mtod(m, struct ip6_hdr *);
2308
2309		/* IP header fields included in the TCP checksum */
2310		h6->ip6_nxt = IPPROTO_TCP;
2311		h6->ip6_plen = htons(tlen);
2312		memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
2313		memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
2314
2315		th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
2316		break;
2317#endif /* INET6 */
2318	}
2319
2320	/* TCP header */
2321	th->th_sport = sport;
2322	th->th_dport = dport;
2323	th->th_seq = htonl(seq);
2324	th->th_ack = htonl(ack);
2325	th->th_off = tlen >> 2;
2326	th->th_flags = flags;
2327	th->th_win = htons(win);
2328
2329	if (mss) {
2330		opt = (char *)(th + 1);
2331		opt[0] = TCPOPT_MAXSEG;
2332		opt[1] = 4;
2333		HTONS(mss);
2334		bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
2335	}
2336
2337	switch (af) {
2338#ifdef INET
2339	case AF_INET:
2340		/* TCP checksum */
2341		th->th_sum = in_cksum(m, len);
2342
2343		/* Finish the IP header */
2344		h->ip_v = 4;
2345		h->ip_hl = sizeof(*h) >> 2;
2346		h->ip_tos = IPTOS_LOWDELAY;
2347		h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0);
2348		h->ip_len = htons(len);
2349		h->ip_ttl = ttl ? ttl : V_ip_defttl;
2350		h->ip_sum = 0;
2351
2352		pfse->pfse_type = PFSE_IP;
2353		break;
2354#endif /* INET */
2355#ifdef INET6
2356	case AF_INET6:
2357		/* TCP checksum */
2358		th->th_sum = in6_cksum(m, IPPROTO_TCP,
2359		    sizeof(struct ip6_hdr), tlen);
2360
2361		h6->ip6_vfc |= IPV6_VERSION;
2362		h6->ip6_hlim = IPV6_DEFHLIM;
2363
2364		pfse->pfse_type = PFSE_IP6;
2365		break;
2366#endif /* INET6 */
2367	}
2368	pfse->pfse_m = m;
2369	pf_send(pfse);
2370}
2371
2372static void
2373pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
2374    struct pf_rule *r)
2375{
2376	struct pf_send_entry *pfse;
2377	struct mbuf *m0;
2378	struct pf_mtag *pf_mtag;
2379
2380	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
2381	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2382	if (pfse == NULL)
2383		return;
2384
2385	if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
2386		free(pfse, M_PFTEMP);
2387		return;
2388	}
2389
2390	if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
2391		free(pfse, M_PFTEMP);
2392		return;
2393	}
2394	/* XXX: revisit */
2395	m0->m_flags |= M_SKIP_FIREWALL;
2396
2397	if (r->rtableid >= 0)
2398		M_SETFIB(m0, r->rtableid);
2399
2400#ifdef ALTQ
2401	if (r->qid) {
2402		pf_mtag->qid = r->qid;
2403		/* add hints for ecn */
2404		pf_mtag->hdr = mtod(m0, struct ip *);
2405	}
2406#endif /* ALTQ */
2407
2408	switch (af) {
2409#ifdef INET
2410	case AF_INET:
2411		pfse->pfse_type = PFSE_ICMP;
2412		break;
2413#endif /* INET */
2414#ifdef INET6
2415	case AF_INET6:
2416		pfse->pfse_type = PFSE_ICMP6;
2417		break;
2418#endif /* INET6 */
2419	}
2420	pfse->pfse_m = m0;
2421	pfse->pfse_icmp_type = type;
2422	pfse->pfse_icmp_code = code;
2423	pf_send(pfse);
2424}
2425
2426/*
2427 * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
2428 * If n is 0, they match if they are equal. If n is != 0, they match if they
2429 * are different.
2430 */
2431int
2432pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
2433    struct pf_addr *b, sa_family_t af)
2434{
2435	int	match = 0;
2436
2437	switch (af) {
2438#ifdef INET
2439	case AF_INET:
2440		if ((a->addr32[0] & m->addr32[0]) ==
2441		    (b->addr32[0] & m->addr32[0]))
2442			match++;
2443		break;
2444#endif /* INET */
2445#ifdef INET6
2446	case AF_INET6:
2447		if (((a->addr32[0] & m->addr32[0]) ==
2448		     (b->addr32[0] & m->addr32[0])) &&
2449		    ((a->addr32[1] & m->addr32[1]) ==
2450		     (b->addr32[1] & m->addr32[1])) &&
2451		    ((a->addr32[2] & m->addr32[2]) ==
2452		     (b->addr32[2] & m->addr32[2])) &&
2453		    ((a->addr32[3] & m->addr32[3]) ==
2454		     (b->addr32[3] & m->addr32[3])))
2455			match++;
2456		break;
2457#endif /* INET6 */
2458	}
2459	if (match) {
2460		if (n)
2461			return (0);
2462		else
2463			return (1);
2464	} else {
2465		if (n)
2466			return (1);
2467		else
2468			return (0);
2469	}
2470}
2471
2472/*
2473 * Return 1 if b <= a <= e, otherwise return 0.
2474 */
2475int
2476pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
2477    struct pf_addr *a, sa_family_t af)
2478{
2479	switch (af) {
2480#ifdef INET
2481	case AF_INET:
2482		if ((a->addr32[0] < b->addr32[0]) ||
2483		    (a->addr32[0] > e->addr32[0]))
2484			return (0);
2485		break;
2486#endif /* INET */
2487#ifdef INET6
2488	case AF_INET6: {
2489		int	i;
2490
2491		/* check a >= b */
2492		for (i = 0; i < 4; ++i)
2493			if (a->addr32[i] > b->addr32[i])
2494				break;
2495			else if (a->addr32[i] < b->addr32[i])
2496				return (0);
2497		/* check a <= e */
2498		for (i = 0; i < 4; ++i)
2499			if (a->addr32[i] < e->addr32[i])
2500				break;
2501			else if (a->addr32[i] > e->addr32[i])
2502				return (0);
2503		break;
2504	}
2505#endif /* INET6 */
2506	}
2507	return (1);
2508}
2509
2510static int
2511pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
2512{
2513	switch (op) {
2514	case PF_OP_IRG:
2515		return ((p > a1) && (p < a2));
2516	case PF_OP_XRG:
2517		return ((p < a1) || (p > a2));
2518	case PF_OP_RRG:
2519		return ((p >= a1) && (p <= a2));
2520	case PF_OP_EQ:
2521		return (p == a1);
2522	case PF_OP_NE:
2523		return (p != a1);
2524	case PF_OP_LT:
2525		return (p < a1);
2526	case PF_OP_LE:
2527		return (p <= a1);
2528	case PF_OP_GT:
2529		return (p > a1);
2530	case PF_OP_GE:
2531		return (p >= a1);
2532	}
2533	return (0); /* never reached */
2534}
2535
2536int
2537pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
2538{
2539	NTOHS(a1);
2540	NTOHS(a2);
2541	NTOHS(p);
2542	return (pf_match(op, a1, a2, p));
2543}
2544
2545static int
2546pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
2547{
2548	if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2549		return (0);
2550	return (pf_match(op, a1, a2, u));
2551}
2552
2553static int
2554pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
2555{
2556	if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2557		return (0);
2558	return (pf_match(op, a1, a2, g));
2559}
2560
2561int
2562pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag)
2563{
2564	if (*tag == -1)
2565		*tag = mtag;
2566
2567	return ((!r->match_tag_not && r->match_tag == *tag) ||
2568	    (r->match_tag_not && r->match_tag != *tag));
2569}
2570
2571int
2572pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
2573{
2574
2575	KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
2576
2577	if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
2578		return (ENOMEM);
2579
2580	pd->pf_mtag->tag = tag;
2581
2582	return (0);
2583}
2584
2585#define	PF_ANCHOR_STACKSIZE	32
2586struct pf_anchor_stackframe {
2587	struct pf_ruleset	*rs;
2588	struct pf_rule		*r;	/* XXX: + match bit */
2589	struct pf_anchor	*child;
2590};
2591
2592/*
2593 * XXX: We rely on malloc(9) returning pointer aligned addresses.
2594 */
2595#define	PF_ANCHORSTACK_MATCH	0x00000001
2596#define	PF_ANCHORSTACK_MASK	(PF_ANCHORSTACK_MATCH)
2597
2598#define	PF_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
2599#define	PF_ANCHOR_RULE(f)	(struct pf_rule *)			\
2600				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
2601#define	PF_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 			\
2602				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
2603} while (0)
2604
2605void
2606pf_step_into_anchor(struct pf_anchor_stackframe *stack, int *depth,
2607    struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
2608    int *match)
2609{
2610	struct pf_anchor_stackframe	*f;
2611
2612	PF_RULES_RASSERT();
2613
2614	if (match)
2615		*match = 0;
2616	if (*depth >= PF_ANCHOR_STACKSIZE) {
2617		printf("%s: anchor stack overflow on %s\n",
2618		    __func__, (*r)->anchor->name);
2619		*r = TAILQ_NEXT(*r, entries);
2620		return;
2621	} else if (*depth == 0 && a != NULL)
2622		*a = *r;
2623	f = stack + (*depth)++;
2624	f->rs = *rs;
2625	f->r = *r;
2626	if ((*r)->anchor_wildcard) {
2627		struct pf_anchor_node *parent = &(*r)->anchor->children;
2628
2629		if ((f->child = RB_MIN(pf_anchor_node, parent)) == NULL) {
2630			*r = NULL;
2631			return;
2632		}
2633		*rs = &f->child->ruleset;
2634	} else {
2635		f->child = NULL;
2636		*rs = &(*r)->anchor->ruleset;
2637	}
2638	*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2639}
2640
2641int
2642pf_step_out_of_anchor(struct pf_anchor_stackframe *stack, int *depth,
2643    struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
2644    int *match)
2645{
2646	struct pf_anchor_stackframe	*f;
2647	struct pf_rule *fr;
2648	int quick = 0;
2649
2650	PF_RULES_RASSERT();
2651
2652	do {
2653		if (*depth <= 0)
2654			break;
2655		f = stack + *depth - 1;
2656		fr = PF_ANCHOR_RULE(f);
2657		if (f->child != NULL) {
2658			struct pf_anchor_node *parent;
2659
2660			/*
2661			 * This block traverses through
2662			 * a wildcard anchor.
2663			 */
2664			parent = &fr->anchor->children;
2665			if (match != NULL && *match) {
2666				/*
2667				 * If any of "*" matched, then
2668				 * "foo/ *" matched, mark frame
2669				 * appropriately.
2670				 */
2671				PF_ANCHOR_SET_MATCH(f);
2672				*match = 0;
2673			}
2674			f->child = RB_NEXT(pf_anchor_node, parent, f->child);
2675			if (f->child != NULL) {
2676				*rs = &f->child->ruleset;
2677				*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2678				if (*r == NULL)
2679					continue;
2680				else
2681					break;
2682			}
2683		}
2684		(*depth)--;
2685		if (*depth == 0 && a != NULL)
2686			*a = NULL;
2687		*rs = f->rs;
2688		if (PF_ANCHOR_MATCH(f) || (match != NULL && *match))
2689			quick = fr->quick;
2690		*r = TAILQ_NEXT(fr, entries);
2691	} while (*r == NULL);
2692
2693	return (quick);
2694}
2695
2696#ifdef INET6
2697void
2698pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
2699    struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
2700{
2701	switch (af) {
2702#ifdef INET
2703	case AF_INET:
2704		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2705		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2706		break;
2707#endif /* INET */
2708	case AF_INET6:
2709		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2710		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2711		naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
2712		((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
2713		naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
2714		((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
2715		naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
2716		((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
2717		break;
2718	}
2719}
2720
2721void
2722pf_addr_inc(struct pf_addr *addr, sa_family_t af)
2723{
2724	switch (af) {
2725#ifdef INET
2726	case AF_INET:
2727		addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
2728		break;
2729#endif /* INET */
2730	case AF_INET6:
2731		if (addr->addr32[3] == 0xffffffff) {
2732			addr->addr32[3] = 0;
2733			if (addr->addr32[2] == 0xffffffff) {
2734				addr->addr32[2] = 0;
2735				if (addr->addr32[1] == 0xffffffff) {
2736					addr->addr32[1] = 0;
2737					addr->addr32[0] =
2738					    htonl(ntohl(addr->addr32[0]) + 1);
2739				} else
2740					addr->addr32[1] =
2741					    htonl(ntohl(addr->addr32[1]) + 1);
2742			} else
2743				addr->addr32[2] =
2744				    htonl(ntohl(addr->addr32[2]) + 1);
2745		} else
2746			addr->addr32[3] =
2747			    htonl(ntohl(addr->addr32[3]) + 1);
2748		break;
2749	}
2750}
2751#endif /* INET6 */
2752
2753int
2754pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
2755{
2756	struct pf_addr		*saddr, *daddr;
2757	u_int16_t		 sport, dport;
2758	struct inpcbinfo	*pi;
2759	struct inpcb		*inp;
2760
2761	pd->lookup.uid = UID_MAX;
2762	pd->lookup.gid = GID_MAX;
2763
2764	switch (pd->proto) {
2765	case IPPROTO_TCP:
2766		if (pd->hdr.tcp == NULL)
2767			return (-1);
2768		sport = pd->hdr.tcp->th_sport;
2769		dport = pd->hdr.tcp->th_dport;
2770		pi = &V_tcbinfo;
2771		break;
2772	case IPPROTO_UDP:
2773		if (pd->hdr.udp == NULL)
2774			return (-1);
2775		sport = pd->hdr.udp->uh_sport;
2776		dport = pd->hdr.udp->uh_dport;
2777		pi = &V_udbinfo;
2778		break;
2779	default:
2780		return (-1);
2781	}
2782	if (direction == PF_IN) {
2783		saddr = pd->src;
2784		daddr = pd->dst;
2785	} else {
2786		u_int16_t	p;
2787
2788		p = sport;
2789		sport = dport;
2790		dport = p;
2791		saddr = pd->dst;
2792		daddr = pd->src;
2793	}
2794	switch (pd->af) {
2795#ifdef INET
2796	case AF_INET:
2797		inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
2798		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
2799		if (inp == NULL) {
2800			inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
2801			   daddr->v4, dport, INPLOOKUP_WILDCARD |
2802			   INPLOOKUP_RLOCKPCB, NULL, m);
2803			if (inp == NULL)
2804				return (-1);
2805		}
2806		break;
2807#endif /* INET */
2808#ifdef INET6
2809	case AF_INET6:
2810		inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
2811		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
2812		if (inp == NULL) {
2813			inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
2814			    &daddr->v6, dport, INPLOOKUP_WILDCARD |
2815			    INPLOOKUP_RLOCKPCB, NULL, m);
2816			if (inp == NULL)
2817				return (-1);
2818		}
2819		break;
2820#endif /* INET6 */
2821
2822	default:
2823		return (-1);
2824	}
2825	INP_RLOCK_ASSERT(inp);
2826	pd->lookup.uid = inp->inp_cred->cr_uid;
2827	pd->lookup.gid = inp->inp_cred->cr_groups[0];
2828	INP_RUNLOCK(inp);
2829
2830	return (1);
2831}
2832
2833static u_int8_t
2834pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2835{
2836	int		 hlen;
2837	u_int8_t	 hdr[60];
2838	u_int8_t	*opt, optlen;
2839	u_int8_t	 wscale = 0;
2840
2841	hlen = th_off << 2;		/* hlen <= sizeof(hdr) */
2842	if (hlen <= sizeof(struct tcphdr))
2843		return (0);
2844	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
2845		return (0);
2846	opt = hdr + sizeof(struct tcphdr);
2847	hlen -= sizeof(struct tcphdr);
2848	while (hlen >= 3) {
2849		switch (*opt) {
2850		case TCPOPT_EOL:
2851		case TCPOPT_NOP:
2852			++opt;
2853			--hlen;
2854			break;
2855		case TCPOPT_WINDOW:
2856			wscale = opt[2];
2857			if (wscale > TCP_MAX_WINSHIFT)
2858				wscale = TCP_MAX_WINSHIFT;
2859			wscale |= PF_WSCALE_FLAG;
2860			/* FALLTHROUGH */
2861		default:
2862			optlen = opt[1];
2863			if (optlen < 2)
2864				optlen = 2;
2865			hlen -= optlen;
2866			opt += optlen;
2867			break;
2868		}
2869	}
2870	return (wscale);
2871}
2872
2873static u_int16_t
2874pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2875{
2876	int		 hlen;
2877	u_int8_t	 hdr[60];
2878	u_int8_t	*opt, optlen;
2879	u_int16_t	 mss = V_tcp_mssdflt;
2880
2881	hlen = th_off << 2;	/* hlen <= sizeof(hdr) */
2882	if (hlen <= sizeof(struct tcphdr))
2883		return (0);
2884	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
2885		return (0);
2886	opt = hdr + sizeof(struct tcphdr);
2887	hlen -= sizeof(struct tcphdr);
2888	while (hlen >= TCPOLEN_MAXSEG) {
2889		switch (*opt) {
2890		case TCPOPT_EOL:
2891		case TCPOPT_NOP:
2892			++opt;
2893			--hlen;
2894			break;
2895		case TCPOPT_MAXSEG:
2896			bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
2897			NTOHS(mss);
2898			/* FALLTHROUGH */
2899		default:
2900			optlen = opt[1];
2901			if (optlen < 2)
2902				optlen = 2;
2903			hlen -= optlen;
2904			opt += optlen;
2905			break;
2906		}
2907	}
2908	return (mss);
2909}
2910
2911static u_int16_t
2912pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
2913{
2914#ifdef INET
2915	struct sockaddr_in	*dst;
2916	struct route		 ro;
2917#endif /* INET */
2918#ifdef INET6
2919	struct sockaddr_in6	*dst6;
2920	struct route_in6	 ro6;
2921#endif /* INET6 */
2922	struct rtentry		*rt = NULL;
2923	int			 hlen = 0;
2924	u_int16_t		 mss = V_tcp_mssdflt;
2925
2926	switch (af) {
2927#ifdef INET
2928	case AF_INET:
2929		hlen = sizeof(struct ip);
2930		bzero(&ro, sizeof(ro));
2931		dst = (struct sockaddr_in *)&ro.ro_dst;
2932		dst->sin_family = AF_INET;
2933		dst->sin_len = sizeof(*dst);
2934		dst->sin_addr = addr->v4;
2935		in_rtalloc_ign(&ro, 0, rtableid);
2936		rt = ro.ro_rt;
2937		break;
2938#endif /* INET */
2939#ifdef INET6
2940	case AF_INET6:
2941		hlen = sizeof(struct ip6_hdr);
2942		bzero(&ro6, sizeof(ro6));
2943		dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
2944		dst6->sin6_family = AF_INET6;
2945		dst6->sin6_len = sizeof(*dst6);
2946		dst6->sin6_addr = addr->v6;
2947		in6_rtalloc_ign(&ro6, 0, rtableid);
2948		rt = ro6.ro_rt;
2949		break;
2950#endif /* INET6 */
2951	}
2952
2953	if (rt && rt->rt_ifp) {
2954		mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
2955		mss = max(V_tcp_mssdflt, mss);
2956		RTFREE(rt);
2957	}
2958	mss = min(mss, offer);
2959	mss = max(mss, 64);		/* sanity - at least max opt space */
2960	return (mss);
2961}
2962
2963static void
2964pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
2965{
2966	struct pf_rule *r = s->rule.ptr;
2967	struct pf_src_node *sn = NULL;
2968
2969	s->rt_kif = NULL;
2970	if (!r->rt || r->rt == PF_FASTROUTE)
2971		return;
2972	switch (s->key[PF_SK_WIRE]->af) {
2973#ifdef INET
2974	case AF_INET:
2975		pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL, &sn);
2976		s->rt_kif = r->rpool.cur->kif;
2977		break;
2978#endif /* INET */
2979#ifdef INET6
2980	case AF_INET6:
2981		pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL, &sn);
2982		s->rt_kif = r->rpool.cur->kif;
2983		break;
2984#endif /* INET6 */
2985	}
2986}
2987
2988static u_int32_t
2989pf_tcp_iss(struct pf_pdesc *pd)
2990{
2991	MD5_CTX ctx;
2992	u_int32_t digest[4];
2993
2994	if (V_pf_tcp_secret_init == 0) {
2995		read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
2996		MD5Init(&V_pf_tcp_secret_ctx);
2997		MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
2998		    sizeof(V_pf_tcp_secret));
2999		V_pf_tcp_secret_init = 1;
3000	}
3001
3002	ctx = V_pf_tcp_secret_ctx;
3003
3004	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
3005	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
3006	if (pd->af == AF_INET6) {
3007		MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
3008		MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
3009	} else {
3010		MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
3011		MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
3012	}
3013	MD5Final((u_char *)digest, &ctx);
3014	V_pf_tcp_iss_off += 4096;
3015#define	ISN_RANDOM_INCREMENT (4096 - 1)
3016	return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
3017	    V_pf_tcp_iss_off);
3018#undef	ISN_RANDOM_INCREMENT
3019}
3020
3021static int
3022pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
3023    struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
3024    struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp)
3025{
3026	struct pf_rule		*nr = NULL;
3027	struct pf_addr		* const saddr = pd->src;
3028	struct pf_addr		* const daddr = pd->dst;
3029	sa_family_t		 af = pd->af;
3030	struct pf_rule		*r, *a = NULL;
3031	struct pf_ruleset	*ruleset = NULL;
3032	struct pf_src_node	*nsn = NULL;
3033	struct tcphdr		*th = pd->hdr.tcp;
3034	struct pf_state_key	*sk = NULL, *nk = NULL;
3035	u_short			 reason;
3036	int			 rewrite = 0, hdrlen = 0;
3037	int			 tag = -1, rtableid = -1;
3038	int			 asd = 0;
3039	int			 match = 0;
3040	int			 state_icmp = 0;
3041	u_int16_t		 sport = 0, dport = 0;
3042	u_int16_t		 bproto_sum = 0, bip_sum = 0;
3043	u_int8_t		 icmptype = 0, icmpcode = 0;
3044	struct pf_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
3045
3046	PF_RULES_RASSERT();
3047
3048	if (inp != NULL) {
3049		INP_LOCK_ASSERT(inp);
3050		pd->lookup.uid = inp->inp_cred->cr_uid;
3051		pd->lookup.gid = inp->inp_cred->cr_groups[0];
3052		pd->lookup.done = 1;
3053	}
3054
3055	switch (pd->proto) {
3056	case IPPROTO_TCP:
3057		sport = th->th_sport;
3058		dport = th->th_dport;
3059		hdrlen = sizeof(*th);
3060		break;
3061	case IPPROTO_UDP:
3062		sport = pd->hdr.udp->uh_sport;
3063		dport = pd->hdr.udp->uh_dport;
3064		hdrlen = sizeof(*pd->hdr.udp);
3065		break;
3066#ifdef INET
3067	case IPPROTO_ICMP:
3068		if (pd->af != AF_INET)
3069			break;
3070		sport = dport = pd->hdr.icmp->icmp_id;
3071		hdrlen = sizeof(*pd->hdr.icmp);
3072		icmptype = pd->hdr.icmp->icmp_type;
3073		icmpcode = pd->hdr.icmp->icmp_code;
3074
3075		if (icmptype == ICMP_UNREACH ||
3076		    icmptype == ICMP_SOURCEQUENCH ||
3077		    icmptype == ICMP_REDIRECT ||
3078		    icmptype == ICMP_TIMXCEED ||
3079		    icmptype == ICMP_PARAMPROB)
3080			state_icmp++;
3081		break;
3082#endif /* INET */
3083#ifdef INET6
3084	case IPPROTO_ICMPV6:
3085		if (af != AF_INET6)
3086			break;
3087		sport = dport = pd->hdr.icmp6->icmp6_id;
3088		hdrlen = sizeof(*pd->hdr.icmp6);
3089		icmptype = pd->hdr.icmp6->icmp6_type;
3090		icmpcode = pd->hdr.icmp6->icmp6_code;
3091
3092		if (icmptype == ICMP6_DST_UNREACH ||
3093		    icmptype == ICMP6_PACKET_TOO_BIG ||
3094		    icmptype == ICMP6_TIME_EXCEEDED ||
3095		    icmptype == ICMP6_PARAM_PROB)
3096			state_icmp++;
3097		break;
3098#endif /* INET6 */
3099	default:
3100		sport = dport = hdrlen = 0;
3101		break;
3102	}
3103
3104	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3105
3106	/* check packet for BINAT/NAT/RDR */
3107	if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
3108	    &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) {
3109		KASSERT(sk != NULL, ("%s: null sk", __func__));
3110		KASSERT(nk != NULL, ("%s: null nk", __func__));
3111
3112		if (pd->ip_sum)
3113			bip_sum = *pd->ip_sum;
3114
3115		switch (pd->proto) {
3116		case IPPROTO_TCP:
3117			bproto_sum = th->th_sum;
3118			pd->proto_sum = &th->th_sum;
3119
3120			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3121			    nk->port[pd->sidx] != sport) {
3122				pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
3123				    &th->th_sum, &nk->addr[pd->sidx],
3124				    nk->port[pd->sidx], 0, af);
3125				pd->sport = &th->th_sport;
3126				sport = th->th_sport;
3127			}
3128
3129			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3130			    nk->port[pd->didx] != dport) {
3131				pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
3132				    &th->th_sum, &nk->addr[pd->didx],
3133				    nk->port[pd->didx], 0, af);
3134				dport = th->th_dport;
3135				pd->dport = &th->th_dport;
3136			}
3137			rewrite++;
3138			break;
3139		case IPPROTO_UDP:
3140			bproto_sum = pd->hdr.udp->uh_sum;
3141			pd->proto_sum = &pd->hdr.udp->uh_sum;
3142
3143			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3144			    nk->port[pd->sidx] != sport) {
3145				pf_change_ap(saddr, &pd->hdr.udp->uh_sport,
3146				    pd->ip_sum, &pd->hdr.udp->uh_sum,
3147				    &nk->addr[pd->sidx],
3148				    nk->port[pd->sidx], 1, af);
3149				sport = pd->hdr.udp->uh_sport;
3150				pd->sport = &pd->hdr.udp->uh_sport;
3151			}
3152
3153			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3154			    nk->port[pd->didx] != dport) {
3155				pf_change_ap(daddr, &pd->hdr.udp->uh_dport,
3156				    pd->ip_sum, &pd->hdr.udp->uh_sum,
3157				    &nk->addr[pd->didx],
3158				    nk->port[pd->didx], 1, af);
3159				dport = pd->hdr.udp->uh_dport;
3160				pd->dport = &pd->hdr.udp->uh_dport;
3161			}
3162			rewrite++;
3163			break;
3164#ifdef INET
3165		case IPPROTO_ICMP:
3166			nk->port[0] = nk->port[1];
3167			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
3168				pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
3169				    nk->addr[pd->sidx].v4.s_addr, 0);
3170
3171			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
3172				pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
3173				    nk->addr[pd->didx].v4.s_addr, 0);
3174
3175			if (nk->port[1] != pd->hdr.icmp->icmp_id) {
3176				pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
3177				    pd->hdr.icmp->icmp_cksum, sport,
3178				    nk->port[1], 0);
3179				pd->hdr.icmp->icmp_id = nk->port[1];
3180				pd->sport = &pd->hdr.icmp->icmp_id;
3181			}
3182			m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
3183			break;
3184#endif /* INET */
3185#ifdef INET6
3186		case IPPROTO_ICMPV6:
3187			nk->port[0] = nk->port[1];
3188			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
3189				pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
3190				    &nk->addr[pd->sidx], 0);
3191
3192			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
3193				pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
3194				    &nk->addr[pd->didx], 0);
3195			rewrite++;
3196			break;
3197#endif /* INET */
3198		default:
3199			switch (af) {
3200#ifdef INET
3201			case AF_INET:
3202				if (PF_ANEQ(saddr,
3203				    &nk->addr[pd->sidx], AF_INET))
3204					pf_change_a(&saddr->v4.s_addr,
3205					    pd->ip_sum,
3206					    nk->addr[pd->sidx].v4.s_addr, 0);
3207
3208				if (PF_ANEQ(daddr,
3209				    &nk->addr[pd->didx], AF_INET))
3210					pf_change_a(&daddr->v4.s_addr,
3211					    pd->ip_sum,
3212					    nk->addr[pd->didx].v4.s_addr, 0);
3213				break;
3214#endif /* INET */
3215#ifdef INET6
3216			case AF_INET6:
3217				if (PF_ANEQ(saddr,
3218				    &nk->addr[pd->sidx], AF_INET6))
3219					PF_ACPY(saddr, &nk->addr[pd->sidx], af);
3220
3221				if (PF_ANEQ(daddr,
3222				    &nk->addr[pd->didx], AF_INET6))
3223					PF_ACPY(saddr, &nk->addr[pd->didx], af);
3224				break;
3225#endif /* INET */
3226			}
3227			break;
3228		}
3229		if (nr->natpass)
3230			r = NULL;
3231		pd->nat_rule = nr;
3232	}
3233
3234	while (r != NULL) {
3235		r->evaluations++;
3236		if (pfi_kif_match(r->kif, kif) == r->ifnot)
3237			r = r->skip[PF_SKIP_IFP].ptr;
3238		else if (r->direction && r->direction != direction)
3239			r = r->skip[PF_SKIP_DIR].ptr;
3240		else if (r->af && r->af != af)
3241			r = r->skip[PF_SKIP_AF].ptr;
3242		else if (r->proto && r->proto != pd->proto)
3243			r = r->skip[PF_SKIP_PROTO].ptr;
3244		else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
3245		    r->src.neg, kif, M_GETFIB(m)))
3246			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3247		/* tcp/udp only. port_op always 0 in other cases */
3248		else if (r->src.port_op && !pf_match_port(r->src.port_op,
3249		    r->src.port[0], r->src.port[1], sport))
3250			r = r->skip[PF_SKIP_SRC_PORT].ptr;
3251		else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
3252		    r->dst.neg, NULL, M_GETFIB(m)))
3253			r = r->skip[PF_SKIP_DST_ADDR].ptr;
3254		/* tcp/udp only. port_op always 0 in other cases */
3255		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
3256		    r->dst.port[0], r->dst.port[1], dport))
3257			r = r->skip[PF_SKIP_DST_PORT].ptr;
3258		/* icmp only. type always 0 in other cases */
3259		else if (r->type && r->type != icmptype + 1)
3260			r = TAILQ_NEXT(r, entries);
3261		/* icmp only. type always 0 in other cases */
3262		else if (r->code && r->code != icmpcode + 1)
3263			r = TAILQ_NEXT(r, entries);
3264		else if (r->tos && !(r->tos == pd->tos))
3265			r = TAILQ_NEXT(r, entries);
3266		else if (r->rule_flag & PFRULE_FRAGMENT)
3267			r = TAILQ_NEXT(r, entries);
3268		else if (pd->proto == IPPROTO_TCP &&
3269		    (r->flagset & th->th_flags) != r->flags)
3270			r = TAILQ_NEXT(r, entries);
3271		/* tcp/udp only. uid.op always 0 in other cases */
3272		else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
3273		    pf_socket_lookup(direction, pd, m), 1)) &&
3274		    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
3275		    pd->lookup.uid))
3276			r = TAILQ_NEXT(r, entries);
3277		/* tcp/udp only. gid.op always 0 in other cases */
3278		else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
3279		    pf_socket_lookup(direction, pd, m), 1)) &&
3280		    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
3281		    pd->lookup.gid))
3282			r = TAILQ_NEXT(r, entries);
3283		else if (r->prob &&
3284		    r->prob <= arc4random())
3285			r = TAILQ_NEXT(r, entries);
3286		else if (r->match_tag && !pf_match_tag(m, r, &tag,
3287		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
3288			r = TAILQ_NEXT(r, entries);
3289		else if (r->os_fingerprint != PF_OSFP_ANY &&
3290		    (pd->proto != IPPROTO_TCP || !pf_osfp_match(
3291		    pf_osfp_fingerprint(pd, m, off, th),
3292		    r->os_fingerprint)))
3293			r = TAILQ_NEXT(r, entries);
3294		else {
3295			if (r->tag)
3296				tag = r->tag;
3297			if (r->rtableid >= 0)
3298				rtableid = r->rtableid;
3299			if (r->anchor == NULL) {
3300				match = 1;
3301				*rm = r;
3302				*am = a;
3303				*rsm = ruleset;
3304				if ((*rm)->quick)
3305					break;
3306				r = TAILQ_NEXT(r, entries);
3307			} else
3308				pf_step_into_anchor(anchor_stack, &asd,
3309				    &ruleset, PF_RULESET_FILTER, &r, &a,
3310				    &match);
3311		}
3312		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
3313		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
3314			break;
3315	}
3316	r = *rm;
3317	a = *am;
3318	ruleset = *rsm;
3319
3320	REASON_SET(&reason, PFRES_MATCH);
3321
3322	if (r->log || (nr != NULL && nr->log)) {
3323		if (rewrite)
3324			m_copyback(m, off, hdrlen, pd->hdr.any);
3325		PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a,
3326		    ruleset, pd, 1);
3327	}
3328
3329	if ((r->action == PF_DROP) &&
3330	    ((r->rule_flag & PFRULE_RETURNRST) ||
3331	    (r->rule_flag & PFRULE_RETURNICMP) ||
3332	    (r->rule_flag & PFRULE_RETURN))) {
3333		/* undo NAT changes, if they have taken place */
3334		if (nr != NULL) {
3335			PF_ACPY(saddr, &sk->addr[pd->sidx], af);
3336			PF_ACPY(daddr, &sk->addr[pd->didx], af);
3337			if (pd->sport)
3338				*pd->sport = sk->port[pd->sidx];
3339			if (pd->dport)
3340				*pd->dport = sk->port[pd->didx];
3341			if (pd->proto_sum)
3342				*pd->proto_sum = bproto_sum;
3343			if (pd->ip_sum)
3344				*pd->ip_sum = bip_sum;
3345			m_copyback(m, off, hdrlen, pd->hdr.any);
3346		}
3347		if (pd->proto == IPPROTO_TCP &&
3348		    ((r->rule_flag & PFRULE_RETURNRST) ||
3349		    (r->rule_flag & PFRULE_RETURN)) &&
3350		    !(th->th_flags & TH_RST)) {
3351			u_int32_t	 ack = ntohl(th->th_seq) + pd->p_len;
3352			int		 len = 0;
3353#ifdef INET
3354			struct ip	*h4;
3355#endif
3356#ifdef INET6
3357			struct ip6_hdr	*h6;
3358#endif
3359
3360			switch (af) {
3361#ifdef INET
3362			case AF_INET:
3363				h4 = mtod(m, struct ip *);
3364				len = ntohs(h4->ip_len) - off;
3365				break;
3366#endif
3367#ifdef INET6
3368			case AF_INET6:
3369				h6 = mtod(m, struct ip6_hdr *);
3370				len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
3371				break;
3372#endif
3373			}
3374
3375			if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
3376				REASON_SET(&reason, PFRES_PROTCKSUM);
3377			else {
3378				if (th->th_flags & TH_SYN)
3379					ack++;
3380				if (th->th_flags & TH_FIN)
3381					ack++;
3382				pf_send_tcp(m, r, af, pd->dst,
3383				    pd->src, th->th_dport, th->th_sport,
3384				    ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
3385				    r->return_ttl, 1, 0, kif->pfik_ifp);
3386			}
3387		} else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
3388		    r->return_icmp)
3389			pf_send_icmp(m, r->return_icmp >> 8,
3390			    r->return_icmp & 255, af, r);
3391		else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
3392		    r->return_icmp6)
3393			pf_send_icmp(m, r->return_icmp6 >> 8,
3394			    r->return_icmp6 & 255, af, r);
3395	}
3396
3397	if (r->action == PF_DROP)
3398		goto cleanup;
3399
3400	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3401		REASON_SET(&reason, PFRES_MEMORY);
3402		goto cleanup;
3403	}
3404	if (rtableid >= 0)
3405		M_SETFIB(m, rtableid);
3406
3407	if (!state_icmp && (r->keep_state || nr != NULL ||
3408	    (pd->flags & PFDESC_TCP_NORM))) {
3409		int action;
3410		action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
3411		    sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
3412		    hdrlen);
3413		if (action != PF_PASS)
3414			return (action);
3415	} else {
3416		if (sk != NULL)
3417			uma_zfree(V_pf_state_key_z, sk);
3418		if (nk != NULL)
3419			uma_zfree(V_pf_state_key_z, nk);
3420	}
3421
3422	/* copy back packet headers if we performed NAT operations */
3423	if (rewrite)
3424		m_copyback(m, off, hdrlen, pd->hdr.any);
3425
3426	if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
3427	    direction == PF_OUT &&
3428	    pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m))
3429		/*
3430		 * We want the state created, but we dont
3431		 * want to send this in case a partner
3432		 * firewall has to know about it to allow
3433		 * replies through it.
3434		 */
3435		return (PF_DEFER);
3436
3437	return (PF_PASS);
3438
3439cleanup:
3440	if (sk != NULL)
3441		uma_zfree(V_pf_state_key_z, sk);
3442	if (nk != NULL)
3443		uma_zfree(V_pf_state_key_z, nk);
3444	return (PF_DROP);
3445}
3446
3447static int
3448pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
3449    struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk,
3450    struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
3451    u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm,
3452    int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
3453{
3454	struct pf_state		*s = NULL;
3455	struct pf_src_node	*sn = NULL;
3456	struct tcphdr		*th = pd->hdr.tcp;
3457	u_int16_t		 mss = V_tcp_mssdflt;
3458	u_short			 reason;
3459
3460	/* check maximums */
3461	if (r->max_states &&
3462	    (counter_u64_fetch(r->states_cur) >= r->max_states)) {
3463		counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1);
3464		REASON_SET(&reason, PFRES_MAXSTATES);
3465		return (PF_DROP);
3466	}
3467	/* src node for filter rule */
3468	if ((r->rule_flag & PFRULE_SRCTRACK ||
3469	    r->rpool.opts & PF_POOL_STICKYADDR) &&
3470	    pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
3471		REASON_SET(&reason, PFRES_SRCLIMIT);
3472		goto csfailed;
3473	}
3474	/* src node for translation rule */
3475	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
3476	    pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
3477		REASON_SET(&reason, PFRES_SRCLIMIT);
3478		goto csfailed;
3479	}
3480	s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO);
3481	if (s == NULL) {
3482		REASON_SET(&reason, PFRES_MEMORY);
3483		goto csfailed;
3484	}
3485	s->rule.ptr = r;
3486	s->nat_rule.ptr = nr;
3487	s->anchor.ptr = a;
3488	STATE_INC_COUNTERS(s);
3489	if (r->allow_opts)
3490		s->state_flags |= PFSTATE_ALLOWOPTS;
3491	if (r->rule_flag & PFRULE_STATESLOPPY)
3492		s->state_flags |= PFSTATE_SLOPPY;
3493	s->log = r->log & PF_LOG_ALL;
3494	s->sync_state = PFSYNC_S_NONE;
3495	if (nr != NULL)
3496		s->log |= nr->log & PF_LOG_ALL;
3497	switch (pd->proto) {
3498	case IPPROTO_TCP:
3499		s->src.seqlo = ntohl(th->th_seq);
3500		s->src.seqhi = s->src.seqlo + pd->p_len + 1;
3501		if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
3502		    r->keep_state == PF_STATE_MODULATE) {
3503			/* Generate sequence number modulator */
3504			if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
3505			    0)
3506				s->src.seqdiff = 1;
3507			pf_change_a(&th->th_seq, &th->th_sum,
3508			    htonl(s->src.seqlo + s->src.seqdiff), 0);
3509			*rewrite = 1;
3510		} else
3511			s->src.seqdiff = 0;
3512		if (th->th_flags & TH_SYN) {
3513			s->src.seqhi++;
3514			s->src.wscale = pf_get_wscale(m, off,
3515			    th->th_off, pd->af);
3516		}
3517		s->src.max_win = MAX(ntohs(th->th_win), 1);
3518		if (s->src.wscale & PF_WSCALE_MASK) {
3519			/* Remove scale factor from initial window */
3520			int win = s->src.max_win;
3521			win += 1 << (s->src.wscale & PF_WSCALE_MASK);
3522			s->src.max_win = (win - 1) >>
3523			    (s->src.wscale & PF_WSCALE_MASK);
3524		}
3525		if (th->th_flags & TH_FIN)
3526			s->src.seqhi++;
3527		s->dst.seqhi = 1;
3528		s->dst.max_win = 1;
3529		s->src.state = TCPS_SYN_SENT;
3530		s->dst.state = TCPS_CLOSED;
3531		s->timeout = PFTM_TCP_FIRST_PACKET;
3532		break;
3533	case IPPROTO_UDP:
3534		s->src.state = PFUDPS_SINGLE;
3535		s->dst.state = PFUDPS_NO_TRAFFIC;
3536		s->timeout = PFTM_UDP_FIRST_PACKET;
3537		break;
3538	case IPPROTO_ICMP:
3539#ifdef INET6
3540	case IPPROTO_ICMPV6:
3541#endif
3542		s->timeout = PFTM_ICMP_FIRST_PACKET;
3543		break;
3544	default:
3545		s->src.state = PFOTHERS_SINGLE;
3546		s->dst.state = PFOTHERS_NO_TRAFFIC;
3547		s->timeout = PFTM_OTHER_FIRST_PACKET;
3548	}
3549
3550	s->creation = time_uptime;
3551	s->expire = time_uptime;
3552
3553	if (sn != NULL) {
3554		s->src_node = sn;
3555		s->src_node->states++;
3556	}
3557	if (nsn != NULL) {
3558		/* XXX We only modify one side for now. */
3559		PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
3560		s->nat_src_node = nsn;
3561		s->nat_src_node->states++;
3562	}
3563	if (pd->proto == IPPROTO_TCP) {
3564		if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
3565		    off, pd, th, &s->src, &s->dst)) {
3566			REASON_SET(&reason, PFRES_MEMORY);
3567			pf_src_tree_remove_state(s);
3568			STATE_DEC_COUNTERS(s);
3569			uma_zfree(V_pf_state_z, s);
3570			return (PF_DROP);
3571		}
3572		if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
3573		    pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
3574		    &s->src, &s->dst, rewrite)) {
3575			/* This really shouldn't happen!!! */
3576			DPFPRINTF(PF_DEBUG_URGENT,
3577			    ("pf_normalize_tcp_stateful failed on first pkt"));
3578			pf_normalize_tcp_cleanup(s);
3579			pf_src_tree_remove_state(s);
3580			STATE_DEC_COUNTERS(s);
3581			uma_zfree(V_pf_state_z, s);
3582			return (PF_DROP);
3583		}
3584	}
3585	s->direction = pd->dir;
3586
3587	/*
3588	 * sk/nk could already been setup by pf_get_translation().
3589	 */
3590	if (nr == NULL) {
3591		KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
3592		    __func__, nr, sk, nk));
3593		sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
3594		if (sk == NULL)
3595			goto csfailed;
3596		nk = sk;
3597	} else
3598		KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
3599		    __func__, nr, sk, nk));
3600
3601	/* Swap sk/nk for PF_OUT. */
3602	if (pf_state_insert(BOUND_IFACE(r, kif),
3603	    (pd->dir == PF_IN) ? sk : nk,
3604	    (pd->dir == PF_IN) ? nk : sk, s)) {
3605		if (pd->proto == IPPROTO_TCP)
3606			pf_normalize_tcp_cleanup(s);
3607		REASON_SET(&reason, PFRES_STATEINS);
3608		pf_src_tree_remove_state(s);
3609		STATE_DEC_COUNTERS(s);
3610		uma_zfree(V_pf_state_z, s);
3611		return (PF_DROP);
3612	} else
3613		*sm = s;
3614
3615	pf_set_rt_ifp(s, pd->src);	/* needs s->state_key set */
3616	if (tag > 0)
3617		s->tag = tag;
3618	if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
3619	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
3620		s->src.state = PF_TCPS_PROXY_SRC;
3621		/* undo NAT changes, if they have taken place */
3622		if (nr != NULL) {
3623			struct pf_state_key *skt = s->key[PF_SK_WIRE];
3624			if (pd->dir == PF_OUT)
3625				skt = s->key[PF_SK_STACK];
3626			PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
3627			PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
3628			if (pd->sport)
3629				*pd->sport = skt->port[pd->sidx];
3630			if (pd->dport)
3631				*pd->dport = skt->port[pd->didx];
3632			if (pd->proto_sum)
3633				*pd->proto_sum = bproto_sum;
3634			if (pd->ip_sum)
3635				*pd->ip_sum = bip_sum;
3636			m_copyback(m, off, hdrlen, pd->hdr.any);
3637		}
3638		s->src.seqhi = htonl(arc4random());
3639		/* Find mss option */
3640		int rtid = M_GETFIB(m);
3641		mss = pf_get_mss(m, off, th->th_off, pd->af);
3642		mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
3643		mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
3644		s->src.mss = mss;
3645		pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport,
3646		    th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
3647		    TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL);
3648		REASON_SET(&reason, PFRES_SYNPROXY);
3649		return (PF_SYNPROXY_DROP);
3650	}
3651
3652	return (PF_PASS);
3653
3654csfailed:
3655	if (sk != NULL)
3656		uma_zfree(V_pf_state_key_z, sk);
3657	if (nk != NULL)
3658		uma_zfree(V_pf_state_key_z, nk);
3659
3660	if (sn != NULL && sn->states == 0 && sn->expire == 0) {
3661		pf_unlink_src_node(sn);
3662		pf_free_src_node(sn);
3663	}
3664
3665	if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) {
3666		pf_unlink_src_node(nsn);
3667		pf_free_src_node(nsn);
3668	}
3669
3670	return (PF_DROP);
3671}
3672
3673static int
3674pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
3675    struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
3676    struct pf_ruleset **rsm)
3677{
3678	struct pf_rule		*r, *a = NULL;
3679	struct pf_ruleset	*ruleset = NULL;
3680	sa_family_t		 af = pd->af;
3681	u_short			 reason;
3682	int			 tag = -1;
3683	int			 asd = 0;
3684	int			 match = 0;
3685	struct pf_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
3686
3687	PF_RULES_RASSERT();
3688
3689	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3690	while (r != NULL) {
3691		r->evaluations++;
3692		if (pfi_kif_match(r->kif, kif) == r->ifnot)
3693			r = r->skip[PF_SKIP_IFP].ptr;
3694		else if (r->direction && r->direction != direction)
3695			r = r->skip[PF_SKIP_DIR].ptr;
3696		else if (r->af && r->af != af)
3697			r = r->skip[PF_SKIP_AF].ptr;
3698		else if (r->proto && r->proto != pd->proto)
3699			r = r->skip[PF_SKIP_PROTO].ptr;
3700		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
3701		    r->src.neg, kif, M_GETFIB(m)))
3702			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3703		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
3704		    r->dst.neg, NULL, M_GETFIB(m)))
3705			r = r->skip[PF_SKIP_DST_ADDR].ptr;
3706		else if (r->tos && !(r->tos == pd->tos))
3707			r = TAILQ_NEXT(r, entries);
3708		else if (r->os_fingerprint != PF_OSFP_ANY)
3709			r = TAILQ_NEXT(r, entries);
3710		else if (pd->proto == IPPROTO_UDP &&
3711		    (r->src.port_op || r->dst.port_op))
3712			r = TAILQ_NEXT(r, entries);
3713		else if (pd->proto == IPPROTO_TCP &&
3714		    (r->src.port_op || r->dst.port_op || r->flagset))
3715			r = TAILQ_NEXT(r, entries);
3716		else if ((pd->proto == IPPROTO_ICMP ||
3717		    pd->proto == IPPROTO_ICMPV6) &&
3718		    (r->type || r->code))
3719			r = TAILQ_NEXT(r, entries);
3720		else if (r->prob && r->prob <=
3721		    (arc4random() % (UINT_MAX - 1) + 1))
3722			r = TAILQ_NEXT(r, entries);
3723		else if (r->match_tag && !pf_match_tag(m, r, &tag,
3724		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
3725			r = TAILQ_NEXT(r, entries);
3726		else {
3727			if (r->anchor == NULL) {
3728				match = 1;
3729				*rm = r;
3730				*am = a;
3731				*rsm = ruleset;
3732				if ((*rm)->quick)
3733					break;
3734				r = TAILQ_NEXT(r, entries);
3735			} else
3736				pf_step_into_anchor(anchor_stack, &asd,
3737				    &ruleset, PF_RULESET_FILTER, &r, &a,
3738				    &match);
3739		}
3740		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
3741		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
3742			break;
3743	}
3744	r = *rm;
3745	a = *am;
3746	ruleset = *rsm;
3747
3748	REASON_SET(&reason, PFRES_MATCH);
3749
3750	if (r->log)
3751		PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
3752		    1);
3753
3754	if (r->action != PF_PASS)
3755		return (PF_DROP);
3756
3757	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3758		REASON_SET(&reason, PFRES_MEMORY);
3759		return (PF_DROP);
3760	}
3761
3762	return (PF_PASS);
3763}
3764
3765static int
3766pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
3767	struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
3768	struct pf_pdesc *pd, u_short *reason, int *copyback)
3769{
3770	struct tcphdr		*th = pd->hdr.tcp;
3771	u_int16_t		 win = ntohs(th->th_win);
3772	u_int32_t		 ack, end, seq, orig_seq;
3773	u_int8_t		 sws, dws;
3774	int			 ackskew;
3775
3776	if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
3777		sws = src->wscale & PF_WSCALE_MASK;
3778		dws = dst->wscale & PF_WSCALE_MASK;
3779	} else
3780		sws = dws = 0;
3781
3782	/*
3783	 * Sequence tracking algorithm from Guido van Rooij's paper:
3784	 *   http://www.madison-gurkha.com/publications/tcp_filtering/
3785	 *	tcp_filtering.ps
3786	 */
3787
3788	orig_seq = seq = ntohl(th->th_seq);
3789	if (src->seqlo == 0) {
3790		/* First packet from this end. Set its state */
3791
3792		if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
3793		    src->scrub == NULL) {
3794			if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
3795				REASON_SET(reason, PFRES_MEMORY);
3796				return (PF_DROP);
3797			}
3798		}
3799
3800		/* Deferred generation of sequence number modulator */
3801		if (dst->seqdiff && !src->seqdiff) {
3802			/* use random iss for the TCP server */
3803			while ((src->seqdiff = arc4random() - seq) == 0)
3804				;
3805			ack = ntohl(th->th_ack) - dst->seqdiff;
3806			pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
3807			    src->seqdiff), 0);
3808			pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
3809			*copyback = 1;
3810		} else {
3811			ack = ntohl(th->th_ack);
3812		}
3813
3814		end = seq + pd->p_len;
3815		if (th->th_flags & TH_SYN) {
3816			end++;
3817			if (dst->wscale & PF_WSCALE_FLAG) {
3818				src->wscale = pf_get_wscale(m, off, th->th_off,
3819				    pd->af);
3820				if (src->wscale & PF_WSCALE_FLAG) {
3821					/* Remove scale factor from initial
3822					 * window */
3823					sws = src->wscale & PF_WSCALE_MASK;
3824					win = ((u_int32_t)win + (1 << sws) - 1)
3825					    >> sws;
3826					dws = dst->wscale & PF_WSCALE_MASK;
3827				} else {
3828					/* fixup other window */
3829					dst->max_win <<= dst->wscale &
3830					    PF_WSCALE_MASK;
3831					/* in case of a retrans SYN|ACK */
3832					dst->wscale = 0;
3833				}
3834			}
3835		}
3836		if (th->th_flags & TH_FIN)
3837			end++;
3838
3839		src->seqlo = seq;
3840		if (src->state < TCPS_SYN_SENT)
3841			src->state = TCPS_SYN_SENT;
3842
3843		/*
3844		 * May need to slide the window (seqhi may have been set by
3845		 * the crappy stack check or if we picked up the connection
3846		 * after establishment)
3847		 */
3848		if (src->seqhi == 1 ||
3849		    SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
3850			src->seqhi = end + MAX(1, dst->max_win << dws);
3851		if (win > src->max_win)
3852			src->max_win = win;
3853
3854	} else {
3855		ack = ntohl(th->th_ack) - dst->seqdiff;
3856		if (src->seqdiff) {
3857			/* Modulate sequence numbers */
3858			pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
3859			    src->seqdiff), 0);
3860			pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
3861			*copyback = 1;
3862		}
3863		end = seq + pd->p_len;
3864		if (th->th_flags & TH_SYN)
3865			end++;
3866		if (th->th_flags & TH_FIN)
3867			end++;
3868	}
3869
3870	if ((th->th_flags & TH_ACK) == 0) {
3871		/* Let it pass through the ack skew check */
3872		ack = dst->seqlo;
3873	} else if ((ack == 0 &&
3874	    (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
3875	    /* broken tcp stacks do not set ack */
3876	    (dst->state < TCPS_SYN_SENT)) {
3877		/*
3878		 * Many stacks (ours included) will set the ACK number in an
3879		 * FIN|ACK if the SYN times out -- no sequence to ACK.
3880		 */
3881		ack = dst->seqlo;
3882	}
3883
3884	if (seq == end) {
3885		/* Ease sequencing restrictions on no data packets */
3886		seq = src->seqlo;
3887		end = seq;
3888	}
3889
3890	ackskew = dst->seqlo - ack;
3891
3892
3893	/*
3894	 * Need to demodulate the sequence numbers in any TCP SACK options
3895	 * (Selective ACK). We could optionally validate the SACK values
3896	 * against the current ACK window, either forwards or backwards, but
3897	 * I'm not confident that SACK has been implemented properly
3898	 * everywhere. It wouldn't surprise me if several stacks accidently
3899	 * SACK too far backwards of previously ACKed data. There really aren't
3900	 * any security implications of bad SACKing unless the target stack
3901	 * doesn't validate the option length correctly. Someone trying to
3902	 * spoof into a TCP connection won't bother blindly sending SACK
3903	 * options anyway.
3904	 */
3905	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
3906		if (pf_modulate_sack(m, off, pd, th, dst))
3907			*copyback = 1;
3908	}
3909
3910
3911#define	MAXACKWINDOW (0xffff + 1500)	/* 1500 is an arbitrary fudge factor */
3912	if (SEQ_GEQ(src->seqhi, end) &&
3913	    /* Last octet inside other's window space */
3914	    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
3915	    /* Retrans: not more than one window back */
3916	    (ackskew >= -MAXACKWINDOW) &&
3917	    /* Acking not more than one reassembled fragment backwards */
3918	    (ackskew <= (MAXACKWINDOW << sws)) &&
3919	    /* Acking not more than one window forward */
3920	    ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
3921	    (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
3922	    (pd->flags & PFDESC_IP_REAS) == 0)) {
3923	    /* Require an exact/+1 sequence match on resets when possible */
3924
3925		if (dst->scrub || src->scrub) {
3926			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
3927			    *state, src, dst, copyback))
3928				return (PF_DROP);
3929		}
3930
3931		/* update max window */
3932		if (src->max_win < win)
3933			src->max_win = win;
3934		/* synchronize sequencing */
3935		if (SEQ_GT(end, src->seqlo))
3936			src->seqlo = end;
3937		/* slide the window of what the other end can send */
3938		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
3939			dst->seqhi = ack + MAX((win << sws), 1);
3940
3941
3942		/* update states */
3943		if (th->th_flags & TH_SYN)
3944			if (src->state < TCPS_SYN_SENT)
3945				src->state = TCPS_SYN_SENT;
3946		if (th->th_flags & TH_FIN)
3947			if (src->state < TCPS_CLOSING)
3948				src->state = TCPS_CLOSING;
3949		if (th->th_flags & TH_ACK) {
3950			if (dst->state == TCPS_SYN_SENT) {
3951				dst->state = TCPS_ESTABLISHED;
3952				if (src->state == TCPS_ESTABLISHED &&
3953				    (*state)->src_node != NULL &&
3954				    pf_src_connlimit(state)) {
3955					REASON_SET(reason, PFRES_SRCLIMIT);
3956					return (PF_DROP);
3957				}
3958			} else if (dst->state == TCPS_CLOSING)
3959				dst->state = TCPS_FIN_WAIT_2;
3960		}
3961		if (th->th_flags & TH_RST)
3962			src->state = dst->state = TCPS_TIME_WAIT;
3963
3964		/* update expire time */
3965		(*state)->expire = time_uptime;
3966		if (src->state >= TCPS_FIN_WAIT_2 &&
3967		    dst->state >= TCPS_FIN_WAIT_2)
3968			(*state)->timeout = PFTM_TCP_CLOSED;
3969		else if (src->state >= TCPS_CLOSING &&
3970		    dst->state >= TCPS_CLOSING)
3971			(*state)->timeout = PFTM_TCP_FIN_WAIT;
3972		else if (src->state < TCPS_ESTABLISHED ||
3973		    dst->state < TCPS_ESTABLISHED)
3974			(*state)->timeout = PFTM_TCP_OPENING;
3975		else if (src->state >= TCPS_CLOSING ||
3976		    dst->state >= TCPS_CLOSING)
3977			(*state)->timeout = PFTM_TCP_CLOSING;
3978		else
3979			(*state)->timeout = PFTM_TCP_ESTABLISHED;
3980
3981		/* Fall through to PASS packet */
3982
3983	} else if ((dst->state < TCPS_SYN_SENT ||
3984		dst->state >= TCPS_FIN_WAIT_2 ||
3985		src->state >= TCPS_FIN_WAIT_2) &&
3986	    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
3987	    /* Within a window forward of the originating packet */
3988	    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
3989	    /* Within a window backward of the originating packet */
3990
3991		/*
3992		 * This currently handles three situations:
3993		 *  1) Stupid stacks will shotgun SYNs before their peer
3994		 *     replies.
3995		 *  2) When PF catches an already established stream (the
3996		 *     firewall rebooted, the state table was flushed, routes
3997		 *     changed...)
3998		 *  3) Packets get funky immediately after the connection
3999		 *     closes (this should catch Solaris spurious ACK|FINs
4000		 *     that web servers like to spew after a close)
4001		 *
4002		 * This must be a little more careful than the above code
4003		 * since packet floods will also be caught here. We don't
4004		 * update the TTL here to mitigate the damage of a packet
4005		 * flood and so the same code can handle awkward establishment
4006		 * and a loosened connection close.
4007		 * In the establishment case, a correct peer response will
4008		 * validate the connection, go through the normal state code
4009		 * and keep updating the state TTL.
4010		 */
4011
4012		if (V_pf_status.debug >= PF_DEBUG_MISC) {
4013			printf("pf: loose state match: ");
4014			pf_print_state(*state);
4015			pf_print_flags(th->th_flags);
4016			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4017			    "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
4018			    pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
4019			    (unsigned long long)(*state)->packets[1],
4020			    pd->dir == PF_IN ? "in" : "out",
4021			    pd->dir == (*state)->direction ? "fwd" : "rev");
4022		}
4023
4024		if (dst->scrub || src->scrub) {
4025			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4026			    *state, src, dst, copyback))
4027				return (PF_DROP);
4028		}
4029
4030		/* update max window */
4031		if (src->max_win < win)
4032			src->max_win = win;
4033		/* synchronize sequencing */
4034		if (SEQ_GT(end, src->seqlo))
4035			src->seqlo = end;
4036		/* slide the window of what the other end can send */
4037		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4038			dst->seqhi = ack + MAX((win << sws), 1);
4039
4040		/*
4041		 * Cannot set dst->seqhi here since this could be a shotgunned
4042		 * SYN and not an already established connection.
4043		 */
4044
4045		if (th->th_flags & TH_FIN)
4046			if (src->state < TCPS_CLOSING)
4047				src->state = TCPS_CLOSING;
4048		if (th->th_flags & TH_RST)
4049			src->state = dst->state = TCPS_TIME_WAIT;
4050
4051		/* Fall through to PASS packet */
4052
4053	} else {
4054		if ((*state)->dst.state == TCPS_SYN_SENT &&
4055		    (*state)->src.state == TCPS_SYN_SENT) {
4056			/* Send RST for state mismatches during handshake */
4057			if (!(th->th_flags & TH_RST))
4058				pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4059				    pd->dst, pd->src, th->th_dport,
4060				    th->th_sport, ntohl(th->th_ack), 0,
4061				    TH_RST, 0, 0,
4062				    (*state)->rule.ptr->return_ttl, 1, 0,
4063				    kif->pfik_ifp);
4064			src->seqlo = 0;
4065			src->seqhi = 1;
4066			src->max_win = 1;
4067		} else if (V_pf_status.debug >= PF_DEBUG_MISC) {
4068			printf("pf: BAD state: ");
4069			pf_print_state(*state);
4070			pf_print_flags(th->th_flags);
4071			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4072			    "pkts=%llu:%llu dir=%s,%s\n",
4073			    seq, orig_seq, ack, pd->p_len, ackskew,
4074			    (unsigned long long)(*state)->packets[0],
4075			    (unsigned long long)(*state)->packets[1],
4076			    pd->dir == PF_IN ? "in" : "out",
4077			    pd->dir == (*state)->direction ? "fwd" : "rev");
4078			printf("pf: State failure on: %c %c %c %c | %c %c\n",
4079			    SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
4080			    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
4081			    ' ': '2',
4082			    (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
4083			    (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
4084			    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
4085			    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
4086		}
4087		REASON_SET(reason, PFRES_BADSTATE);
4088		return (PF_DROP);
4089	}
4090
4091	return (PF_PASS);
4092}
4093
4094static int
4095pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
4096	struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
4097{
4098	struct tcphdr		*th = pd->hdr.tcp;
4099
4100	if (th->th_flags & TH_SYN)
4101		if (src->state < TCPS_SYN_SENT)
4102			src->state = TCPS_SYN_SENT;
4103	if (th->th_flags & TH_FIN)
4104		if (src->state < TCPS_CLOSING)
4105			src->state = TCPS_CLOSING;
4106	if (th->th_flags & TH_ACK) {
4107		if (dst->state == TCPS_SYN_SENT) {
4108			dst->state = TCPS_ESTABLISHED;
4109			if (src->state == TCPS_ESTABLISHED &&
4110			    (*state)->src_node != NULL &&
4111			    pf_src_connlimit(state)) {
4112				REASON_SET(reason, PFRES_SRCLIMIT);
4113				return (PF_DROP);
4114			}
4115		} else if (dst->state == TCPS_CLOSING) {
4116			dst->state = TCPS_FIN_WAIT_2;
4117		} else if (src->state == TCPS_SYN_SENT &&
4118		    dst->state < TCPS_SYN_SENT) {
4119			/*
4120			 * Handle a special sloppy case where we only see one
4121			 * half of the connection. If there is a ACK after
4122			 * the initial SYN without ever seeing a packet from
4123			 * the destination, set the connection to established.
4124			 */
4125			dst->state = src->state = TCPS_ESTABLISHED;
4126			if ((*state)->src_node != NULL &&
4127			    pf_src_connlimit(state)) {
4128				REASON_SET(reason, PFRES_SRCLIMIT);
4129				return (PF_DROP);
4130			}
4131		} else if (src->state == TCPS_CLOSING &&
4132		    dst->state == TCPS_ESTABLISHED &&
4133		    dst->seqlo == 0) {
4134			/*
4135			 * Handle the closing of half connections where we
4136			 * don't see the full bidirectional FIN/ACK+ACK
4137			 * handshake.
4138			 */
4139			dst->state = TCPS_CLOSING;
4140		}
4141	}
4142	if (th->th_flags & TH_RST)
4143		src->state = dst->state = TCPS_TIME_WAIT;
4144
4145	/* update expire time */
4146	(*state)->expire = time_uptime;
4147	if (src->state >= TCPS_FIN_WAIT_2 &&
4148	    dst->state >= TCPS_FIN_WAIT_2)
4149		(*state)->timeout = PFTM_TCP_CLOSED;
4150	else if (src->state >= TCPS_CLOSING &&
4151	    dst->state >= TCPS_CLOSING)
4152		(*state)->timeout = PFTM_TCP_FIN_WAIT;
4153	else if (src->state < TCPS_ESTABLISHED ||
4154	    dst->state < TCPS_ESTABLISHED)
4155		(*state)->timeout = PFTM_TCP_OPENING;
4156	else if (src->state >= TCPS_CLOSING ||
4157	    dst->state >= TCPS_CLOSING)
4158		(*state)->timeout = PFTM_TCP_CLOSING;
4159	else
4160		(*state)->timeout = PFTM_TCP_ESTABLISHED;
4161
4162	return (PF_PASS);
4163}
4164
4165static int
4166pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
4167    struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
4168    u_short *reason)
4169{
4170	struct pf_state_key_cmp	 key;
4171	struct tcphdr		*th = pd->hdr.tcp;
4172	int			 copyback = 0;
4173	struct pf_state_peer	*src, *dst;
4174	struct pf_state_key	*sk;
4175
4176	bzero(&key, sizeof(key));
4177	key.af = pd->af;
4178	key.proto = IPPROTO_TCP;
4179	if (direction == PF_IN)	{	/* wire side, straight */
4180		PF_ACPY(&key.addr[0], pd->src, key.af);
4181		PF_ACPY(&key.addr[1], pd->dst, key.af);
4182		key.port[0] = th->th_sport;
4183		key.port[1] = th->th_dport;
4184	} else {			/* stack side, reverse */
4185		PF_ACPY(&key.addr[1], pd->src, key.af);
4186		PF_ACPY(&key.addr[0], pd->dst, key.af);
4187		key.port[1] = th->th_sport;
4188		key.port[0] = th->th_dport;
4189	}
4190
4191	STATE_LOOKUP(kif, &key, direction, *state, pd);
4192
4193	if (direction == (*state)->direction) {
4194		src = &(*state)->src;
4195		dst = &(*state)->dst;
4196	} else {
4197		src = &(*state)->dst;
4198		dst = &(*state)->src;
4199	}
4200
4201	sk = (*state)->key[pd->didx];
4202
4203	if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
4204		if (direction != (*state)->direction) {
4205			REASON_SET(reason, PFRES_SYNPROXY);
4206			return (PF_SYNPROXY_DROP);
4207		}
4208		if (th->th_flags & TH_SYN) {
4209			if (ntohl(th->th_seq) != (*state)->src.seqlo) {
4210				REASON_SET(reason, PFRES_SYNPROXY);
4211				return (PF_DROP);
4212			}
4213			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4214			    pd->src, th->th_dport, th->th_sport,
4215			    (*state)->src.seqhi, ntohl(th->th_seq) + 1,
4216			    TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL);
4217			REASON_SET(reason, PFRES_SYNPROXY);
4218			return (PF_SYNPROXY_DROP);
4219		} else if (!(th->th_flags & TH_ACK) ||
4220		    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4221		    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4222			REASON_SET(reason, PFRES_SYNPROXY);
4223			return (PF_DROP);
4224		} else if ((*state)->src_node != NULL &&
4225		    pf_src_connlimit(state)) {
4226			REASON_SET(reason, PFRES_SRCLIMIT);
4227			return (PF_DROP);
4228		} else
4229			(*state)->src.state = PF_TCPS_PROXY_DST;
4230	}
4231	if ((*state)->src.state == PF_TCPS_PROXY_DST) {
4232		if (direction == (*state)->direction) {
4233			if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
4234			    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4235			    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4236				REASON_SET(reason, PFRES_SYNPROXY);
4237				return (PF_DROP);
4238			}
4239			(*state)->src.max_win = MAX(ntohs(th->th_win), 1);
4240			if ((*state)->dst.seqhi == 1)
4241				(*state)->dst.seqhi = htonl(arc4random());
4242			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4243			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
4244			    sk->port[pd->sidx], sk->port[pd->didx],
4245			    (*state)->dst.seqhi, 0, TH_SYN, 0,
4246			    (*state)->src.mss, 0, 0, (*state)->tag, NULL);
4247			REASON_SET(reason, PFRES_SYNPROXY);
4248			return (PF_SYNPROXY_DROP);
4249		} else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
4250		    (TH_SYN|TH_ACK)) ||
4251		    (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
4252			REASON_SET(reason, PFRES_SYNPROXY);
4253			return (PF_DROP);
4254		} else {
4255			(*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
4256			(*state)->dst.seqlo = ntohl(th->th_seq);
4257			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4258			    pd->src, th->th_dport, th->th_sport,
4259			    ntohl(th->th_ack), ntohl(th->th_seq) + 1,
4260			    TH_ACK, (*state)->src.max_win, 0, 0, 0,
4261			    (*state)->tag, NULL);
4262			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4263			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
4264			    sk->port[pd->sidx], sk->port[pd->didx],
4265			    (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
4266			    TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL);
4267			(*state)->src.seqdiff = (*state)->dst.seqhi -
4268			    (*state)->src.seqlo;
4269			(*state)->dst.seqdiff = (*state)->src.seqhi -
4270			    (*state)->dst.seqlo;
4271			(*state)->src.seqhi = (*state)->src.seqlo +
4272			    (*state)->dst.max_win;
4273			(*state)->dst.seqhi = (*state)->dst.seqlo +
4274			    (*state)->src.max_win;
4275			(*state)->src.wscale = (*state)->dst.wscale = 0;
4276			(*state)->src.state = (*state)->dst.state =
4277			    TCPS_ESTABLISHED;
4278			REASON_SET(reason, PFRES_SYNPROXY);
4279			return (PF_SYNPROXY_DROP);
4280		}
4281	}
4282
4283	if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
4284	    dst->state >= TCPS_FIN_WAIT_2 &&
4285	    src->state >= TCPS_FIN_WAIT_2) {
4286		if (V_pf_status.debug >= PF_DEBUG_MISC) {
4287			printf("pf: state reuse ");
4288			pf_print_state(*state);
4289			pf_print_flags(th->th_flags);
4290			printf("\n");
4291		}
4292		/* XXX make sure it's the same direction ?? */
4293		(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
4294		pf_unlink_state(*state, PF_ENTER_LOCKED);
4295		*state = NULL;
4296		return (PF_DROP);
4297	}
4298
4299	if ((*state)->state_flags & PFSTATE_SLOPPY) {
4300		if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
4301			return (PF_DROP);
4302	} else {
4303		if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
4304		    &copyback) == PF_DROP)
4305			return (PF_DROP);
4306	}
4307
4308	/* translate source/destination address, if necessary */
4309	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4310		struct pf_state_key *nk = (*state)->key[pd->didx];
4311
4312		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4313		    nk->port[pd->sidx] != th->th_sport)
4314			pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
4315			    &th->th_sum, &nk->addr[pd->sidx],
4316			    nk->port[pd->sidx], 0, pd->af);
4317
4318		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4319		    nk->port[pd->didx] != th->th_dport)
4320			pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
4321			    &th->th_sum, &nk->addr[pd->didx],
4322			    nk->port[pd->didx], 0, pd->af);
4323		copyback = 1;
4324	}
4325
4326	/* Copyback sequence modulation or stateful scrub changes if needed */
4327	if (copyback)
4328		m_copyback(m, off, sizeof(*th), (caddr_t)th);
4329
4330	return (PF_PASS);
4331}
4332
4333static int
4334pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
4335    struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
4336{
4337	struct pf_state_peer	*src, *dst;
4338	struct pf_state_key_cmp	 key;
4339	struct udphdr		*uh = pd->hdr.udp;
4340
4341	bzero(&key, sizeof(key));
4342	key.af = pd->af;
4343	key.proto = IPPROTO_UDP;
4344	if (direction == PF_IN)	{	/* wire side, straight */
4345		PF_ACPY(&key.addr[0], pd->src, key.af);
4346		PF_ACPY(&key.addr[1], pd->dst, key.af);
4347		key.port[0] = uh->uh_sport;
4348		key.port[1] = uh->uh_dport;
4349	} else {			/* stack side, reverse */
4350		PF_ACPY(&key.addr[1], pd->src, key.af);
4351		PF_ACPY(&key.addr[0], pd->dst, key.af);
4352		key.port[1] = uh->uh_sport;
4353		key.port[0] = uh->uh_dport;
4354	}
4355
4356	STATE_LOOKUP(kif, &key, direction, *state, pd);
4357
4358	if (direction == (*state)->direction) {
4359		src = &(*state)->src;
4360		dst = &(*state)->dst;
4361	} else {
4362		src = &(*state)->dst;
4363		dst = &(*state)->src;
4364	}
4365
4366	/* update states */
4367	if (src->state < PFUDPS_SINGLE)
4368		src->state = PFUDPS_SINGLE;
4369	if (dst->state == PFUDPS_SINGLE)
4370		dst->state = PFUDPS_MULTIPLE;
4371
4372	/* update expire time */
4373	(*state)->expire = time_uptime;
4374	if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
4375		(*state)->timeout = PFTM_UDP_MULTIPLE;
4376	else
4377		(*state)->timeout = PFTM_UDP_SINGLE;
4378
4379	/* translate source/destination address, if necessary */
4380	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4381		struct pf_state_key *nk = (*state)->key[pd->didx];
4382
4383		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4384		    nk->port[pd->sidx] != uh->uh_sport)
4385			pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
4386			    &uh->uh_sum, &nk->addr[pd->sidx],
4387			    nk->port[pd->sidx], 1, pd->af);
4388
4389		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4390		    nk->port[pd->didx] != uh->uh_dport)
4391			pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
4392			    &uh->uh_sum, &nk->addr[pd->didx],
4393			    nk->port[pd->didx], 1, pd->af);
4394		m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
4395	}
4396
4397	return (PF_PASS);
4398}
4399
4400static int
4401pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
4402    struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
4403{
4404	struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
4405	u_int16_t	 icmpid = 0, *icmpsum;
4406	u_int8_t	 icmptype;
4407	int		 state_icmp = 0;
4408	struct pf_state_key_cmp key;
4409
4410	bzero(&key, sizeof(key));
4411	switch (pd->proto) {
4412#ifdef INET
4413	case IPPROTO_ICMP:
4414		icmptype = pd->hdr.icmp->icmp_type;
4415		icmpid = pd->hdr.icmp->icmp_id;
4416		icmpsum = &pd->hdr.icmp->icmp_cksum;
4417
4418		if (icmptype == ICMP_UNREACH ||
4419		    icmptype == ICMP_SOURCEQUENCH ||
4420		    icmptype == ICMP_REDIRECT ||
4421		    icmptype == ICMP_TIMXCEED ||
4422		    icmptype == ICMP_PARAMPROB)
4423			state_icmp++;
4424		break;
4425#endif /* INET */
4426#ifdef INET6
4427	case IPPROTO_ICMPV6:
4428		icmptype = pd->hdr.icmp6->icmp6_type;
4429		icmpid = pd->hdr.icmp6->icmp6_id;
4430		icmpsum = &pd->hdr.icmp6->icmp6_cksum;
4431
4432		if (icmptype == ICMP6_DST_UNREACH ||
4433		    icmptype == ICMP6_PACKET_TOO_BIG ||
4434		    icmptype == ICMP6_TIME_EXCEEDED ||
4435		    icmptype == ICMP6_PARAM_PROB)
4436			state_icmp++;
4437		break;
4438#endif /* INET6 */
4439	}
4440
4441	if (!state_icmp) {
4442
4443		/*
4444		 * ICMP query/reply message not related to a TCP/UDP packet.
4445		 * Search for an ICMP state.
4446		 */
4447		key.af = pd->af;
4448		key.proto = pd->proto;
4449		key.port[0] = key.port[1] = icmpid;
4450		if (direction == PF_IN)	{	/* wire side, straight */
4451			PF_ACPY(&key.addr[0], pd->src, key.af);
4452			PF_ACPY(&key.addr[1], pd->dst, key.af);
4453		} else {			/* stack side, reverse */
4454			PF_ACPY(&key.addr[1], pd->src, key.af);
4455			PF_ACPY(&key.addr[0], pd->dst, key.af);
4456		}
4457
4458		STATE_LOOKUP(kif, &key, direction, *state, pd);
4459
4460		(*state)->expire = time_uptime;
4461		(*state)->timeout = PFTM_ICMP_ERROR_REPLY;
4462
4463		/* translate source/destination address, if necessary */
4464		if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4465			struct pf_state_key *nk = (*state)->key[pd->didx];
4466
4467			switch (pd->af) {
4468#ifdef INET
4469			case AF_INET:
4470				if (PF_ANEQ(pd->src,
4471				    &nk->addr[pd->sidx], AF_INET))
4472					pf_change_a(&saddr->v4.s_addr,
4473					    pd->ip_sum,
4474					    nk->addr[pd->sidx].v4.s_addr, 0);
4475
4476				if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
4477				    AF_INET))
4478					pf_change_a(&daddr->v4.s_addr,
4479					    pd->ip_sum,
4480					    nk->addr[pd->didx].v4.s_addr, 0);
4481
4482				if (nk->port[0] !=
4483				    pd->hdr.icmp->icmp_id) {
4484					pd->hdr.icmp->icmp_cksum =
4485					    pf_cksum_fixup(
4486					    pd->hdr.icmp->icmp_cksum, icmpid,
4487					    nk->port[pd->sidx], 0);
4488					pd->hdr.icmp->icmp_id =
4489					    nk->port[pd->sidx];
4490				}
4491
4492				m_copyback(m, off, ICMP_MINLEN,
4493				    (caddr_t )pd->hdr.icmp);
4494				break;
4495#endif /* INET */
4496#ifdef INET6
4497			case AF_INET6:
4498				if (PF_ANEQ(pd->src,
4499				    &nk->addr[pd->sidx], AF_INET6))
4500					pf_change_a6(saddr,
4501					    &pd->hdr.icmp6->icmp6_cksum,
4502					    &nk->addr[pd->sidx], 0);
4503
4504				if (PF_ANEQ(pd->dst,
4505				    &nk->addr[pd->didx], AF_INET6))
4506					pf_change_a6(daddr,
4507					    &pd->hdr.icmp6->icmp6_cksum,
4508					    &nk->addr[pd->didx], 0);
4509
4510				m_copyback(m, off, sizeof(struct icmp6_hdr),
4511				    (caddr_t )pd->hdr.icmp6);
4512				break;
4513#endif /* INET6 */
4514			}
4515		}
4516		return (PF_PASS);
4517
4518	} else {
4519		/*
4520		 * ICMP error message in response to a TCP/UDP packet.
4521		 * Extract the inner TCP/UDP header and search for that state.
4522		 */
4523
4524		struct pf_pdesc	pd2;
4525		bzero(&pd2, sizeof pd2);
4526#ifdef INET
4527		struct ip	h2;
4528#endif /* INET */
4529#ifdef INET6
4530		struct ip6_hdr	h2_6;
4531		int		terminal = 0;
4532#endif /* INET6 */
4533		int		ipoff2 = 0;
4534		int		off2 = 0;
4535
4536		pd2.af = pd->af;
4537		/* Payload packet is from the opposite direction. */
4538		pd2.sidx = (direction == PF_IN) ? 1 : 0;
4539		pd2.didx = (direction == PF_IN) ? 0 : 1;
4540		switch (pd->af) {
4541#ifdef INET
4542		case AF_INET:
4543			/* offset of h2 in mbuf chain */
4544			ipoff2 = off + ICMP_MINLEN;
4545
4546			if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
4547			    NULL, reason, pd2.af)) {
4548				DPFPRINTF(PF_DEBUG_MISC,
4549				    ("pf: ICMP error message too short "
4550				    "(ip)\n"));
4551				return (PF_DROP);
4552			}
4553			/*
4554			 * ICMP error messages don't refer to non-first
4555			 * fragments
4556			 */
4557			if (h2.ip_off & htons(IP_OFFMASK)) {
4558				REASON_SET(reason, PFRES_FRAG);
4559				return (PF_DROP);
4560			}
4561
4562			/* offset of protocol header that follows h2 */
4563			off2 = ipoff2 + (h2.ip_hl << 2);
4564
4565			pd2.proto = h2.ip_p;
4566			pd2.src = (struct pf_addr *)&h2.ip_src;
4567			pd2.dst = (struct pf_addr *)&h2.ip_dst;
4568			pd2.ip_sum = &h2.ip_sum;
4569			break;
4570#endif /* INET */
4571#ifdef INET6
4572		case AF_INET6:
4573			ipoff2 = off + sizeof(struct icmp6_hdr);
4574
4575			if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
4576			    NULL, reason, pd2.af)) {
4577				DPFPRINTF(PF_DEBUG_MISC,
4578				    ("pf: ICMP error message too short "
4579				    "(ip6)\n"));
4580				return (PF_DROP);
4581			}
4582			pd2.proto = h2_6.ip6_nxt;
4583			pd2.src = (struct pf_addr *)&h2_6.ip6_src;
4584			pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
4585			pd2.ip_sum = NULL;
4586			off2 = ipoff2 + sizeof(h2_6);
4587			do {
4588				switch (pd2.proto) {
4589				case IPPROTO_FRAGMENT:
4590					/*
4591					 * ICMPv6 error messages for
4592					 * non-first fragments
4593					 */
4594					REASON_SET(reason, PFRES_FRAG);
4595					return (PF_DROP);
4596				case IPPROTO_AH:
4597				case IPPROTO_HOPOPTS:
4598				case IPPROTO_ROUTING:
4599				case IPPROTO_DSTOPTS: {
4600					/* get next header and header length */
4601					struct ip6_ext opt6;
4602
4603					if (!pf_pull_hdr(m, off2, &opt6,
4604					    sizeof(opt6), NULL, reason,
4605					    pd2.af)) {
4606						DPFPRINTF(PF_DEBUG_MISC,
4607						    ("pf: ICMPv6 short opt\n"));
4608						return (PF_DROP);
4609					}
4610					if (pd2.proto == IPPROTO_AH)
4611						off2 += (opt6.ip6e_len + 2) * 4;
4612					else
4613						off2 += (opt6.ip6e_len + 1) * 8;
4614					pd2.proto = opt6.ip6e_nxt;
4615					/* goto the next header */
4616					break;
4617				}
4618				default:
4619					terminal++;
4620					break;
4621				}
4622			} while (!terminal);
4623			break;
4624#endif /* INET6 */
4625		}
4626
4627		switch (pd2.proto) {
4628		case IPPROTO_TCP: {
4629			struct tcphdr		 th;
4630			u_int32_t		 seq;
4631			struct pf_state_peer	*src, *dst;
4632			u_int8_t		 dws;
4633			int			 copyback = 0;
4634
4635			/*
4636			 * Only the first 8 bytes of the TCP header can be
4637			 * expected. Don't access any TCP header fields after
4638			 * th_seq, an ackskew test is not possible.
4639			 */
4640			if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
4641			    pd2.af)) {
4642				DPFPRINTF(PF_DEBUG_MISC,
4643				    ("pf: ICMP error message too short "
4644				    "(tcp)\n"));
4645				return (PF_DROP);
4646			}
4647
4648			key.af = pd2.af;
4649			key.proto = IPPROTO_TCP;
4650			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4651			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4652			key.port[pd2.sidx] = th.th_sport;
4653			key.port[pd2.didx] = th.th_dport;
4654
4655			STATE_LOOKUP(kif, &key, direction, *state, pd);
4656
4657			if (direction == (*state)->direction) {
4658				src = &(*state)->dst;
4659				dst = &(*state)->src;
4660			} else {
4661				src = &(*state)->src;
4662				dst = &(*state)->dst;
4663			}
4664
4665			if (src->wscale && dst->wscale)
4666				dws = dst->wscale & PF_WSCALE_MASK;
4667			else
4668				dws = 0;
4669
4670			/* Demodulate sequence number */
4671			seq = ntohl(th.th_seq) - src->seqdiff;
4672			if (src->seqdiff) {
4673				pf_change_a(&th.th_seq, icmpsum,
4674				    htonl(seq), 0);
4675				copyback = 1;
4676			}
4677
4678			if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
4679			    (!SEQ_GEQ(src->seqhi, seq) ||
4680			    !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
4681				if (V_pf_status.debug >= PF_DEBUG_MISC) {
4682					printf("pf: BAD ICMP %d:%d ",
4683					    icmptype, pd->hdr.icmp->icmp_code);
4684					pf_print_host(pd->src, 0, pd->af);
4685					printf(" -> ");
4686					pf_print_host(pd->dst, 0, pd->af);
4687					printf(" state: ");
4688					pf_print_state(*state);
4689					printf(" seq=%u\n", seq);
4690				}
4691				REASON_SET(reason, PFRES_BADSTATE);
4692				return (PF_DROP);
4693			} else {
4694				if (V_pf_status.debug >= PF_DEBUG_MISC) {
4695					printf("pf: OK ICMP %d:%d ",
4696					    icmptype, pd->hdr.icmp->icmp_code);
4697					pf_print_host(pd->src, 0, pd->af);
4698					printf(" -> ");
4699					pf_print_host(pd->dst, 0, pd->af);
4700					printf(" state: ");
4701					pf_print_state(*state);
4702					printf(" seq=%u\n", seq);
4703				}
4704			}
4705
4706			/* translate source/destination address, if necessary */
4707			if ((*state)->key[PF_SK_WIRE] !=
4708			    (*state)->key[PF_SK_STACK]) {
4709				struct pf_state_key *nk =
4710				    (*state)->key[pd->didx];
4711
4712				if (PF_ANEQ(pd2.src,
4713				    &nk->addr[pd2.sidx], pd2.af) ||
4714				    nk->port[pd2.sidx] != th.th_sport)
4715					pf_change_icmp(pd2.src, &th.th_sport,
4716					    daddr, &nk->addr[pd2.sidx],
4717					    nk->port[pd2.sidx], NULL,
4718					    pd2.ip_sum, icmpsum,
4719					    pd->ip_sum, 0, pd2.af);
4720
4721				if (PF_ANEQ(pd2.dst,
4722				    &nk->addr[pd2.didx], pd2.af) ||
4723				    nk->port[pd2.didx] != th.th_dport)
4724					pf_change_icmp(pd2.dst, &th.th_dport,
4725					    NULL, /* XXX Inbound NAT? */
4726					    &nk->addr[pd2.didx],
4727					    nk->port[pd2.didx], NULL,
4728					    pd2.ip_sum, icmpsum,
4729					    pd->ip_sum, 0, pd2.af);
4730				copyback = 1;
4731			}
4732
4733			if (copyback) {
4734				switch (pd2.af) {
4735#ifdef INET
4736				case AF_INET:
4737					m_copyback(m, off, ICMP_MINLEN,
4738					    (caddr_t )pd->hdr.icmp);
4739					m_copyback(m, ipoff2, sizeof(h2),
4740					    (caddr_t )&h2);
4741					break;
4742#endif /* INET */
4743#ifdef INET6
4744				case AF_INET6:
4745					m_copyback(m, off,
4746					    sizeof(struct icmp6_hdr),
4747					    (caddr_t )pd->hdr.icmp6);
4748					m_copyback(m, ipoff2, sizeof(h2_6),
4749					    (caddr_t )&h2_6);
4750					break;
4751#endif /* INET6 */
4752				}
4753				m_copyback(m, off2, 8, (caddr_t)&th);
4754			}
4755
4756			return (PF_PASS);
4757			break;
4758		}
4759		case IPPROTO_UDP: {
4760			struct udphdr		uh;
4761
4762			if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
4763			    NULL, reason, pd2.af)) {
4764				DPFPRINTF(PF_DEBUG_MISC,
4765				    ("pf: ICMP error message too short "
4766				    "(udp)\n"));
4767				return (PF_DROP);
4768			}
4769
4770			key.af = pd2.af;
4771			key.proto = IPPROTO_UDP;
4772			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4773			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4774			key.port[pd2.sidx] = uh.uh_sport;
4775			key.port[pd2.didx] = uh.uh_dport;
4776
4777			STATE_LOOKUP(kif, &key, direction, *state, pd);
4778
4779			/* translate source/destination address, if necessary */
4780			if ((*state)->key[PF_SK_WIRE] !=
4781			    (*state)->key[PF_SK_STACK]) {
4782				struct pf_state_key *nk =
4783				    (*state)->key[pd->didx];
4784
4785				if (PF_ANEQ(pd2.src,
4786				    &nk->addr[pd2.sidx], pd2.af) ||
4787				    nk->port[pd2.sidx] != uh.uh_sport)
4788					pf_change_icmp(pd2.src, &uh.uh_sport,
4789					    daddr, &nk->addr[pd2.sidx],
4790					    nk->port[pd2.sidx], &uh.uh_sum,
4791					    pd2.ip_sum, icmpsum,
4792					    pd->ip_sum, 1, pd2.af);
4793
4794				if (PF_ANEQ(pd2.dst,
4795				    &nk->addr[pd2.didx], pd2.af) ||
4796				    nk->port[pd2.didx] != uh.uh_dport)
4797					pf_change_icmp(pd2.dst, &uh.uh_dport,
4798					    NULL, /* XXX Inbound NAT? */
4799					    &nk->addr[pd2.didx],
4800					    nk->port[pd2.didx], &uh.uh_sum,
4801					    pd2.ip_sum, icmpsum,
4802					    pd->ip_sum, 1, pd2.af);
4803
4804				switch (pd2.af) {
4805#ifdef INET
4806				case AF_INET:
4807					m_copyback(m, off, ICMP_MINLEN,
4808					    (caddr_t )pd->hdr.icmp);
4809					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4810					break;
4811#endif /* INET */
4812#ifdef INET6
4813				case AF_INET6:
4814					m_copyback(m, off,
4815					    sizeof(struct icmp6_hdr),
4816					    (caddr_t )pd->hdr.icmp6);
4817					m_copyback(m, ipoff2, sizeof(h2_6),
4818					    (caddr_t )&h2_6);
4819					break;
4820#endif /* INET6 */
4821				}
4822				m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
4823			}
4824			return (PF_PASS);
4825			break;
4826		}
4827#ifdef INET
4828		case IPPROTO_ICMP: {
4829			struct icmp		iih;
4830
4831			if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
4832			    NULL, reason, pd2.af)) {
4833				DPFPRINTF(PF_DEBUG_MISC,
4834				    ("pf: ICMP error message too short i"
4835				    "(icmp)\n"));
4836				return (PF_DROP);
4837			}
4838
4839			key.af = pd2.af;
4840			key.proto = IPPROTO_ICMP;
4841			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4842			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4843			key.port[0] = key.port[1] = iih.icmp_id;
4844
4845			STATE_LOOKUP(kif, &key, direction, *state, pd);
4846
4847			/* translate source/destination address, if necessary */
4848			if ((*state)->key[PF_SK_WIRE] !=
4849			    (*state)->key[PF_SK_STACK]) {
4850				struct pf_state_key *nk =
4851				    (*state)->key[pd->didx];
4852
4853				if (PF_ANEQ(pd2.src,
4854				    &nk->addr[pd2.sidx], pd2.af) ||
4855				    nk->port[pd2.sidx] != iih.icmp_id)
4856					pf_change_icmp(pd2.src, &iih.icmp_id,
4857					    daddr, &nk->addr[pd2.sidx],
4858					    nk->port[pd2.sidx], NULL,
4859					    pd2.ip_sum, icmpsum,
4860					    pd->ip_sum, 0, AF_INET);
4861
4862				if (PF_ANEQ(pd2.dst,
4863				    &nk->addr[pd2.didx], pd2.af) ||
4864				    nk->port[pd2.didx] != iih.icmp_id)
4865					pf_change_icmp(pd2.dst, &iih.icmp_id,
4866					    NULL, /* XXX Inbound NAT? */
4867					    &nk->addr[pd2.didx],
4868					    nk->port[pd2.didx], NULL,
4869					    pd2.ip_sum, icmpsum,
4870					    pd->ip_sum, 0, AF_INET);
4871
4872				m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
4873				m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4874				m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
4875			}
4876			return (PF_PASS);
4877			break;
4878		}
4879#endif /* INET */
4880#ifdef INET6
4881		case IPPROTO_ICMPV6: {
4882			struct icmp6_hdr	iih;
4883
4884			if (!pf_pull_hdr(m, off2, &iih,
4885			    sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
4886				DPFPRINTF(PF_DEBUG_MISC,
4887				    ("pf: ICMP error message too short "
4888				    "(icmp6)\n"));
4889				return (PF_DROP);
4890			}
4891
4892			key.af = pd2.af;
4893			key.proto = IPPROTO_ICMPV6;
4894			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4895			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4896			key.port[0] = key.port[1] = iih.icmp6_id;
4897
4898			STATE_LOOKUP(kif, &key, direction, *state, pd);
4899
4900			/* translate source/destination address, if necessary */
4901			if ((*state)->key[PF_SK_WIRE] !=
4902			    (*state)->key[PF_SK_STACK]) {
4903				struct pf_state_key *nk =
4904				    (*state)->key[pd->didx];
4905
4906				if (PF_ANEQ(pd2.src,
4907				    &nk->addr[pd2.sidx], pd2.af) ||
4908				    nk->port[pd2.sidx] != iih.icmp6_id)
4909					pf_change_icmp(pd2.src, &iih.icmp6_id,
4910					    daddr, &nk->addr[pd2.sidx],
4911					    nk->port[pd2.sidx], NULL,
4912					    pd2.ip_sum, icmpsum,
4913					    pd->ip_sum, 0, AF_INET6);
4914
4915				if (PF_ANEQ(pd2.dst,
4916				    &nk->addr[pd2.didx], pd2.af) ||
4917				    nk->port[pd2.didx] != iih.icmp6_id)
4918					pf_change_icmp(pd2.dst, &iih.icmp6_id,
4919					    NULL, /* XXX Inbound NAT? */
4920					    &nk->addr[pd2.didx],
4921					    nk->port[pd2.didx], NULL,
4922					    pd2.ip_sum, icmpsum,
4923					    pd->ip_sum, 0, AF_INET6);
4924
4925				m_copyback(m, off, sizeof(struct icmp6_hdr),
4926				    (caddr_t)pd->hdr.icmp6);
4927				m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
4928				m_copyback(m, off2, sizeof(struct icmp6_hdr),
4929				    (caddr_t)&iih);
4930			}
4931			return (PF_PASS);
4932			break;
4933		}
4934#endif /* INET6 */
4935		default: {
4936			key.af = pd2.af;
4937			key.proto = pd2.proto;
4938			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4939			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4940			key.port[0] = key.port[1] = 0;
4941
4942			STATE_LOOKUP(kif, &key, direction, *state, pd);
4943
4944			/* translate source/destination address, if necessary */
4945			if ((*state)->key[PF_SK_WIRE] !=
4946			    (*state)->key[PF_SK_STACK]) {
4947				struct pf_state_key *nk =
4948				    (*state)->key[pd->didx];
4949
4950				if (PF_ANEQ(pd2.src,
4951				    &nk->addr[pd2.sidx], pd2.af))
4952					pf_change_icmp(pd2.src, NULL, daddr,
4953					    &nk->addr[pd2.sidx], 0, NULL,
4954					    pd2.ip_sum, icmpsum,
4955					    pd->ip_sum, 0, pd2.af);
4956
4957				if (PF_ANEQ(pd2.dst,
4958				    &nk->addr[pd2.didx], pd2.af))
4959					pf_change_icmp(pd2.src, NULL,
4960					    NULL, /* XXX Inbound NAT? */
4961					    &nk->addr[pd2.didx], 0, NULL,
4962					    pd2.ip_sum, icmpsum,
4963					    pd->ip_sum, 0, pd2.af);
4964
4965				switch (pd2.af) {
4966#ifdef INET
4967				case AF_INET:
4968					m_copyback(m, off, ICMP_MINLEN,
4969					    (caddr_t)pd->hdr.icmp);
4970					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4971					break;
4972#endif /* INET */
4973#ifdef INET6
4974				case AF_INET6:
4975					m_copyback(m, off,
4976					    sizeof(struct icmp6_hdr),
4977					    (caddr_t )pd->hdr.icmp6);
4978					m_copyback(m, ipoff2, sizeof(h2_6),
4979					    (caddr_t )&h2_6);
4980					break;
4981#endif /* INET6 */
4982				}
4983			}
4984			return (PF_PASS);
4985			break;
4986		}
4987		}
4988	}
4989}
4990
4991static int
4992pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
4993    struct mbuf *m, struct pf_pdesc *pd)
4994{
4995	struct pf_state_peer	*src, *dst;
4996	struct pf_state_key_cmp	 key;
4997
4998	bzero(&key, sizeof(key));
4999	key.af = pd->af;
5000	key.proto = pd->proto;
5001	if (direction == PF_IN)	{
5002		PF_ACPY(&key.addr[0], pd->src, key.af);
5003		PF_ACPY(&key.addr[1], pd->dst, key.af);
5004		key.port[0] = key.port[1] = 0;
5005	} else {
5006		PF_ACPY(&key.addr[1], pd->src, key.af);
5007		PF_ACPY(&key.addr[0], pd->dst, key.af);
5008		key.port[1] = key.port[0] = 0;
5009	}
5010
5011	STATE_LOOKUP(kif, &key, direction, *state, pd);
5012
5013	if (direction == (*state)->direction) {
5014		src = &(*state)->src;
5015		dst = &(*state)->dst;
5016	} else {
5017		src = &(*state)->dst;
5018		dst = &(*state)->src;
5019	}
5020
5021	/* update states */
5022	if (src->state < PFOTHERS_SINGLE)
5023		src->state = PFOTHERS_SINGLE;
5024	if (dst->state == PFOTHERS_SINGLE)
5025		dst->state = PFOTHERS_MULTIPLE;
5026
5027	/* update expire time */
5028	(*state)->expire = time_uptime;
5029	if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
5030		(*state)->timeout = PFTM_OTHER_MULTIPLE;
5031	else
5032		(*state)->timeout = PFTM_OTHER_SINGLE;
5033
5034	/* translate source/destination address, if necessary */
5035	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5036		struct pf_state_key *nk = (*state)->key[pd->didx];
5037
5038		KASSERT(nk, ("%s: nk is null", __func__));
5039		KASSERT(pd, ("%s: pd is null", __func__));
5040		KASSERT(pd->src, ("%s: pd->src is null", __func__));
5041		KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
5042		switch (pd->af) {
5043#ifdef INET
5044		case AF_INET:
5045			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5046				pf_change_a(&pd->src->v4.s_addr,
5047				    pd->ip_sum,
5048				    nk->addr[pd->sidx].v4.s_addr,
5049				    0);
5050
5051
5052			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5053				pf_change_a(&pd->dst->v4.s_addr,
5054				    pd->ip_sum,
5055				    nk->addr[pd->didx].v4.s_addr,
5056				    0);
5057
5058				break;
5059#endif /* INET */
5060#ifdef INET6
5061		case AF_INET6:
5062			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5063				PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
5064
5065			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5066				PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
5067#endif /* INET6 */
5068		}
5069	}
5070	return (PF_PASS);
5071}
5072
5073/*
5074 * ipoff and off are measured from the start of the mbuf chain.
5075 * h must be at "ipoff" on the mbuf chain.
5076 */
5077void *
5078pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
5079    u_short *actionp, u_short *reasonp, sa_family_t af)
5080{
5081	switch (af) {
5082#ifdef INET
5083	case AF_INET: {
5084		struct ip	*h = mtod(m, struct ip *);
5085		u_int16_t	 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
5086
5087		if (fragoff) {
5088			if (fragoff >= len)
5089				ACTION_SET(actionp, PF_PASS);
5090			else {
5091				ACTION_SET(actionp, PF_DROP);
5092				REASON_SET(reasonp, PFRES_FRAG);
5093			}
5094			return (NULL);
5095		}
5096		if (m->m_pkthdr.len < off + len ||
5097		    ntohs(h->ip_len) < off + len) {
5098			ACTION_SET(actionp, PF_DROP);
5099			REASON_SET(reasonp, PFRES_SHORT);
5100			return (NULL);
5101		}
5102		break;
5103	}
5104#endif /* INET */
5105#ifdef INET6
5106	case AF_INET6: {
5107		struct ip6_hdr	*h = mtod(m, struct ip6_hdr *);
5108
5109		if (m->m_pkthdr.len < off + len ||
5110		    (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
5111		    (unsigned)(off + len)) {
5112			ACTION_SET(actionp, PF_DROP);
5113			REASON_SET(reasonp, PFRES_SHORT);
5114			return (NULL);
5115		}
5116		break;
5117	}
5118#endif /* INET6 */
5119	}
5120	m_copydata(m, off, len, p);
5121	return (p);
5122}
5123
5124int
5125pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
5126    int rtableid)
5127{
5128#ifdef RADIX_MPATH
5129	struct radix_node_head	*rnh;
5130#endif
5131	struct sockaddr_in	*dst;
5132	int			 ret = 1;
5133	int			 check_mpath;
5134#ifdef INET6
5135	struct sockaddr_in6	*dst6;
5136	struct route_in6	 ro;
5137#else
5138	struct route		 ro;
5139#endif
5140	struct radix_node	*rn;
5141	struct rtentry		*rt;
5142	struct ifnet		*ifp;
5143
5144	check_mpath = 0;
5145#ifdef RADIX_MPATH
5146	/* XXX: stick to table 0 for now */
5147	rnh = rt_tables_get_rnh(0, af);
5148	if (rnh != NULL && rn_mpath_capable(rnh))
5149		check_mpath = 1;
5150#endif
5151	bzero(&ro, sizeof(ro));
5152	switch (af) {
5153	case AF_INET:
5154		dst = satosin(&ro.ro_dst);
5155		dst->sin_family = AF_INET;
5156		dst->sin_len = sizeof(*dst);
5157		dst->sin_addr = addr->v4;
5158		break;
5159#ifdef INET6
5160	case AF_INET6:
5161		/*
5162		 * Skip check for addresses with embedded interface scope,
5163		 * as they would always match anyway.
5164		 */
5165		if (IN6_IS_SCOPE_EMBED(&addr->v6))
5166			goto out;
5167		dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5168		dst6->sin6_family = AF_INET6;
5169		dst6->sin6_len = sizeof(*dst6);
5170		dst6->sin6_addr = addr->v6;
5171		break;
5172#endif /* INET6 */
5173	default:
5174		return (0);
5175	}
5176
5177	/* Skip checks for ipsec interfaces */
5178	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5179		goto out;
5180
5181	switch (af) {
5182#ifdef INET6
5183	case AF_INET6:
5184		in6_rtalloc_ign(&ro, 0, rtableid);
5185		break;
5186#endif
5187#ifdef INET
5188	case AF_INET:
5189		in_rtalloc_ign((struct route *)&ro, 0, rtableid);
5190		break;
5191#endif
5192	default:
5193		rtalloc_ign((struct route *)&ro, 0);	/* No/default FIB. */
5194		break;
5195	}
5196
5197	if (ro.ro_rt != NULL) {
5198		/* No interface given, this is a no-route check */
5199		if (kif == NULL)
5200			goto out;
5201
5202		if (kif->pfik_ifp == NULL) {
5203			ret = 0;
5204			goto out;
5205		}
5206
5207		/* Perform uRPF check if passed input interface */
5208		ret = 0;
5209		rn = (struct radix_node *)ro.ro_rt;
5210		do {
5211			rt = (struct rtentry *)rn;
5212			ifp = rt->rt_ifp;
5213
5214			if (kif->pfik_ifp == ifp)
5215				ret = 1;
5216#ifdef RADIX_MPATH
5217			rn = rn_mpath_next(rn);
5218#endif
5219		} while (check_mpath == 1 && rn != NULL && ret == 0);
5220	} else
5221		ret = 0;
5222out:
5223	if (ro.ro_rt != NULL)
5224		RTFREE(ro.ro_rt);
5225	return (ret);
5226}
5227
5228#ifdef INET
5229static void
5230pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5231    struct pf_state *s, struct pf_pdesc *pd)
5232{
5233	struct mbuf		*m0, *m1;
5234	struct sockaddr_in	dst;
5235	struct ip		*ip;
5236	struct ifnet		*ifp = NULL;
5237	struct pf_addr		 naddr;
5238	struct pf_src_node	*sn = NULL;
5239	int			 error = 0;
5240	uint16_t		 ip_len, ip_off;
5241
5242	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5243	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5244	    __func__));
5245
5246	if ((pd->pf_mtag == NULL &&
5247	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5248	    pd->pf_mtag->routed++ > 3) {
5249		m0 = *m;
5250		*m = NULL;
5251		goto bad_locked;
5252	}
5253
5254	if (r->rt == PF_DUPTO) {
5255		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5256			if (s)
5257				PF_STATE_UNLOCK(s);
5258			return;
5259		}
5260	} else {
5261		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5262			if (s)
5263				PF_STATE_UNLOCK(s);
5264			return;
5265		}
5266		m0 = *m;
5267	}
5268
5269	ip = mtod(m0, struct ip *);
5270
5271	bzero(&dst, sizeof(dst));
5272	dst.sin_family = AF_INET;
5273	dst.sin_len = sizeof(dst);
5274	dst.sin_addr = ip->ip_dst;
5275
5276	if (r->rt == PF_FASTROUTE) {
5277		struct rtentry *rt;
5278
5279		if (s)
5280			PF_STATE_UNLOCK(s);
5281		rt = rtalloc1_fib(sintosa(&dst), 0, 0, M_GETFIB(m0));
5282		if (rt == NULL) {
5283			KMOD_IPSTAT_INC(ips_noroute);
5284			error = EHOSTUNREACH;
5285			goto bad;
5286		}
5287
5288		ifp = rt->rt_ifp;
5289		counter_u64_add(rt->rt_pksent, 1);
5290
5291		if (rt->rt_flags & RTF_GATEWAY)
5292			bcopy(satosin(rt->rt_gateway), &dst, sizeof(dst));
5293		RTFREE_LOCKED(rt);
5294	} else {
5295		if (TAILQ_EMPTY(&r->rpool.list)) {
5296			DPFPRINTF(PF_DEBUG_URGENT,
5297			    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5298			goto bad_locked;
5299		}
5300		if (s == NULL) {
5301			pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
5302			    &naddr, NULL, &sn);
5303			if (!PF_AZERO(&naddr, AF_INET))
5304				dst.sin_addr.s_addr = naddr.v4.s_addr;
5305			ifp = r->rpool.cur->kif ?
5306			    r->rpool.cur->kif->pfik_ifp : NULL;
5307		} else {
5308			if (!PF_AZERO(&s->rt_addr, AF_INET))
5309				dst.sin_addr.s_addr =
5310				    s->rt_addr.v4.s_addr;
5311			ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5312			PF_STATE_UNLOCK(s);
5313		}
5314	}
5315	if (ifp == NULL)
5316		goto bad;
5317
5318	if (oifp != ifp) {
5319		if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS)
5320			goto bad;
5321		else if (m0 == NULL)
5322			goto done;
5323		if (m0->m_len < sizeof(struct ip)) {
5324			DPFPRINTF(PF_DEBUG_URGENT,
5325			    ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
5326			goto bad;
5327		}
5328		ip = mtod(m0, struct ip *);
5329	}
5330
5331	if (ifp->if_flags & IFF_LOOPBACK)
5332		m0->m_flags |= M_SKIP_FIREWALL;
5333
5334	ip_len = ntohs(ip->ip_len);
5335	ip_off = ntohs(ip->ip_off);
5336
5337	/* Copied from FreeBSD 10.0-CURRENT ip_output. */
5338	m0->m_pkthdr.csum_flags |= CSUM_IP;
5339	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
5340		in_delayed_cksum(m0);
5341		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
5342	}
5343#ifdef SCTP
5344	if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
5345		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
5346		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
5347	}
5348#endif
5349
5350	/*
5351	 * If small enough for interface, or the interface will take
5352	 * care of the fragmentation for us, we can just send directly.
5353	 */
5354	if (ip_len <= ifp->if_mtu ||
5355	    (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
5356	    ((ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
5357		ip->ip_sum = 0;
5358		if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
5359			ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
5360			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
5361		}
5362		m_clrprotoflags(m0);	/* Avoid confusing lower layers. */
5363		error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5364		goto done;
5365	}
5366
5367	/* Balk when DF bit is set or the interface didn't support TSO. */
5368	if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
5369		error = EMSGSIZE;
5370		KMOD_IPSTAT_INC(ips_cantfrag);
5371		if (r->rt != PF_DUPTO) {
5372			icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
5373			    ifp->if_mtu);
5374			goto done;
5375		} else
5376			goto bad;
5377	}
5378
5379	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist);
5380	if (error)
5381		goto bad;
5382
5383	for (; m0; m0 = m1) {
5384		m1 = m0->m_nextpkt;
5385		m0->m_nextpkt = NULL;
5386		if (error == 0) {
5387			m_clrprotoflags(m0);
5388			error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5389		} else
5390			m_freem(m0);
5391	}
5392
5393	if (error == 0)
5394		KMOD_IPSTAT_INC(ips_fragmented);
5395
5396done:
5397	if (r->rt != PF_DUPTO)
5398		*m = NULL;
5399	return;
5400
5401bad_locked:
5402	if (s)
5403		PF_STATE_UNLOCK(s);
5404bad:
5405	m_freem(m0);
5406	goto done;
5407}
5408#endif /* INET */
5409
5410#ifdef INET6
5411static void
5412pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5413    struct pf_state *s, struct pf_pdesc *pd)
5414{
5415	struct mbuf		*m0;
5416	struct sockaddr_in6	dst;
5417	struct ip6_hdr		*ip6;
5418	struct ifnet		*ifp = NULL;
5419	struct pf_addr		 naddr;
5420	struct pf_src_node	*sn = NULL;
5421
5422	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5423	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5424	    __func__));
5425
5426	if ((pd->pf_mtag == NULL &&
5427	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5428	    pd->pf_mtag->routed++ > 3) {
5429		m0 = *m;
5430		*m = NULL;
5431		goto bad_locked;
5432	}
5433
5434	if (r->rt == PF_DUPTO) {
5435		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5436			if (s)
5437				PF_STATE_UNLOCK(s);
5438			return;
5439		}
5440	} else {
5441		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5442			if (s)
5443				PF_STATE_UNLOCK(s);
5444			return;
5445		}
5446		m0 = *m;
5447	}
5448
5449	ip6 = mtod(m0, struct ip6_hdr *);
5450
5451	bzero(&dst, sizeof(dst));
5452	dst.sin6_family = AF_INET6;
5453	dst.sin6_len = sizeof(dst);
5454	dst.sin6_addr = ip6->ip6_dst;
5455
5456	/* Cheat. XXX why only in the v6 case??? */
5457	if (r->rt == PF_FASTROUTE) {
5458		if (s)
5459			PF_STATE_UNLOCK(s);
5460		m0->m_flags |= M_SKIP_FIREWALL;
5461		ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
5462		return;
5463	}
5464
5465	if (TAILQ_EMPTY(&r->rpool.list)) {
5466		DPFPRINTF(PF_DEBUG_URGENT,
5467		    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5468		goto bad_locked;
5469	}
5470	if (s == NULL) {
5471		pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
5472		    &naddr, NULL, &sn);
5473		if (!PF_AZERO(&naddr, AF_INET6))
5474			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5475			    &naddr, AF_INET6);
5476		ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
5477	} else {
5478		if (!PF_AZERO(&s->rt_addr, AF_INET6))
5479			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5480			    &s->rt_addr, AF_INET6);
5481		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5482	}
5483
5484	if (s)
5485		PF_STATE_UNLOCK(s);
5486
5487	if (ifp == NULL)
5488		goto bad;
5489
5490	if (oifp != ifp) {
5491		if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS)
5492			goto bad;
5493		else if (m0 == NULL)
5494			goto done;
5495		if (m0->m_len < sizeof(struct ip6_hdr)) {
5496			DPFPRINTF(PF_DEBUG_URGENT,
5497			    ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
5498			    __func__));
5499			goto bad;
5500		}
5501		ip6 = mtod(m0, struct ip6_hdr *);
5502	}
5503
5504	if (ifp->if_flags & IFF_LOOPBACK)
5505		m0->m_flags |= M_SKIP_FIREWALL;
5506
5507	/*
5508	 * If the packet is too large for the outgoing interface,
5509	 * send back an icmp6 error.
5510	 */
5511	if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
5512		dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
5513	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu)
5514		nd6_output(ifp, ifp, m0, &dst, NULL);
5515	else {
5516		in6_ifstat_inc(ifp, ifs6_in_toobig);
5517		if (r->rt != PF_DUPTO)
5518			icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
5519		else
5520			goto bad;
5521	}
5522
5523done:
5524	if (r->rt != PF_DUPTO)
5525		*m = NULL;
5526	return;
5527
5528bad_locked:
5529	if (s)
5530		PF_STATE_UNLOCK(s);
5531bad:
5532	m_freem(m0);
5533	goto done;
5534}
5535#endif /* INET6 */
5536
5537/*
5538 * FreeBSD supports cksum offloads for the following drivers.
5539 *  em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4),
5540 *   ti(4), txp(4), xl(4)
5541 *
5542 * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
5543 *  network driver performed cksum including pseudo header, need to verify
5544 *   csum_data
5545 * CSUM_DATA_VALID :
5546 *  network driver performed cksum, needs to additional pseudo header
5547 *  cksum computation with partial csum_data(i.e. lack of H/W support for
5548 *  pseudo header, for instance hme(4), sk(4) and possibly gem(4))
5549 *
5550 * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
5551 * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
5552 * TCP/UDP layer.
5553 * Also, set csum_data to 0xffff to force cksum validation.
5554 */
5555static int
5556pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
5557{
5558	u_int16_t sum = 0;
5559	int hw_assist = 0;
5560	struct ip *ip;
5561
5562	if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
5563		return (1);
5564	if (m->m_pkthdr.len < off + len)
5565		return (1);
5566
5567	switch (p) {
5568	case IPPROTO_TCP:
5569		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5570			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5571				sum = m->m_pkthdr.csum_data;
5572			} else {
5573				ip = mtod(m, struct ip *);
5574				sum = in_pseudo(ip->ip_src.s_addr,
5575				ip->ip_dst.s_addr, htonl((u_short)len +
5576				m->m_pkthdr.csum_data + IPPROTO_TCP));
5577			}
5578			sum ^= 0xffff;
5579			++hw_assist;
5580		}
5581		break;
5582	case IPPROTO_UDP:
5583		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5584			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5585				sum = m->m_pkthdr.csum_data;
5586			} else {
5587				ip = mtod(m, struct ip *);
5588				sum = in_pseudo(ip->ip_src.s_addr,
5589				ip->ip_dst.s_addr, htonl((u_short)len +
5590				m->m_pkthdr.csum_data + IPPROTO_UDP));
5591			}
5592			sum ^= 0xffff;
5593			++hw_assist;
5594		}
5595		break;
5596	case IPPROTO_ICMP:
5597#ifdef INET6
5598	case IPPROTO_ICMPV6:
5599#endif /* INET6 */
5600		break;
5601	default:
5602		return (1);
5603	}
5604
5605	if (!hw_assist) {
5606		switch (af) {
5607		case AF_INET:
5608			if (p == IPPROTO_ICMP) {
5609				if (m->m_len < off)
5610					return (1);
5611				m->m_data += off;
5612				m->m_len -= off;
5613				sum = in_cksum(m, len);
5614				m->m_data -= off;
5615				m->m_len += off;
5616			} else {
5617				if (m->m_len < sizeof(struct ip))
5618					return (1);
5619				sum = in4_cksum(m, p, off, len);
5620			}
5621			break;
5622#ifdef INET6
5623		case AF_INET6:
5624			if (m->m_len < sizeof(struct ip6_hdr))
5625				return (1);
5626			sum = in6_cksum(m, p, off, len);
5627			break;
5628#endif /* INET6 */
5629		default:
5630			return (1);
5631		}
5632	}
5633	if (sum) {
5634		switch (p) {
5635		case IPPROTO_TCP:
5636		    {
5637			KMOD_TCPSTAT_INC(tcps_rcvbadsum);
5638			break;
5639		    }
5640		case IPPROTO_UDP:
5641		    {
5642			KMOD_UDPSTAT_INC(udps_badsum);
5643			break;
5644		    }
5645#ifdef INET
5646		case IPPROTO_ICMP:
5647		    {
5648			KMOD_ICMPSTAT_INC(icps_checksum);
5649			break;
5650		    }
5651#endif
5652#ifdef INET6
5653		case IPPROTO_ICMPV6:
5654		    {
5655			KMOD_ICMP6STAT_INC(icp6s_checksum);
5656			break;
5657		    }
5658#endif /* INET6 */
5659		}
5660		return (1);
5661	} else {
5662		if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
5663			m->m_pkthdr.csum_flags |=
5664			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
5665			m->m_pkthdr.csum_data = 0xffff;
5666		}
5667	}
5668	return (0);
5669}
5670
5671
5672#ifdef INET
5673int
5674pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
5675{
5676	struct pfi_kif		*kif;
5677	u_short			 action, reason = 0, log = 0;
5678	struct mbuf		*m = *m0;
5679	struct ip		*h = NULL;
5680	struct m_tag		*ipfwtag;
5681	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
5682	struct pf_state		*s = NULL;
5683	struct pf_ruleset	*ruleset = NULL;
5684	struct pf_pdesc		 pd;
5685	int			 off, dirndx, pqid = 0;
5686
5687	M_ASSERTPKTHDR(m);
5688
5689	if (!V_pf_status.running)
5690		return (PF_PASS);
5691
5692	memset(&pd, 0, sizeof(pd));
5693
5694	kif = (struct pfi_kif *)ifp->if_pf_kif;
5695
5696	if (kif == NULL) {
5697		DPFPRINTF(PF_DEBUG_URGENT,
5698		    ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
5699		return (PF_DROP);
5700	}
5701	if (kif->pfik_flags & PFI_IFLAG_SKIP)
5702		return (PF_PASS);
5703
5704	if (m->m_flags & M_SKIP_FIREWALL)
5705		return (PF_PASS);
5706
5707	pd.pf_mtag = pf_find_mtag(m);
5708
5709	PF_RULES_RLOCK();
5710
5711	if (ip_divert_ptr != NULL &&
5712	    ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
5713		struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
5714		if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
5715			if (pd.pf_mtag == NULL &&
5716			    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5717				action = PF_DROP;
5718				goto done;
5719			}
5720			pd.pf_mtag->flags |= PF_PACKET_LOOPED;
5721			m_tag_delete(m, ipfwtag);
5722		}
5723		if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
5724			m->m_flags |= M_FASTFWD_OURS;
5725			pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
5726		}
5727	} else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
5728		/* We do IP header normalization and packet reassembly here */
5729		action = PF_DROP;
5730		goto done;
5731	}
5732	m = *m0;	/* pf_normalize messes with m0 */
5733	h = mtod(m, struct ip *);
5734
5735	off = h->ip_hl << 2;
5736	if (off < (int)sizeof(struct ip)) {
5737		action = PF_DROP;
5738		REASON_SET(&reason, PFRES_SHORT);
5739		log = 1;
5740		goto done;
5741	}
5742
5743	pd.src = (struct pf_addr *)&h->ip_src;
5744	pd.dst = (struct pf_addr *)&h->ip_dst;
5745	pd.sport = pd.dport = NULL;
5746	pd.ip_sum = &h->ip_sum;
5747	pd.proto_sum = NULL;
5748	pd.proto = h->ip_p;
5749	pd.dir = dir;
5750	pd.sidx = (dir == PF_IN) ? 0 : 1;
5751	pd.didx = (dir == PF_IN) ? 1 : 0;
5752	pd.af = AF_INET;
5753	pd.tos = h->ip_tos;
5754	pd.tot_len = ntohs(h->ip_len);
5755
5756	/* handle fragments that didn't get reassembled by normalization */
5757	if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
5758		action = pf_test_fragment(&r, dir, kif, m, h,
5759		    &pd, &a, &ruleset);
5760		goto done;
5761	}
5762
5763	switch (h->ip_p) {
5764
5765	case IPPROTO_TCP: {
5766		struct tcphdr	th;
5767
5768		pd.hdr.tcp = &th;
5769		if (!pf_pull_hdr(m, off, &th, sizeof(th),
5770		    &action, &reason, AF_INET)) {
5771			log = action != PF_PASS;
5772			goto done;
5773		}
5774		pd.p_len = pd.tot_len - off - (th.th_off << 2);
5775		if ((th.th_flags & TH_ACK) && pd.p_len == 0)
5776			pqid = 1;
5777		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
5778		if (action == PF_DROP)
5779			goto done;
5780		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
5781		    &reason);
5782		if (action == PF_PASS) {
5783			if (pfsync_update_state_ptr != NULL)
5784				pfsync_update_state_ptr(s);
5785			r = s->rule.ptr;
5786			a = s->anchor.ptr;
5787			log = s->log;
5788		} else if (s == NULL)
5789			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5790			    &a, &ruleset, inp);
5791		break;
5792	}
5793
5794	case IPPROTO_UDP: {
5795		struct udphdr	uh;
5796
5797		pd.hdr.udp = &uh;
5798		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
5799		    &action, &reason, AF_INET)) {
5800			log = action != PF_PASS;
5801			goto done;
5802		}
5803		if (uh.uh_dport == 0 ||
5804		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
5805		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
5806			action = PF_DROP;
5807			REASON_SET(&reason, PFRES_SHORT);
5808			goto done;
5809		}
5810		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
5811		if (action == PF_PASS) {
5812			if (pfsync_update_state_ptr != NULL)
5813				pfsync_update_state_ptr(s);
5814			r = s->rule.ptr;
5815			a = s->anchor.ptr;
5816			log = s->log;
5817		} else if (s == NULL)
5818			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5819			    &a, &ruleset, inp);
5820		break;
5821	}
5822
5823	case IPPROTO_ICMP: {
5824		struct icmp	ih;
5825
5826		pd.hdr.icmp = &ih;
5827		if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
5828		    &action, &reason, AF_INET)) {
5829			log = action != PF_PASS;
5830			goto done;
5831		}
5832		action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
5833		    &reason);
5834		if (action == PF_PASS) {
5835			if (pfsync_update_state_ptr != NULL)
5836				pfsync_update_state_ptr(s);
5837			r = s->rule.ptr;
5838			a = s->anchor.ptr;
5839			log = s->log;
5840		} else if (s == NULL)
5841			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5842			    &a, &ruleset, inp);
5843		break;
5844	}
5845
5846#ifdef INET6
5847	case IPPROTO_ICMPV6: {
5848		action = PF_DROP;
5849		DPFPRINTF(PF_DEBUG_MISC,
5850		    ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
5851		goto done;
5852	}
5853#endif
5854
5855	default:
5856		action = pf_test_state_other(&s, dir, kif, m, &pd);
5857		if (action == PF_PASS) {
5858			if (pfsync_update_state_ptr != NULL)
5859				pfsync_update_state_ptr(s);
5860			r = s->rule.ptr;
5861			a = s->anchor.ptr;
5862			log = s->log;
5863		} else if (s == NULL)
5864			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5865			    &a, &ruleset, inp);
5866		break;
5867	}
5868
5869done:
5870	PF_RULES_RUNLOCK();
5871	if (action == PF_PASS && h->ip_hl > 5 &&
5872	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
5873		action = PF_DROP;
5874		REASON_SET(&reason, PFRES_IPOPTIONS);
5875		log = 1;
5876		DPFPRINTF(PF_DEBUG_MISC,
5877		    ("pf: dropping packet with ip options\n"));
5878	}
5879
5880	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
5881		action = PF_DROP;
5882		REASON_SET(&reason, PFRES_MEMORY);
5883	}
5884	if (r->rtableid >= 0)
5885		M_SETFIB(m, r->rtableid);
5886
5887#ifdef ALTQ
5888	if (action == PF_PASS && r->qid) {
5889		if (pd.pf_mtag == NULL &&
5890		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5891			action = PF_DROP;
5892			REASON_SET(&reason, PFRES_MEMORY);
5893		}
5894		if (pqid || (pd.tos & IPTOS_LOWDELAY))
5895			pd.pf_mtag->qid = r->pqid;
5896		else
5897			pd.pf_mtag->qid = r->qid;
5898		/* add hints for ecn */
5899		pd.pf_mtag->hdr = h;
5900
5901	}
5902#endif /* ALTQ */
5903
5904	/*
5905	 * connections redirected to loopback should not match sockets
5906	 * bound specifically to loopback due to security implications,
5907	 * see tcp_input() and in_pcblookup_listen().
5908	 */
5909	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
5910	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
5911	    (s->nat_rule.ptr->action == PF_RDR ||
5912	    s->nat_rule.ptr->action == PF_BINAT) &&
5913	    (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
5914		m->m_flags |= M_SKIP_FIREWALL;
5915
5916	if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL &&
5917	    !PACKET_LOOPED(&pd)) {
5918
5919		ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
5920		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
5921		if (ipfwtag != NULL) {
5922			((struct ipfw_rule_ref *)(ipfwtag+1))->info =
5923			    ntohs(r->divert.port);
5924			((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
5925
5926			if (s)
5927				PF_STATE_UNLOCK(s);
5928
5929			m_tag_prepend(m, ipfwtag);
5930			if (m->m_flags & M_FASTFWD_OURS) {
5931				if (pd.pf_mtag == NULL &&
5932				    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5933					action = PF_DROP;
5934					REASON_SET(&reason, PFRES_MEMORY);
5935					log = 1;
5936					DPFPRINTF(PF_DEBUG_MISC,
5937					    ("pf: failed to allocate tag\n"));
5938				}
5939				pd.pf_mtag->flags |= PF_FASTFWD_OURS_PRESENT;
5940				m->m_flags &= ~M_FASTFWD_OURS;
5941			}
5942			ip_divert_ptr(*m0, dir ==  PF_IN ? DIR_IN : DIR_OUT);
5943			*m0 = NULL;
5944
5945			return (action);
5946		} else {
5947			/* XXX: ipfw has the same behaviour! */
5948			action = PF_DROP;
5949			REASON_SET(&reason, PFRES_MEMORY);
5950			log = 1;
5951			DPFPRINTF(PF_DEBUG_MISC,
5952			    ("pf: failed to allocate divert tag\n"));
5953		}
5954	}
5955
5956	if (log) {
5957		struct pf_rule *lr;
5958
5959		if (s != NULL && s->nat_rule.ptr != NULL &&
5960		    s->nat_rule.ptr->log & PF_LOG_ALL)
5961			lr = s->nat_rule.ptr;
5962		else
5963			lr = r;
5964		PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
5965		    (s == NULL));
5966	}
5967
5968	kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
5969	kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
5970
5971	if (action == PF_PASS || r->action == PF_DROP) {
5972		dirndx = (dir == PF_OUT);
5973		r->packets[dirndx]++;
5974		r->bytes[dirndx] += pd.tot_len;
5975		if (a != NULL) {
5976			a->packets[dirndx]++;
5977			a->bytes[dirndx] += pd.tot_len;
5978		}
5979		if (s != NULL) {
5980			if (s->nat_rule.ptr != NULL) {
5981				s->nat_rule.ptr->packets[dirndx]++;
5982				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
5983			}
5984			if (s->src_node != NULL) {
5985				s->src_node->packets[dirndx]++;
5986				s->src_node->bytes[dirndx] += pd.tot_len;
5987			}
5988			if (s->nat_src_node != NULL) {
5989				s->nat_src_node->packets[dirndx]++;
5990				s->nat_src_node->bytes[dirndx] += pd.tot_len;
5991			}
5992			dirndx = (dir == s->direction) ? 0 : 1;
5993			s->packets[dirndx]++;
5994			s->bytes[dirndx] += pd.tot_len;
5995		}
5996		tr = r;
5997		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
5998		if (nr != NULL && r == &V_pf_default_rule)
5999			tr = nr;
6000		if (tr->src.addr.type == PF_ADDR_TABLE)
6001			pfr_update_stats(tr->src.addr.p.tbl,
6002			    (s == NULL) ? pd.src :
6003			    &s->key[(s->direction == PF_IN)]->
6004				addr[(s->direction == PF_OUT)],
6005			    pd.af, pd.tot_len, dir == PF_OUT,
6006			    r->action == PF_PASS, tr->src.neg);
6007		if (tr->dst.addr.type == PF_ADDR_TABLE)
6008			pfr_update_stats(tr->dst.addr.p.tbl,
6009			    (s == NULL) ? pd.dst :
6010			    &s->key[(s->direction == PF_IN)]->
6011				addr[(s->direction == PF_IN)],
6012			    pd.af, pd.tot_len, dir == PF_OUT,
6013			    r->action == PF_PASS, tr->dst.neg);
6014	}
6015
6016	switch (action) {
6017	case PF_SYNPROXY_DROP:
6018		m_freem(*m0);
6019	case PF_DEFER:
6020		*m0 = NULL;
6021		action = PF_PASS;
6022		break;
6023	default:
6024		/* pf_route() returns unlocked. */
6025		if (r->rt) {
6026			pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
6027			return (action);
6028		}
6029		break;
6030	}
6031	if (s)
6032		PF_STATE_UNLOCK(s);
6033
6034	return (action);
6035}
6036#endif /* INET */
6037
6038#ifdef INET6
6039int
6040pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
6041{
6042	struct pfi_kif		*kif;
6043	u_short			 action, reason = 0, log = 0;
6044	struct mbuf		*m = *m0, *n = NULL;
6045	struct ip6_hdr		*h = NULL;
6046	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
6047	struct pf_state		*s = NULL;
6048	struct pf_ruleset	*ruleset = NULL;
6049	struct pf_pdesc		 pd;
6050	int			 off, terminal = 0, dirndx, rh_cnt = 0;
6051
6052	M_ASSERTPKTHDR(m);
6053
6054	if (!V_pf_status.running)
6055		return (PF_PASS);
6056
6057	memset(&pd, 0, sizeof(pd));
6058	pd.pf_mtag = pf_find_mtag(m);
6059
6060	if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED)
6061		return (PF_PASS);
6062
6063	kif = (struct pfi_kif *)ifp->if_pf_kif;
6064	if (kif == NULL) {
6065		DPFPRINTF(PF_DEBUG_URGENT,
6066		    ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
6067		return (PF_DROP);
6068	}
6069	if (kif->pfik_flags & PFI_IFLAG_SKIP)
6070		return (PF_PASS);
6071
6072	PF_RULES_RLOCK();
6073
6074	/* We do IP header normalization and packet reassembly here */
6075	if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
6076		action = PF_DROP;
6077		goto done;
6078	}
6079	m = *m0;	/* pf_normalize messes with m0 */
6080	h = mtod(m, struct ip6_hdr *);
6081
6082#if 1
6083	/*
6084	 * we do not support jumbogram yet.  if we keep going, zero ip6_plen
6085	 * will do something bad, so drop the packet for now.
6086	 */
6087	if (htons(h->ip6_plen) == 0) {
6088		action = PF_DROP;
6089		REASON_SET(&reason, PFRES_NORM);	/*XXX*/
6090		goto done;
6091	}
6092#endif
6093
6094	pd.src = (struct pf_addr *)&h->ip6_src;
6095	pd.dst = (struct pf_addr *)&h->ip6_dst;
6096	pd.sport = pd.dport = NULL;
6097	pd.ip_sum = NULL;
6098	pd.proto_sum = NULL;
6099	pd.dir = dir;
6100	pd.sidx = (dir == PF_IN) ? 0 : 1;
6101	pd.didx = (dir == PF_IN) ? 1 : 0;
6102	pd.af = AF_INET6;
6103	pd.tos = 0;
6104	pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
6105
6106	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
6107	pd.proto = h->ip6_nxt;
6108	do {
6109		switch (pd.proto) {
6110		case IPPROTO_FRAGMENT:
6111			action = pf_test_fragment(&r, dir, kif, m, h,
6112			    &pd, &a, &ruleset);
6113			if (action == PF_DROP)
6114				REASON_SET(&reason, PFRES_FRAG);
6115			goto done;
6116		case IPPROTO_ROUTING: {
6117			struct ip6_rthdr rthdr;
6118
6119			if (rh_cnt++) {
6120				DPFPRINTF(PF_DEBUG_MISC,
6121				    ("pf: IPv6 more than one rthdr\n"));
6122				action = PF_DROP;
6123				REASON_SET(&reason, PFRES_IPOPTIONS);
6124				log = 1;
6125				goto done;
6126			}
6127			if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
6128			    &reason, pd.af)) {
6129				DPFPRINTF(PF_DEBUG_MISC,
6130				    ("pf: IPv6 short rthdr\n"));
6131				action = PF_DROP;
6132				REASON_SET(&reason, PFRES_SHORT);
6133				log = 1;
6134				goto done;
6135			}
6136			if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
6137				DPFPRINTF(PF_DEBUG_MISC,
6138				    ("pf: IPv6 rthdr0\n"));
6139				action = PF_DROP;
6140				REASON_SET(&reason, PFRES_IPOPTIONS);
6141				log = 1;
6142				goto done;
6143			}
6144			/* FALLTHROUGH */
6145		}
6146		case IPPROTO_AH:
6147		case IPPROTO_HOPOPTS:
6148		case IPPROTO_DSTOPTS: {
6149			/* get next header and header length */
6150			struct ip6_ext	opt6;
6151
6152			if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
6153			    NULL, &reason, pd.af)) {
6154				DPFPRINTF(PF_DEBUG_MISC,
6155				    ("pf: IPv6 short opt\n"));
6156				action = PF_DROP;
6157				log = 1;
6158				goto done;
6159			}
6160			if (pd.proto == IPPROTO_AH)
6161				off += (opt6.ip6e_len + 2) * 4;
6162			else
6163				off += (opt6.ip6e_len + 1) * 8;
6164			pd.proto = opt6.ip6e_nxt;
6165			/* goto the next header */
6166			break;
6167		}
6168		default:
6169			terminal++;
6170			break;
6171		}
6172	} while (!terminal);
6173
6174	/* if there's no routing header, use unmodified mbuf for checksumming */
6175	if (!n)
6176		n = m;
6177
6178	switch (pd.proto) {
6179
6180	case IPPROTO_TCP: {
6181		struct tcphdr	th;
6182
6183		pd.hdr.tcp = &th;
6184		if (!pf_pull_hdr(m, off, &th, sizeof(th),
6185		    &action, &reason, AF_INET6)) {
6186			log = action != PF_PASS;
6187			goto done;
6188		}
6189		pd.p_len = pd.tot_len - off - (th.th_off << 2);
6190		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6191		if (action == PF_DROP)
6192			goto done;
6193		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6194		    &reason);
6195		if (action == PF_PASS) {
6196			if (pfsync_update_state_ptr != NULL)
6197				pfsync_update_state_ptr(s);
6198			r = s->rule.ptr;
6199			a = s->anchor.ptr;
6200			log = s->log;
6201		} else if (s == NULL)
6202			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6203			    &a, &ruleset, inp);
6204		break;
6205	}
6206
6207	case IPPROTO_UDP: {
6208		struct udphdr	uh;
6209
6210		pd.hdr.udp = &uh;
6211		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6212		    &action, &reason, AF_INET6)) {
6213			log = action != PF_PASS;
6214			goto done;
6215		}
6216		if (uh.uh_dport == 0 ||
6217		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6218		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6219			action = PF_DROP;
6220			REASON_SET(&reason, PFRES_SHORT);
6221			goto done;
6222		}
6223		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6224		if (action == PF_PASS) {
6225			if (pfsync_update_state_ptr != NULL)
6226				pfsync_update_state_ptr(s);
6227			r = s->rule.ptr;
6228			a = s->anchor.ptr;
6229			log = s->log;
6230		} else if (s == NULL)
6231			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6232			    &a, &ruleset, inp);
6233		break;
6234	}
6235
6236	case IPPROTO_ICMP: {
6237		action = PF_DROP;
6238		DPFPRINTF(PF_DEBUG_MISC,
6239		    ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
6240		goto done;
6241	}
6242
6243	case IPPROTO_ICMPV6: {
6244		struct icmp6_hdr	ih;
6245
6246		pd.hdr.icmp6 = &ih;
6247		if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
6248		    &action, &reason, AF_INET6)) {
6249			log = action != PF_PASS;
6250			goto done;
6251		}
6252		action = pf_test_state_icmp(&s, dir, kif,
6253		    m, off, h, &pd, &reason);
6254		if (action == PF_PASS) {
6255			if (pfsync_update_state_ptr != NULL)
6256				pfsync_update_state_ptr(s);
6257			r = s->rule.ptr;
6258			a = s->anchor.ptr;
6259			log = s->log;
6260		} else if (s == NULL)
6261			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6262			    &a, &ruleset, inp);
6263		break;
6264	}
6265
6266	default:
6267		action = pf_test_state_other(&s, dir, kif, m, &pd);
6268		if (action == PF_PASS) {
6269			if (pfsync_update_state_ptr != NULL)
6270				pfsync_update_state_ptr(s);
6271			r = s->rule.ptr;
6272			a = s->anchor.ptr;
6273			log = s->log;
6274		} else if (s == NULL)
6275			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6276			    &a, &ruleset, inp);
6277		break;
6278	}
6279
6280done:
6281	PF_RULES_RUNLOCK();
6282	if (n != m) {
6283		m_freem(n);
6284		n = NULL;
6285	}
6286
6287	/* handle dangerous IPv6 extension headers. */
6288	if (action == PF_PASS && rh_cnt &&
6289	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6290		action = PF_DROP;
6291		REASON_SET(&reason, PFRES_IPOPTIONS);
6292		log = 1;
6293		DPFPRINTF(PF_DEBUG_MISC,
6294		    ("pf: dropping packet with dangerous v6 headers\n"));
6295	}
6296
6297	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
6298		action = PF_DROP;
6299		REASON_SET(&reason, PFRES_MEMORY);
6300	}
6301	if (r->rtableid >= 0)
6302		M_SETFIB(m, r->rtableid);
6303
6304#ifdef ALTQ
6305	if (action == PF_PASS && r->qid) {
6306		if (pd.pf_mtag == NULL &&
6307		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6308			action = PF_DROP;
6309			REASON_SET(&reason, PFRES_MEMORY);
6310		}
6311		if (pd.tos & IPTOS_LOWDELAY)
6312			pd.pf_mtag->qid = r->pqid;
6313		else
6314			pd.pf_mtag->qid = r->qid;
6315		/* add hints for ecn */
6316		pd.pf_mtag->hdr = h;
6317	}
6318#endif /* ALTQ */
6319
6320	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6321	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6322	    (s->nat_rule.ptr->action == PF_RDR ||
6323	    s->nat_rule.ptr->action == PF_BINAT) &&
6324	    IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
6325		m->m_flags |= M_SKIP_FIREWALL;
6326
6327	/* XXX: Anybody working on it?! */
6328	if (r->divert.port)
6329		printf("pf: divert(9) is not supported for IPv6\n");
6330
6331	if (log) {
6332		struct pf_rule *lr;
6333
6334		if (s != NULL && s->nat_rule.ptr != NULL &&
6335		    s->nat_rule.ptr->log & PF_LOG_ALL)
6336			lr = s->nat_rule.ptr;
6337		else
6338			lr = r;
6339		PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
6340		    &pd, (s == NULL));
6341	}
6342
6343	kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6344	kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
6345
6346	if (action == PF_PASS || r->action == PF_DROP) {
6347		dirndx = (dir == PF_OUT);
6348		r->packets[dirndx]++;
6349		r->bytes[dirndx] += pd.tot_len;
6350		if (a != NULL) {
6351			a->packets[dirndx]++;
6352			a->bytes[dirndx] += pd.tot_len;
6353		}
6354		if (s != NULL) {
6355			if (s->nat_rule.ptr != NULL) {
6356				s->nat_rule.ptr->packets[dirndx]++;
6357				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6358			}
6359			if (s->src_node != NULL) {
6360				s->src_node->packets[dirndx]++;
6361				s->src_node->bytes[dirndx] += pd.tot_len;
6362			}
6363			if (s->nat_src_node != NULL) {
6364				s->nat_src_node->packets[dirndx]++;
6365				s->nat_src_node->bytes[dirndx] += pd.tot_len;
6366			}
6367			dirndx = (dir == s->direction) ? 0 : 1;
6368			s->packets[dirndx]++;
6369			s->bytes[dirndx] += pd.tot_len;
6370		}
6371		tr = r;
6372		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6373		if (nr != NULL && r == &V_pf_default_rule)
6374			tr = nr;
6375		if (tr->src.addr.type == PF_ADDR_TABLE)
6376			pfr_update_stats(tr->src.addr.p.tbl,
6377			    (s == NULL) ? pd.src :
6378			    &s->key[(s->direction == PF_IN)]->addr[0],
6379			    pd.af, pd.tot_len, dir == PF_OUT,
6380			    r->action == PF_PASS, tr->src.neg);
6381		if (tr->dst.addr.type == PF_ADDR_TABLE)
6382			pfr_update_stats(tr->dst.addr.p.tbl,
6383			    (s == NULL) ? pd.dst :
6384			    &s->key[(s->direction == PF_IN)]->addr[1],
6385			    pd.af, pd.tot_len, dir == PF_OUT,
6386			    r->action == PF_PASS, tr->dst.neg);
6387	}
6388
6389	switch (action) {
6390	case PF_SYNPROXY_DROP:
6391		m_freem(*m0);
6392	case PF_DEFER:
6393		*m0 = NULL;
6394		action = PF_PASS;
6395		break;
6396	default:
6397		/* pf_route6() returns unlocked. */
6398		if (r->rt) {
6399			pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
6400			return (action);
6401		}
6402		break;
6403	}
6404
6405	if (s)
6406		PF_STATE_UNLOCK(s);
6407
6408	return (action);
6409}
6410#endif /* INET6 */
6411