ip_input.c revision 191314
14910Swollman/*-
2139365Srik * Copyright (c) 1982, 1986, 1988, 1993
34910Swollman *	The Regents of the University of California.  All rights reserved.
4139823Simp *
5139823Simp * Redistribution and use in source and binary forms, with or without
6139365Srik * modification, are permitted provided that the following conditions
725944Sjoerg * are met:
84910Swollman * 1. Redistributions of source code must retain the above copyright
925944Sjoerg *    notice, this list of conditions and the following disclaimer.
1088534Sjoerg * 2. Redistributions in binary form must reproduce the above copyright
1125944Sjoerg *    notice, this list of conditions and the following disclaimer in the
124910Swollman *    documentation and/or other materials provided with the distribution.
134910Swollman * 4. Neither the name of the University nor the names of its contributors
144910Swollman *    may be used to endorse or promote products derived from this software
154910Swollman *    without specific prior written permission.
164910Swollman *
174910Swollman * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
184910Swollman * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1930300Sjoerg * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2016288Sgpalmer * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2150477Speter * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
224910Swollman * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
234910Swollman * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2440008Sjoerg * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2540008Sjoerg * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2632350Seivind * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2754263Sshin * SUCH DAMAGE.
2831742Seivind *
2931742Seivind *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
304952Sbde */
314952Sbde
3270199Sjhay#include <sys/cdefs.h>
3324204Sbde__FBSDID("$FreeBSD: head/sys/netinet/ip_input.c 191314 2009-04-20 14:35:42Z rwatson $");
344910Swollman
3525706Sjoerg#include "opt_bootp.h"
3659604Sobrien#include "opt_ipfw.h"
3729024Sbde#include "opt_ipstealth.h"
384910Swollman#include "opt_ipsec.h"
3940008Sjoerg#include "opt_route.h"
4030300Sjoerg#include "opt_mac.h"
414910Swollman#include "opt_carp.h"
424910Swollman
434910Swollman#include <sys/param.h>
444910Swollman#include <sys/systm.h>
4542104Sphk#include <sys/callout.h>
46196019Srwatson#include <sys/mbuf.h>
4788534Sjoerg#include <sys/malloc.h>
4888534Sjoerg#include <sys/domain.h>
4988534Sjoerg#include <sys/protosw.h>
5088534Sjoerg#include <sys/socket.h>
514910Swollman#include <sys/time.h>
5230300Sjoerg#include <sys/kernel.h>
5330300Sjoerg#include <sys/lock.h>
544910Swollman#include <sys/rwlock.h>
5588705Sjoerg#include <sys/syslog.h>
5688705Sjoerg#include <sys/sysctl.h>
574910Swollman#include <sys/vimage.h>
584910Swollman
594910Swollman#include <net/pfil.h>
604910Swollman#include <net/if.h>
61148385Sume#include <net/if_types.h>
62148385Sume#include <net/if_var.h>
63148385Sume#include <net/if_dl.h>
64148385Sume#include <net/route.h>
65182121Simp#include <net/netisr.h>
6688705Sjoerg#include <net/vnet.h>
6711819Sjulian#include <net/flowtable.h>
6811819Sjulian
6911819Sjulian#include <netinet/in.h>
7011819Sjulian#include <netinet/in_systm.h>
7111819Sjulian#include <netinet/in_var.h>
724910Swollman#include <netinet/ip.h>
734910Swollman#include <netinet/in_pcb.h>
74182121Simp#include <netinet/ip_var.h>
754910Swollman#include <netinet/ip_icmp.h>
764910Swollman#include <netinet/ip_options.h>
7725944Sjoerg#include <machine/in_cksum.h>
7825944Sjoerg#include <netinet/vinet.h>
7925944Sjoerg#ifdef DEV_CARP
8025955Sjoerg#include <netinet/ip_carp.h>
8125944Sjoerg#endif
8225944Sjoerg#ifdef IPSEC
8325944Sjoerg#include <netinet/ip_ipsec.h>
8425955Sjoerg#endif /* IPSEC */
8525955Sjoerg
8625955Sjoerg#include <sys/socketvar.h>
8730300Sjoerg
8830300Sjoerg/* XXX: Temporary until ipfw_ether and ipfw_bridge are converted. */
8930300Sjoerg#include <netinet/ip_fw.h>
9030300Sjoerg#include <netinet/ip_dummynet.h>
9130300Sjoerg
9230300Sjoerg#include <security/mac/mac_framework.h>
9330300Sjoerg
9430300Sjoerg#ifdef CTASSERT
9530300SjoergCTASSERT(sizeof(struct ip) == 20);
9625944Sjoerg#endif
9725944Sjoerg
9825955Sjoerg#ifndef VIMAGE
9925955Sjoerg#ifndef VIMAGE_GLOBALS
10045152Sphkstruct vnet_inet vnet_inet_0;
10125944Sjoerg#endif
10230300Sjoerg#endif
10330300Sjoerg
10430300Sjoerg#ifdef VIMAGE_GLOBALS
10530300Sjoergstatic int	ipsendredirects;
10630300Sjoergstatic int	ip_checkinterface;
10730300Sjoergstatic int	ip_keepfaith;
10888534Sjoergstatic int	ip_sendsourcequench;
10988534Sjoergint	ip_defttl;
11078064Sumeint	ip_do_randomid;
11130300Sjoergint	ipforwarding;
11230300Sjoergstruct	in_ifaddrhead in_ifaddrhead; 		/* first inet address */
11330300Sjoergstruct	in_ifaddrhashhead *in_ifaddrhashtbl;	/* inet addr hash table  */
11430300Sjoergu_long 	in_ifaddrhmask;				/* mask for hash table */
11578064Sumestruct ipstat ipstat;
1164910Swollmanstatic int ip_rsvp_on;
11725944Sjoergstruct socket *ip_rsvpd;
11825944Sjoergint	rsvp_on;
11925944Sjoergstatic struct ipqhead ipq[IPREASS_NHASH];
12025944Sjoergstatic int	maxnipq;	/* Administrative limit on # reass queues. */
12125944Sjoergstatic int	maxfragsperpacket;
12225944Sjoergint	ipstealth;
12325944Sjoergstatic int	nipq;	/* Total # of reass queues */
12425944Sjoerg#endif
12525944Sjoerg
12625944SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_FORWARDING,
12725944Sjoerg    forwarding, CTLFLAG_RW, ipforwarding, 0,
1284910Swollman    "Enable IP forwarding between interfaces");
12930300Sjoerg
13030300SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_SENDREDIRECTS,
13130300Sjoerg    redirect, CTLFLAG_RW, ipsendredirects, 0,
13230300Sjoerg    "Enable sending IP redirects");
13330300Sjoerg
13430300SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_DEFTTL,
13530300Sjoerg    ttl, CTLFLAG_RW, ip_defttl, 0, "Maximum TTL on IP packets");
13630300Sjoerg
1374910SwollmanSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_KEEPFAITH,
13825944Sjoerg    keepfaith, CTLFLAG_RW, ip_keepfaith,	0,
13925944Sjoerg    "Enable packet capture for FAITH IPv4->IPv6 translater daemon");
14025944Sjoerg
1414910SwollmanSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO,
14278064Sume    sendsourcequench, CTLFLAG_RW, ip_sendsourcequench, 0,
14378064Sume    "Enable the transmission of source quench packets");
14478064Sume
14588534SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, random_id,
14688534Sjoerg    CTLFLAG_RW, ip_do_randomid, 0, "Assign random ip_id values");
14730300Sjoerg
14830300Sjoerg/*
14930300Sjoerg * XXX - Setting ip_checkinterface mostly implements the receive side of
1504910Swollman * the Strong ES model described in RFC 1122, but since the routing table
15130300Sjoerg * and transmit implementation do not implement the Strong ES model,
15230300Sjoerg * setting this to 1 results in an odd hybrid.
15330300Sjoerg *
15430300Sjoerg * XXX - ip_checkinterface currently must be disabled if you use ipnat
15530300Sjoerg * to translate the destination address to another local interface.
15630300Sjoerg *
15730300Sjoerg * XXX - ip_checkinterface must be disabled if you add IP aliases
15830300Sjoerg * to the loopback interface instead of the interface where the
15930300Sjoerg * packets for those addresses are received.
16030300Sjoerg */
16130300SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO,
16230300Sjoerg    check_interface, CTLFLAG_RW, ip_checkinterface, 0,
16330300Sjoerg    "Verify packet arrives on correct interface");
16430300Sjoerg
16525944Sjoergstruct pfil_head inet_pfil_hook;	/* Packet filter hooks */
16625944Sjoerg
16725944Sjoergstatic struct	ifqueue ipintrq;
16825944Sjoergstatic int	ipqmaxlen = IFQ_MAXLEN;
16925944Sjoerg
17025944Sjoergextern	struct domain inetdomain;
17125944Sjoergextern	struct protosw inetsw[];
17225944Sjoergu_char	ip_protox[IPPROTO_MAX];
17325944Sjoerg
17425944SjoergSYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW,
17525944Sjoerg    &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue");
17625944SjoergSYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD,
177147256Sbrooks    &ipintrq.ifq_drops, 0,
178147256Sbrooks    "Number of packets dropped from the IP input queue");
1794910Swollman
18011189SjkhSYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW,
18111189Sjkh    ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)");
18211189Sjkh
183103842Salfred#ifdef VIMAGE_GLOBALS
1844910Swollmanstatic uma_zone_t ipq_zone;
1854910Swollman#endif
1864910Swollmanstatic struct mtx ipqlock;
18711189Sjkh
18811189Sjkh#define	IPQ_LOCK()	mtx_lock(&ipqlock)
18911189Sjkh#define	IPQ_UNLOCK()	mtx_unlock(&ipqlock)
190103842Salfred#define	IPQ_LOCK_INIT()	mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF)
1914910Swollman#define	IPQ_LOCK_ASSERT()	mtx_assert(&ipqlock, MA_OWNED)
1924910Swollman
1934910Swollmanstatic void	maxnipq_update(void);
19411189Sjkhstatic void	ipq_zone_change(void *);
19511189Sjkh
19611189SjkhSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fragpackets,
19711189Sjkh    CTLFLAG_RD, nipq, 0,
19811189Sjkh    "Current number of IPv4 fragment reassembly queue entries");
19911189Sjkh
200103842SalfredSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, maxfragsperpacket,
20188704Sjoerg    CTLFLAG_RW, maxfragsperpacket, 0,
2024910Swollman    "Maximum number of IPv4 fragments allowed per packet");
20325944Sjoerg
20425944Sjoergstruct callout	ipport_tick_callout;
20525944Sjoerg
20625944Sjoerg#ifdef IPCTL_DEFMTU
20725944SjoergSYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
20825944Sjoerg    &ip_mtu, 0, "Default MTU");
20925944Sjoerg#endif
21025944Sjoerg
21125944Sjoerg#ifdef IPSTEALTH
21225944SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
21325944Sjoerg    ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding");
21425944Sjoerg#endif
21525944Sjoergstatic int ip_output_flowtable_size = 2048;
21625944SjoergTUNABLE_INT("net.inet.ip.output_flowtable_size", &ip_output_flowtable_size);
21725944SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, output_flowtable_size,
21825944Sjoerg    CTLFLAG_RDTUN, ip_output_flowtable_size, 2048,
21925944Sjoerg    "number of entries in the per-cpu output flow caches");
22025944Sjoerg
22125944Sjoerg/*
22225944Sjoerg * ipfw_ether and ipfw_bridge hooks.
22325944Sjoerg * XXX: Temporary until those are converted to pfil_hooks as well.
22425944Sjoerg */
22525944Sjoergip_fw_chk_t *ip_fw_chk_ptr = NULL;
22625944Sjoergip_dn_io_t *ip_dn_io_ptr = NULL;
22725944Sjoerg#ifdef VIMAGE_GLOBALS
22825944Sjoergint fw_one_pass;
22925944Sjoerg#endif
23025944Sjoergstruct flowtable *ip_ft;
23125944Sjoerg
23225944Sjoergstatic void	ip_freef(struct ipqhead *, struct ipq *);
23325944Sjoerg
23425944Sjoerg#ifndef VIMAGE_GLOBALS
23540008Sjoergstatic void vnet_inet_register(void);
23640008Sjoerg
23740008Sjoergstatic const vnet_modinfo_t vnet_inet_modinfo = {
238188668Srwatson	.vmi_id		= VNET_MOD_INET,
239188668Srwatson	.vmi_name	= "inet",
240188668Srwatson};
241190818Sed
242138745Srikstatic void vnet_inet_register()
24388705Sjoerg{
2444910Swollman
2454910Swollman	vnet_mod_register(&vnet_inet_modinfo);
2464910Swollman}
2474910Swollman
2484910SwollmanSYSINIT(inet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet_register, 0);
24930300Sjoerg#endif
25030300Sjoerg
2514910Swollman/*
252126910Srwatson * IP initialization: fill in IP protocol switch table.
2534910Swollman * All protocols not implemented in kernel go to raw IP protocol handler.
2544910Swollman */
2554910Swollmanvoid
2564910Swollmanip_init(void)
25788705Sjoerg{
2584910Swollman	INIT_VNET_INET(curvnet);
25925944Sjoerg	struct protosw *pr;
26025944Sjoerg	int i;
261147256Sbrooks
26225944Sjoerg	V_ipsendredirects = 1; /* XXX */
26311189Sjkh	V_ip_checkinterface = 0;
26430300Sjoerg	V_ip_keepfaith = 0;
265191148Skmacy	V_ip_sendsourcequench = 0;
2664910Swollman	V_rsvp_on = 0;
26725944Sjoerg	V_ip_defttl = IPDEFTTL;
26825944Sjoerg	V_ip_do_randomid = 0;
26925944Sjoerg	V_ip_id = time_second & 0xffff;
27025944Sjoerg	V_ipforwarding = 0;
27125944Sjoerg	V_ipstealth = 0;
27225944Sjoerg	V_nipq = 0;	/* Total # of reass queues */
27325944Sjoerg
27442104Sphk	V_ipport_lowfirstauto = IPPORT_RESERVED - 1;	/* 1023 */
27525944Sjoerg	V_ipport_lowlastauto = IPPORT_RESERVEDSTART;	/* 600 */
27625944Sjoerg	V_ipport_firstauto = IPPORT_EPHEMERALFIRST;	/* 10000 */
27730300Sjoerg	V_ipport_lastauto = IPPORT_EPHEMERALLAST;	/* 65535 */
27842104Sphk	V_ipport_hifirstauto = IPPORT_HIFIRSTAUTO;	/* 49152 */
27930300Sjoerg	V_ipport_hilastauto = IPPORT_HILASTAUTO;	/* 65535 */
28025944Sjoerg	V_ipport_reservedhigh = IPPORT_RESERVED - 1;	/* 1023 */
28125944Sjoerg	V_ipport_reservedlow = 0;
28225944Sjoerg	V_ipport_randomized = 1;	/* user controlled via sysctl */
28325944Sjoerg	V_ipport_randomcps = 10;	/* user controlled via sysctl */
28425944Sjoerg	V_ipport_randomtime = 45;	/* user controlled via sysctl */
28525944Sjoerg	V_ipport_stoprandom = 0;	/* toggled by ipport_tick */
28625944Sjoerg
28730300Sjoerg	V_fw_one_pass = 1;
28830300Sjoerg
289138745Srik#ifdef NOTYET
290138745Srik	/* XXX global static but not instantiated in this file */
291138745Srik	V_ipfastforward_active = 0;
29225944Sjoerg	V_subnetsarelocal = 0;
29325944Sjoerg	V_sameprefixcarponly = 0;
29425944Sjoerg#endif
29525944Sjoerg
29625944Sjoerg	TAILQ_INIT(&V_in_ifaddrhead);
29725944Sjoerg	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
29825944Sjoerg
29925944Sjoerg	/* Initialize IP reassembly queue. */
30025944Sjoerg	for (i = 0; i < IPREASS_NHASH; i++)
30125944Sjoerg		TAILQ_INIT(&V_ipq[i]);
30225944Sjoerg	V_maxnipq = nmbclusters / 32;
30325944Sjoerg	V_maxfragsperpacket = 16;
30425944Sjoerg	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
30525944Sjoerg	    NULL, UMA_ALIGN_PTR, 0);
30630300Sjoerg	maxnipq_update();
30730300Sjoerg
30825944Sjoerg	/* Skip initialization of globals for non-default instances. */
30925944Sjoerg	if (!IS_DEFAULT_VNET(curvnet))
31025944Sjoerg		return;
31125944Sjoerg
31225944Sjoerg	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
31325944Sjoerg	if (pr == NULL)
31425944Sjoerg		panic("ip_init: PF_INET not found");
31525944Sjoerg
31625944Sjoerg	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
31725944Sjoerg	for (i = 0; i < IPPROTO_MAX; i++)
31825944Sjoerg		ip_protox[i] = pr - inetsw;
31925944Sjoerg	/*
32025944Sjoerg	 * Cycle through IP protocols and put them into the appropriate place
32125944Sjoerg	 * in ip_protox[].
32225944Sjoerg	 */
32325944Sjoerg	for (pr = inetdomain.dom_protosw;
32478064Sume	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
32578064Sume		if (pr->pr_domain->dom_family == PF_INET &&
32678064Sume		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
32778064Sume			/* Be careful to only index valid IP protocols. */
32878064Sume			if (pr->pr_protocol < IPPROTO_MAX)
32978064Sume				ip_protox[pr->pr_protocol] = pr - inetsw;
33078064Sume		}
33178064Sume
33278064Sume	/* Initialize packet filter hooks. */
33378064Sume	inet_pfil_hook.ph_type = PFIL_TYPE_AF;
33478064Sume	inet_pfil_hook.ph_af = AF_INET;
33578064Sume	if ((i = pfil_head_register(&inet_pfil_hook)) != 0)
33678064Sume		printf("%s: WARNING: unable to register pfil hook, "
33778064Sume			"error %d\n", __func__, i);
33878064Sume
33930300Sjoerg	/* Start ipport_tick. */
34030300Sjoerg	callout_init(&ipport_tick_callout, CALLOUT_MPSAFE);
34130300Sjoerg	ipport_tick(NULL);
34230300Sjoerg	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
34330300Sjoerg		SHUTDOWN_PRI_DEFAULT);
34430300Sjoerg	EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change,
34530300Sjoerg		NULL, EVENTHANDLER_PRI_ANY);
34630300Sjoerg
34730300Sjoerg	/* Initialize various other remaining things. */
34830300Sjoerg	IPQ_LOCK_INIT();
34930300Sjoerg	ipintrq.ifq_maxlen = ipqmaxlen;
35030300Sjoerg	mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF);
35130300Sjoerg	netisr_register(NETISR_IP, ip_input, &ipintrq, 0);
35230300Sjoerg
35330300Sjoerg	ip_ft = flowtable_alloc(ip_output_flowtable_size, FL_PCPU);
35430300Sjoerg}
35530300Sjoerg
35630300Sjoergvoid
35730300Sjoergip_fini(void *xtp)
35830300Sjoerg{
35925944Sjoerg
360184682Sbz	callout_stop(&ipport_tick_callout);
36130300Sjoerg}
36230300Sjoerg
363184682Sbz/*
36478064Sume * Ip input routine.  Checksum and byte swap header.  If fragmented
36578064Sume * try to reassemble.  Process options.  Pass to next level.
36678064Sume */
36725944Sjoergvoid
36825944Sjoergip_input(struct mbuf *m)
36925944Sjoerg{
37030300Sjoerg	INIT_VNET_INET(curvnet);
37138343Sbde	struct ip *ip = NULL;
37230300Sjoerg	struct in_ifaddr *ia = NULL;
37325944Sjoerg	struct ifaddr *ifa;
37430300Sjoerg	struct ifnet *ifp;
37530300Sjoerg	int    checkif, hlen = 0;
37630300Sjoerg	u_short sum;
37725944Sjoerg	int dchg = 0;				/* dest changed after fw */
378184682Sbz	struct in_addr odst;			/* original dst address */
37925944Sjoerg
380184682Sbz	M_ASSERTPKTHDR(m);
38178064Sume
38278064Sume	if (m->m_flags & M_FASTFWD_OURS) {
38378064Sume		/*
38478064Sume		 * Firewall or NAT changed destination to local.
38578064Sume		 * We expect ip_len and ip_off to be in host byte order.
38678064Sume		 */
38778064Sume		m->m_flags &= ~M_FASTFWD_OURS;
38878064Sume		/* Set up some basics that will be used later. */
38978064Sume		ip = mtod(m, struct ip *);
39025944Sjoerg		hlen = ip->ip_hl << 2;
391138745Srik		goto ours;
392138745Srik	}
393138745Srik
39425944Sjoerg	IPSTAT_INC(ips_total);
39533181Seivind
39625944Sjoerg	if (m->m_pkthdr.len < sizeof(struct ip))
39725944Sjoerg		goto tooshort;
39825944Sjoerg
39925944Sjoerg	if (m->m_len < sizeof (struct ip) &&
40025944Sjoerg	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
40125944Sjoerg		IPSTAT_INC(ips_toosmall);
40225944Sjoerg		return;
40333181Seivind	}
40488709Sjoerg	ip = mtod(m, struct ip *);
40588709Sjoerg
40688709Sjoerg	if (ip->ip_v != IPVERSION) {
40788709Sjoerg		IPSTAT_INC(ips_badvers);
40888709Sjoerg		goto bad;
40988709Sjoerg	}
41088709Sjoerg
41125944Sjoerg	hlen = ip->ip_hl << 2;
41225944Sjoerg	if (hlen < sizeof(struct ip)) {	/* minimum header length */
41325944Sjoerg		IPSTAT_INC(ips_badhlen);
41425944Sjoerg		goto bad;
41525944Sjoerg	}
41625944Sjoerg	if (hlen > m->m_len) {
41778064Sume		if ((m = m_pullup(m, hlen)) == NULL) {
41878064Sume			IPSTAT_INC(ips_badhlen);
41978064Sume			return;
42078064Sume		}
42178064Sume		ip = mtod(m, struct ip *);
42278064Sume	}
42378064Sume
42478064Sume	/* 127/8 must not appear on wire - RFC1122 */
42578064Sume	ifp = m->m_pkthdr.rcvif;
42678064Sume	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
42778064Sume	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
42878064Sume		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
42978064Sume			IPSTAT_INC(ips_badaddr);
43078064Sume			goto bad;
43133181Seivind		}
43230300Sjoerg	}
43330300Sjoerg
43430300Sjoerg	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
43530300Sjoerg		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
43630300Sjoerg	} else {
43730300Sjoerg		if (hlen == sizeof(struct ip)) {
43830300Sjoerg			sum = in_cksum_hdr(ip);
43933181Seivind		} else {
44030300Sjoerg			sum = in_cksum(m, hlen);
44130300Sjoerg		}
44230300Sjoerg	}
44330300Sjoerg	if (sum) {
44430300Sjoerg		IPSTAT_INC(ips_badsum);
44530300Sjoerg		goto bad;
44630300Sjoerg	}
44733181Seivind
44825944Sjoerg#ifdef ALTQ
44925944Sjoerg	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
45078064Sume		/* packet is dropped by traffic conditioner */
45130300Sjoerg		return;
45230300Sjoerg#endif
45325944Sjoerg
45425944Sjoerg	/*
455147256Sbrooks	 * Convert fields to host representation.
456147256Sbrooks	 */
457147256Sbrooks	ip->ip_len = ntohs(ip->ip_len);
458147256Sbrooks	if (ip->ip_len < hlen) {
459147256Sbrooks		IPSTAT_INC(ips_badlen);
460147256Sbrooks		goto bad;
461147256Sbrooks	}
462147256Sbrooks	ip->ip_off = ntohs(ip->ip_off);
463147256Sbrooks
464147256Sbrooks	/*
465147256Sbrooks	 * Check that the amount of data in the buffers
466147256Sbrooks	 * is as at least much as the IP header would have us expect.
467147256Sbrooks	 * Trim mbufs if longer than we expect.
468147256Sbrooks	 * Drop packet if shorter than we expect.
469147256Sbrooks	 */
470147256Sbrooks	if (m->m_pkthdr.len < ip->ip_len) {
471147256Sbrookstooshort:
472147256Sbrooks		IPSTAT_INC(ips_tooshort);
47370199Sjhay		goto bad;
47470199Sjhay	}
47570199Sjhay	if (m->m_pkthdr.len > ip->ip_len) {
47670199Sjhay		if (m->m_len == m->m_pkthdr.len) {
47770199Sjhay			m->m_len = ip->ip_len;
478147256Sbrooks			m->m_pkthdr.len = ip->ip_len;
479147256Sbrooks		} else
480147256Sbrooks			m_adj(m, ip->ip_len - m->m_pkthdr.len);
481147256Sbrooks	}
482147256Sbrooks#ifdef IPSEC
483147256Sbrooks	/*
484147256Sbrooks	 * Bypass packet filtering for packets from a tunnel (gif).
48570199Sjhay	 */
48670199Sjhay	if (ip_ipsec_filtertunnel(m))
487147256Sbrooks		goto passin;
48870199Sjhay#endif /* IPSEC */
48970199Sjhay
490132199Sphk	/*
49170199Sjhay	 * Run through list of hooks for input packets.
49270199Sjhay	 *
49370199Sjhay	 * NB: Beware of the destination address changing (e.g.
49470199Sjhay	 *     by NAT rewriting).  When this happens, tell
49570199Sjhay	 *     ip_forward to do the right thing.
49670199Sjhay	 */
49770199Sjhay
49870199Sjhay	/* Jump over all PFIL processing if hooks are not active. */
49970199Sjhay	if (!PFIL_HOOKED(&inet_pfil_hook))
50070199Sjhay		goto passin;
50125944Sjoerg
50270199Sjhay	odst = ip->ip_dst;
50325944Sjoerg	if (pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0)
5044910Swollman		return;
5054910Swollman	if (m == NULL)			/* consumed by filter */
5064910Swollman		return;
5074910Swollman
5084910Swollman	ip = mtod(m, struct ip *);
50925706Sjoerg	dchg = (odst.s_addr != ip->ip_dst.s_addr);
51025706Sjoerg	ifp = m->m_pkthdr.rcvif;
5114910Swollman
5124910Swollman#ifdef IPFIREWALL_FORWARD
513111888Sjlemon	if (m->m_flags & M_FASTFWD_OURS) {
514147256Sbrooks		m->m_flags &= ~M_FASTFWD_OURS;
515184682Sbz		goto ours;
516184682Sbz	}
517184682Sbz	if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) {
51888700Sjoerg		/*
519184682Sbz		 * Directly ship on the packet.  This allows to forward packets
5204910Swollman		 * that were destined for us to some other directly connected
521138745Srik		 * host.
522138745Srik		 */
523138745Srik		ip_forward(m, dchg);
5244910Swollman		return;
5254910Swollman	}
5264910Swollman#endif /* IPFIREWALL_FORWARD */
5274910Swollman
5284910Swollmanpassin:
5294910Swollman	/*
53025944Sjoerg	 * Process options and, if not destined for us,
53125706Sjoerg	 * ship it on.  ip_dooptions returns 1 when an
53240008Sjoerg	 * error was detected (causing an icmp message
53340008Sjoerg	 * to be sent and the original packet to be freed).
53425944Sjoerg	 */
53588700Sjoerg	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
536138745Srik		return;
53788700Sjoerg
53825944Sjoerg        /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
53925944Sjoerg         * matter if it is destined to another node, or whether it is
5404910Swollman         * a multicast one, RSVP wants it! and prevents it from being forwarded
5414910Swollman         * anywhere else. Also checks if the rsvp daemon is running before
5424910Swollman	 * grabbing the packet.
543139365Srik         */
544139365Srik	if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP)
545139365Srik		goto ours;
546139365Srik
547139365Srik	/*
548139365Srik	 * Check our list of addresses, to see if the packet is for us.
5494910Swollman	 * If we don't have any addresses, assume any unicast packet
5504910Swollman	 * we receive might be for us (and let the upper layers deal
5514910Swollman	 * with it).
5524910Swollman	 */
5534910Swollman	if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
5544910Swollman	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
5554910Swollman		goto ours;
5564910Swollman
55745152Sphk	/*
55825944Sjoerg	 * Enable a consistency check between the destination address
55925706Sjoerg	 * and the arrival interface for a unicast packet (the RFC 1122
56040008Sjoerg	 * strong ES model) if IP forwarding is disabled and the packet
56125706Sjoerg	 * is not locally generated and the packet is not subject to
56240008Sjoerg	 * 'ipfw fwd'.
56325706Sjoerg	 *
56411189Sjkh	 * XXX - Checking also should be disabled if the destination
56511189Sjkh	 * address is ipnat'ed to a different interface.
5664910Swollman	 *
5674910Swollman	 * XXX - Checking is incompatible with IP aliases added
56825944Sjoerg	 * to the loopback interface instead of the interface where
56925706Sjoerg	 * the packets are received.
57044145Sphk	 *
57125706Sjoerg	 * XXX - This is the case for carp vhost IPs as well so we
57240008Sjoerg	 * insert a workaround. If the packet got here, we already
57325706Sjoerg	 * checked with carp_iamatch() and carp_forus().
57444145Sphk	 */
57544145Sphk	checkif = V_ip_checkinterface && (V_ipforwarding == 0) &&
57678064Sume	    ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
57744145Sphk#ifdef DEV_CARP
5784910Swollman	    !ifp->if_carp &&
5794910Swollman#endif
5804910Swollman	    (dchg == 0);
58130300Sjoerg
5824910Swollman	/*
583138745Srik	 * Check for exact addresses in the hash bucket.
5844910Swollman	 */
58530300Sjoerg	LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
58630300Sjoerg		/*
58730300Sjoerg		 * If the address matches, verify that the packet
58830300Sjoerg		 * arrived via the correct interface if checking is
589138745Srik		 * enabled.
59030300Sjoerg		 */
59130300Sjoerg		if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr &&
59230300Sjoerg		    (!checkif || ia->ia_ifp == ifp))
59330300Sjoerg			goto ours;
59430300Sjoerg	}
595138745Srik	/*
59630300Sjoerg	 * Check for broadcast addresses.
5974910Swollman	 *
5984910Swollman	 * Only accept broadcast packets that arrive via the matching
59925944Sjoerg	 * interface.  Reception of forwarded directed broadcasts would
60030300Sjoerg	 * be handled via ip_forward() and ether_output() with the loopback
6014910Swollman	 * into the stack for SIMPLEX interfaces handled by ether_output().
602138745Srik	 */
6034910Swollman	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
6044910Swollman		IF_ADDR_LOCK(ifp);
60525944Sjoerg	        TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
606111888Sjlemon			if (ifa->ifa_addr->sa_family != AF_INET)
6074910Swollman				continue;
60888577Sjoerg			ia = ifatoia(ifa);
6094910Swollman			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
61088534Sjoerg			    ip->ip_dst.s_addr) {
61188534Sjoerg				IF_ADDR_UNLOCK(ifp);
61288700Sjoerg				goto ours;
61388700Sjoerg			}
61488700Sjoerg			if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) {
61588700Sjoerg				IF_ADDR_UNLOCK(ifp);
61688700Sjoerg				goto ours;
61788700Sjoerg			}
61888700Sjoerg#ifdef BOOTP_COMPAT
61988700Sjoerg			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
62088700Sjoerg				IF_ADDR_UNLOCK(ifp);
62188700Sjoerg				goto ours;
62288534Sjoerg			}
62388700Sjoerg#endif
62488700Sjoerg		}
62588700Sjoerg		IF_ADDR_UNLOCK(ifp);
62688700Sjoerg	}
62788700Sjoerg	/* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
62888700Sjoerg	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
62988700Sjoerg		IPSTAT_INC(ips_cantforward);
63088700Sjoerg		m_freem(m);
63188700Sjoerg		return;
632111119Simp	}
633138745Srik	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
634138745Srik		if (V_ip_mrouter) {
63588700Sjoerg			/*
636138745Srik			 * If we are acting as a multicast router, all
63788700Sjoerg			 * incoming multicast packets are passed to the
638111888Sjlemon			 * kernel-level multicast forwarding function.
63988534Sjoerg			 * The packet is returned (relatively) intact; if
64088599Sjoerg			 * ip_mforward() returns a non-zero value, the packet
64188534Sjoerg			 * must be discarded, else it may be accepted below.
64288534Sjoerg			 */
64388534Sjoerg			if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
64488700Sjoerg				IPSTAT_INC(ips_cantforward);
64588700Sjoerg				m_freem(m);
64688700Sjoerg				return;
64788700Sjoerg			}
64888700Sjoerg
64988700Sjoerg			/*
65088700Sjoerg			 * The process-level routing daemon needs to receive
65188700Sjoerg			 * all multicast IGMP packets, whether or not this
65288700Sjoerg			 * host belongs to their destination groups.
65388534Sjoerg			 */
65488700Sjoerg			if (ip->ip_p == IPPROTO_IGMP)
655111888Sjlemon				goto ours;
65688534Sjoerg			IPSTAT_INC(ips_forward);
65788599Sjoerg		}
65888534Sjoerg		/*
65978064Sume		 * Assume the packet is for us, to avoid prematurely taking
66088599Sjoerg		 * a lock on the in_multi hash. Protocols must perform
66188599Sjoerg		 * their own filtering and update statistics accordingly.
66288599Sjoerg		 */
66388599Sjoerg		goto ours;
66488599Sjoerg	}
665138745Srik	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
66688599Sjoerg		goto ours;
66788599Sjoerg	if (ip->ip_dst.s_addr == INADDR_ANY)
66888599Sjoerg		goto ours;
669111888Sjlemon
670111888Sjlemon	/*
67188599Sjoerg	 * FAITH(Firewall Aided Internet Translator)
67288599Sjoerg	 */
67388599Sjoerg	if (ifp && ifp->if_type == IFT_FAITH) {
67412495Speter		if (V_ip_keepfaith) {
67512495Speter			if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP)
67612495Speter				goto ours;
677111888Sjlemon		}
678111888Sjlemon		m_freem(m);
67988577Sjoerg		return;
68012495Speter	}
68112495Speter
6824910Swollman	/*
6834910Swollman	 * Not for us; forward if possible and desirable.
6844910Swollman	 */
6854910Swollman	if (V_ipforwarding == 0) {
6864910Swollman		IPSTAT_INC(ips_cantforward);
68745152Sphk		m_freem(m);
68825944Sjoerg	} else {
68925706Sjoerg#ifdef IPSEC
69040008Sjoerg		if (ip_ipsec_fwd(m))
69125706Sjoerg			goto bad;
69240008Sjoerg#endif /* IPSEC */
69325706Sjoerg		ip_forward(m, dchg);
69411189Sjkh	}
69511189Sjkh	return;
6964910Swollman
6974910Swollmanours:
6984910Swollman#ifdef IPSTEALTH
6994910Swollman	/*
7004910Swollman	 * IPSTEALTH: Process non-routing options only
701147256Sbrooks	 * if the packet is destined for us.
7024910Swollman	 */
703138745Srik	if (V_ipstealth && hlen > sizeof (struct ip) &&
7044910Swollman	    ip_dooptions(m, 1))
7054910Swollman		return;
7064910Swollman#endif /* IPSTEALTH */
707111888Sjlemon
70888577Sjoerg	/* Count the packet in the ip address stats */
7094910Swollman	if (ia != NULL) {
7104910Swollman		ia->ia_ifa.if_ipackets++;
71154263Sshin		ia->ia_ifa.if_ibytes += m->m_pkthdr.len;
71254263Sshin	}
713111888Sjlemon
71488577Sjoerg	/*
71554263Sshin	 * Attempt reassembly; if it succeeds, proceed.
71654263Sshin	 * ip_reass() will return a different mbuf.
71712495Speter	 */
71812495Speter	if (ip->ip_off & (IP_MF | IP_OFFMASK)) {
719111888Sjlemon		m = ip_reass(m);
72088577Sjoerg		if (m == NULL)
72112495Speter			return;
72212495Speter		ip = mtod(m, struct ip *);
7234910Swollman		/* Get the header length of the reassembled packet */
7244910Swollman		hlen = ip->ip_hl << 2;
72525944Sjoerg	}
72625944Sjoerg
72725944Sjoerg	/*
72825944Sjoerg	 * Further protocols expect the packet length to be w/o the
72940008Sjoerg	 * IP header.
73025944Sjoerg	 */
73140008Sjoerg	ip->ip_len -= hlen;
73225944Sjoerg
73325944Sjoerg#ifdef IPSEC
7344910Swollman	/*
7354910Swollman	 * enforce IPsec policy checking if we are seeing last header.
736111888Sjlemon	 * note that we do not visit this with protocols with pcb layer
7374910Swollman	 * code - like udp/tcp/raw ip.
7384910Swollman	 */
739138745Srik	if (ip_ipsec_input(m))
7404910Swollman		goto bad;
741134391Sandre#endif /* IPSEC */
74225944Sjoerg
74340008Sjoerg	/*
74440008Sjoerg	 * Switch out to protocol's input routine.
745131241Srik	 */
7464910Swollman	IPSTAT_INC(ips_delivered);
747138745Srik
74888577Sjoerg	(*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen);
74988577Sjoerg	return;
75088577Sjoergbad:
75188577Sjoerg	m_freem(m);
75288577Sjoerg}
75388577Sjoerg
754150349Sandre/*
7554910Swollman * After maxnipq has been updated, propagate the change to UMA.  The UMA zone
7564910Swollman * max has slightly different semantics than the sysctl, for historical
757138745Srik * reasons.
758138745Srik */
759138745Srikstatic void
760138745Srikmaxnipq_update(void)
761138745Srik{
762147256Sbrooks	INIT_VNET_INET(curvnet);
763138745Srik
764138745Srik	/*
765138745Srik	 * -1 for unlimited allocation.
766138745Srik	 */
767138745Srik	if (V_maxnipq < 0)
768138745Srik		uma_zone_set_max(V_ipq_zone, 0);
769138745Srik	/*
770138745Srik	 * Positive number for specific bound.
771147256Sbrooks	 */
772138745Srik	if (V_maxnipq > 0)
773138745Srik		uma_zone_set_max(V_ipq_zone, V_maxnipq);
774138745Srik	/*
775138745Srik	 * Zero specifies no further fragment queue allocation -- set the
776138745Srik	 * bound very low, but rely on implementation elsewhere to actually
777138745Srik	 * prevent allocation and reclaim current queues.
778138745Srik	 */
779138745Srik	if (V_maxnipq == 0)
780138745Srik		uma_zone_set_max(V_ipq_zone, 1);
781138745Srik}
782138745Srik
7834910Swollmanstatic void
7844910Swollmanipq_zone_change(void *tag)
7854910Swollman{
78612820Sphk	INIT_VNET_INET(curvnet);
78725706Sjoerg
788191148Skmacy	if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) {
7894910Swollman		V_maxnipq = nmbclusters / 32;
790147256Sbrooks		maxnipq_update();
7914910Swollman	}
79278064Sume}
793130549Smlaier
794184682Sbzstatic int
79588534Sjoergsysctl_maxnipq(SYSCTL_HANDLER_ARGS)
796184682Sbz{
79742066Sphk	INIT_VNET_INET(curvnet);
7984910Swollman	int error, i;
79925944Sjoerg
800138745Srik	i = V_maxnipq;
80125944Sjoerg	error = sysctl_handle_int(oidp, &i, 0, req);
802148887Srwatson	if (error || !req->newptr)
803148887Srwatson		return (error);
804148887Srwatson
80588723Sjoerg	/*
80688723Sjoerg	 * XXXRW: Might be a good idea to sanity check the argument and place
80788723Sjoerg	 * an extreme upper bound.
8084910Swollman	 */
809138745Srik	if (i < -1)
8104910Swollman		return (EINVAL);
8114910Swollman	V_maxnipq = i;
8124910Swollman	maxnipq_update();
8134910Swollman	return (0);
814148887Srwatson}
815148887Srwatson
81688723SjoergSYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW,
81725944Sjoerg    NULL, 0, sysctl_maxnipq, "I",
81888723Sjoerg    "Maximum number of IPv4 fragment reassembly queue entries");
81988723Sjoerg
82088723Sjoerg/*
82188723Sjoerg * Take incoming datagram fragment and try to reassemble it into
82288723Sjoerg * whole datagram.  If the argument is the first fragment or one
82388723Sjoerg * in between the function will return NULL and store the mbuf
82488723Sjoerg * in the fragment chain.  If the argument is the last fragment
82588723Sjoerg * the packet will be reassembled and the pointer to the new
82688723Sjoerg * mbuf returned for further processing.  Only m_tags attached
82788723Sjoerg * to the first packet/fragment are preserved.
82888723Sjoerg * The IP header is *NOT* adjusted out of iplen.
82988723Sjoerg */
83025944Sjoergstruct mbuf *
83125944Sjoergip_reass(struct mbuf *m)
83225944Sjoerg{
833148887Srwatson	INIT_VNET_INET(curvnet);
83425944Sjoerg	struct ip *ip;
83525944Sjoerg	struct mbuf *p, *q, *nq, *t;
83625944Sjoerg	struct ipq *fp = NULL;
83725944Sjoerg	struct ipqhead *head;
83825944Sjoerg	int i, hlen, next;
8394910Swollman	u_int8_t ecn, ecn0;
84012436Speter	u_short hash;
84140008Sjoerg
84212436Speter	/* If maxnipq or maxfragsperpacket are 0, never accept fragments. */
84312436Speter	if (V_maxnipq == 0 || V_maxfragsperpacket == 0) {
8444910Swollman		IPSTAT_INC(ips_fragments);
84542104Sphk		IPSTAT_INC(ips_fragdropped);
84642104Sphk		m_freem(m);
84742104Sphk		return (NULL);
84842104Sphk	}
84942104Sphk
85042104Sphk	ip = mtod(m, struct ip *);
85142104Sphk	hlen = ip->ip_hl << 2;
85242104Sphk
85342104Sphk	hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
85470199Sjhay	head = &V_ipq[hash];
85542104Sphk	IPQ_LOCK();
85642104Sphk
85742104Sphk	/*
85842104Sphk	 * Look for queue of fragments
859138745Srik	 * of this datagram.
86042104Sphk	 */
86142104Sphk	TAILQ_FOREACH(fp, head, ipq_list)
86242104Sphk		if (ip->ip_id == fp->ipq_id &&
86342104Sphk		    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
86442104Sphk		    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
86542104Sphk#ifdef MAC
86670199Sjhay		    mac_ipq_match(m, fp) &&
86742104Sphk#endif
86842104Sphk		    ip->ip_p == fp->ipq_p)
869130549Smlaier			goto found;
87042104Sphk
871130549Smlaier	fp = NULL;
87241686Sphk
873130549Smlaier	/*
874130549Smlaier	 * Attempt to trim the number of allocated fragment queues if it
87541686Sphk	 * exceeds the administrative limit.
87612436Speter	 */
87741686Sphk	if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) {
87841686Sphk		/*
87941686Sphk		 * drop something from the tail of the current queue
88041686Sphk		 * before proceeding further
88141686Sphk		 */
88241686Sphk		struct ipq *q = TAILQ_LAST(head, ipqhead);
88341686Sphk		if (q == NULL) {   /* gak */
88441686Sphk			for (i = 0; i < IPREASS_NHASH; i++) {
88588534Sjoerg				struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead);
88688534Sjoerg				if (r) {
88788534Sjoerg					IPSTAT_ADD(ips_fragtimeout,
88888534Sjoerg					    r->ipq_nfrags);
889139365Srik					ip_freef(&V_ipq[i], r);
890138745Srik					break;
89188599Sjoerg				}
89288534Sjoerg			}
89388534Sjoerg		} else {
89488534Sjoerg			IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags);
89588534Sjoerg			ip_freef(head, q);
89688534Sjoerg		}
89788534Sjoerg	}
89888534Sjoerg
89988534Sjoergfound:
90088534Sjoerg	/*
90188534Sjoerg	 * Adjust ip_len to not reflect header,
90288534Sjoerg	 * convert offset of this to bytes.
90388534Sjoerg	 */
904138745Srik	ip->ip_len -= hlen;
90588534Sjoerg	if (ip->ip_off & IP_MF) {
90688534Sjoerg		/*
90788534Sjoerg		 * Make sure that fragments have a data length
9084910Swollman		 * that's a non-zero multiple of 8 bytes.
9094910Swollman		 */
9104910Swollman		if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
91178064Sume			IPSTAT_INC(ips_toosmall); /* XXX */
91278064Sume			goto dropfrag;
91378064Sume		}
91478064Sume		m->m_flags |= M_FRAG;
91578064Sume	} else
91678064Sume		m->m_flags &= ~M_FRAG;
917139365Srik	ip->ip_off <<= 3;
918139365Srik
919139365Srik
920139365Srik	/*
921139365Srik	 * Attempt reassembly; if it succeeds, proceed.
922139365Srik	 * ip_reass() will return a different mbuf.
923139365Srik	 */
924139365Srik	IPSTAT_INC(ips_fragments);
9254910Swollman	m->m_pkthdr.header = ip;
9264910Swollman
9274910Swollman	/* Previous ip_reass() started here. */
928111119Simp	/*
9294910Swollman	 * Presence of header sizes in mbufs
930139365Srik	 * would confuse code below.
93140008Sjoerg	 */
93240008Sjoerg	m->m_data += hlen;
93325944Sjoerg	m->m_len -= hlen;
934138745Srik
9354910Swollman	/*
9364910Swollman	 * If first fragment to arrive, create a reassembly queue.
9374910Swollman	 */
93840008Sjoerg	if (fp == NULL) {
93940008Sjoerg		fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
94040008Sjoerg		if (fp == NULL)
94140008Sjoerg			goto dropfrag;
9424910Swollman#ifdef MAC
94345152Sphk		if (mac_ipq_init(fp, M_NOWAIT) != 0) {
94428088Skjc			uma_zfree(V_ipq_zone, fp);
9454910Swollman			fp = NULL;
9464910Swollman			goto dropfrag;
9474910Swollman		}
9484910Swollman		mac_ipq_create(m, fp);
9494910Swollman#endif
9504910Swollman		TAILQ_INSERT_HEAD(head, fp, ipq_list);
9514910Swollman		V_nipq++;
9524910Swollman		fp->ipq_nfrags = 1;
9534910Swollman		fp->ipq_ttl = IPFRAGTTL;
95445152Sphk		fp->ipq_p = ip->ip_p;
95511189Sjkh		fp->ipq_id = ip->ip_id;
95611189Sjkh		fp->ipq_src = ip->ip_src;
95725955Sjoerg		fp->ipq_dst = ip->ip_dst;
95825955Sjoerg		fp->ipq_frags = m;
95925955Sjoerg		m->m_nextpkt = NULL;
96025955Sjoerg		goto done;
96125955Sjoerg	} else {
96225955Sjoerg		fp->ipq_nfrags++;
96325955Sjoerg#ifdef MAC
96425955Sjoerg		mac_ipq_update(m, fp);
96525955Sjoerg#endif
96688534Sjoerg	}
96725955Sjoerg
96825955Sjoerg#define GETIP(m)	((struct ip*)((m)->m_pkthdr.header))
96911189Sjkh
9704910Swollman	/*
9714910Swollman	 * Handle ECN by comparing this segment with the first one;
97254263Sshin	 * if CE is set, do not lose CE.
97354263Sshin	 * drop if CE and not-ECT are mixed for the same packet.
97454263Sshin	 */
97554263Sshin	ecn = ip->ip_tos & IPTOS_ECN_MASK;
97654263Sshin	ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
97778064Sume	if (ecn == IPTOS_ECN_CE) {
97878064Sume		if (ecn0 == IPTOS_ECN_NOTECT)
97978064Sume			goto dropfrag;
98078064Sume		if (ecn0 != IPTOS_ECN_CE)
98178064Sume			GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
98278064Sume	}
98378064Sume	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
98478064Sume		goto dropfrag;
98578064Sume
98678064Sume	/*
98778064Sume	 * Find a segment which begins after this one does.
98878064Sume	 */
98954263Sshin	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
99054263Sshin		if (GETIP(q)->ip_off > ip->ip_off)
99154263Sshin			break;
99211819Sjulian
99312495Speter	/*
99445152Sphk	 * If there is a preceding segment, it may provide some of
99512495Speter	 * our data already.  If so, drop the data from the incoming
99611819Sjulian	 * segment.  If it provides all of our data, drop us, otherwise
99711819Sjulian	 * stick new segment in the proper place.
9984910Swollman	 *
9994910Swollman	 * If some of the data is dropped from the the preceding
100025944Sjoerg	 * segment, then it's checksum is invalidated.
1001138745Srik	 */
10024910Swollman	if (p) {
10034910Swollman		i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
10044910Swollman		if (i > 0) {
10054910Swollman			if (i >= ip->ip_len)
10064910Swollman				goto dropfrag;
10074910Swollman			m_adj(m, i);
100888577Sjoerg			m->m_pkthdr.csum_flags = 0;
10094910Swollman			ip->ip_off += i;
1010139365Srik			ip->ip_len -= i;
1011130549Smlaier		}
1012130549Smlaier		m->m_nextpkt = p->m_nextpkt;
1013130549Smlaier		p->m_nextpkt = m;
1014130549Smlaier	} else {
1015130549Smlaier		m->m_nextpkt = fp->ipq_frags;
101625944Sjoerg		fp->ipq_frags = m;
1017138745Srik	}
1018111038Smaxim
101925955Sjoerg	/*
10204910Swollman	 * While we overlap succeeding segments trim them or,
1021138745Srik	 * if they are completely covered, dequeue them.
1022111038Smaxim	 */
102388577Sjoerg	for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
102488577Sjoerg	     q = nq) {
102588577Sjoerg		i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off;
102688577Sjoerg		if (i < GETIP(q)->ip_len) {
102788577Sjoerg			GETIP(q)->ip_len -= i;
102888577Sjoerg			GETIP(q)->ip_off += i;
1029150349Sandre			m_adj(q, i);
10304910Swollman			q->m_pkthdr.csum_flags = 0;
10314910Swollman			break;
10324910Swollman		}
103325706Sjoerg		nq = q->m_nextpkt;
103425706Sjoerg		m->m_nextpkt = nq;
10354910Swollman		IPSTAT_INC(ips_fragdropped);
1036147256Sbrooks		fp->ipq_nfrags--;
10374910Swollman		m_freem(q);
1038138745Srik	}
1039138745Srik
1040138745Srik	/*
10414910Swollman	 * Check for complete reassembly and perform frag per packet
1042188668Srwatson	 * limiting.
1043138745Srik	 *
1044138745Srik	 * Frag limiting is performed here so that the nth frag has
10454910Swollman	 * a chance to complete the packet before we drop the packet.
1046147256Sbrooks	 * As a result, n+1 frags are actually allowed per packet, but
1047147256Sbrooks	 * only n will ever be stored. (n = maxfragsperpacket.)
1048147256Sbrooks	 *
104942104Sphk	 */
105042064Sphk	next = 0;
105142104Sphk	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
1052147256Sbrooks		if (GETIP(q)->ip_off != next) {
105370199Sjhay			if (fp->ipq_nfrags > V_maxfragsperpacket) {
105470199Sjhay				IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
10554910Swollman				ip_freef(head, fp);
10564910Swollman			}
105778064Sume			goto done;
105878064Sume		}
105925944Sjoerg		next += GETIP(q)->ip_len;
1060138745Srik	}
1061138745Srik	/* Make sure the last packet didn't have the IP_MF flag */
106288716Sjoerg	if (p->m_flags & M_FRAG) {
106393818Sjhb		if (fp->ipq_nfrags > V_maxfragsperpacket) {
106488716Sjoerg			IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
106593818Sjhb			ip_freef(head, fp);
1066150349Sandre		}
106788723Sjoerg		goto done;
106888723Sjoerg	}
106988723Sjoerg
107088723Sjoerg	/*
107188723Sjoerg	 * Reassembly is complete.  Make sure the packet is a sane size.
107288723Sjoerg	 */
107388723Sjoerg	q = fp->ipq_frags;
1074188668Srwatson	ip = GETIP(q);
1075138745Srik	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
1076138745Srik		IPSTAT_INC(ips_toolong);
1077118072Sgj		IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
107888599Sjoerg		ip_freef(head, fp);
107925944Sjoerg		goto done;
108025944Sjoerg	}
108178064Sume
108230300Sjoerg	/*
108330300Sjoerg	 * Concatenate fragments.
10844910Swollman	 */
10854910Swollman	m = q;
108630300Sjoerg	t = m->m_next;
108725706Sjoerg	m->m_next = NULL;
10884910Swollman	m_cat(m, t);
1089147256Sbrooks	nq = q->m_nextpkt;
109025944Sjoerg	q->m_nextpkt = NULL;
10914910Swollman	for (q = nq; q != NULL; q = nq) {
1092138745Srik		nq = q->m_nextpkt;
10934910Swollman		q->m_nextpkt = NULL;
10944910Swollman		m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
1095138745Srik		m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
1096138745Srik		m_cat(m, q);
109725944Sjoerg	}
1098138745Srik	/*
1099138745Srik	 * In order to do checksumming faster we do 'end-around carry' here
1100138745Srik	 * (and not in for{} loop), though it implies we are not going to
1101138745Srik	 * reassemble more than 64k fragments.
1102138745Srik	 */
1103138745Srik	m->m_pkthdr.csum_data =
110469152Sjlemon	    (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16);
110569152Sjlemon#ifdef MAC
1106138745Srik	mac_ipq_reassemble(fp, m);
11074910Swollman	mac_ipq_destroy(fp);
11084910Swollman#endif
11094910Swollman
11104910Swollman	/*
11114910Swollman	 * Create header for new ip packet by modifying header of first
1112138745Srik	 * packet;  dequeue and discard fragment reassembly header.
1113138745Srik	 * Make header visible.
11144910Swollman	 */
1115147256Sbrooks	ip->ip_len = (ip->ip_hl << 2) + next;
11164910Swollman	ip->ip_src = fp->ipq_src;
1117147256Sbrooks	ip->ip_dst = fp->ipq_dst;
111825944Sjoerg	TAILQ_REMOVE(head, fp, ipq_list);
111926018Sjoerg	V_nipq--;
11204910Swollman	uma_zfree(V_ipq_zone, fp);
11214910Swollman	m->m_len += (ip->ip_hl << 2);
1122138745Srik	m->m_data -= (ip->ip_hl << 2);
1123138745Srik	/* some debugging cruft by sklower, below, will go away soon */
1124138745Srik	if (m->m_flags & M_PKTHDR)	/* XXX this should be done elsewhere */
1125147256Sbrooks		m_fixhdr(m);
1126138745Srik	IPSTAT_INC(ips_reassembled);
1127138745Srik	IPQ_UNLOCK();
1128138745Srik	return (m);
1129138745Srik
1130138745Srikdropfrag:
1131138745Srik	IPSTAT_INC(ips_fragdropped);
11324910Swollman	if (fp != NULL)
113311189Sjkh		fp->ipq_nfrags--;
113411189Sjkh	m_freem(m);
113512820Sphkdone:
113625706Sjoerg	IPQ_UNLOCK();
113711189Sjkh	return (NULL);
1138147256Sbrooks
113925944Sjoerg#undef GETIP
114011189Sjkh}
114125944Sjoerg
1142138745Srik/*
114326018Sjoerg * Free a fragment reassembly header and all
1144147256Sbrooks * associated datagrams.
1145138745Srik */
114625944Sjoergstatic void
114711189Sjkhip_freef(struct ipqhead *fhp, struct ipq *fp)
114811189Sjkh{
114911189Sjkh	INIT_VNET_INET(curvnet);
115011189Sjkh	struct mbuf *q;
11514910Swollman
11524910Swollman	IPQ_LOCK_ASSERT();
115325706Sjoerg
115425706Sjoerg	while (fp->ipq_frags) {
11554910Swollman		q = fp->ipq_frags;
1156147256Sbrooks		fp->ipq_frags = q->m_nextpkt;
11574910Swollman		m_freem(q);
115825944Sjoerg	}
11594910Swollman	TAILQ_REMOVE(fhp, fp, ipq_list);
116025944Sjoerg	uma_zfree(V_ipq_zone, fp);
1161138745Srik	V_nipq--;
116226018Sjoerg}
116330300Sjoerg
116430300Sjoerg/*
116526018Sjoerg * IP timer processing;
116626018Sjoerg * if a timer expires on a reassembly
116726018Sjoerg * queue, discard it.
116826018Sjoerg */
116926018Sjoergvoid
1170139365Srikip_slowtimo(void)
1171139365Srik{
117226018Sjoerg	VNET_ITERATOR_DECL(vnet_iter);
117326018Sjoerg	struct ipq *fp;
1174147256Sbrooks	int i;
117526018Sjoerg
1176138745Srik	IPQ_LOCK();
117726018Sjoerg	VNET_LIST_RLOCK();
117826018Sjoerg	VNET_FOREACH(vnet_iter) {
11794910Swollman		CURVNET_SET(vnet_iter);
11804910Swollman		INIT_VNET_INET(vnet_iter);
11814910Swollman		for (i = 0; i < IPREASS_NHASH; i++) {
118230300Sjoerg			for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) {
118330300Sjoerg				struct ipq *fpp;
118430300Sjoerg
118530300Sjoerg				fpp = fp;
118630300Sjoerg				fp = TAILQ_NEXT(fp, ipq_list);
1187147256Sbrooks				if(--fpp->ipq_ttl == 0) {
118830300Sjoerg					IPSTAT_ADD(ips_fragtimeout,
118930300Sjoerg					    fpp->ipq_nfrags);
119030300Sjoerg					ip_freef(&V_ipq[i], fpp);
1191138745Srik				}
1192138745Srik			}
119330300Sjoerg		}
119430300Sjoerg		/*
119530300Sjoerg		 * If we are over the maximum number of fragments
1196138745Srik		 * (due to the limit being lowered), drain off
1197139365Srik		 * enough to get down to the new limit.
1198139365Srik		 */
119930300Sjoerg		if (V_maxnipq >= 0 && V_nipq > V_maxnipq) {
1200147256Sbrooks			for (i = 0; i < IPREASS_NHASH; i++) {
1201138745Srik				while (V_nipq > V_maxnipq &&
120230300Sjoerg				    !TAILQ_EMPTY(&V_ipq[i])) {
120330300Sjoerg					IPSTAT_ADD(ips_fragdropped,
120430300Sjoerg					    TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
120530300Sjoerg					ip_freef(&V_ipq[i],
120630300Sjoerg					    TAILQ_FIRST(&V_ipq[i]));
120725944Sjoerg				}
12084910Swollman			}
120925944Sjoerg		}
121042104Sphk		CURVNET_RESTORE();
12114910Swollman	}
121225944Sjoerg	VNET_LIST_RUNLOCK();
1213147256Sbrooks	IPQ_UNLOCK();
121430300Sjoerg}
12154910Swollman
121625944Sjoerg/*
1217138745Srik * Drain off all datagram fragments.
121830300Sjoerg */
121925944Sjoergvoid
122025944Sjoergip_drain(void)
122125944Sjoerg{
122225944Sjoerg	VNET_ITERATOR_DECL(vnet_iter);
12234910Swollman	int     i;
122425944Sjoerg
122588503Sjoerg	IPQ_LOCK();
122688503Sjoerg	VNET_LIST_RLOCK();
1227102412Scharnier	VNET_FOREACH(vnet_iter) {
122811189Sjkh		CURVNET_SET(vnet_iter);
122925944Sjoerg		INIT_VNET_INET(vnet_iter);
123025944Sjoerg		for (i = 0; i < IPREASS_NHASH; i++) {
1231148887Srwatson			while(!TAILQ_EMPTY(&V_ipq[i])) {
123225944Sjoerg				IPSTAT_ADD(ips_fragdropped,
1233148887Srwatson				    TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
123445152Sphk				ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i]));
123545152Sphk			}
123645152Sphk		}
123745152Sphk		CURVNET_RESTORE();
123845152Sphk	}
123945152Sphk	VNET_LIST_RUNLOCK();
124045152Sphk	IPQ_UNLOCK();
124145152Sphk	in_rtqdrain();
124245152Sphk}
1243139365Srik
1244139365Srik/*
1245139365Srik * The protocol to be inserted into ip_protox[] must be already registered
124645152Sphk * in inetsw[], either statically or through pf_proto_register().
124745152Sphk */
124845152Sphkint
1249148887Srwatsonipproto_register(u_char ipproto)
12504910Swollman{
12514910Swollman	struct protosw *pr;
125245152Sphk
1253139365Srik	/* Sanity checks. */
1254139365Srik	if (ipproto == 0)
125545152Sphk		return (EPROTONOSUPPORT);
125645152Sphk
125745152Sphk	/*
1258138745Srik	 * The protocol slot must not be occupied by another protocol
1259148887Srwatson	 * already.  An index pointing to IPPROTO_RAW is unused.
126045152Sphk	 */
126126018Sjoerg	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
12624910Swollman	if (pr == NULL)
126345152Sphk		return (EPFNOSUPPORT);
1264139365Srik	if (ip_protox[ipproto] != pr - inetsw)	/* IPPROTO_RAW */
1265139365Srik		return (EEXIST);
126645152Sphk
126745152Sphk	/* Find the protocol position in inetsw[] and set the index. */
126845152Sphk	for (pr = inetdomain.dom_protosw;
1269148887Srwatson	     pr < inetdomain.dom_protoswNPROTOSW; pr++) {
127045152Sphk		if (pr->pr_domain->dom_family == PF_INET &&
127145152Sphk		    pr->pr_protocol && pr->pr_protocol == ipproto) {
1272139365Srik			/* Be careful to only index valid IP protocols. */
1273139365Srik			if (pr->pr_protocol < IPPROTO_MAX) {
127445152Sphk				ip_protox[pr->pr_protocol] = pr - inetsw;
127545152Sphk				return (0);
1276148887Srwatson			} else
127745152Sphk				return (EINVAL);
127845152Sphk		}
127945152Sphk	}
12804910Swollman	return (EPROTONOSUPPORT);
128111189Sjkh}
128225944Sjoerg
128325944Sjoergint
128425944Sjoergipproto_unregister(u_char ipproto)
128525944Sjoerg{
128625944Sjoerg	struct protosw *pr;
128725944Sjoerg
128825944Sjoerg	/* Sanity checks. */
128925944Sjoerg	if (ipproto == 0)
12904910Swollman		return (EPROTONOSUPPORT);
129125944Sjoerg
129225944Sjoerg	/* Check if the protocol was indeed registered. */
129325944Sjoerg	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
129425944Sjoerg	if (pr == NULL)
129525944Sjoerg		return (EPFNOSUPPORT);
129625944Sjoerg	if (ip_protox[ipproto] == pr - inetsw)  /* IPPROTO_RAW */
12974910Swollman		return (ENOENT);
129825944Sjoerg
129925944Sjoerg	/* Reset the protocol slot to IPPROTO_RAW. */
130025944Sjoerg	ip_protox[ipproto] = pr - inetsw;
130125944Sjoerg	return (0);
130211189Sjkh}
130325944Sjoerg
130425944Sjoerg/*
130525944Sjoerg * Given address of next destination (final or next hop),
130625944Sjoerg * return internet address info of interface to be used to get there.
13074910Swollman */
130825944Sjoergstruct in_ifaddr *
130925944Sjoergip_rtaddr(struct in_addr dst, u_int fibnum)
131025944Sjoerg{
13114910Swollman	struct route sro;
131211189Sjkh	struct sockaddr_in *sin;
131330300Sjoerg	struct in_ifaddr *ifa;
131430300Sjoerg
131530300Sjoerg	bzero(&sro, sizeof(sro));
131630300Sjoerg	sin = (struct sockaddr_in *)&sro.ro_dst;
131730300Sjoerg	sin->sin_family = AF_INET;
131825944Sjoerg	sin->sin_len = sizeof(*sin);
131930300Sjoerg	sin->sin_addr = dst;
13204910Swollman	in_rtalloc_ign(&sro, 0, fibnum);
1321138745Srik
132225944Sjoerg	if (sro.ro_rt == NULL)
132330300Sjoerg		return (NULL);
13244910Swollman
13254910Swollman	ifa = ifatoia(sro.ro_rt->rt_ifa);
132670199Sjhay	RTFREE(sro.ro_rt);
132725944Sjoerg	return (ifa);
132825944Sjoerg}
132925944Sjoerg
13304910Swollmanu_char inetctlerrmap[PRC_NCMDS] = {
13314910Swollman	0,		0,		0,		0,
13324910Swollman	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
133330300Sjoerg	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
133425706Sjoerg	EMSGSIZE,	EHOSTUNREACH,	0,		0,
13354910Swollman	0,		0,		EHOSTUNREACH,	0,
133625944Sjoerg	ENOPROTOOPT,	ECONNREFUSED
13374910Swollman};
133830300Sjoerg
13394910Swollman/*
134027929Sitojun * Forward a packet.  If some error occurs return the sender
134125706Sjoerg * an icmp packet.  Note we can't always generate a meaningful
134225706Sjoerg * icmp message because icmp doesn't have a large enough repertoire
134340008Sjoerg * of codes and types.
134440008Sjoerg *
13454910Swollman * If not forwarding, just drop the packet.  This could be confusing
13464910Swollman * if ipforwarding was zero but some routing protocol was advancing
13474910Swollman * us as a gateway to somewhere.  However, we must let the routing
134825706Sjoerg * protocol deal with that.
134925706Sjoerg *
135040008Sjoerg * The srcrt parameter indicates whether the packet is being forwarded
135125706Sjoerg * via a source route.
135240008Sjoerg */
135340008Sjoergvoid
135440008Sjoergip_forward(struct mbuf *m, int srcrt)
13554910Swollman{
13564910Swollman	INIT_VNET_INET(curvnet);
135725706Sjoerg	struct ip *ip = mtod(m, struct ip *);
135869211Sphk	struct in_ifaddr *ia = NULL;
135940008Sjoerg	struct mbuf *mcopy;
13604910Swollman	struct in_addr dest;
13614910Swollman	struct route ro;
13624910Swollman	int error, type = 0, code = 0, mtu = 0;
13634910Swollman
13644910Swollman	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
13654910Swollman		IPSTAT_INC(ips_cantforward);
136678064Sume		m_freem(m);
136778064Sume		return;
13684910Swollman	}
13694910Swollman#ifdef IPSTEALTH
137011189Sjkh	if (!V_ipstealth) {
137140008Sjoerg#endif
137240008Sjoerg		if (ip->ip_ttl <= IPTTLDEC) {
137311189Sjkh			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
137411189Sjkh			    0, 0);
137511189Sjkh			return;
137626018Sjoerg		}
137711189Sjkh#ifdef IPSTEALTH
137811189Sjkh	}
13794910Swollman#endif
13804910Swollman
13814910Swollman	ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
138278064Sume	if (!srcrt && ia == NULL) {
138311189Sjkh		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
138411189Sjkh		return;
138530300Sjoerg	}
138611189Sjkh
1387148887Srwatson	/*
138830300Sjoerg	 * Save the IP header and at most 8 bytes of the payload,
138940008Sjoerg	 * in case we need to generate an ICMP message to the src.
139011189Sjkh	 *
13914910Swollman	 * XXX this can be optimized a lot by saving the data in a local
13924910Swollman	 * buffer on the stack (72 bytes at most), and only allocating the
139330300Sjoerg	 * mbuf if really necessary. The vast majority of the packets
139430300Sjoerg	 * are forwarded without having to send an ICMP back (either
139530300Sjoerg	 * because unnecessary, or because rate limited), so we are
13964910Swollman	 * really we are wasting a lot of work here.
13974910Swollman	 *
13984910Swollman	 * We don't use m_copy() because it might return a reference
13994910Swollman	 * to a shared cluster. Both this function and ip_output()
14004910Swollman	 * assume exclusive access to the IP header in `m', so any
140125944Sjoerg	 * data in a cluster may change before we reach icmp_error().
14024910Swollman	 */
140312820Sphk	MGETHDR(mcopy, M_DONTWAIT, m->m_type);
140425944Sjoerg	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) {
140525944Sjoerg		/*
140625944Sjoerg		 * It's probably ok if the pkthdr dup fails (because
140725944Sjoerg		 * the deep copy of the tag chain failed), but for now
140825944Sjoerg		 * be conservative and just discard the copy since
140925944Sjoerg		 * code below may some day want the tags.
141035029Sphk		 */
141125944Sjoerg		m_free(mcopy);
141236119Sphk		mcopy = NULL;
141370199Sjhay	}
1414111119Simp	if (mcopy != NULL) {
141525944Sjoerg		mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy));
141625944Sjoerg		mcopy->m_pkthdr.len = mcopy->m_len;
141725944Sjoerg		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
141825944Sjoerg	}
141925944Sjoerg
142025944Sjoerg#ifdef IPSTEALTH
142125944Sjoerg	if (!V_ipstealth) {
142225944Sjoerg#endif
142325944Sjoerg		ip->ip_ttl -= IPTTLDEC;
142425944Sjoerg#ifdef IPSTEALTH
142525944Sjoerg	}
142625944Sjoerg#endif
142725944Sjoerg
142825944Sjoerg	/*
142925944Sjoerg	 * If forwarding packet using same interface that it came in on,
143040008Sjoerg	 * perhaps should send a redirect to sender to shortcut a hop.
143135029Sphk	 * Only send redirect if source is sending directly to us,
143235029Sphk	 * and if packet was not source routed (or has any options).
143325944Sjoerg	 * Also, don't send redirect if forwarding using a default route
143425944Sjoerg	 * or a route modified by a redirect.
143525944Sjoerg	 */
143640008Sjoerg	dest.s_addr = 0;
143740008Sjoerg	if (!srcrt && V_ipsendredirects && ia->ia_ifp == m->m_pkthdr.rcvif) {
143840008Sjoerg		struct sockaddr_in *sin;
143925944Sjoerg		struct rtentry *rt;
144069152Sjlemon
144169152Sjlemon		bzero(&ro, sizeof(ro));
144225944Sjoerg		sin = (struct sockaddr_in *)&ro.ro_dst;
144325944Sjoerg		sin->sin_family = AF_INET;
144470199Sjhay		sin->sin_len = sizeof(*sin);
144525944Sjoerg		sin->sin_addr = ip->ip_dst;
144625944Sjoerg		in_rtalloc_ign(&ro, 0, M_GETFIB(m));
144725944Sjoerg
144825944Sjoerg		rt = ro.ro_rt;
144925944Sjoerg
145025944Sjoerg		if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
145125944Sjoerg		    satosin(rt_key(rt))->sin_addr.s_addr != 0) {
145225706Sjoerg#define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
145325706Sjoerg			u_long src = ntohl(ip->ip_src.s_addr);
14544910Swollman
145525944Sjoerg			if (RTA(rt) &&
14564910Swollman			    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
14574910Swollman				if (rt->rt_flags & RTF_GATEWAY)
14584910Swollman					dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
14594910Swollman				else
14604910Swollman					dest.s_addr = ip->ip_dst.s_addr;
14614910Swollman				/* Router requirements says to only send host redirects */
1462111119Simp				type = ICMP_REDIRECT;
14634910Swollman				code = ICMP_REDIRECT_HOST;
14644910Swollman			}
14654910Swollman		}
14664910Swollman		if (rt)
14674910Swollman			RTFREE(rt);
14684910Swollman	}
14694910Swollman
14704910Swollman	/*
14714910Swollman	 * Try to cache the route MTU from ip_output so we can consider it for
14724910Swollman	 * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
14734910Swollman	 */
14744910Swollman	bzero(&ro, sizeof(ro));
14754910Swollman
14764910Swollman	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
14774910Swollman
14784910Swollman	if (error == EMSGSIZE && ro.ro_rt)
14794910Swollman		mtu = ro.ro_rt->rt_rmx.rmx_mtu;
148025706Sjoerg	if (ro.ro_rt)
148140008Sjoerg		RTFREE(ro.ro_rt);
148240008Sjoerg
148325944Sjoerg	if (error)
148425944Sjoerg		IPSTAT_INC(ips_cantforward);
148525944Sjoerg	else {
148644145Sphk		IPSTAT_INC(ips_forward);
148769211Sphk		if (type)
14884910Swollman			IPSTAT_INC(ips_redirectsent);
148969152Sjlemon		else {
149069152Sjlemon			if (mcopy)
14914910Swollman				m_freem(mcopy);
14924910Swollman			return;
14934910Swollman		}
149425944Sjoerg	}
14954910Swollman	if (mcopy == NULL)
149612820Sphk		return;
149725944Sjoerg
14984910Swollman	switch (error) {
149925944Sjoerg
150025944Sjoerg	case 0:				/* forwarded, but need redirect */
150125944Sjoerg		/* type, code set above */
150225944Sjoerg		break;
150325944Sjoerg
15044910Swollman	case ENETUNREACH:		/* shouldn't happen, checked above */
150525944Sjoerg	case EHOSTUNREACH:
150625944Sjoerg	case ENETDOWN:
150725944Sjoerg	case EHOSTDOWN:
150840008Sjoerg	default:
150940008Sjoerg		type = ICMP_UNREACH;
15104910Swollman		code = ICMP_UNREACH_HOST;
151125944Sjoerg		break;
151225944Sjoerg
151325944Sjoerg	case EMSGSIZE:
151425944Sjoerg		type = ICMP_UNREACH;
151540008Sjoerg		code = ICMP_UNREACH_NEEDFRAG;
151640008Sjoerg
151725944Sjoerg#ifdef IPSEC
151825944Sjoerg		/*
151944145Sphk		 * If IPsec is configured for this path,
152069211Sphk		 * override any possibly mtu value set by ip_output.
152125944Sjoerg		 */
152225944Sjoerg		mtu = ip_ipsec_mtu(m, mtu);
152325944Sjoerg#endif /* IPSEC */
152430300Sjoerg		/*
152525944Sjoerg		 * If the MTU was set before make sure we are below the
152625944Sjoerg		 * interface MTU.
152725944Sjoerg		 * If the MTU wasn't set before use the interface mtu or
152825944Sjoerg		 * fall back to the next smaller mtu step compared to the
152969211Sphk		 * current packet size.
153040008Sjoerg		 */
153125944Sjoerg		if (mtu != 0) {
153225944Sjoerg			if (ia != NULL)
153325944Sjoerg				mtu = min(mtu, ia->ia_ifp->if_mtu);
153425944Sjoerg		} else {
153530300Sjoerg			if (ia != NULL)
153630300Sjoerg				mtu = ia->ia_ifp->if_mtu;
153730300Sjoerg			else
153830300Sjoerg				mtu = ip_next_mtu(ip->ip_len, 0);
153930300Sjoerg		}
154030300Sjoerg		IPSTAT_INC(ips_cantfrag);
154130300Sjoerg		break;
154230300Sjoerg
154330300Sjoerg	case ENOBUFS:
154430300Sjoerg		/*
154525944Sjoerg		 * A router should not generate ICMP_SOURCEQUENCH as
154625944Sjoerg		 * required in RFC1812 Requirements for IP Version 4 Routers.
154725944Sjoerg		 * Source quench could be a big problem under DoS attacks,
154825944Sjoerg		 * or if the underlying interface is rate-limited.
154925944Sjoerg		 * Those who need source quench packets may re-enable them
1550102412Scharnier		 * via the net.inet.ip.sendsourcequench sysctl.
155125944Sjoerg		 */
155225944Sjoerg		if (V_ip_sendsourcequench == 0) {
155370199Sjhay			m_freem(mcopy);
155470199Sjhay			return;
155570199Sjhay		} else {
155670199Sjhay			type = ICMP_SOURCEQUENCH;
155770199Sjhay			code = 0;
155870199Sjhay		}
155970199Sjhay		break;
156070199Sjhay
156170199Sjhay	case EACCES:			/* ipfw denied packet */
156270199Sjhay		m_freem(mcopy);
156325944Sjoerg		return;
156425944Sjoerg	}
156525944Sjoerg	icmp_error(mcopy, type, code, dest.s_addr, mtu);
156625944Sjoerg}
156725944Sjoerg
156825944Sjoergvoid
156925944Sjoergip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
157025944Sjoerg    struct mbuf *m)
157125944Sjoerg{
157225944Sjoerg	INIT_VNET_NET(inp->inp_vnet);
157325944Sjoerg
157425944Sjoerg	if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) {
157525944Sjoerg		struct bintime bt;
157640008Sjoerg
157740008Sjoerg		bintime(&bt);
157826077Sjoerg		if (inp->inp_socket->so_options & SO_BINTIME) {
157925944Sjoerg			*mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt),
158025944Sjoerg			SCM_BINTIME, SOL_SOCKET);
158125944Sjoerg			if (*mp)
158225944Sjoerg				mp = &(*mp)->m_next;
158325944Sjoerg		}
158440008Sjoerg		if (inp->inp_socket->so_options & SO_TIMESTAMP) {
158540008Sjoerg			struct timeval tv;
158625944Sjoerg
158725944Sjoerg			bintime2timeval(&bt, &tv);
158825944Sjoerg			*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
158925944Sjoerg				SCM_TIMESTAMP, SOL_SOCKET);
159025944Sjoerg			if (*mp)
159125944Sjoerg				mp = &(*mp)->m_next;
159225944Sjoerg		}
159325944Sjoerg	}
159469211Sphk	if (inp->inp_flags & INP_RECVDSTADDR) {
159540008Sjoerg		*mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
159625944Sjoerg		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
159725944Sjoerg		if (*mp)
159825944Sjoerg			mp = &(*mp)->m_next;
159925944Sjoerg	}
160025944Sjoerg	if (inp->inp_flags & INP_RECVTTL) {
160125944Sjoerg		*mp = sbcreatecontrol((caddr_t) &ip->ip_ttl,
160225944Sjoerg		    sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
160325944Sjoerg		if (*mp)
160425944Sjoerg			mp = &(*mp)->m_next;
160525944Sjoerg	}
160625944Sjoerg#ifdef notyet
160725944Sjoerg	/* XXX
160825944Sjoerg	 * Moving these out of udp_input() made them even more broken
160925944Sjoerg	 * than they already were.
161025944Sjoerg	 */
161125944Sjoerg	/* options were tossed already */
161225944Sjoerg	if (inp->inp_flags & INP_RECVOPTS) {
161325944Sjoerg		*mp = sbcreatecontrol((caddr_t) opts_deleted_above,
1614102412Scharnier		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
161525944Sjoerg		if (*mp)
161625944Sjoerg			mp = &(*mp)->m_next;
161725944Sjoerg	}
161825944Sjoerg	/* ip_srcroute doesn't do what we want here, need to fix */
161925944Sjoerg	if (inp->inp_flags & INP_RECVRETOPTS) {
162025944Sjoerg		*mp = sbcreatecontrol((caddr_t) ip_srcroute(m),
162125944Sjoerg		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
162225944Sjoerg		if (*mp)
162340008Sjoerg			mp = &(*mp)->m_next;
162440008Sjoerg	}
162525944Sjoerg#endif
162625944Sjoerg	if (inp->inp_flags & INP_RECVIF) {
162725944Sjoerg		struct ifnet *ifp;
162840008Sjoerg		struct sdlbuf {
162940008Sjoerg			struct sockaddr_dl sdl;
163025944Sjoerg			u_char	pad[32];
163125944Sjoerg		} sdlbuf;
163225944Sjoerg		struct sockaddr_dl *sdp;
163325944Sjoerg		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
163425944Sjoerg
163525944Sjoerg		if (((ifp = m->m_pkthdr.rcvif))
163625944Sjoerg		&& ( ifp->if_index && (ifp->if_index <= V_if_index))) {
163725944Sjoerg			sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
163825944Sjoerg			/*
163969211Sphk			 * Change our mind and don't try copy.
164040008Sjoerg			 */
164125944Sjoerg			if ((sdp->sdl_family != AF_LINK)
164225944Sjoerg			|| (sdp->sdl_len > sizeof(sdlbuf))) {
164325944Sjoerg				goto makedummy;
164425944Sjoerg			}
164525944Sjoerg			bcopy(sdp, sdl2, sdp->sdl_len);
164625944Sjoerg		} else {
164725944Sjoergmakedummy:
164825944Sjoerg			sdl2->sdl_len
16494910Swollman				= offsetof(struct sockaddr_dl, sdl_data[0]);
165025944Sjoerg			sdl2->sdl_family = AF_LINK;
165125944Sjoerg			sdl2->sdl_index = 0;
165225944Sjoerg			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
165325944Sjoerg		}
165425944Sjoerg		*mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len,
165525944Sjoerg			IP_RECVIF, IPPROTO_IP);
165625944Sjoerg		if (*mp)
165725944Sjoerg			mp = &(*mp)->m_next;
165870199Sjhay	}
165970199Sjhay}
166070199Sjhay
166170199Sjhay/*
166270199Sjhay * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
166370199Sjhay * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
166470199Sjhay * locking.  This code remains in ip_input.c as ip_mroute.c is optionally
166525944Sjoerg * compiled.
166625944Sjoerg */
166725944Sjoergint
166825944Sjoergip_rsvp_init(struct socket *so)
1669102412Scharnier{
167025944Sjoerg	INIT_VNET_INET(so->so_vnet);
167152633Sjoerg
167225944Sjoerg	if (so->so_type != SOCK_RAW ||
167325944Sjoerg	    so->so_proto->pr_protocol != IPPROTO_RSVP)
167425944Sjoerg		return EOPNOTSUPP;
167525944Sjoerg
167625944Sjoerg	if (V_ip_rsvpd != NULL)
167725944Sjoerg		return EADDRINUSE;
167840008Sjoerg
167940008Sjoerg	V_ip_rsvpd = so;
168025944Sjoerg	/*
168125944Sjoerg	 * This may seem silly, but we need to be sure we don't over-increment
168225944Sjoerg	 * the RSVP counter, in case something slips up.
168325944Sjoerg	 */
168425944Sjoerg	if (!V_ip_rsvp_on) {
16854910Swollman		V_ip_rsvp_on = 1;
168625944Sjoerg		V_rsvp_on++;
168725944Sjoerg	}
168825944Sjoerg
168925944Sjoerg	return 0;
169025944Sjoerg}
1691102412Scharnier
169225944Sjoergint
169325944Sjoergip_rsvp_done(void)
169425944Sjoerg{
169525944Sjoerg	INIT_VNET_INET(curvnet);
169625944Sjoerg
169725944Sjoerg	V_ip_rsvpd = NULL;
169825944Sjoerg	/*
169925944Sjoerg	 * This may seem silly, but we need to be sure we don't over-decrement
170040008Sjoerg	 * the RSVP counter, in case something slips up.
170140008Sjoerg	 */
170225944Sjoerg	if (V_ip_rsvp_on) {
170325944Sjoerg		V_ip_rsvp_on = 0;
170425944Sjoerg		V_rsvp_on--;
170525944Sjoerg	}
170625944Sjoerg	return 0;
170725944Sjoerg}
170825944Sjoerg
170925944Sjoergvoid
171025944Sjoergrsvp_input(struct mbuf *m, int off)	/* XXX must fixup manually */
171140008Sjoerg{
171240008Sjoerg	INIT_VNET_INET(curvnet);
171325944Sjoerg
171425944Sjoerg	if (rsvp_input_p) { /* call the real one if loaded */
171525944Sjoerg		rsvp_input_p(m, off);
171625944Sjoerg		return;
171725944Sjoerg	}
171825944Sjoerg
171925944Sjoerg	/* Can still get packets with rsvp_on = 0 if there is a local member
172025944Sjoerg	 * of the group to which the RSVP packet is addressed.  But in this
172125944Sjoerg	 * case we want to throw the packet away.
172225944Sjoerg	 */
172325944Sjoerg
172425944Sjoerg	if (!V_rsvp_on) {
172525944Sjoerg		m_freem(m);
172641881Sphk		return;
172725944Sjoerg	}
172825944Sjoerg
172925944Sjoerg	if (V_ip_rsvpd != NULL) {
173041881Sphk		rip_input(m, off);
173125944Sjoerg		return;
173225944Sjoerg	}
173325944Sjoerg	/* Drop the packet */
173425944Sjoerg	m_freem(m);
173525944Sjoerg}
173625944Sjoerg