ip_input.c revision 191314
14910Swollman/*- 2139365Srik * Copyright (c) 1982, 1986, 1988, 1993 34910Swollman * The Regents of the University of California. All rights reserved. 4139823Simp * 5139823Simp * Redistribution and use in source and binary forms, with or without 6139365Srik * modification, are permitted provided that the following conditions 725944Sjoerg * are met: 84910Swollman * 1. Redistributions of source code must retain the above copyright 925944Sjoerg * notice, this list of conditions and the following disclaimer. 1088534Sjoerg * 2. Redistributions in binary form must reproduce the above copyright 1125944Sjoerg * notice, this list of conditions and the following disclaimer in the 124910Swollman * documentation and/or other materials provided with the distribution. 134910Swollman * 4. Neither the name of the University nor the names of its contributors 144910Swollman * may be used to endorse or promote products derived from this software 154910Swollman * without specific prior written permission. 164910Swollman * 174910Swollman * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 184910Swollman * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1930300Sjoerg * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2016288Sgpalmer * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2150477Speter * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 224910Swollman * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 234910Swollman * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2440008Sjoerg * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2540008Sjoerg * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2632350Seivind * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2754263Sshin * SUCH DAMAGE. 2831742Seivind * 2931742Seivind * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 304952Sbde */ 314952Sbde 3270199Sjhay#include <sys/cdefs.h> 3324204Sbde__FBSDID("$FreeBSD: head/sys/netinet/ip_input.c 191314 2009-04-20 14:35:42Z rwatson $"); 344910Swollman 3525706Sjoerg#include "opt_bootp.h" 3659604Sobrien#include "opt_ipfw.h" 3729024Sbde#include "opt_ipstealth.h" 384910Swollman#include "opt_ipsec.h" 3940008Sjoerg#include "opt_route.h" 4030300Sjoerg#include "opt_mac.h" 414910Swollman#include "opt_carp.h" 424910Swollman 434910Swollman#include <sys/param.h> 444910Swollman#include <sys/systm.h> 4542104Sphk#include <sys/callout.h> 46196019Srwatson#include <sys/mbuf.h> 4788534Sjoerg#include <sys/malloc.h> 4888534Sjoerg#include <sys/domain.h> 4988534Sjoerg#include <sys/protosw.h> 5088534Sjoerg#include <sys/socket.h> 514910Swollman#include <sys/time.h> 5230300Sjoerg#include <sys/kernel.h> 5330300Sjoerg#include <sys/lock.h> 544910Swollman#include <sys/rwlock.h> 5588705Sjoerg#include <sys/syslog.h> 5688705Sjoerg#include <sys/sysctl.h> 574910Swollman#include <sys/vimage.h> 584910Swollman 594910Swollman#include <net/pfil.h> 604910Swollman#include <net/if.h> 61148385Sume#include <net/if_types.h> 62148385Sume#include <net/if_var.h> 63148385Sume#include <net/if_dl.h> 64148385Sume#include <net/route.h> 65182121Simp#include <net/netisr.h> 6688705Sjoerg#include <net/vnet.h> 6711819Sjulian#include <net/flowtable.h> 6811819Sjulian 6911819Sjulian#include <netinet/in.h> 7011819Sjulian#include <netinet/in_systm.h> 7111819Sjulian#include <netinet/in_var.h> 724910Swollman#include <netinet/ip.h> 734910Swollman#include <netinet/in_pcb.h> 74182121Simp#include <netinet/ip_var.h> 754910Swollman#include <netinet/ip_icmp.h> 764910Swollman#include <netinet/ip_options.h> 7725944Sjoerg#include <machine/in_cksum.h> 7825944Sjoerg#include <netinet/vinet.h> 7925944Sjoerg#ifdef DEV_CARP 8025955Sjoerg#include <netinet/ip_carp.h> 8125944Sjoerg#endif 8225944Sjoerg#ifdef IPSEC 8325944Sjoerg#include <netinet/ip_ipsec.h> 8425955Sjoerg#endif /* IPSEC */ 8525955Sjoerg 8625955Sjoerg#include <sys/socketvar.h> 8730300Sjoerg 8830300Sjoerg/* XXX: Temporary until ipfw_ether and ipfw_bridge are converted. */ 8930300Sjoerg#include <netinet/ip_fw.h> 9030300Sjoerg#include <netinet/ip_dummynet.h> 9130300Sjoerg 9230300Sjoerg#include <security/mac/mac_framework.h> 9330300Sjoerg 9430300Sjoerg#ifdef CTASSERT 9530300SjoergCTASSERT(sizeof(struct ip) == 20); 9625944Sjoerg#endif 9725944Sjoerg 9825955Sjoerg#ifndef VIMAGE 9925955Sjoerg#ifndef VIMAGE_GLOBALS 10045152Sphkstruct vnet_inet vnet_inet_0; 10125944Sjoerg#endif 10230300Sjoerg#endif 10330300Sjoerg 10430300Sjoerg#ifdef VIMAGE_GLOBALS 10530300Sjoergstatic int ipsendredirects; 10630300Sjoergstatic int ip_checkinterface; 10730300Sjoergstatic int ip_keepfaith; 10888534Sjoergstatic int ip_sendsourcequench; 10988534Sjoergint ip_defttl; 11078064Sumeint ip_do_randomid; 11130300Sjoergint ipforwarding; 11230300Sjoergstruct in_ifaddrhead in_ifaddrhead; /* first inet address */ 11330300Sjoergstruct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ 11430300Sjoergu_long in_ifaddrhmask; /* mask for hash table */ 11578064Sumestruct ipstat ipstat; 1164910Swollmanstatic int ip_rsvp_on; 11725944Sjoergstruct socket *ip_rsvpd; 11825944Sjoergint rsvp_on; 11925944Sjoergstatic struct ipqhead ipq[IPREASS_NHASH]; 12025944Sjoergstatic int maxnipq; /* Administrative limit on # reass queues. */ 12125944Sjoergstatic int maxfragsperpacket; 12225944Sjoergint ipstealth; 12325944Sjoergstatic int nipq; /* Total # of reass queues */ 12425944Sjoerg#endif 12525944Sjoerg 12625944SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_FORWARDING, 12725944Sjoerg forwarding, CTLFLAG_RW, ipforwarding, 0, 1284910Swollman "Enable IP forwarding between interfaces"); 12930300Sjoerg 13030300SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_SENDREDIRECTS, 13130300Sjoerg redirect, CTLFLAG_RW, ipsendredirects, 0, 13230300Sjoerg "Enable sending IP redirects"); 13330300Sjoerg 13430300SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_DEFTTL, 13530300Sjoerg ttl, CTLFLAG_RW, ip_defttl, 0, "Maximum TTL on IP packets"); 13630300Sjoerg 1374910SwollmanSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_KEEPFAITH, 13825944Sjoerg keepfaith, CTLFLAG_RW, ip_keepfaith, 0, 13925944Sjoerg "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); 14025944Sjoerg 1414910SwollmanSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, 14278064Sume sendsourcequench, CTLFLAG_RW, ip_sendsourcequench, 0, 14378064Sume "Enable the transmission of source quench packets"); 14478064Sume 14588534SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, random_id, 14688534Sjoerg CTLFLAG_RW, ip_do_randomid, 0, "Assign random ip_id values"); 14730300Sjoerg 14830300Sjoerg/* 14930300Sjoerg * XXX - Setting ip_checkinterface mostly implements the receive side of 1504910Swollman * the Strong ES model described in RFC 1122, but since the routing table 15130300Sjoerg * and transmit implementation do not implement the Strong ES model, 15230300Sjoerg * setting this to 1 results in an odd hybrid. 15330300Sjoerg * 15430300Sjoerg * XXX - ip_checkinterface currently must be disabled if you use ipnat 15530300Sjoerg * to translate the destination address to another local interface. 15630300Sjoerg * 15730300Sjoerg * XXX - ip_checkinterface must be disabled if you add IP aliases 15830300Sjoerg * to the loopback interface instead of the interface where the 15930300Sjoerg * packets for those addresses are received. 16030300Sjoerg */ 16130300SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, 16230300Sjoerg check_interface, CTLFLAG_RW, ip_checkinterface, 0, 16330300Sjoerg "Verify packet arrives on correct interface"); 16430300Sjoerg 16525944Sjoergstruct pfil_head inet_pfil_hook; /* Packet filter hooks */ 16625944Sjoerg 16725944Sjoergstatic struct ifqueue ipintrq; 16825944Sjoergstatic int ipqmaxlen = IFQ_MAXLEN; 16925944Sjoerg 17025944Sjoergextern struct domain inetdomain; 17125944Sjoergextern struct protosw inetsw[]; 17225944Sjoergu_char ip_protox[IPPROTO_MAX]; 17325944Sjoerg 17425944SjoergSYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, 17525944Sjoerg &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); 17625944SjoergSYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, 177147256Sbrooks &ipintrq.ifq_drops, 0, 178147256Sbrooks "Number of packets dropped from the IP input queue"); 1794910Swollman 18011189SjkhSYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, 18111189Sjkh ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); 18211189Sjkh 183103842Salfred#ifdef VIMAGE_GLOBALS 1844910Swollmanstatic uma_zone_t ipq_zone; 1854910Swollman#endif 1864910Swollmanstatic struct mtx ipqlock; 18711189Sjkh 18811189Sjkh#define IPQ_LOCK() mtx_lock(&ipqlock) 18911189Sjkh#define IPQ_UNLOCK() mtx_unlock(&ipqlock) 190103842Salfred#define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF) 1914910Swollman#define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED) 1924910Swollman 1934910Swollmanstatic void maxnipq_update(void); 19411189Sjkhstatic void ipq_zone_change(void *); 19511189Sjkh 19611189SjkhSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fragpackets, 19711189Sjkh CTLFLAG_RD, nipq, 0, 19811189Sjkh "Current number of IPv4 fragment reassembly queue entries"); 19911189Sjkh 200103842SalfredSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, maxfragsperpacket, 20188704Sjoerg CTLFLAG_RW, maxfragsperpacket, 0, 2024910Swollman "Maximum number of IPv4 fragments allowed per packet"); 20325944Sjoerg 20425944Sjoergstruct callout ipport_tick_callout; 20525944Sjoerg 20625944Sjoerg#ifdef IPCTL_DEFMTU 20725944SjoergSYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, 20825944Sjoerg &ip_mtu, 0, "Default MTU"); 20925944Sjoerg#endif 21025944Sjoerg 21125944Sjoerg#ifdef IPSTEALTH 21225944SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, 21325944Sjoerg ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding"); 21425944Sjoerg#endif 21525944Sjoergstatic int ip_output_flowtable_size = 2048; 21625944SjoergTUNABLE_INT("net.inet.ip.output_flowtable_size", &ip_output_flowtable_size); 21725944SjoergSYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, output_flowtable_size, 21825944Sjoerg CTLFLAG_RDTUN, ip_output_flowtable_size, 2048, 21925944Sjoerg "number of entries in the per-cpu output flow caches"); 22025944Sjoerg 22125944Sjoerg/* 22225944Sjoerg * ipfw_ether and ipfw_bridge hooks. 22325944Sjoerg * XXX: Temporary until those are converted to pfil_hooks as well. 22425944Sjoerg */ 22525944Sjoergip_fw_chk_t *ip_fw_chk_ptr = NULL; 22625944Sjoergip_dn_io_t *ip_dn_io_ptr = NULL; 22725944Sjoerg#ifdef VIMAGE_GLOBALS 22825944Sjoergint fw_one_pass; 22925944Sjoerg#endif 23025944Sjoergstruct flowtable *ip_ft; 23125944Sjoerg 23225944Sjoergstatic void ip_freef(struct ipqhead *, struct ipq *); 23325944Sjoerg 23425944Sjoerg#ifndef VIMAGE_GLOBALS 23540008Sjoergstatic void vnet_inet_register(void); 23640008Sjoerg 23740008Sjoergstatic const vnet_modinfo_t vnet_inet_modinfo = { 238188668Srwatson .vmi_id = VNET_MOD_INET, 239188668Srwatson .vmi_name = "inet", 240188668Srwatson}; 241190818Sed 242138745Srikstatic void vnet_inet_register() 24388705Sjoerg{ 2444910Swollman 2454910Swollman vnet_mod_register(&vnet_inet_modinfo); 2464910Swollman} 2474910Swollman 2484910SwollmanSYSINIT(inet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet_register, 0); 24930300Sjoerg#endif 25030300Sjoerg 2514910Swollman/* 252126910Srwatson * IP initialization: fill in IP protocol switch table. 2534910Swollman * All protocols not implemented in kernel go to raw IP protocol handler. 2544910Swollman */ 2554910Swollmanvoid 2564910Swollmanip_init(void) 25788705Sjoerg{ 2584910Swollman INIT_VNET_INET(curvnet); 25925944Sjoerg struct protosw *pr; 26025944Sjoerg int i; 261147256Sbrooks 26225944Sjoerg V_ipsendredirects = 1; /* XXX */ 26311189Sjkh V_ip_checkinterface = 0; 26430300Sjoerg V_ip_keepfaith = 0; 265191148Skmacy V_ip_sendsourcequench = 0; 2664910Swollman V_rsvp_on = 0; 26725944Sjoerg V_ip_defttl = IPDEFTTL; 26825944Sjoerg V_ip_do_randomid = 0; 26925944Sjoerg V_ip_id = time_second & 0xffff; 27025944Sjoerg V_ipforwarding = 0; 27125944Sjoerg V_ipstealth = 0; 27225944Sjoerg V_nipq = 0; /* Total # of reass queues */ 27325944Sjoerg 27442104Sphk V_ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ 27525944Sjoerg V_ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ 27625944Sjoerg V_ipport_firstauto = IPPORT_EPHEMERALFIRST; /* 10000 */ 27730300Sjoerg V_ipport_lastauto = IPPORT_EPHEMERALLAST; /* 65535 */ 27842104Sphk V_ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ 27930300Sjoerg V_ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ 28025944Sjoerg V_ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ 28125944Sjoerg V_ipport_reservedlow = 0; 28225944Sjoerg V_ipport_randomized = 1; /* user controlled via sysctl */ 28325944Sjoerg V_ipport_randomcps = 10; /* user controlled via sysctl */ 28425944Sjoerg V_ipport_randomtime = 45; /* user controlled via sysctl */ 28525944Sjoerg V_ipport_stoprandom = 0; /* toggled by ipport_tick */ 28625944Sjoerg 28730300Sjoerg V_fw_one_pass = 1; 28830300Sjoerg 289138745Srik#ifdef NOTYET 290138745Srik /* XXX global static but not instantiated in this file */ 291138745Srik V_ipfastforward_active = 0; 29225944Sjoerg V_subnetsarelocal = 0; 29325944Sjoerg V_sameprefixcarponly = 0; 29425944Sjoerg#endif 29525944Sjoerg 29625944Sjoerg TAILQ_INIT(&V_in_ifaddrhead); 29725944Sjoerg V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); 29825944Sjoerg 29925944Sjoerg /* Initialize IP reassembly queue. */ 30025944Sjoerg for (i = 0; i < IPREASS_NHASH; i++) 30125944Sjoerg TAILQ_INIT(&V_ipq[i]); 30225944Sjoerg V_maxnipq = nmbclusters / 32; 30325944Sjoerg V_maxfragsperpacket = 16; 30425944Sjoerg V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, 30525944Sjoerg NULL, UMA_ALIGN_PTR, 0); 30630300Sjoerg maxnipq_update(); 30730300Sjoerg 30825944Sjoerg /* Skip initialization of globals for non-default instances. */ 30925944Sjoerg if (!IS_DEFAULT_VNET(curvnet)) 31025944Sjoerg return; 31125944Sjoerg 31225944Sjoerg pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 31325944Sjoerg if (pr == NULL) 31425944Sjoerg panic("ip_init: PF_INET not found"); 31525944Sjoerg 31625944Sjoerg /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ 31725944Sjoerg for (i = 0; i < IPPROTO_MAX; i++) 31825944Sjoerg ip_protox[i] = pr - inetsw; 31925944Sjoerg /* 32025944Sjoerg * Cycle through IP protocols and put them into the appropriate place 32125944Sjoerg * in ip_protox[]. 32225944Sjoerg */ 32325944Sjoerg for (pr = inetdomain.dom_protosw; 32478064Sume pr < inetdomain.dom_protoswNPROTOSW; pr++) 32578064Sume if (pr->pr_domain->dom_family == PF_INET && 32678064Sume pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { 32778064Sume /* Be careful to only index valid IP protocols. */ 32878064Sume if (pr->pr_protocol < IPPROTO_MAX) 32978064Sume ip_protox[pr->pr_protocol] = pr - inetsw; 33078064Sume } 33178064Sume 33278064Sume /* Initialize packet filter hooks. */ 33378064Sume inet_pfil_hook.ph_type = PFIL_TYPE_AF; 33478064Sume inet_pfil_hook.ph_af = AF_INET; 33578064Sume if ((i = pfil_head_register(&inet_pfil_hook)) != 0) 33678064Sume printf("%s: WARNING: unable to register pfil hook, " 33778064Sume "error %d\n", __func__, i); 33878064Sume 33930300Sjoerg /* Start ipport_tick. */ 34030300Sjoerg callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); 34130300Sjoerg ipport_tick(NULL); 34230300Sjoerg EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, 34330300Sjoerg SHUTDOWN_PRI_DEFAULT); 34430300Sjoerg EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, 34530300Sjoerg NULL, EVENTHANDLER_PRI_ANY); 34630300Sjoerg 34730300Sjoerg /* Initialize various other remaining things. */ 34830300Sjoerg IPQ_LOCK_INIT(); 34930300Sjoerg ipintrq.ifq_maxlen = ipqmaxlen; 35030300Sjoerg mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF); 35130300Sjoerg netisr_register(NETISR_IP, ip_input, &ipintrq, 0); 35230300Sjoerg 35330300Sjoerg ip_ft = flowtable_alloc(ip_output_flowtable_size, FL_PCPU); 35430300Sjoerg} 35530300Sjoerg 35630300Sjoergvoid 35730300Sjoergip_fini(void *xtp) 35830300Sjoerg{ 35925944Sjoerg 360184682Sbz callout_stop(&ipport_tick_callout); 36130300Sjoerg} 36230300Sjoerg 363184682Sbz/* 36478064Sume * Ip input routine. Checksum and byte swap header. If fragmented 36578064Sume * try to reassemble. Process options. Pass to next level. 36678064Sume */ 36725944Sjoergvoid 36825944Sjoergip_input(struct mbuf *m) 36925944Sjoerg{ 37030300Sjoerg INIT_VNET_INET(curvnet); 37138343Sbde struct ip *ip = NULL; 37230300Sjoerg struct in_ifaddr *ia = NULL; 37325944Sjoerg struct ifaddr *ifa; 37430300Sjoerg struct ifnet *ifp; 37530300Sjoerg int checkif, hlen = 0; 37630300Sjoerg u_short sum; 37725944Sjoerg int dchg = 0; /* dest changed after fw */ 378184682Sbz struct in_addr odst; /* original dst address */ 37925944Sjoerg 380184682Sbz M_ASSERTPKTHDR(m); 38178064Sume 38278064Sume if (m->m_flags & M_FASTFWD_OURS) { 38378064Sume /* 38478064Sume * Firewall or NAT changed destination to local. 38578064Sume * We expect ip_len and ip_off to be in host byte order. 38678064Sume */ 38778064Sume m->m_flags &= ~M_FASTFWD_OURS; 38878064Sume /* Set up some basics that will be used later. */ 38978064Sume ip = mtod(m, struct ip *); 39025944Sjoerg hlen = ip->ip_hl << 2; 391138745Srik goto ours; 392138745Srik } 393138745Srik 39425944Sjoerg IPSTAT_INC(ips_total); 39533181Seivind 39625944Sjoerg if (m->m_pkthdr.len < sizeof(struct ip)) 39725944Sjoerg goto tooshort; 39825944Sjoerg 39925944Sjoerg if (m->m_len < sizeof (struct ip) && 40025944Sjoerg (m = m_pullup(m, sizeof (struct ip))) == NULL) { 40125944Sjoerg IPSTAT_INC(ips_toosmall); 40225944Sjoerg return; 40333181Seivind } 40488709Sjoerg ip = mtod(m, struct ip *); 40588709Sjoerg 40688709Sjoerg if (ip->ip_v != IPVERSION) { 40788709Sjoerg IPSTAT_INC(ips_badvers); 40888709Sjoerg goto bad; 40988709Sjoerg } 41088709Sjoerg 41125944Sjoerg hlen = ip->ip_hl << 2; 41225944Sjoerg if (hlen < sizeof(struct ip)) { /* minimum header length */ 41325944Sjoerg IPSTAT_INC(ips_badhlen); 41425944Sjoerg goto bad; 41525944Sjoerg } 41625944Sjoerg if (hlen > m->m_len) { 41778064Sume if ((m = m_pullup(m, hlen)) == NULL) { 41878064Sume IPSTAT_INC(ips_badhlen); 41978064Sume return; 42078064Sume } 42178064Sume ip = mtod(m, struct ip *); 42278064Sume } 42378064Sume 42478064Sume /* 127/8 must not appear on wire - RFC1122 */ 42578064Sume ifp = m->m_pkthdr.rcvif; 42678064Sume if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 42778064Sume (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 42878064Sume if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 42978064Sume IPSTAT_INC(ips_badaddr); 43078064Sume goto bad; 43133181Seivind } 43230300Sjoerg } 43330300Sjoerg 43430300Sjoerg if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { 43530300Sjoerg sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); 43630300Sjoerg } else { 43730300Sjoerg if (hlen == sizeof(struct ip)) { 43830300Sjoerg sum = in_cksum_hdr(ip); 43933181Seivind } else { 44030300Sjoerg sum = in_cksum(m, hlen); 44130300Sjoerg } 44230300Sjoerg } 44330300Sjoerg if (sum) { 44430300Sjoerg IPSTAT_INC(ips_badsum); 44530300Sjoerg goto bad; 44630300Sjoerg } 44733181Seivind 44825944Sjoerg#ifdef ALTQ 44925944Sjoerg if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) 45078064Sume /* packet is dropped by traffic conditioner */ 45130300Sjoerg return; 45230300Sjoerg#endif 45325944Sjoerg 45425944Sjoerg /* 455147256Sbrooks * Convert fields to host representation. 456147256Sbrooks */ 457147256Sbrooks ip->ip_len = ntohs(ip->ip_len); 458147256Sbrooks if (ip->ip_len < hlen) { 459147256Sbrooks IPSTAT_INC(ips_badlen); 460147256Sbrooks goto bad; 461147256Sbrooks } 462147256Sbrooks ip->ip_off = ntohs(ip->ip_off); 463147256Sbrooks 464147256Sbrooks /* 465147256Sbrooks * Check that the amount of data in the buffers 466147256Sbrooks * is as at least much as the IP header would have us expect. 467147256Sbrooks * Trim mbufs if longer than we expect. 468147256Sbrooks * Drop packet if shorter than we expect. 469147256Sbrooks */ 470147256Sbrooks if (m->m_pkthdr.len < ip->ip_len) { 471147256Sbrookstooshort: 472147256Sbrooks IPSTAT_INC(ips_tooshort); 47370199Sjhay goto bad; 47470199Sjhay } 47570199Sjhay if (m->m_pkthdr.len > ip->ip_len) { 47670199Sjhay if (m->m_len == m->m_pkthdr.len) { 47770199Sjhay m->m_len = ip->ip_len; 478147256Sbrooks m->m_pkthdr.len = ip->ip_len; 479147256Sbrooks } else 480147256Sbrooks m_adj(m, ip->ip_len - m->m_pkthdr.len); 481147256Sbrooks } 482147256Sbrooks#ifdef IPSEC 483147256Sbrooks /* 484147256Sbrooks * Bypass packet filtering for packets from a tunnel (gif). 48570199Sjhay */ 48670199Sjhay if (ip_ipsec_filtertunnel(m)) 487147256Sbrooks goto passin; 48870199Sjhay#endif /* IPSEC */ 48970199Sjhay 490132199Sphk /* 49170199Sjhay * Run through list of hooks for input packets. 49270199Sjhay * 49370199Sjhay * NB: Beware of the destination address changing (e.g. 49470199Sjhay * by NAT rewriting). When this happens, tell 49570199Sjhay * ip_forward to do the right thing. 49670199Sjhay */ 49770199Sjhay 49870199Sjhay /* Jump over all PFIL processing if hooks are not active. */ 49970199Sjhay if (!PFIL_HOOKED(&inet_pfil_hook)) 50070199Sjhay goto passin; 50125944Sjoerg 50270199Sjhay odst = ip->ip_dst; 50325944Sjoerg if (pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0) 5044910Swollman return; 5054910Swollman if (m == NULL) /* consumed by filter */ 5064910Swollman return; 5074910Swollman 5084910Swollman ip = mtod(m, struct ip *); 50925706Sjoerg dchg = (odst.s_addr != ip->ip_dst.s_addr); 51025706Sjoerg ifp = m->m_pkthdr.rcvif; 5114910Swollman 5124910Swollman#ifdef IPFIREWALL_FORWARD 513111888Sjlemon if (m->m_flags & M_FASTFWD_OURS) { 514147256Sbrooks m->m_flags &= ~M_FASTFWD_OURS; 515184682Sbz goto ours; 516184682Sbz } 517184682Sbz if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) { 51888700Sjoerg /* 519184682Sbz * Directly ship on the packet. This allows to forward packets 5204910Swollman * that were destined for us to some other directly connected 521138745Srik * host. 522138745Srik */ 523138745Srik ip_forward(m, dchg); 5244910Swollman return; 5254910Swollman } 5264910Swollman#endif /* IPFIREWALL_FORWARD */ 5274910Swollman 5284910Swollmanpassin: 5294910Swollman /* 53025944Sjoerg * Process options and, if not destined for us, 53125706Sjoerg * ship it on. ip_dooptions returns 1 when an 53240008Sjoerg * error was detected (causing an icmp message 53340008Sjoerg * to be sent and the original packet to be freed). 53425944Sjoerg */ 53588700Sjoerg if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) 536138745Srik return; 53788700Sjoerg 53825944Sjoerg /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no 53925944Sjoerg * matter if it is destined to another node, or whether it is 5404910Swollman * a multicast one, RSVP wants it! and prevents it from being forwarded 5414910Swollman * anywhere else. Also checks if the rsvp daemon is running before 5424910Swollman * grabbing the packet. 543139365Srik */ 544139365Srik if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 545139365Srik goto ours; 546139365Srik 547139365Srik /* 548139365Srik * Check our list of addresses, to see if the packet is for us. 5494910Swollman * If we don't have any addresses, assume any unicast packet 5504910Swollman * we receive might be for us (and let the upper layers deal 5514910Swollman * with it). 5524910Swollman */ 5534910Swollman if (TAILQ_EMPTY(&V_in_ifaddrhead) && 5544910Swollman (m->m_flags & (M_MCAST|M_BCAST)) == 0) 5554910Swollman goto ours; 5564910Swollman 55745152Sphk /* 55825944Sjoerg * Enable a consistency check between the destination address 55925706Sjoerg * and the arrival interface for a unicast packet (the RFC 1122 56040008Sjoerg * strong ES model) if IP forwarding is disabled and the packet 56125706Sjoerg * is not locally generated and the packet is not subject to 56240008Sjoerg * 'ipfw fwd'. 56325706Sjoerg * 56411189Sjkh * XXX - Checking also should be disabled if the destination 56511189Sjkh * address is ipnat'ed to a different interface. 5664910Swollman * 5674910Swollman * XXX - Checking is incompatible with IP aliases added 56825944Sjoerg * to the loopback interface instead of the interface where 56925706Sjoerg * the packets are received. 57044145Sphk * 57125706Sjoerg * XXX - This is the case for carp vhost IPs as well so we 57240008Sjoerg * insert a workaround. If the packet got here, we already 57325706Sjoerg * checked with carp_iamatch() and carp_forus(). 57444145Sphk */ 57544145Sphk checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 57678064Sume ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && 57744145Sphk#ifdef DEV_CARP 5784910Swollman !ifp->if_carp && 5794910Swollman#endif 5804910Swollman (dchg == 0); 58130300Sjoerg 5824910Swollman /* 583138745Srik * Check for exact addresses in the hash bucket. 5844910Swollman */ 58530300Sjoerg LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { 58630300Sjoerg /* 58730300Sjoerg * If the address matches, verify that the packet 58830300Sjoerg * arrived via the correct interface if checking is 589138745Srik * enabled. 59030300Sjoerg */ 59130300Sjoerg if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 59230300Sjoerg (!checkif || ia->ia_ifp == ifp)) 59330300Sjoerg goto ours; 59430300Sjoerg } 595138745Srik /* 59630300Sjoerg * Check for broadcast addresses. 5974910Swollman * 5984910Swollman * Only accept broadcast packets that arrive via the matching 59925944Sjoerg * interface. Reception of forwarded directed broadcasts would 60030300Sjoerg * be handled via ip_forward() and ether_output() with the loopback 6014910Swollman * into the stack for SIMPLEX interfaces handled by ether_output(). 602138745Srik */ 6034910Swollman if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { 6044910Swollman IF_ADDR_LOCK(ifp); 60525944Sjoerg TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 606111888Sjlemon if (ifa->ifa_addr->sa_family != AF_INET) 6074910Swollman continue; 60888577Sjoerg ia = ifatoia(ifa); 6094910Swollman if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == 61088534Sjoerg ip->ip_dst.s_addr) { 61188534Sjoerg IF_ADDR_UNLOCK(ifp); 61288700Sjoerg goto ours; 61388700Sjoerg } 61488700Sjoerg if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) { 61588700Sjoerg IF_ADDR_UNLOCK(ifp); 61688700Sjoerg goto ours; 61788700Sjoerg } 61888700Sjoerg#ifdef BOOTP_COMPAT 61988700Sjoerg if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { 62088700Sjoerg IF_ADDR_UNLOCK(ifp); 62188700Sjoerg goto ours; 62288534Sjoerg } 62388700Sjoerg#endif 62488700Sjoerg } 62588700Sjoerg IF_ADDR_UNLOCK(ifp); 62688700Sjoerg } 62788700Sjoerg /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ 62888700Sjoerg if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { 62988700Sjoerg IPSTAT_INC(ips_cantforward); 63088700Sjoerg m_freem(m); 63188700Sjoerg return; 632111119Simp } 633138745Srik if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { 634138745Srik if (V_ip_mrouter) { 63588700Sjoerg /* 636138745Srik * If we are acting as a multicast router, all 63788700Sjoerg * incoming multicast packets are passed to the 638111888Sjlemon * kernel-level multicast forwarding function. 63988534Sjoerg * The packet is returned (relatively) intact; if 64088599Sjoerg * ip_mforward() returns a non-zero value, the packet 64188534Sjoerg * must be discarded, else it may be accepted below. 64288534Sjoerg */ 64388534Sjoerg if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { 64488700Sjoerg IPSTAT_INC(ips_cantforward); 64588700Sjoerg m_freem(m); 64688700Sjoerg return; 64788700Sjoerg } 64888700Sjoerg 64988700Sjoerg /* 65088700Sjoerg * The process-level routing daemon needs to receive 65188700Sjoerg * all multicast IGMP packets, whether or not this 65288700Sjoerg * host belongs to their destination groups. 65388534Sjoerg */ 65488700Sjoerg if (ip->ip_p == IPPROTO_IGMP) 655111888Sjlemon goto ours; 65688534Sjoerg IPSTAT_INC(ips_forward); 65788599Sjoerg } 65888534Sjoerg /* 65978064Sume * Assume the packet is for us, to avoid prematurely taking 66088599Sjoerg * a lock on the in_multi hash. Protocols must perform 66188599Sjoerg * their own filtering and update statistics accordingly. 66288599Sjoerg */ 66388599Sjoerg goto ours; 66488599Sjoerg } 665138745Srik if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) 66688599Sjoerg goto ours; 66788599Sjoerg if (ip->ip_dst.s_addr == INADDR_ANY) 66888599Sjoerg goto ours; 669111888Sjlemon 670111888Sjlemon /* 67188599Sjoerg * FAITH(Firewall Aided Internet Translator) 67288599Sjoerg */ 67388599Sjoerg if (ifp && ifp->if_type == IFT_FAITH) { 67412495Speter if (V_ip_keepfaith) { 67512495Speter if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) 67612495Speter goto ours; 677111888Sjlemon } 678111888Sjlemon m_freem(m); 67988577Sjoerg return; 68012495Speter } 68112495Speter 6824910Swollman /* 6834910Swollman * Not for us; forward if possible and desirable. 6844910Swollman */ 6854910Swollman if (V_ipforwarding == 0) { 6864910Swollman IPSTAT_INC(ips_cantforward); 68745152Sphk m_freem(m); 68825944Sjoerg } else { 68925706Sjoerg#ifdef IPSEC 69040008Sjoerg if (ip_ipsec_fwd(m)) 69125706Sjoerg goto bad; 69240008Sjoerg#endif /* IPSEC */ 69325706Sjoerg ip_forward(m, dchg); 69411189Sjkh } 69511189Sjkh return; 6964910Swollman 6974910Swollmanours: 6984910Swollman#ifdef IPSTEALTH 6994910Swollman /* 7004910Swollman * IPSTEALTH: Process non-routing options only 701147256Sbrooks * if the packet is destined for us. 7024910Swollman */ 703138745Srik if (V_ipstealth && hlen > sizeof (struct ip) && 7044910Swollman ip_dooptions(m, 1)) 7054910Swollman return; 7064910Swollman#endif /* IPSTEALTH */ 707111888Sjlemon 70888577Sjoerg /* Count the packet in the ip address stats */ 7094910Swollman if (ia != NULL) { 7104910Swollman ia->ia_ifa.if_ipackets++; 71154263Sshin ia->ia_ifa.if_ibytes += m->m_pkthdr.len; 71254263Sshin } 713111888Sjlemon 71488577Sjoerg /* 71554263Sshin * Attempt reassembly; if it succeeds, proceed. 71654263Sshin * ip_reass() will return a different mbuf. 71712495Speter */ 71812495Speter if (ip->ip_off & (IP_MF | IP_OFFMASK)) { 719111888Sjlemon m = ip_reass(m); 72088577Sjoerg if (m == NULL) 72112495Speter return; 72212495Speter ip = mtod(m, struct ip *); 7234910Swollman /* Get the header length of the reassembled packet */ 7244910Swollman hlen = ip->ip_hl << 2; 72525944Sjoerg } 72625944Sjoerg 72725944Sjoerg /* 72825944Sjoerg * Further protocols expect the packet length to be w/o the 72940008Sjoerg * IP header. 73025944Sjoerg */ 73140008Sjoerg ip->ip_len -= hlen; 73225944Sjoerg 73325944Sjoerg#ifdef IPSEC 7344910Swollman /* 7354910Swollman * enforce IPsec policy checking if we are seeing last header. 736111888Sjlemon * note that we do not visit this with protocols with pcb layer 7374910Swollman * code - like udp/tcp/raw ip. 7384910Swollman */ 739138745Srik if (ip_ipsec_input(m)) 7404910Swollman goto bad; 741134391Sandre#endif /* IPSEC */ 74225944Sjoerg 74340008Sjoerg /* 74440008Sjoerg * Switch out to protocol's input routine. 745131241Srik */ 7464910Swollman IPSTAT_INC(ips_delivered); 747138745Srik 74888577Sjoerg (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); 74988577Sjoerg return; 75088577Sjoergbad: 75188577Sjoerg m_freem(m); 75288577Sjoerg} 75388577Sjoerg 754150349Sandre/* 7554910Swollman * After maxnipq has been updated, propagate the change to UMA. The UMA zone 7564910Swollman * max has slightly different semantics than the sysctl, for historical 757138745Srik * reasons. 758138745Srik */ 759138745Srikstatic void 760138745Srikmaxnipq_update(void) 761138745Srik{ 762147256Sbrooks INIT_VNET_INET(curvnet); 763138745Srik 764138745Srik /* 765138745Srik * -1 for unlimited allocation. 766138745Srik */ 767138745Srik if (V_maxnipq < 0) 768138745Srik uma_zone_set_max(V_ipq_zone, 0); 769138745Srik /* 770138745Srik * Positive number for specific bound. 771147256Sbrooks */ 772138745Srik if (V_maxnipq > 0) 773138745Srik uma_zone_set_max(V_ipq_zone, V_maxnipq); 774138745Srik /* 775138745Srik * Zero specifies no further fragment queue allocation -- set the 776138745Srik * bound very low, but rely on implementation elsewhere to actually 777138745Srik * prevent allocation and reclaim current queues. 778138745Srik */ 779138745Srik if (V_maxnipq == 0) 780138745Srik uma_zone_set_max(V_ipq_zone, 1); 781138745Srik} 782138745Srik 7834910Swollmanstatic void 7844910Swollmanipq_zone_change(void *tag) 7854910Swollman{ 78612820Sphk INIT_VNET_INET(curvnet); 78725706Sjoerg 788191148Skmacy if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { 7894910Swollman V_maxnipq = nmbclusters / 32; 790147256Sbrooks maxnipq_update(); 7914910Swollman } 79278064Sume} 793130549Smlaier 794184682Sbzstatic int 79588534Sjoergsysctl_maxnipq(SYSCTL_HANDLER_ARGS) 796184682Sbz{ 79742066Sphk INIT_VNET_INET(curvnet); 7984910Swollman int error, i; 79925944Sjoerg 800138745Srik i = V_maxnipq; 80125944Sjoerg error = sysctl_handle_int(oidp, &i, 0, req); 802148887Srwatson if (error || !req->newptr) 803148887Srwatson return (error); 804148887Srwatson 80588723Sjoerg /* 80688723Sjoerg * XXXRW: Might be a good idea to sanity check the argument and place 80788723Sjoerg * an extreme upper bound. 8084910Swollman */ 809138745Srik if (i < -1) 8104910Swollman return (EINVAL); 8114910Swollman V_maxnipq = i; 8124910Swollman maxnipq_update(); 8134910Swollman return (0); 814148887Srwatson} 815148887Srwatson 81688723SjoergSYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, 81725944Sjoerg NULL, 0, sysctl_maxnipq, "I", 81888723Sjoerg "Maximum number of IPv4 fragment reassembly queue entries"); 81988723Sjoerg 82088723Sjoerg/* 82188723Sjoerg * Take incoming datagram fragment and try to reassemble it into 82288723Sjoerg * whole datagram. If the argument is the first fragment or one 82388723Sjoerg * in between the function will return NULL and store the mbuf 82488723Sjoerg * in the fragment chain. If the argument is the last fragment 82588723Sjoerg * the packet will be reassembled and the pointer to the new 82688723Sjoerg * mbuf returned for further processing. Only m_tags attached 82788723Sjoerg * to the first packet/fragment are preserved. 82888723Sjoerg * The IP header is *NOT* adjusted out of iplen. 82988723Sjoerg */ 83025944Sjoergstruct mbuf * 83125944Sjoergip_reass(struct mbuf *m) 83225944Sjoerg{ 833148887Srwatson INIT_VNET_INET(curvnet); 83425944Sjoerg struct ip *ip; 83525944Sjoerg struct mbuf *p, *q, *nq, *t; 83625944Sjoerg struct ipq *fp = NULL; 83725944Sjoerg struct ipqhead *head; 83825944Sjoerg int i, hlen, next; 8394910Swollman u_int8_t ecn, ecn0; 84012436Speter u_short hash; 84140008Sjoerg 84212436Speter /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ 84312436Speter if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { 8444910Swollman IPSTAT_INC(ips_fragments); 84542104Sphk IPSTAT_INC(ips_fragdropped); 84642104Sphk m_freem(m); 84742104Sphk return (NULL); 84842104Sphk } 84942104Sphk 85042104Sphk ip = mtod(m, struct ip *); 85142104Sphk hlen = ip->ip_hl << 2; 85242104Sphk 85342104Sphk hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); 85470199Sjhay head = &V_ipq[hash]; 85542104Sphk IPQ_LOCK(); 85642104Sphk 85742104Sphk /* 85842104Sphk * Look for queue of fragments 859138745Srik * of this datagram. 86042104Sphk */ 86142104Sphk TAILQ_FOREACH(fp, head, ipq_list) 86242104Sphk if (ip->ip_id == fp->ipq_id && 86342104Sphk ip->ip_src.s_addr == fp->ipq_src.s_addr && 86442104Sphk ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 86542104Sphk#ifdef MAC 86670199Sjhay mac_ipq_match(m, fp) && 86742104Sphk#endif 86842104Sphk ip->ip_p == fp->ipq_p) 869130549Smlaier goto found; 87042104Sphk 871130549Smlaier fp = NULL; 87241686Sphk 873130549Smlaier /* 874130549Smlaier * Attempt to trim the number of allocated fragment queues if it 87541686Sphk * exceeds the administrative limit. 87612436Speter */ 87741686Sphk if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { 87841686Sphk /* 87941686Sphk * drop something from the tail of the current queue 88041686Sphk * before proceeding further 88141686Sphk */ 88241686Sphk struct ipq *q = TAILQ_LAST(head, ipqhead); 88341686Sphk if (q == NULL) { /* gak */ 88441686Sphk for (i = 0; i < IPREASS_NHASH; i++) { 88588534Sjoerg struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); 88688534Sjoerg if (r) { 88788534Sjoerg IPSTAT_ADD(ips_fragtimeout, 88888534Sjoerg r->ipq_nfrags); 889139365Srik ip_freef(&V_ipq[i], r); 890138745Srik break; 89188599Sjoerg } 89288534Sjoerg } 89388534Sjoerg } else { 89488534Sjoerg IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags); 89588534Sjoerg ip_freef(head, q); 89688534Sjoerg } 89788534Sjoerg } 89888534Sjoerg 89988534Sjoergfound: 90088534Sjoerg /* 90188534Sjoerg * Adjust ip_len to not reflect header, 90288534Sjoerg * convert offset of this to bytes. 90388534Sjoerg */ 904138745Srik ip->ip_len -= hlen; 90588534Sjoerg if (ip->ip_off & IP_MF) { 90688534Sjoerg /* 90788534Sjoerg * Make sure that fragments have a data length 9084910Swollman * that's a non-zero multiple of 8 bytes. 9094910Swollman */ 9104910Swollman if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { 91178064Sume IPSTAT_INC(ips_toosmall); /* XXX */ 91278064Sume goto dropfrag; 91378064Sume } 91478064Sume m->m_flags |= M_FRAG; 91578064Sume } else 91678064Sume m->m_flags &= ~M_FRAG; 917139365Srik ip->ip_off <<= 3; 918139365Srik 919139365Srik 920139365Srik /* 921139365Srik * Attempt reassembly; if it succeeds, proceed. 922139365Srik * ip_reass() will return a different mbuf. 923139365Srik */ 924139365Srik IPSTAT_INC(ips_fragments); 9254910Swollman m->m_pkthdr.header = ip; 9264910Swollman 9274910Swollman /* Previous ip_reass() started here. */ 928111119Simp /* 9294910Swollman * Presence of header sizes in mbufs 930139365Srik * would confuse code below. 93140008Sjoerg */ 93240008Sjoerg m->m_data += hlen; 93325944Sjoerg m->m_len -= hlen; 934138745Srik 9354910Swollman /* 9364910Swollman * If first fragment to arrive, create a reassembly queue. 9374910Swollman */ 93840008Sjoerg if (fp == NULL) { 93940008Sjoerg fp = uma_zalloc(V_ipq_zone, M_NOWAIT); 94040008Sjoerg if (fp == NULL) 94140008Sjoerg goto dropfrag; 9424910Swollman#ifdef MAC 94345152Sphk if (mac_ipq_init(fp, M_NOWAIT) != 0) { 94428088Skjc uma_zfree(V_ipq_zone, fp); 9454910Swollman fp = NULL; 9464910Swollman goto dropfrag; 9474910Swollman } 9484910Swollman mac_ipq_create(m, fp); 9494910Swollman#endif 9504910Swollman TAILQ_INSERT_HEAD(head, fp, ipq_list); 9514910Swollman V_nipq++; 9524910Swollman fp->ipq_nfrags = 1; 9534910Swollman fp->ipq_ttl = IPFRAGTTL; 95445152Sphk fp->ipq_p = ip->ip_p; 95511189Sjkh fp->ipq_id = ip->ip_id; 95611189Sjkh fp->ipq_src = ip->ip_src; 95725955Sjoerg fp->ipq_dst = ip->ip_dst; 95825955Sjoerg fp->ipq_frags = m; 95925955Sjoerg m->m_nextpkt = NULL; 96025955Sjoerg goto done; 96125955Sjoerg } else { 96225955Sjoerg fp->ipq_nfrags++; 96325955Sjoerg#ifdef MAC 96425955Sjoerg mac_ipq_update(m, fp); 96525955Sjoerg#endif 96688534Sjoerg } 96725955Sjoerg 96825955Sjoerg#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) 96911189Sjkh 9704910Swollman /* 9714910Swollman * Handle ECN by comparing this segment with the first one; 97254263Sshin * if CE is set, do not lose CE. 97354263Sshin * drop if CE and not-ECT are mixed for the same packet. 97454263Sshin */ 97554263Sshin ecn = ip->ip_tos & IPTOS_ECN_MASK; 97654263Sshin ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; 97778064Sume if (ecn == IPTOS_ECN_CE) { 97878064Sume if (ecn0 == IPTOS_ECN_NOTECT) 97978064Sume goto dropfrag; 98078064Sume if (ecn0 != IPTOS_ECN_CE) 98178064Sume GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; 98278064Sume } 98378064Sume if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 98478064Sume goto dropfrag; 98578064Sume 98678064Sume /* 98778064Sume * Find a segment which begins after this one does. 98878064Sume */ 98954263Sshin for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) 99054263Sshin if (GETIP(q)->ip_off > ip->ip_off) 99154263Sshin break; 99211819Sjulian 99312495Speter /* 99445152Sphk * If there is a preceding segment, it may provide some of 99512495Speter * our data already. If so, drop the data from the incoming 99611819Sjulian * segment. If it provides all of our data, drop us, otherwise 99711819Sjulian * stick new segment in the proper place. 9984910Swollman * 9994910Swollman * If some of the data is dropped from the the preceding 100025944Sjoerg * segment, then it's checksum is invalidated. 1001138745Srik */ 10024910Swollman if (p) { 10034910Swollman i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; 10044910Swollman if (i > 0) { 10054910Swollman if (i >= ip->ip_len) 10064910Swollman goto dropfrag; 10074910Swollman m_adj(m, i); 100888577Sjoerg m->m_pkthdr.csum_flags = 0; 10094910Swollman ip->ip_off += i; 1010139365Srik ip->ip_len -= i; 1011130549Smlaier } 1012130549Smlaier m->m_nextpkt = p->m_nextpkt; 1013130549Smlaier p->m_nextpkt = m; 1014130549Smlaier } else { 1015130549Smlaier m->m_nextpkt = fp->ipq_frags; 101625944Sjoerg fp->ipq_frags = m; 1017138745Srik } 1018111038Smaxim 101925955Sjoerg /* 10204910Swollman * While we overlap succeeding segments trim them or, 1021138745Srik * if they are completely covered, dequeue them. 1022111038Smaxim */ 102388577Sjoerg for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; 102488577Sjoerg q = nq) { 102588577Sjoerg i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off; 102688577Sjoerg if (i < GETIP(q)->ip_len) { 102788577Sjoerg GETIP(q)->ip_len -= i; 102888577Sjoerg GETIP(q)->ip_off += i; 1029150349Sandre m_adj(q, i); 10304910Swollman q->m_pkthdr.csum_flags = 0; 10314910Swollman break; 10324910Swollman } 103325706Sjoerg nq = q->m_nextpkt; 103425706Sjoerg m->m_nextpkt = nq; 10354910Swollman IPSTAT_INC(ips_fragdropped); 1036147256Sbrooks fp->ipq_nfrags--; 10374910Swollman m_freem(q); 1038138745Srik } 1039138745Srik 1040138745Srik /* 10414910Swollman * Check for complete reassembly and perform frag per packet 1042188668Srwatson * limiting. 1043138745Srik * 1044138745Srik * Frag limiting is performed here so that the nth frag has 10454910Swollman * a chance to complete the packet before we drop the packet. 1046147256Sbrooks * As a result, n+1 frags are actually allowed per packet, but 1047147256Sbrooks * only n will ever be stored. (n = maxfragsperpacket.) 1048147256Sbrooks * 104942104Sphk */ 105042064Sphk next = 0; 105142104Sphk for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { 1052147256Sbrooks if (GETIP(q)->ip_off != next) { 105370199Sjhay if (fp->ipq_nfrags > V_maxfragsperpacket) { 105470199Sjhay IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 10554910Swollman ip_freef(head, fp); 10564910Swollman } 105778064Sume goto done; 105878064Sume } 105925944Sjoerg next += GETIP(q)->ip_len; 1060138745Srik } 1061138745Srik /* Make sure the last packet didn't have the IP_MF flag */ 106288716Sjoerg if (p->m_flags & M_FRAG) { 106393818Sjhb if (fp->ipq_nfrags > V_maxfragsperpacket) { 106488716Sjoerg IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 106593818Sjhb ip_freef(head, fp); 1066150349Sandre } 106788723Sjoerg goto done; 106888723Sjoerg } 106988723Sjoerg 107088723Sjoerg /* 107188723Sjoerg * Reassembly is complete. Make sure the packet is a sane size. 107288723Sjoerg */ 107388723Sjoerg q = fp->ipq_frags; 1074188668Srwatson ip = GETIP(q); 1075138745Srik if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { 1076138745Srik IPSTAT_INC(ips_toolong); 1077118072Sgj IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); 107888599Sjoerg ip_freef(head, fp); 107925944Sjoerg goto done; 108025944Sjoerg } 108178064Sume 108230300Sjoerg /* 108330300Sjoerg * Concatenate fragments. 10844910Swollman */ 10854910Swollman m = q; 108630300Sjoerg t = m->m_next; 108725706Sjoerg m->m_next = NULL; 10884910Swollman m_cat(m, t); 1089147256Sbrooks nq = q->m_nextpkt; 109025944Sjoerg q->m_nextpkt = NULL; 10914910Swollman for (q = nq; q != NULL; q = nq) { 1092138745Srik nq = q->m_nextpkt; 10934910Swollman q->m_nextpkt = NULL; 10944910Swollman m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; 1095138745Srik m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; 1096138745Srik m_cat(m, q); 109725944Sjoerg } 1098138745Srik /* 1099138745Srik * In order to do checksumming faster we do 'end-around carry' here 1100138745Srik * (and not in for{} loop), though it implies we are not going to 1101138745Srik * reassemble more than 64k fragments. 1102138745Srik */ 1103138745Srik m->m_pkthdr.csum_data = 110469152Sjlemon (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); 110569152Sjlemon#ifdef MAC 1106138745Srik mac_ipq_reassemble(fp, m); 11074910Swollman mac_ipq_destroy(fp); 11084910Swollman#endif 11094910Swollman 11104910Swollman /* 11114910Swollman * Create header for new ip packet by modifying header of first 1112138745Srik * packet; dequeue and discard fragment reassembly header. 1113138745Srik * Make header visible. 11144910Swollman */ 1115147256Sbrooks ip->ip_len = (ip->ip_hl << 2) + next; 11164910Swollman ip->ip_src = fp->ipq_src; 1117147256Sbrooks ip->ip_dst = fp->ipq_dst; 111825944Sjoerg TAILQ_REMOVE(head, fp, ipq_list); 111926018Sjoerg V_nipq--; 11204910Swollman uma_zfree(V_ipq_zone, fp); 11214910Swollman m->m_len += (ip->ip_hl << 2); 1122138745Srik m->m_data -= (ip->ip_hl << 2); 1123138745Srik /* some debugging cruft by sklower, below, will go away soon */ 1124138745Srik if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ 1125147256Sbrooks m_fixhdr(m); 1126138745Srik IPSTAT_INC(ips_reassembled); 1127138745Srik IPQ_UNLOCK(); 1128138745Srik return (m); 1129138745Srik 1130138745Srikdropfrag: 1131138745Srik IPSTAT_INC(ips_fragdropped); 11324910Swollman if (fp != NULL) 113311189Sjkh fp->ipq_nfrags--; 113411189Sjkh m_freem(m); 113512820Sphkdone: 113625706Sjoerg IPQ_UNLOCK(); 113711189Sjkh return (NULL); 1138147256Sbrooks 113925944Sjoerg#undef GETIP 114011189Sjkh} 114125944Sjoerg 1142138745Srik/* 114326018Sjoerg * Free a fragment reassembly header and all 1144147256Sbrooks * associated datagrams. 1145138745Srik */ 114625944Sjoergstatic void 114711189Sjkhip_freef(struct ipqhead *fhp, struct ipq *fp) 114811189Sjkh{ 114911189Sjkh INIT_VNET_INET(curvnet); 115011189Sjkh struct mbuf *q; 11514910Swollman 11524910Swollman IPQ_LOCK_ASSERT(); 115325706Sjoerg 115425706Sjoerg while (fp->ipq_frags) { 11554910Swollman q = fp->ipq_frags; 1156147256Sbrooks fp->ipq_frags = q->m_nextpkt; 11574910Swollman m_freem(q); 115825944Sjoerg } 11594910Swollman TAILQ_REMOVE(fhp, fp, ipq_list); 116025944Sjoerg uma_zfree(V_ipq_zone, fp); 1161138745Srik V_nipq--; 116226018Sjoerg} 116330300Sjoerg 116430300Sjoerg/* 116526018Sjoerg * IP timer processing; 116626018Sjoerg * if a timer expires on a reassembly 116726018Sjoerg * queue, discard it. 116826018Sjoerg */ 116926018Sjoergvoid 1170139365Srikip_slowtimo(void) 1171139365Srik{ 117226018Sjoerg VNET_ITERATOR_DECL(vnet_iter); 117326018Sjoerg struct ipq *fp; 1174147256Sbrooks int i; 117526018Sjoerg 1176138745Srik IPQ_LOCK(); 117726018Sjoerg VNET_LIST_RLOCK(); 117826018Sjoerg VNET_FOREACH(vnet_iter) { 11794910Swollman CURVNET_SET(vnet_iter); 11804910Swollman INIT_VNET_INET(vnet_iter); 11814910Swollman for (i = 0; i < IPREASS_NHASH; i++) { 118230300Sjoerg for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { 118330300Sjoerg struct ipq *fpp; 118430300Sjoerg 118530300Sjoerg fpp = fp; 118630300Sjoerg fp = TAILQ_NEXT(fp, ipq_list); 1187147256Sbrooks if(--fpp->ipq_ttl == 0) { 118830300Sjoerg IPSTAT_ADD(ips_fragtimeout, 118930300Sjoerg fpp->ipq_nfrags); 119030300Sjoerg ip_freef(&V_ipq[i], fpp); 1191138745Srik } 1192138745Srik } 119330300Sjoerg } 119430300Sjoerg /* 119530300Sjoerg * If we are over the maximum number of fragments 1196138745Srik * (due to the limit being lowered), drain off 1197139365Srik * enough to get down to the new limit. 1198139365Srik */ 119930300Sjoerg if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { 1200147256Sbrooks for (i = 0; i < IPREASS_NHASH; i++) { 1201138745Srik while (V_nipq > V_maxnipq && 120230300Sjoerg !TAILQ_EMPTY(&V_ipq[i])) { 120330300Sjoerg IPSTAT_ADD(ips_fragdropped, 120430300Sjoerg TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); 120530300Sjoerg ip_freef(&V_ipq[i], 120630300Sjoerg TAILQ_FIRST(&V_ipq[i])); 120725944Sjoerg } 12084910Swollman } 120925944Sjoerg } 121042104Sphk CURVNET_RESTORE(); 12114910Swollman } 121225944Sjoerg VNET_LIST_RUNLOCK(); 1213147256Sbrooks IPQ_UNLOCK(); 121430300Sjoerg} 12154910Swollman 121625944Sjoerg/* 1217138745Srik * Drain off all datagram fragments. 121830300Sjoerg */ 121925944Sjoergvoid 122025944Sjoergip_drain(void) 122125944Sjoerg{ 122225944Sjoerg VNET_ITERATOR_DECL(vnet_iter); 12234910Swollman int i; 122425944Sjoerg 122588503Sjoerg IPQ_LOCK(); 122688503Sjoerg VNET_LIST_RLOCK(); 1227102412Scharnier VNET_FOREACH(vnet_iter) { 122811189Sjkh CURVNET_SET(vnet_iter); 122925944Sjoerg INIT_VNET_INET(vnet_iter); 123025944Sjoerg for (i = 0; i < IPREASS_NHASH; i++) { 1231148887Srwatson while(!TAILQ_EMPTY(&V_ipq[i])) { 123225944Sjoerg IPSTAT_ADD(ips_fragdropped, 1233148887Srwatson TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); 123445152Sphk ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); 123545152Sphk } 123645152Sphk } 123745152Sphk CURVNET_RESTORE(); 123845152Sphk } 123945152Sphk VNET_LIST_RUNLOCK(); 124045152Sphk IPQ_UNLOCK(); 124145152Sphk in_rtqdrain(); 124245152Sphk} 1243139365Srik 1244139365Srik/* 1245139365Srik * The protocol to be inserted into ip_protox[] must be already registered 124645152Sphk * in inetsw[], either statically or through pf_proto_register(). 124745152Sphk */ 124845152Sphkint 1249148887Srwatsonipproto_register(u_char ipproto) 12504910Swollman{ 12514910Swollman struct protosw *pr; 125245152Sphk 1253139365Srik /* Sanity checks. */ 1254139365Srik if (ipproto == 0) 125545152Sphk return (EPROTONOSUPPORT); 125645152Sphk 125745152Sphk /* 1258138745Srik * The protocol slot must not be occupied by another protocol 1259148887Srwatson * already. An index pointing to IPPROTO_RAW is unused. 126045152Sphk */ 126126018Sjoerg pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 12624910Swollman if (pr == NULL) 126345152Sphk return (EPFNOSUPPORT); 1264139365Srik if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ 1265139365Srik return (EEXIST); 126645152Sphk 126745152Sphk /* Find the protocol position in inetsw[] and set the index. */ 126845152Sphk for (pr = inetdomain.dom_protosw; 1269148887Srwatson pr < inetdomain.dom_protoswNPROTOSW; pr++) { 127045152Sphk if (pr->pr_domain->dom_family == PF_INET && 127145152Sphk pr->pr_protocol && pr->pr_protocol == ipproto) { 1272139365Srik /* Be careful to only index valid IP protocols. */ 1273139365Srik if (pr->pr_protocol < IPPROTO_MAX) { 127445152Sphk ip_protox[pr->pr_protocol] = pr - inetsw; 127545152Sphk return (0); 1276148887Srwatson } else 127745152Sphk return (EINVAL); 127845152Sphk } 127945152Sphk } 12804910Swollman return (EPROTONOSUPPORT); 128111189Sjkh} 128225944Sjoerg 128325944Sjoergint 128425944Sjoergipproto_unregister(u_char ipproto) 128525944Sjoerg{ 128625944Sjoerg struct protosw *pr; 128725944Sjoerg 128825944Sjoerg /* Sanity checks. */ 128925944Sjoerg if (ipproto == 0) 12904910Swollman return (EPROTONOSUPPORT); 129125944Sjoerg 129225944Sjoerg /* Check if the protocol was indeed registered. */ 129325944Sjoerg pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 129425944Sjoerg if (pr == NULL) 129525944Sjoerg return (EPFNOSUPPORT); 129625944Sjoerg if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ 12974910Swollman return (ENOENT); 129825944Sjoerg 129925944Sjoerg /* Reset the protocol slot to IPPROTO_RAW. */ 130025944Sjoerg ip_protox[ipproto] = pr - inetsw; 130125944Sjoerg return (0); 130211189Sjkh} 130325944Sjoerg 130425944Sjoerg/* 130525944Sjoerg * Given address of next destination (final or next hop), 130625944Sjoerg * return internet address info of interface to be used to get there. 13074910Swollman */ 130825944Sjoergstruct in_ifaddr * 130925944Sjoergip_rtaddr(struct in_addr dst, u_int fibnum) 131025944Sjoerg{ 13114910Swollman struct route sro; 131211189Sjkh struct sockaddr_in *sin; 131330300Sjoerg struct in_ifaddr *ifa; 131430300Sjoerg 131530300Sjoerg bzero(&sro, sizeof(sro)); 131630300Sjoerg sin = (struct sockaddr_in *)&sro.ro_dst; 131730300Sjoerg sin->sin_family = AF_INET; 131825944Sjoerg sin->sin_len = sizeof(*sin); 131930300Sjoerg sin->sin_addr = dst; 13204910Swollman in_rtalloc_ign(&sro, 0, fibnum); 1321138745Srik 132225944Sjoerg if (sro.ro_rt == NULL) 132330300Sjoerg return (NULL); 13244910Swollman 13254910Swollman ifa = ifatoia(sro.ro_rt->rt_ifa); 132670199Sjhay RTFREE(sro.ro_rt); 132725944Sjoerg return (ifa); 132825944Sjoerg} 132925944Sjoerg 13304910Swollmanu_char inetctlerrmap[PRC_NCMDS] = { 13314910Swollman 0, 0, 0, 0, 13324910Swollman 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 133330300Sjoerg EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 133425706Sjoerg EMSGSIZE, EHOSTUNREACH, 0, 0, 13354910Swollman 0, 0, EHOSTUNREACH, 0, 133625944Sjoerg ENOPROTOOPT, ECONNREFUSED 13374910Swollman}; 133830300Sjoerg 13394910Swollman/* 134027929Sitojun * Forward a packet. If some error occurs return the sender 134125706Sjoerg * an icmp packet. Note we can't always generate a meaningful 134225706Sjoerg * icmp message because icmp doesn't have a large enough repertoire 134340008Sjoerg * of codes and types. 134440008Sjoerg * 13454910Swollman * If not forwarding, just drop the packet. This could be confusing 13464910Swollman * if ipforwarding was zero but some routing protocol was advancing 13474910Swollman * us as a gateway to somewhere. However, we must let the routing 134825706Sjoerg * protocol deal with that. 134925706Sjoerg * 135040008Sjoerg * The srcrt parameter indicates whether the packet is being forwarded 135125706Sjoerg * via a source route. 135240008Sjoerg */ 135340008Sjoergvoid 135440008Sjoergip_forward(struct mbuf *m, int srcrt) 13554910Swollman{ 13564910Swollman INIT_VNET_INET(curvnet); 135725706Sjoerg struct ip *ip = mtod(m, struct ip *); 135869211Sphk struct in_ifaddr *ia = NULL; 135940008Sjoerg struct mbuf *mcopy; 13604910Swollman struct in_addr dest; 13614910Swollman struct route ro; 13624910Swollman int error, type = 0, code = 0, mtu = 0; 13634910Swollman 13644910Swollman if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { 13654910Swollman IPSTAT_INC(ips_cantforward); 136678064Sume m_freem(m); 136778064Sume return; 13684910Swollman } 13694910Swollman#ifdef IPSTEALTH 137011189Sjkh if (!V_ipstealth) { 137140008Sjoerg#endif 137240008Sjoerg if (ip->ip_ttl <= IPTTLDEC) { 137311189Sjkh icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 137411189Sjkh 0, 0); 137511189Sjkh return; 137626018Sjoerg } 137711189Sjkh#ifdef IPSTEALTH 137811189Sjkh } 13794910Swollman#endif 13804910Swollman 13814910Swollman ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); 138278064Sume if (!srcrt && ia == NULL) { 138311189Sjkh icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); 138411189Sjkh return; 138530300Sjoerg } 138611189Sjkh 1387148887Srwatson /* 138830300Sjoerg * Save the IP header and at most 8 bytes of the payload, 138940008Sjoerg * in case we need to generate an ICMP message to the src. 139011189Sjkh * 13914910Swollman * XXX this can be optimized a lot by saving the data in a local 13924910Swollman * buffer on the stack (72 bytes at most), and only allocating the 139330300Sjoerg * mbuf if really necessary. The vast majority of the packets 139430300Sjoerg * are forwarded without having to send an ICMP back (either 139530300Sjoerg * because unnecessary, or because rate limited), so we are 13964910Swollman * really we are wasting a lot of work here. 13974910Swollman * 13984910Swollman * We don't use m_copy() because it might return a reference 13994910Swollman * to a shared cluster. Both this function and ip_output() 14004910Swollman * assume exclusive access to the IP header in `m', so any 140125944Sjoerg * data in a cluster may change before we reach icmp_error(). 14024910Swollman */ 140312820Sphk MGETHDR(mcopy, M_DONTWAIT, m->m_type); 140425944Sjoerg if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) { 140525944Sjoerg /* 140625944Sjoerg * It's probably ok if the pkthdr dup fails (because 140725944Sjoerg * the deep copy of the tag chain failed), but for now 140825944Sjoerg * be conservative and just discard the copy since 140925944Sjoerg * code below may some day want the tags. 141035029Sphk */ 141125944Sjoerg m_free(mcopy); 141236119Sphk mcopy = NULL; 141370199Sjhay } 1414111119Simp if (mcopy != NULL) { 141525944Sjoerg mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy)); 141625944Sjoerg mcopy->m_pkthdr.len = mcopy->m_len; 141725944Sjoerg m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); 141825944Sjoerg } 141925944Sjoerg 142025944Sjoerg#ifdef IPSTEALTH 142125944Sjoerg if (!V_ipstealth) { 142225944Sjoerg#endif 142325944Sjoerg ip->ip_ttl -= IPTTLDEC; 142425944Sjoerg#ifdef IPSTEALTH 142525944Sjoerg } 142625944Sjoerg#endif 142725944Sjoerg 142825944Sjoerg /* 142925944Sjoerg * If forwarding packet using same interface that it came in on, 143040008Sjoerg * perhaps should send a redirect to sender to shortcut a hop. 143135029Sphk * Only send redirect if source is sending directly to us, 143235029Sphk * and if packet was not source routed (or has any options). 143325944Sjoerg * Also, don't send redirect if forwarding using a default route 143425944Sjoerg * or a route modified by a redirect. 143525944Sjoerg */ 143640008Sjoerg dest.s_addr = 0; 143740008Sjoerg if (!srcrt && V_ipsendredirects && ia->ia_ifp == m->m_pkthdr.rcvif) { 143840008Sjoerg struct sockaddr_in *sin; 143925944Sjoerg struct rtentry *rt; 144069152Sjlemon 144169152Sjlemon bzero(&ro, sizeof(ro)); 144225944Sjoerg sin = (struct sockaddr_in *)&ro.ro_dst; 144325944Sjoerg sin->sin_family = AF_INET; 144470199Sjhay sin->sin_len = sizeof(*sin); 144525944Sjoerg sin->sin_addr = ip->ip_dst; 144625944Sjoerg in_rtalloc_ign(&ro, 0, M_GETFIB(m)); 144725944Sjoerg 144825944Sjoerg rt = ro.ro_rt; 144925944Sjoerg 145025944Sjoerg if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && 145125944Sjoerg satosin(rt_key(rt))->sin_addr.s_addr != 0) { 145225706Sjoerg#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) 145325706Sjoerg u_long src = ntohl(ip->ip_src.s_addr); 14544910Swollman 145525944Sjoerg if (RTA(rt) && 14564910Swollman (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { 14574910Swollman if (rt->rt_flags & RTF_GATEWAY) 14584910Swollman dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; 14594910Swollman else 14604910Swollman dest.s_addr = ip->ip_dst.s_addr; 14614910Swollman /* Router requirements says to only send host redirects */ 1462111119Simp type = ICMP_REDIRECT; 14634910Swollman code = ICMP_REDIRECT_HOST; 14644910Swollman } 14654910Swollman } 14664910Swollman if (rt) 14674910Swollman RTFREE(rt); 14684910Swollman } 14694910Swollman 14704910Swollman /* 14714910Swollman * Try to cache the route MTU from ip_output so we can consider it for 14724910Swollman * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191. 14734910Swollman */ 14744910Swollman bzero(&ro, sizeof(ro)); 14754910Swollman 14764910Swollman error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); 14774910Swollman 14784910Swollman if (error == EMSGSIZE && ro.ro_rt) 14794910Swollman mtu = ro.ro_rt->rt_rmx.rmx_mtu; 148025706Sjoerg if (ro.ro_rt) 148140008Sjoerg RTFREE(ro.ro_rt); 148240008Sjoerg 148325944Sjoerg if (error) 148425944Sjoerg IPSTAT_INC(ips_cantforward); 148525944Sjoerg else { 148644145Sphk IPSTAT_INC(ips_forward); 148769211Sphk if (type) 14884910Swollman IPSTAT_INC(ips_redirectsent); 148969152Sjlemon else { 149069152Sjlemon if (mcopy) 14914910Swollman m_freem(mcopy); 14924910Swollman return; 14934910Swollman } 149425944Sjoerg } 14954910Swollman if (mcopy == NULL) 149612820Sphk return; 149725944Sjoerg 14984910Swollman switch (error) { 149925944Sjoerg 150025944Sjoerg case 0: /* forwarded, but need redirect */ 150125944Sjoerg /* type, code set above */ 150225944Sjoerg break; 150325944Sjoerg 15044910Swollman case ENETUNREACH: /* shouldn't happen, checked above */ 150525944Sjoerg case EHOSTUNREACH: 150625944Sjoerg case ENETDOWN: 150725944Sjoerg case EHOSTDOWN: 150840008Sjoerg default: 150940008Sjoerg type = ICMP_UNREACH; 15104910Swollman code = ICMP_UNREACH_HOST; 151125944Sjoerg break; 151225944Sjoerg 151325944Sjoerg case EMSGSIZE: 151425944Sjoerg type = ICMP_UNREACH; 151540008Sjoerg code = ICMP_UNREACH_NEEDFRAG; 151640008Sjoerg 151725944Sjoerg#ifdef IPSEC 151825944Sjoerg /* 151944145Sphk * If IPsec is configured for this path, 152069211Sphk * override any possibly mtu value set by ip_output. 152125944Sjoerg */ 152225944Sjoerg mtu = ip_ipsec_mtu(m, mtu); 152325944Sjoerg#endif /* IPSEC */ 152430300Sjoerg /* 152525944Sjoerg * If the MTU was set before make sure we are below the 152625944Sjoerg * interface MTU. 152725944Sjoerg * If the MTU wasn't set before use the interface mtu or 152825944Sjoerg * fall back to the next smaller mtu step compared to the 152969211Sphk * current packet size. 153040008Sjoerg */ 153125944Sjoerg if (mtu != 0) { 153225944Sjoerg if (ia != NULL) 153325944Sjoerg mtu = min(mtu, ia->ia_ifp->if_mtu); 153425944Sjoerg } else { 153530300Sjoerg if (ia != NULL) 153630300Sjoerg mtu = ia->ia_ifp->if_mtu; 153730300Sjoerg else 153830300Sjoerg mtu = ip_next_mtu(ip->ip_len, 0); 153930300Sjoerg } 154030300Sjoerg IPSTAT_INC(ips_cantfrag); 154130300Sjoerg break; 154230300Sjoerg 154330300Sjoerg case ENOBUFS: 154430300Sjoerg /* 154525944Sjoerg * A router should not generate ICMP_SOURCEQUENCH as 154625944Sjoerg * required in RFC1812 Requirements for IP Version 4 Routers. 154725944Sjoerg * Source quench could be a big problem under DoS attacks, 154825944Sjoerg * or if the underlying interface is rate-limited. 154925944Sjoerg * Those who need source quench packets may re-enable them 1550102412Scharnier * via the net.inet.ip.sendsourcequench sysctl. 155125944Sjoerg */ 155225944Sjoerg if (V_ip_sendsourcequench == 0) { 155370199Sjhay m_freem(mcopy); 155470199Sjhay return; 155570199Sjhay } else { 155670199Sjhay type = ICMP_SOURCEQUENCH; 155770199Sjhay code = 0; 155870199Sjhay } 155970199Sjhay break; 156070199Sjhay 156170199Sjhay case EACCES: /* ipfw denied packet */ 156270199Sjhay m_freem(mcopy); 156325944Sjoerg return; 156425944Sjoerg } 156525944Sjoerg icmp_error(mcopy, type, code, dest.s_addr, mtu); 156625944Sjoerg} 156725944Sjoerg 156825944Sjoergvoid 156925944Sjoergip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 157025944Sjoerg struct mbuf *m) 157125944Sjoerg{ 157225944Sjoerg INIT_VNET_NET(inp->inp_vnet); 157325944Sjoerg 157425944Sjoerg if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { 157525944Sjoerg struct bintime bt; 157640008Sjoerg 157740008Sjoerg bintime(&bt); 157826077Sjoerg if (inp->inp_socket->so_options & SO_BINTIME) { 157925944Sjoerg *mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt), 158025944Sjoerg SCM_BINTIME, SOL_SOCKET); 158125944Sjoerg if (*mp) 158225944Sjoerg mp = &(*mp)->m_next; 158325944Sjoerg } 158440008Sjoerg if (inp->inp_socket->so_options & SO_TIMESTAMP) { 158540008Sjoerg struct timeval tv; 158625944Sjoerg 158725944Sjoerg bintime2timeval(&bt, &tv); 158825944Sjoerg *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), 158925944Sjoerg SCM_TIMESTAMP, SOL_SOCKET); 159025944Sjoerg if (*mp) 159125944Sjoerg mp = &(*mp)->m_next; 159225944Sjoerg } 159325944Sjoerg } 159469211Sphk if (inp->inp_flags & INP_RECVDSTADDR) { 159540008Sjoerg *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, 159625944Sjoerg sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); 159725944Sjoerg if (*mp) 159825944Sjoerg mp = &(*mp)->m_next; 159925944Sjoerg } 160025944Sjoerg if (inp->inp_flags & INP_RECVTTL) { 160125944Sjoerg *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, 160225944Sjoerg sizeof(u_char), IP_RECVTTL, IPPROTO_IP); 160325944Sjoerg if (*mp) 160425944Sjoerg mp = &(*mp)->m_next; 160525944Sjoerg } 160625944Sjoerg#ifdef notyet 160725944Sjoerg /* XXX 160825944Sjoerg * Moving these out of udp_input() made them even more broken 160925944Sjoerg * than they already were. 161025944Sjoerg */ 161125944Sjoerg /* options were tossed already */ 161225944Sjoerg if (inp->inp_flags & INP_RECVOPTS) { 161325944Sjoerg *mp = sbcreatecontrol((caddr_t) opts_deleted_above, 1614102412Scharnier sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); 161525944Sjoerg if (*mp) 161625944Sjoerg mp = &(*mp)->m_next; 161725944Sjoerg } 161825944Sjoerg /* ip_srcroute doesn't do what we want here, need to fix */ 161925944Sjoerg if (inp->inp_flags & INP_RECVRETOPTS) { 162025944Sjoerg *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), 162125944Sjoerg sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); 162225944Sjoerg if (*mp) 162340008Sjoerg mp = &(*mp)->m_next; 162440008Sjoerg } 162525944Sjoerg#endif 162625944Sjoerg if (inp->inp_flags & INP_RECVIF) { 162725944Sjoerg struct ifnet *ifp; 162840008Sjoerg struct sdlbuf { 162940008Sjoerg struct sockaddr_dl sdl; 163025944Sjoerg u_char pad[32]; 163125944Sjoerg } sdlbuf; 163225944Sjoerg struct sockaddr_dl *sdp; 163325944Sjoerg struct sockaddr_dl *sdl2 = &sdlbuf.sdl; 163425944Sjoerg 163525944Sjoerg if (((ifp = m->m_pkthdr.rcvif)) 163625944Sjoerg && ( ifp->if_index && (ifp->if_index <= V_if_index))) { 163725944Sjoerg sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; 163825944Sjoerg /* 163969211Sphk * Change our mind and don't try copy. 164040008Sjoerg */ 164125944Sjoerg if ((sdp->sdl_family != AF_LINK) 164225944Sjoerg || (sdp->sdl_len > sizeof(sdlbuf))) { 164325944Sjoerg goto makedummy; 164425944Sjoerg } 164525944Sjoerg bcopy(sdp, sdl2, sdp->sdl_len); 164625944Sjoerg } else { 164725944Sjoergmakedummy: 164825944Sjoerg sdl2->sdl_len 16494910Swollman = offsetof(struct sockaddr_dl, sdl_data[0]); 165025944Sjoerg sdl2->sdl_family = AF_LINK; 165125944Sjoerg sdl2->sdl_index = 0; 165225944Sjoerg sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; 165325944Sjoerg } 165425944Sjoerg *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, 165525944Sjoerg IP_RECVIF, IPPROTO_IP); 165625944Sjoerg if (*mp) 165725944Sjoerg mp = &(*mp)->m_next; 165870199Sjhay } 165970199Sjhay} 166070199Sjhay 166170199Sjhay/* 166270199Sjhay * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the 166370199Sjhay * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on 166470199Sjhay * locking. This code remains in ip_input.c as ip_mroute.c is optionally 166525944Sjoerg * compiled. 166625944Sjoerg */ 166725944Sjoergint 166825944Sjoergip_rsvp_init(struct socket *so) 1669102412Scharnier{ 167025944Sjoerg INIT_VNET_INET(so->so_vnet); 167152633Sjoerg 167225944Sjoerg if (so->so_type != SOCK_RAW || 167325944Sjoerg so->so_proto->pr_protocol != IPPROTO_RSVP) 167425944Sjoerg return EOPNOTSUPP; 167525944Sjoerg 167625944Sjoerg if (V_ip_rsvpd != NULL) 167725944Sjoerg return EADDRINUSE; 167840008Sjoerg 167940008Sjoerg V_ip_rsvpd = so; 168025944Sjoerg /* 168125944Sjoerg * This may seem silly, but we need to be sure we don't over-increment 168225944Sjoerg * the RSVP counter, in case something slips up. 168325944Sjoerg */ 168425944Sjoerg if (!V_ip_rsvp_on) { 16854910Swollman V_ip_rsvp_on = 1; 168625944Sjoerg V_rsvp_on++; 168725944Sjoerg } 168825944Sjoerg 168925944Sjoerg return 0; 169025944Sjoerg} 1691102412Scharnier 169225944Sjoergint 169325944Sjoergip_rsvp_done(void) 169425944Sjoerg{ 169525944Sjoerg INIT_VNET_INET(curvnet); 169625944Sjoerg 169725944Sjoerg V_ip_rsvpd = NULL; 169825944Sjoerg /* 169925944Sjoerg * This may seem silly, but we need to be sure we don't over-decrement 170040008Sjoerg * the RSVP counter, in case something slips up. 170140008Sjoerg */ 170225944Sjoerg if (V_ip_rsvp_on) { 170325944Sjoerg V_ip_rsvp_on = 0; 170425944Sjoerg V_rsvp_on--; 170525944Sjoerg } 170625944Sjoerg return 0; 170725944Sjoerg} 170825944Sjoerg 170925944Sjoergvoid 171025944Sjoergrsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */ 171140008Sjoerg{ 171240008Sjoerg INIT_VNET_INET(curvnet); 171325944Sjoerg 171425944Sjoerg if (rsvp_input_p) { /* call the real one if loaded */ 171525944Sjoerg rsvp_input_p(m, off); 171625944Sjoerg return; 171725944Sjoerg } 171825944Sjoerg 171925944Sjoerg /* Can still get packets with rsvp_on = 0 if there is a local member 172025944Sjoerg * of the group to which the RSVP packet is addressed. But in this 172125944Sjoerg * case we want to throw the packet away. 172225944Sjoerg */ 172325944Sjoerg 172425944Sjoerg if (!V_rsvp_on) { 172525944Sjoerg m_freem(m); 172641881Sphk return; 172725944Sjoerg } 172825944Sjoerg 172925944Sjoerg if (V_ip_rsvpd != NULL) { 173041881Sphk rip_input(m, off); 173125944Sjoerg return; 173225944Sjoerg } 173325944Sjoerg /* Drop the packet */ 173425944Sjoerg m_freem(m); 173525944Sjoerg} 173625944Sjoerg