1139823Simp/*- 217072Sjulian * Copyright (c) 1982, 1986, 1988, 1993 317072Sjulian * The Regents of the University of California. All rights reserved. 417072Sjulian * 517072Sjulian * Redistribution and use in source and binary forms, with or without 617072Sjulian * modification, are permitted provided that the following conditions 717072Sjulian * are met: 817072Sjulian * 1. Redistributions of source code must retain the above copyright 917072Sjulian * notice, this list of conditions and the following disclaimer. 1017072Sjulian * 2. Redistributions in binary form must reproduce the above copyright 1117072Sjulian * notice, this list of conditions and the following disclaimer in the 1217072Sjulian * documentation and/or other materials provided with the distribution. 1317072Sjulian * 4. Neither the name of the University nor the names of its contributors 1417072Sjulian * may be used to endorse or promote products derived from this software 1517072Sjulian * without specific prior written permission. 1617072Sjulian * 1717072Sjulian * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 1817072Sjulian * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1917072Sjulian * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2017072Sjulian * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2117072Sjulian * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2217072Sjulian * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2317072Sjulian * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2417072Sjulian * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2517072Sjulian * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2617072Sjulian * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2717072Sjulian * SUCH DAMAGE. 2817072Sjulian */ 2917072Sjulian 30172467Ssilby#include <sys/cdefs.h> 31172467Ssilby__FBSDID("$FreeBSD$"); 32172467Ssilby 33136714Sandre#if !defined(KLD_MODULE) 3432350Seivind#include "opt_inet.h" 35188066Srrs#include "opt_sctp.h" 3632350Seivind#ifndef INET 3736079Swollman#error "IPDIVERT requires INET." 3832350Seivind#endif 39136714Sandre#endif 40223593Sglebius#include "opt_inet6.h" 4132350Seivind 4217072Sjulian#include <sys/param.h> 4364192Sru#include <sys/kernel.h> 4495759Stanimura#include <sys/lock.h> 4517072Sjulian#include <sys/malloc.h> 4617072Sjulian#include <sys/mbuf.h> 47136714Sandre#include <sys/module.h> 48136714Sandre#include <sys/kernel.h> 49164033Srwatson#include <sys/priv.h> 5086183Srwatson#include <sys/proc.h> 5195759Stanimura#include <sys/protosw.h> 5217072Sjulian#include <sys/socket.h> 5317072Sjulian#include <sys/socketvar.h> 5464192Sru#include <sys/sysctl.h> 55201735Sluigi#include <net/vnet.h> 5634923Sbde 5717072Sjulian#include <net/if.h> 58171746Scsjp#include <net/netisr.h> 5917072Sjulian 6017072Sjulian#include <netinet/in.h> 6195759Stanimura#include <netinet/in_pcb.h> 6217072Sjulian#include <netinet/in_systm.h> 6395759Stanimura#include <netinet/in_var.h> 6417072Sjulian#include <netinet/ip.h> 6517072Sjulian#include <netinet/ip_var.h> 66223593Sglebius#ifdef INET6 67223593Sglebius#include <netinet/ip6.h> 68223593Sglebius#include <netinet6/ip6_var.h> 69223593Sglebius#endif 70188066Srrs#ifdef SCTP 71188066Srrs#include <netinet/sctp_crc32.h> 72188066Srrs#endif 7317072Sjulian 74163606Srwatson#include <security/mac/mac_framework.h> 75163606Srwatson 7617072Sjulian/* 7717072Sjulian * Divert sockets 7817072Sjulian */ 7917072Sjulian 8017072Sjulian/* 8117072Sjulian * Allocate enough space to hold a full IP packet 8217072Sjulian */ 8317072Sjulian#define DIVSNDQ (65536 + 100) 8417072Sjulian#define DIVRCVQ (65536 + 100) 8517072Sjulian 8617072Sjulian/* 87201735Sluigi * Divert sockets work in conjunction with ipfw or other packet filters, 88201735Sluigi * see the divert(4) manpage for features. 89201735Sluigi * Packets are selected by the packet filter and tagged with an 90201735Sluigi * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by 91201735Sluigi * the packet filter) and information on the matching filter rule for 92201735Sluigi * subsequent reinjection. The divert_port is used to put the packet 93201735Sluigi * on the corresponding divert socket, while the rule number is passed 94201735Sluigi * up (at least partially) as the sin_port in the struct sockaddr. 9536369Sjulian * 96201735Sluigi * Packets written to the divert socket carry in sin_addr a 97201735Sluigi * destination address, and in sin_port the number of the filter rule 98201735Sluigi * after which to continue processing. 99201735Sluigi * If the destination address is INADDR_ANY, the packet is treated as 100201735Sluigi * as outgoing and sent to ip_output(); otherwise it is treated as 101201735Sluigi * incoming and sent to ip_input(). 102201735Sluigi * Further, sin_zero carries some information on the interface, 103201735Sluigi * which can be used in the reinject -- see comments in the code. 10454175Sarchie * 10598613Sluigi * On reinjection, processing in ip_input() and ip_output() 10698613Sluigi * will be exactly the same as for the original packet, except that 107201735Sluigi * packet filter processing will start at the rule number after the one 108201735Sluigi * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0 109201735Sluigi * will apply the entire ruleset to the packet). 11017072Sjulian */ 11117072Sjulian 112136714Sandre/* Internal variables. */ 113215701Sdimstatic VNET_DEFINE(struct inpcbhead, divcb); 114215701Sdimstatic VNET_DEFINE(struct inpcbinfo, divcbinfo); 11517072Sjulian 116195727Srwatson#define V_divcb VNET(divcb) 117195727Srwatson#define V_divcbinfo VNET(divcbinfo) 118195699Srwatson 11917072Sjulianstatic u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ 12017072Sjulianstatic u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ 12117072Sjulian 122196502Szecstatic eventhandler_tag ip_divert_event_tag; 123196502Szec 12417072Sjulian/* 12517072Sjulian * Initialize divert connection block queue. 12617072Sjulian */ 127157927Spsstatic void 128157927Spsdiv_zone_change(void *tag) 129157927Sps{ 130157927Sps 131181803Sbz uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); 132157927Sps} 133157927Sps 134160491Supsstatic int 135160491Supsdiv_inpcb_init(void *mem, int size, int flags) 136160491Sups{ 137165634Sjhb struct inpcb *inp = mem; 138165634Sjhb 139160491Sups INP_LOCK_INIT(inp, "inp", "divinp"); 140160491Sups return (0); 141160491Sups} 142160491Sups 143160491Supsstatic void 144160491Supsdiv_inpcb_fini(void *mem, int size) 145160491Sups{ 146165634Sjhb struct inpcb *inp = mem; 147165634Sjhb 148160491Sups INP_LOCK_DESTROY(inp); 149160491Sups} 150160491Sups 151196502Szecstatic void 15217072Sjuliandiv_init(void) 15317072Sjulian{ 154169454Srwatson 15517072Sjulian /* 156205157Srwatson * XXX We don't use the hash list for divert IP, but it's easier to 157205157Srwatson * allocate one-entry hash lists than it is to check all over the 158205157Srwatson * place for hashbase == NULL. 15917072Sjulian */ 160205157Srwatson in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", 161222748Srwatson div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE, 162222748Srwatson IPI_HASHFIELDS_NONE); 16317072Sjulian} 16417072Sjulian 165196502Szecstatic void 166196502Szecdiv_destroy(void) 167196502Szec{ 168196502Szec 169205157Srwatson in_pcbinfo_destroy(&V_divcbinfo); 170196502Szec} 171196502Szec 17217072Sjulian/* 173106152Sfenner * IPPROTO_DIVERT is not in the real IP protocol number space; this 174106152Sfenner * function should never be called. Just in case, drop any packets. 17517072Sjulian */ 176201527Sluigistatic void 17782884Sjuliandiv_input(struct mbuf *m, int off) 17817072Sjulian{ 179183550Szec 180196039Srwatson KMOD_IPSTAT_INC(ips_noproto); 18154175Sarchie m_freem(m); 18254175Sarchie} 18354175Sarchie 18454175Sarchie/* 18554175Sarchie * Divert a packet by passing it up to the divert socket at port 'port'. 18654175Sarchie * 18754175Sarchie * Setup generic address and protocol structures for div_input routine, 18854175Sarchie * then pass them along with mbuf chain. 18954175Sarchie */ 190136714Sandrestatic void 191126239Smlaierdivert_packet(struct mbuf *m, int incoming) 19254175Sarchie{ 19326359Sjulian struct ip *ip; 19426359Sjulian struct inpcb *inp; 19526359Sjulian struct socket *sa; 19654175Sarchie u_int16_t nport; 197119752Ssam struct sockaddr_in divsrc; 198126239Smlaier struct m_tag *mtag; 19917072Sjulian 200201527Sluigi mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); 201126239Smlaier if (mtag == NULL) { 202126239Smlaier m_freem(m); 203126239Smlaier return; 204126239Smlaier } 20526359Sjulian /* Assure header */ 20626359Sjulian if (m->m_len < sizeof(struct ip) && 20798613Sluigi (m = m_pullup(m, sizeof(struct ip))) == 0) 20826359Sjulian return; 20926359Sjulian ip = mtod(m, struct ip *); 21026359Sjulian 211133069Sandre /* Delayed checksums are currently not compatible with divert. */ 212133069Sandre if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 213133069Sandre ip->ip_len = ntohs(ip->ip_len); 214133069Sandre in_delayed_cksum(m); 215133069Sandre m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 216133069Sandre ip->ip_len = htons(ip->ip_len); 217133069Sandre } 218188066Srrs#ifdef SCTP 219188066Srrs if (m->m_pkthdr.csum_flags & CSUM_SCTP) { 220188066Srrs ip->ip_len = ntohs(ip->ip_len); 221205104Srrs sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); 222188066Srrs m->m_pkthdr.csum_flags &= ~CSUM_SCTP; 223188066Srrs ip->ip_len = htons(ip->ip_len); 224188066Srrs } 225188066Srrs#endif 226201527Sluigi bzero(&divsrc, sizeof(divsrc)); 227201527Sluigi divsrc.sin_len = sizeof(divsrc); 228201527Sluigi divsrc.sin_family = AF_INET; 229201527Sluigi /* record matching rule, in host format */ 230201527Sluigi divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum; 23137433Sjulian /* 23254175Sarchie * Record receive interface address, if any. 23337433Sjulian * But only for incoming packets. 23437433Sjulian */ 23554175Sarchie if (incoming) { 23617072Sjulian struct ifaddr *ifa; 237191287Srwatson struct ifnet *ifp; 23817072Sjulian 23926359Sjulian /* Sanity check */ 240113255Sdes M_ASSERTPKTHDR(m); 24126359Sjulian 24236364Sjulian /* Find IP address for receive interface */ 243191287Srwatson ifp = m->m_pkthdr.rcvif; 244195023Srwatson if_addr_rlock(ifp); 245191287Srwatson TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 24617072Sjulian if (ifa->ifa_addr->sa_family != AF_INET) 24717072Sjulian continue; 24817072Sjulian divsrc.sin_addr = 24917072Sjulian ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; 25017072Sjulian break; 25117072Sjulian } 252195023Srwatson if_addr_runlock(ifp); 25336903Sjulian } 25437433Sjulian /* 25537433Sjulian * Record the incoming interface name whenever we have one. 25637433Sjulian */ 25736903Sjulian if (m->m_pkthdr.rcvif) { 25836364Sjulian /* 25936364Sjulian * Hide the actual interface name in there in the 26036364Sjulian * sin_zero array. XXX This needs to be moved to a 26136364Sjulian * different sockaddr type for divert, e.g. 26236364Sjulian * sockaddr_div with multiple fields like 26336364Sjulian * sockaddr_dl. Presently we have only 7 bytes 26436364Sjulian * but that will do for now as most interfaces 26536364Sjulian * are 4 or less + 2 or less bytes for unit. 26636364Sjulian * There is probably a faster way of doing this, 26736364Sjulian * possibly taking it from the sockaddr_dl on the iface. 26836364Sjulian * This solves the problem of a P2P link and a LAN interface 26936364Sjulian * having the same address, which can result in the wrong 27036364Sjulian * interface being assigned to the packet when fed back 27136364Sjulian * into the divert socket. Theoretically if the daemon saves 27236364Sjulian * and re-uses the sockaddr_in as suggested in the man pages, 27336364Sjulian * this iface name will come along for the ride. 27436364Sjulian * (see div_output for the other half of this.) 27536364Sjulian */ 276121816Sbrooks strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, 277121816Sbrooks sizeof(divsrc.sin_zero)); 27817072Sjulian } 27917072Sjulian 28017072Sjulian /* Put packet on socket queue, if any */ 28117072Sjulian sa = NULL; 282201527Sluigi nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); 283181803Sbz INP_INFO_RLOCK(&V_divcbinfo); 284181803Sbz LIST_FOREACH(inp, &V_divcb, inp_list) { 285119752Ssam /* XXX why does only one socket match? */ 286119752Ssam if (inp->inp_lport == nport) { 287180851Smav INP_RLOCK(inp); 28817072Sjulian sa = inp->inp_socket; 289131151Srwatson SOCKBUF_LOCK(&sa->so_rcv); 290131151Srwatson if (sbappendaddr_locked(&sa->so_rcv, 291119752Ssam (struct sockaddr *)&divsrc, m, 292131151Srwatson (struct mbuf *)0) == 0) { 293131208Sphk SOCKBUF_UNLOCK(&sa->so_rcv); 294119752Ssam sa = NULL; /* force mbuf reclaim below */ 295131151Srwatson } else 296131151Srwatson sorwakeup_locked(sa); 297178376Srwatson INP_RUNLOCK(inp); 298119752Ssam break; 299119752Ssam } 30017072Sjulian } 301181803Sbz INP_INFO_RUNLOCK(&V_divcbinfo); 302119752Ssam if (sa == NULL) { 30317072Sjulian m_freem(m); 304196039Srwatson KMOD_IPSTAT_INC(ips_noproto); 305196039Srwatson KMOD_IPSTAT_DEC(ips_delivered); 30617072Sjulian } 30717072Sjulian} 30817072Sjulian 30917072Sjulian/* 31017072Sjulian * Deliver packet back into the IP processing machinery. 31117072Sjulian * 31217072Sjulian * If no address specified, or address is 0.0.0.0, send to ip_output(); 31317072Sjulian * otherwise, send to ip_input() and mark as having been received on 31417072Sjulian * the interface with that address. 31517072Sjulian */ 31617072Sjulianstatic int 317169454Srwatsondiv_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, 318169454Srwatson struct mbuf *control) 31917072Sjulian{ 320223593Sglebius struct ip *const ip = mtod(m, struct ip *); 321136073Sgreen struct m_tag *mtag; 322201527Sluigi struct ipfw_rule_ref *dt; 32317072Sjulian int error = 0; 32417072Sjulian 325146182Sglebius /* 326146182Sglebius * An mbuf may hasn't come from userland, but we pretend 327146182Sglebius * that it has. 328146182Sglebius */ 329137630Sglebius m->m_pkthdr.rcvif = NULL; 330146182Sglebius m->m_nextpkt = NULL; 331185101Sjulian M_SETFIB(m, so->so_fibnum); 33298613Sluigi 33317072Sjulian if (control) 33417072Sjulian m_freem(control); /* XXX */ 33517072Sjulian 336201527Sluigi mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); 337201527Sluigi if (mtag == NULL) { 338201527Sluigi /* this should be normal */ 339201527Sluigi mtag = m_tag_alloc(MTAG_IPFW_RULE, 0, 340201527Sluigi sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); 341137630Sglebius if (mtag == NULL) { 342137630Sglebius error = ENOBUFS; 343137630Sglebius goto cantsend; 344137630Sglebius } 345137630Sglebius m_tag_prepend(m, mtag); 346201527Sluigi } 347201527Sluigi dt = (struct ipfw_rule_ref *)(mtag+1); 348136073Sgreen 34936903Sjulian /* Loopback avoidance and state recovery */ 35036707Sjulian if (sin) { 35198613Sluigi int i; 35237433Sjulian 353201527Sluigi /* set the starting point. We provide a non-zero slot, 354201527Sluigi * but a non_matching chain_id to skip that info and use 355201527Sluigi * the rulenum/rule_id. 356201527Sluigi */ 357201527Sluigi dt->slot = 1; /* dummy, chain_id is invalid */ 358201527Sluigi dt->chain_id = 0; 359201527Sluigi dt->rulenum = sin->sin_port+1; /* host format ? */ 360201527Sluigi dt->rule_id = 0; 36136903Sjulian /* 36298613Sluigi * Find receive interface with the given name, stuffed 36398613Sluigi * (if it exists) in the sin_zero[] field. 36498613Sluigi * The name is user supplied data so don't trust its size 36598613Sluigi * or that it is zero terminated. 36636903Sjulian */ 367110008Sphk for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) 36898613Sluigi ; 36998613Sluigi if ( i > 0 && i < sizeof(sin->sin_zero)) 37036903Sjulian m->m_pkthdr.rcvif = ifunit(sin->sin_zero); 37136369Sjulian } 37217072Sjulian 37317072Sjulian /* Reinject packet into the system as incoming or outgoing */ 37417072Sjulian if (!sin || sin->sin_addr.s_addr == 0) { 375223593Sglebius struct mbuf *options = NULL; 376122331Ssam struct inpcb *inp; 37798613Sluigi 378201527Sluigi dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT; 379122331Ssam inp = sotoinpcb(so); 380178376Srwatson INP_RLOCK(inp); 381223593Sglebius switch (ip->ip_v) { 382223593Sglebius case IPVERSION: 383223593Sglebius /* 384223593Sglebius * Don't allow both user specified and setsockopt 385223593Sglebius * options, and don't allow packet length sizes that 386223593Sglebius * will crash. 387223593Sglebius */ 388223593Sglebius if ((((ip->ip_hl << 2) != sizeof(struct ip)) && 389223593Sglebius inp->inp_options != NULL) || 390223593Sglebius ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { 391223593Sglebius error = EINVAL; 392223593Sglebius INP_RUNLOCK(inp); 393223593Sglebius goto cantsend; 394223593Sglebius } 395223593Sglebius 396122331Ssam /* Convert fields to host order for ip_output() */ 397122331Ssam ip->ip_len = ntohs(ip->ip_len); 398122331Ssam ip->ip_off = ntohs(ip->ip_off); 399223593Sglebius break; 400223593Sglebius#ifdef INET6 401223593Sglebius case IPV6_VERSION >> 4: 402223593Sglebius { 403223593Sglebius struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *); 40417072Sjulian 405223593Sglebius /* Don't allow packet length sizes that will crash */ 406223593Sglebius if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) { 407223593Sglebius error = EINVAL; 408223593Sglebius INP_RUNLOCK(inp); 409223593Sglebius goto cantsend; 410223593Sglebius } 41117072Sjulian 412223593Sglebius ip6->ip6_plen = ntohs(ip6->ip6_plen); 413224575Sglebius break; 414223593Sglebius } 415223593Sglebius#endif 416223593Sglebius default: 417223593Sglebius error = EINVAL; 418223593Sglebius INP_RUNLOCK(inp); 419223593Sglebius goto cantsend; 420223593Sglebius } 421223593Sglebius 422223593Sglebius /* Send packet to output processing */ 423223593Sglebius KMOD_IPSTAT_INC(ips_rawout); /* XXX */ 424223593Sglebius 425130900Srwatson#ifdef MAC 426223593Sglebius mac_inpcb_create_mbuf(inp, m); 427130900Srwatson#endif 428223593Sglebius /* 429223593Sglebius * Get ready to inject the packet into ip_output(). 430223593Sglebius * Just in case socket options were specified on the 431223593Sglebius * divert socket, we duplicate them. This is done 432223593Sglebius * to avoid having to hold the PCB locks over the call 433223593Sglebius * to ip_output(), as doing this results in a number of 434223593Sglebius * lock ordering complexities. 435223593Sglebius * 436223593Sglebius * Note that we set the multicast options argument for 437223593Sglebius * ip_output() to NULL since it should be invariant that 438223593Sglebius * they are not present. 439223593Sglebius */ 440223593Sglebius KASSERT(inp->inp_moptions == NULL, 441223593Sglebius ("multicast options set on a divert socket")); 442223593Sglebius /* 443223593Sglebius * XXXCSJP: It is unclear to me whether or not it makes 444223593Sglebius * sense for divert sockets to have options. However, 445223593Sglebius * for now we will duplicate them with the INP locks 446223593Sglebius * held so we can use them in ip_output() without 447223593Sglebius * requring a reference to the pcb. 448223593Sglebius */ 449223593Sglebius if (inp->inp_options != NULL) { 450223593Sglebius options = m_dup(inp->inp_options, M_NOWAIT); 451223593Sglebius if (options == NULL) { 452223593Sglebius INP_RUNLOCK(inp); 453223593Sglebius error = ENOBUFS; 454223593Sglebius goto cantsend; 455171746Scsjp } 456223593Sglebius } 457223593Sglebius INP_RUNLOCK(inp); 458223593Sglebius 459223593Sglebius switch (ip->ip_v) { 460223593Sglebius case IPVERSION: 461171746Scsjp error = ip_output(m, options, NULL, 462223593Sglebius ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) 463223593Sglebius | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL); 464223593Sglebius break; 465223593Sglebius#ifdef INET6 466223593Sglebius case IPV6_VERSION >> 4: 467223593Sglebius error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); 468223593Sglebius break; 469223593Sglebius#endif 470122331Ssam } 471223593Sglebius if (options != NULL) 472223593Sglebius m_freem(options); 47317072Sjulian } else { 474201527Sluigi dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN; 47536903Sjulian if (m->m_pkthdr.rcvif == NULL) { 47643763Sjulian /* 47798613Sluigi * No luck with the name, check by IP address. 47898613Sluigi * Clear the port and the ifname to make sure 47998613Sluigi * there are no distractions for ifa_ifwithaddr. 48043763Sjulian */ 48198613Sluigi struct ifaddr *ifa; 48298613Sluigi 48343763Sjulian bzero(sin->sin_zero, sizeof(sin->sin_zero)); 48443763Sjulian sin->sin_port = 0; 48598613Sluigi ifa = ifa_ifwithaddr((struct sockaddr *) sin); 48698613Sluigi if (ifa == NULL) { 48736364Sjulian error = EADDRNOTAVAIL; 48836364Sjulian goto cantsend; 48936364Sjulian } 49036364Sjulian m->m_pkthdr.rcvif = ifa->ifa_ifp; 491194760Srwatson ifa_free(ifa); 49217072Sjulian } 493130900Srwatson#ifdef MAC 494172930Srwatson mac_socket_create_mbuf(so, m); 495130900Srwatson#endif 496171746Scsjp /* Send packet to input processing via netisr */ 497223593Sglebius switch (ip->ip_v) { 498223593Sglebius case IPVERSION: 499223593Sglebius netisr_queue_src(NETISR_IP, (uintptr_t)so, m); 500223593Sglebius break; 501223593Sglebius#ifdef INET6 502223593Sglebius case IPV6_VERSION >> 4: 503223593Sglebius netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m); 504223593Sglebius break; 505223593Sglebius#endif 506223593Sglebius default: 507223593Sglebius error = EINVAL; 508223593Sglebius goto cantsend; 509223593Sglebius } 51017072Sjulian } 51117072Sjulian 512223593Sglebius return (error); 51317072Sjulian 51417072Sjuliancantsend: 51554175Sarchie m_freem(m); 516223593Sglebius return (error); 51717072Sjulian} 51817072Sjulian 51926096Speterstatic int 52083366Sjuliandiv_attach(struct socket *so, int proto, struct thread *td) 52117072Sjulian{ 52226096Speter struct inpcb *inp; 523119752Ssam int error; 52417072Sjulian 52526096Speter inp = sotoinpcb(so); 526157374Srwatson KASSERT(inp == NULL, ("div_attach: inp != NULL")); 527164033Srwatson if (td != NULL) { 528164033Srwatson error = priv_check(td, PRIV_NETINET_DIVERT); 529164033Srwatson if (error) 530164033Srwatson return (error); 531164033Srwatson } 53255009Sshin error = soreserve(so, div_sendspace, div_recvspace); 533157374Srwatson if (error) 53455009Sshin return error; 535181803Sbz INP_INFO_WLOCK(&V_divcbinfo); 536181803Sbz error = in_pcballoc(so, &V_divcbinfo); 537119752Ssam if (error) { 538181803Sbz INP_INFO_WUNLOCK(&V_divcbinfo); 53926096Speter return error; 540119752Ssam } 54126096Speter inp = (struct inpcb *)so->so_pcb; 542181803Sbz INP_INFO_WUNLOCK(&V_divcbinfo); 54326096Speter inp->inp_ip_p = proto; 54464192Sru inp->inp_vflag |= INP_IPV4; 54526096Speter inp->inp_flags |= INP_HDRINCL; 546178285Srwatson INP_WUNLOCK(inp); 54726096Speter return 0; 54826096Speter} 54917072Sjulian 550157370Srwatsonstatic void 55126096Speterdiv_detach(struct socket *so) 55226096Speter{ 55326096Speter struct inpcb *inp; 55417072Sjulian 555157374Srwatson inp = sotoinpcb(so); 556157374Srwatson KASSERT(inp != NULL, ("div_detach: inp == NULL")); 557181803Sbz INP_INFO_WLOCK(&V_divcbinfo); 558178285Srwatson INP_WLOCK(inp); 55926096Speter in_pcbdetach(inp); 560157374Srwatson in_pcbfree(inp); 561181803Sbz INP_INFO_WUNLOCK(&V_divcbinfo); 56226096Speter} 56317072Sjulian 56426096Speterstatic int 56583366Sjuliandiv_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 56626096Speter{ 56726096Speter struct inpcb *inp; 56826096Speter int error; 56917072Sjulian 57026345Speter inp = sotoinpcb(so); 571157423Srwatson KASSERT(inp != NULL, ("div_bind: inp == NULL")); 57298664Sluigi /* in_pcbbind assumes that nam is a sockaddr_in 57359909Spaul * and in_pcbbind requires a valid address. Since divert 57459909Spaul * sockets don't we need to make sure the address is 57559909Spaul * filled in properly. 57659909Spaul * XXX -- divert should not be abusing in_pcbind 57759909Spaul * and should probably have its own family. 57859909Spaul */ 57998613Sluigi if (nam->sa_family != AF_INET) 580157374Srwatson return EAFNOSUPPORT; 581157374Srwatson ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; 582181803Sbz INP_INFO_WLOCK(&V_divcbinfo); 583178285Srwatson INP_WLOCK(inp); 584222690Srwatson INP_HASH_WLOCK(&V_divcbinfo); 585157374Srwatson error = in_pcbbind(inp, nam, td->td_ucred); 586222690Srwatson INP_HASH_WUNLOCK(&V_divcbinfo); 587178285Srwatson INP_WUNLOCK(inp); 588181803Sbz INP_INFO_WUNLOCK(&V_divcbinfo); 58965260Sru return error; 59026096Speter} 59117072Sjulian 59226096Speterstatic int 59326096Speterdiv_shutdown(struct socket *so) 59426096Speter{ 595122331Ssam struct inpcb *inp; 596122331Ssam 597122331Ssam inp = sotoinpcb(so); 598157374Srwatson KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); 599178285Srwatson INP_WLOCK(inp); 60026096Speter socantsendmore(so); 601178285Srwatson INP_WUNLOCK(inp); 60226096Speter return 0; 60326096Speter} 60417072Sjulian 60526096Speterstatic int 60629327Speterdiv_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 607169454Srwatson struct mbuf *control, struct thread *td) 60826096Speter{ 609183550Szec 61026096Speter /* Packet must have a header (but that's about it) */ 61155009Sshin if (m->m_len < sizeof (struct ip) && 61226096Speter (m = m_pullup(m, sizeof (struct ip))) == 0) { 613196039Srwatson KMOD_IPSTAT_INC(ips_toosmall); 61426096Speter m_freem(m); 61526096Speter return EINVAL; 61626096Speter } 61717072Sjulian 61826096Speter /* Send packet */ 61998613Sluigi return div_output(so, m, (struct sockaddr_in *)nam, control); 62017072Sjulian} 62126096Speter 622201527Sluigistatic void 623122331Ssamdiv_ctlinput(int cmd, struct sockaddr *sa, void *vip) 624122331Ssam{ 625122331Ssam struct in_addr faddr; 626122331Ssam 627122331Ssam faddr = ((struct sockaddr_in *)sa)->sin_addr; 628122331Ssam if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 629122331Ssam return; 630122922Sandre if (PRC_IS_REDIRECT(cmd)) 631122922Sandre return; 632122331Ssam} 633122331Ssam 63464192Srustatic int 63564192Srudiv_pcblist(SYSCTL_HANDLER_ARGS) 63664192Sru{ 637119752Ssam int error, i, n; 63864192Sru struct inpcb *inp, **inp_list; 63964192Sru inp_gen_t gencnt; 64064192Sru struct xinpgen xig; 64164192Sru 64264192Sru /* 64364192Sru * The process of preparing the TCB list is too time-consuming and 64464192Sru * resource-intensive to repeat twice on every request. 64564192Sru */ 64664192Sru if (req->oldptr == 0) { 647181803Sbz n = V_divcbinfo.ipi_count; 648211433Sjhb n += imax(n / 8, 10); 649211433Sjhb req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); 65064192Sru return 0; 65164192Sru } 65264192Sru 65364192Sru if (req->newptr != 0) 65464192Sru return EPERM; 65564192Sru 65664192Sru /* 65764192Sru * OK, now we're committed to doing something. 65864192Sru */ 659181803Sbz INP_INFO_RLOCK(&V_divcbinfo); 660181803Sbz gencnt = V_divcbinfo.ipi_gencnt; 661181803Sbz n = V_divcbinfo.ipi_count; 662181803Sbz INP_INFO_RUNLOCK(&V_divcbinfo); 66364192Sru 664126253Struckman error = sysctl_wire_old_buffer(req, 665126253Struckman 2 * sizeof(xig) + n*sizeof(struct xinpcb)); 666126253Struckman if (error != 0) 667126253Struckman return (error); 668119752Ssam 66964192Sru xig.xig_len = sizeof xig; 67064192Sru xig.xig_count = n; 67164192Sru xig.xig_gen = gencnt; 67264192Sru xig.xig_sogen = so_gencnt; 67364192Sru error = SYSCTL_OUT(req, &xig, sizeof xig); 67464192Sru if (error) 67564192Sru return error; 67664192Sru 677111119Simp inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 67864192Sru if (inp_list == 0) 67964192Sru return ENOMEM; 68064192Sru 681181803Sbz INP_INFO_RLOCK(&V_divcbinfo); 682181803Sbz for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; 68371999Sphk inp = LIST_NEXT(inp, inp_list)) { 684205251Sbz INP_WLOCK(inp); 685119752Ssam if (inp->inp_gencnt <= gencnt && 686205251Sbz cr_canseeinpcb(req->td->td_ucred, inp) == 0) { 687205251Sbz in_pcbref(inp); 68864192Sru inp_list[i++] = inp; 689205251Sbz } 690205251Sbz INP_WUNLOCK(inp); 69164192Sru } 692181803Sbz INP_INFO_RUNLOCK(&V_divcbinfo); 69364192Sru n = i; 69464192Sru 69564192Sru error = 0; 69664192Sru for (i = 0; i < n; i++) { 69764192Sru inp = inp_list[i]; 698178376Srwatson INP_RLOCK(inp); 69964192Sru if (inp->inp_gencnt <= gencnt) { 70064192Sru struct xinpcb xi; 701145953Scperciva bzero(&xi, sizeof(xi)); 70264192Sru xi.xi_len = sizeof xi; 70364192Sru /* XXX should avoid extra copy */ 70464192Sru bcopy(inp, &xi.xi_inp, sizeof *inp); 70564192Sru if (inp->inp_socket) 70664192Sru sotoxsocket(inp->inp_socket, &xi.xi_socket); 707178376Srwatson INP_RUNLOCK(inp); 70864192Sru error = SYSCTL_OUT(req, &xi, sizeof xi); 709160491Sups } else 710178376Srwatson INP_RUNLOCK(inp); 71164192Sru } 712205251Sbz INP_INFO_WLOCK(&V_divcbinfo); 713205251Sbz for (i = 0; i < n; i++) { 714205251Sbz inp = inp_list[i]; 715222488Srwatson INP_RLOCK(inp); 716222488Srwatson if (!in_pcbrele_rlocked(inp)) 717222488Srwatson INP_RUNLOCK(inp); 718205251Sbz } 719205251Sbz INP_INFO_WUNLOCK(&V_divcbinfo); 720205251Sbz 72164192Sru if (!error) { 72264192Sru /* 72364192Sru * Give the user an updated idea of our state. 72464192Sru * If the generation differs from what we told 72564192Sru * her before, she knows that something happened 72664192Sru * while we were processing this request, and it 72764192Sru * might be necessary to retry. 72864192Sru */ 729181803Sbz INP_INFO_RLOCK(&V_divcbinfo); 730181803Sbz xig.xig_gen = V_divcbinfo.ipi_gencnt; 73164192Sru xig.xig_sogen = so_gencnt; 732181803Sbz xig.xig_count = V_divcbinfo.ipi_count; 733181803Sbz INP_INFO_RUNLOCK(&V_divcbinfo); 73464192Sru error = SYSCTL_OUT(req, &xig, sizeof xig); 73564192Sru } 73664192Sru free(inp_list, M_TEMP); 73764192Sru return error; 73864192Sru} 73964192Sru 740136714Sandre#ifdef SYSCTL_NODE 741248085Smariusstatic SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, 742248085Smarius "IPDIVERT"); 743217554SmdfSYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, 744217554Smdf NULL, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); 745136714Sandre#endif 74664192Sru 74726096Speterstruct pr_usrreqs div_usrreqs = { 748137386Sphk .pru_attach = div_attach, 749137386Sphk .pru_bind = div_bind, 750137386Sphk .pru_control = in_control, 751137386Sphk .pru_detach = div_detach, 752169462Srwatson .pru_peeraddr = in_getpeeraddr, 753137386Sphk .pru_send = div_send, 754137386Sphk .pru_shutdown = div_shutdown, 755169462Srwatson .pru_sockaddr = in_getsockaddr, 756137584Sphk .pru_sosetlabel = in_pcbsosetlabel 75726096Speter}; 758136714Sandre 759136714Sandrestruct protosw div_protosw = { 760152242Sru .pr_type = SOCK_RAW, 761152242Sru .pr_protocol = IPPROTO_DIVERT, 762152242Sru .pr_flags = PR_ATOMIC|PR_ADDR, 763152242Sru .pr_input = div_input, 764152242Sru .pr_ctlinput = div_ctlinput, 765152242Sru .pr_ctloutput = ip_ctloutput, 766152242Sru .pr_init = div_init, 767196502Szec#ifdef VIMAGE 768196502Szec .pr_destroy = div_destroy, 769196502Szec#endif 770152242Sru .pr_usrreqs = &div_usrreqs 771136714Sandre}; 772136714Sandre 773136714Sandrestatic int 774136714Sandrediv_modevent(module_t mod, int type, void *unused) 775136714Sandre{ 776136714Sandre int err = 0; 777196502Szec#ifndef VIMAGE 778136714Sandre int n; 779196502Szec#endif 780136714Sandre 781136714Sandre switch (type) { 782136714Sandre case MOD_LOAD: 783136714Sandre /* 784136714Sandre * Protocol will be initialized by pf_proto_register(). 785136714Sandre * We don't have to register ip_protox because we are not 786136714Sandre * a true IP protocol that goes over the wire. 787136714Sandre */ 788136714Sandre err = pf_proto_register(PF_INET, &div_protosw); 789196502Szec if (err != 0) 790196502Szec return (err); 791136714Sandre ip_divert_ptr = divert_packet; 792196502Szec ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change, 793196502Szec div_zone_change, NULL, EVENTHANDLER_PRI_ANY); 794136714Sandre break; 795136788Sandre case MOD_QUIESCE: 796136788Sandre /* 797136788Sandre * IPDIVERT may normally not be unloaded because of the 798136788Sandre * potential race conditions. Tell kldunload we can't be 799136788Sandre * unloaded unless the unload is forced. 800136788Sandre */ 801136788Sandre err = EPERM; 802136788Sandre break; 803136714Sandre case MOD_UNLOAD: 804196502Szec#ifdef VIMAGE 805196502Szec err = EPERM; 806196502Szec break; 807196502Szec#else 808136714Sandre /* 809136788Sandre * Forced unload. 810136788Sandre * 811136714Sandre * Module ipdivert can only be unloaded if no sockets are 812136714Sandre * connected. Maybe this can be changed later to forcefully 813136714Sandre * disconnect any open sockets. 814136715Srwatson * 815136716Sandre * XXXRW: Note that there is a slight race here, as a new 816136716Sandre * socket open request could be spinning on the lock and then 817136716Sandre * we destroy the lock. 818136714Sandre */ 819181803Sbz INP_INFO_WLOCK(&V_divcbinfo); 820181803Sbz n = V_divcbinfo.ipi_count; 821136714Sandre if (n != 0) { 822136714Sandre err = EBUSY; 823181803Sbz INP_INFO_WUNLOCK(&V_divcbinfo); 824136714Sandre break; 825136714Sandre } 826136714Sandre ip_divert_ptr = NULL; 827136714Sandre err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); 828181803Sbz INP_INFO_WUNLOCK(&V_divcbinfo); 829196502Szec div_destroy(); 830196502Szec EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag); 831136714Sandre break; 832196502Szec#endif /* !VIMAGE */ 833136714Sandre default: 834136788Sandre err = EOPNOTSUPP; 835136714Sandre break; 836136714Sandre } 837136714Sandre return err; 838136714Sandre} 839136714Sandre 840136714Sandrestatic moduledata_t ipdivertmod = { 841136714Sandre "ipdivert", 842136714Sandre div_modevent, 843136714Sandre 0 844136714Sandre}; 845136714Sandre 846136714SandreDECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); 847201527SluigiMODULE_DEPEND(ipdivert, ipfw, 2, 2, 2); 848136714SandreMODULE_VERSION(ipdivert, 1); 849