1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33#include "opt_inet.h"
34#include "opt_ipsec.h"
35#include "opt_kern_tls.h"
36#include "opt_mbuf_stress_test.h"
37#include "opt_ratelimit.h"
38#include "opt_route.h"
39#include "opt_rss.h"
40#include "opt_sctp.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/kernel.h>
45#include <sys/ktls.h>
46#include <sys/lock.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/priv.h>
50#include <sys/proc.h>
51#include <sys/protosw.h>
52#include <sys/sdt.h>
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55#include <sys/sysctl.h>
56#include <sys/ucred.h>
57
58#include <net/if.h>
59#include <net/if_var.h>
60#include <net/if_private.h>
61#include <net/if_vlan_var.h>
62#include <net/if_llatbl.h>
63#include <net/ethernet.h>
64#include <net/netisr.h>
65#include <net/pfil.h>
66#include <net/route.h>
67#include <net/route/nhop.h>
68#include <net/rss_config.h>
69#include <net/vnet.h>
70
71#include <netinet/in.h>
72#include <netinet/in_fib.h>
73#include <netinet/in_kdtrace.h>
74#include <netinet/in_systm.h>
75#include <netinet/ip.h>
76#include <netinet/in_fib.h>
77#include <netinet/in_pcb.h>
78#include <netinet/in_rss.h>
79#include <netinet/in_var.h>
80#include <netinet/ip_var.h>
81#include <netinet/ip_options.h>
82
83#include <netinet/udp.h>
84#include <netinet/udp_var.h>
85
86#if defined(SCTP) || defined(SCTP_SUPPORT)
87#include <netinet/sctp.h>
88#include <netinet/sctp_crc32.h>
89#endif
90
91#include <netipsec/ipsec_support.h>
92
93#include <machine/in_cksum.h>
94
95#include <security/mac/mac_framework.h>
96
97#ifdef MBUF_STRESS_TEST
98static int mbuf_frag_size = 0;
99SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
100	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
101#endif
102
103static void	ip_mloopback(struct ifnet *, const struct mbuf *, int);
104
105extern int in_mcast_loop;
106
107static inline int
108ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags,
109    struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error)
110{
111	struct m_tag *fwd_tag = NULL;
112	struct mbuf *m;
113	struct in_addr odst;
114	struct ip *ip;
115
116	m = *mp;
117	ip = mtod(m, struct ip *);
118
119	/* Run through list of hooks for output packets. */
120	odst.s_addr = ip->ip_dst.s_addr;
121	switch (pfil_mbuf_out(V_inet_pfil_head, mp, ifp, inp)) {
122	case PFIL_DROPPED:
123		*error = EACCES;
124		/* FALLTHROUGH */
125	case PFIL_CONSUMED:
126		return 1; /* Finished */
127	case PFIL_PASS:
128		*error = 0;
129	}
130	m = *mp;
131	ip = mtod(m, struct ip *);
132
133	/* See if destination IP address was changed by packet filter. */
134	if (odst.s_addr != ip->ip_dst.s_addr) {
135		m->m_flags |= M_SKIP_FIREWALL;
136		/* If destination is now ourself drop to ip_input(). */
137		if (in_localip(ip->ip_dst)) {
138			m->m_flags |= M_FASTFWD_OURS;
139			if (m->m_pkthdr.rcvif == NULL)
140				m->m_pkthdr.rcvif = V_loif;
141			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
142				m->m_pkthdr.csum_flags |=
143					CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
144				m->m_pkthdr.csum_data = 0xffff;
145			}
146			m->m_pkthdr.csum_flags |=
147				CSUM_IP_CHECKED | CSUM_IP_VALID;
148#if defined(SCTP) || defined(SCTP_SUPPORT)
149			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
150				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
151#endif
152			*error = netisr_queue(NETISR_IP, m);
153			return 1; /* Finished */
154		}
155
156		bzero(dst, sizeof(*dst));
157		dst->sin_family = AF_INET;
158		dst->sin_len = sizeof(*dst);
159		dst->sin_addr = ip->ip_dst;
160
161		return -1; /* Reloop */
162	}
163	/* See if fib was changed by packet filter. */
164	if ((*fibnum) != M_GETFIB(m)) {
165		m->m_flags |= M_SKIP_FIREWALL;
166		*fibnum = M_GETFIB(m);
167		return -1; /* Reloop for FIB change */
168	}
169
170	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
171	if (m->m_flags & M_FASTFWD_OURS) {
172		if (m->m_pkthdr.rcvif == NULL)
173			m->m_pkthdr.rcvif = V_loif;
174		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
175			m->m_pkthdr.csum_flags |=
176				CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
177			m->m_pkthdr.csum_data = 0xffff;
178		}
179#if defined(SCTP) || defined(SCTP_SUPPORT)
180		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
181			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
182#endif
183		m->m_pkthdr.csum_flags |=
184			CSUM_IP_CHECKED | CSUM_IP_VALID;
185
186		*error = netisr_queue(NETISR_IP, m);
187		return 1; /* Finished */
188	}
189	/* Or forward to some other address? */
190	if ((m->m_flags & M_IP_NEXTHOP) &&
191	    ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
192		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
193		m->m_flags |= M_SKIP_FIREWALL;
194		m->m_flags &= ~M_IP_NEXTHOP;
195		m_tag_delete(m, fwd_tag);
196
197		return -1; /* Reloop for CHANGE of dst */
198	}
199
200	return 0;
201}
202
203static int
204ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
205    const struct sockaddr *gw, struct route *ro, bool stamp_tag)
206{
207#ifdef KERN_TLS
208	struct ktls_session *tls = NULL;
209#endif
210	struct m_snd_tag *mst;
211	int error;
212
213	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
214	mst = NULL;
215
216#ifdef KERN_TLS
217	/*
218	 * If this is an unencrypted TLS record, save a reference to
219	 * the record.  This local reference is used to call
220	 * ktls_output_eagain after the mbuf has been freed (thus
221	 * dropping the mbuf's reference) in if_output.
222	 */
223	if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
224		tls = ktls_hold(m->m_next->m_epg_tls);
225		mst = tls->snd_tag;
226
227		/*
228		 * If a TLS session doesn't have a valid tag, it must
229		 * have had an earlier ifp mismatch, so drop this
230		 * packet.
231		 */
232		if (mst == NULL) {
233			m_freem(m);
234			error = EAGAIN;
235			goto done;
236		}
237		/*
238		 * Always stamp tags that include NIC ktls.
239		 */
240		stamp_tag = true;
241	}
242#endif
243#ifdef RATELIMIT
244	if (inp != NULL && mst == NULL) {
245		if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
246		    (inp->inp_snd_tag != NULL &&
247		    inp->inp_snd_tag->ifp != ifp))
248			in_pcboutput_txrtlmt(inp, ifp, m);
249
250		if (inp->inp_snd_tag != NULL)
251			mst = inp->inp_snd_tag;
252	}
253#endif
254	if (stamp_tag && mst != NULL) {
255		KASSERT(m->m_pkthdr.rcvif == NULL,
256		    ("trying to add a send tag to a forwarded packet"));
257		if (mst->ifp != ifp) {
258			m_freem(m);
259			error = EAGAIN;
260			goto done;
261		}
262
263		/* stamp send tag on mbuf */
264		m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
265		m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
266	}
267
268	error = (*ifp->if_output)(ifp, m, gw, ro);
269
270done:
271	/* Check for route change invalidating send tags. */
272#ifdef KERN_TLS
273	if (tls != NULL) {
274		if (error == EAGAIN)
275			error = ktls_output_eagain(inp, tls);
276		ktls_free(tls);
277	}
278#endif
279#ifdef RATELIMIT
280	if (error == EAGAIN)
281		in_pcboutput_eagain(inp);
282#endif
283	return (error);
284}
285
286/* rte<>ro_flags translation */
287static inline void
288rt_update_ro_flags(struct route *ro, const struct nhop_object *nh)
289{
290	int nh_flags = nh->nh_flags;
291
292	ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
293
294	ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0;
295	ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0;
296	ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0;
297}
298
299/*
300 * IP output.  The packet in mbuf chain m contains a skeletal IP
301 * header (with len, off, ttl, proto, tos, src, dst).
302 * The mbuf chain containing the packet will be freed.
303 * The mbuf opt, if present, will not be freed.
304 * If route ro is present and has ro_rt initialized, route lookup would be
305 * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
306 * then result of route lookup is stored in ro->ro_rt.
307 *
308 * In the IP forwarding case, the packet will arrive with options already
309 * inserted, so must have a NULL opt pointer.
310 */
311int
312ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
313    struct ip_moptions *imo, struct inpcb *inp)
314{
315	struct ip *ip;
316	struct ifnet *ifp = NULL;	/* keep compiler happy */
317	struct mbuf *m0;
318	int hlen = sizeof (struct ip);
319	int mtu = 0;
320	int error = 0;
321	int vlan_pcp = -1;
322	struct sockaddr_in *dst;
323	const struct sockaddr *gw;
324	struct in_ifaddr *ia = NULL;
325	struct in_addr src;
326	int isbroadcast;
327	uint16_t ip_len, ip_off;
328	struct route iproute;
329	uint32_t fibnum;
330#if defined(IPSEC) || defined(IPSEC_SUPPORT)
331	int no_route_but_check_spd = 0;
332#endif
333
334	M_ASSERTPKTHDR(m);
335	NET_EPOCH_ASSERT();
336
337	if (inp != NULL) {
338		INP_LOCK_ASSERT(inp);
339		M_SETFIB(m, inp->inp_inc.inc_fibnum);
340		if ((flags & IP_NODEFAULTFLOWID) == 0) {
341			m->m_pkthdr.flowid = inp->inp_flowid;
342			M_HASHTYPE_SET(m, inp->inp_flowtype);
343		}
344		if ((inp->inp_flags2 & INP_2PCP_SET) != 0)
345			vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >>
346			    INP_2PCP_SHIFT;
347#ifdef NUMA
348		m->m_pkthdr.numa_domain = inp->inp_numa_domain;
349#endif
350	}
351
352	if (opt) {
353		int len = 0;
354		m = ip_insertoptions(m, opt, &len);
355		if (len != 0)
356			hlen = len; /* ip->ip_hl is updated above */
357	}
358	ip = mtod(m, struct ip *);
359	ip_len = ntohs(ip->ip_len);
360	ip_off = ntohs(ip->ip_off);
361
362	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
363		ip->ip_v = IPVERSION;
364		ip->ip_hl = hlen >> 2;
365		ip_fillid(ip);
366	} else {
367		/* Header already set, fetch hlen from there */
368		hlen = ip->ip_hl << 2;
369	}
370	if ((flags & IP_FORWARDING) == 0)
371		IPSTAT_INC(ips_localout);
372
373	/*
374	 * dst/gw handling:
375	 *
376	 * gw is readonly but can point either to dst OR rt_gateway,
377	 * therefore we need restore gw if we're redoing lookup.
378	 */
379	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
380	if (ro == NULL) {
381		ro = &iproute;
382		bzero(ro, sizeof (*ro));
383	}
384	dst = (struct sockaddr_in *)&ro->ro_dst;
385	if (ro->ro_nh == NULL) {
386		dst->sin_family = AF_INET;
387		dst->sin_len = sizeof(*dst);
388		dst->sin_addr = ip->ip_dst;
389	}
390	gw = (const struct sockaddr *)dst;
391again:
392	/*
393	 * Validate route against routing table additions;
394	 * a better/more specific route might have been added.
395	 */
396	if (inp != NULL && ro->ro_nh != NULL)
397		NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
398	/*
399	 * If there is a cached route,
400	 * check that it is to the same destination
401	 * and is still up.  If not, free it and try again.
402	 * The address family should also be checked in case of sharing the
403	 * cache with IPv6.
404	 * Also check whether routing cache needs invalidation.
405	 */
406	if (ro->ro_nh != NULL &&
407	    ((!NH_IS_VALID(ro->ro_nh)) || dst->sin_family != AF_INET ||
408	    dst->sin_addr.s_addr != ip->ip_dst.s_addr))
409		RO_INVALIDATE_CACHE(ro);
410	ia = NULL;
411	/*
412	 * If routing to interface only, short circuit routing lookup.
413	 * The use of an all-ones broadcast address implies this; an
414	 * interface is specified by the broadcast address of an interface,
415	 * or the destination address of a ptp interface.
416	 */
417	if (flags & IP_SENDONES) {
418		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
419						      M_GETFIB(m)))) == NULL &&
420		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
421						    M_GETFIB(m)))) == NULL) {
422			IPSTAT_INC(ips_noroute);
423			error = ENETUNREACH;
424			goto bad;
425		}
426		ip->ip_dst.s_addr = INADDR_BROADCAST;
427		dst->sin_addr = ip->ip_dst;
428		ifp = ia->ia_ifp;
429		mtu = ifp->if_mtu;
430		ip->ip_ttl = 1;
431		isbroadcast = 1;
432		src = IA_SIN(ia)->sin_addr;
433	} else if (flags & IP_ROUTETOIF) {
434		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
435						    M_GETFIB(m)))) == NULL &&
436		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
437						M_GETFIB(m)))) == NULL) {
438			IPSTAT_INC(ips_noroute);
439			error = ENETUNREACH;
440			goto bad;
441		}
442		ifp = ia->ia_ifp;
443		mtu = ifp->if_mtu;
444		ip->ip_ttl = 1;
445		isbroadcast = ifp->if_flags & IFF_BROADCAST ?
446		    in_ifaddr_broadcast(dst->sin_addr, ia) : 0;
447		src = IA_SIN(ia)->sin_addr;
448	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
449	    imo != NULL && imo->imo_multicast_ifp != NULL) {
450		/*
451		 * Bypass the normal routing lookup for multicast
452		 * packets if the interface is specified.
453		 */
454		ifp = imo->imo_multicast_ifp;
455		mtu = ifp->if_mtu;
456		IFP_TO_IA(ifp, ia);
457		isbroadcast = 0;	/* fool gcc */
458		/* Interface may have no addresses. */
459		if (ia != NULL)
460			src = IA_SIN(ia)->sin_addr;
461		else
462			src.s_addr = INADDR_ANY;
463	} else if (ro != &iproute) {
464		if (ro->ro_nh == NULL) {
465			/*
466			 * We want to do any cloning requested by the link
467			 * layer, as this is probably required in all cases
468			 * for correct operation (as it is for ARP).
469			 */
470			uint32_t flowid;
471			flowid = m->m_pkthdr.flowid;
472			ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
473			    NHR_REF, flowid);
474
475			if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh))) {
476#if defined(IPSEC) || defined(IPSEC_SUPPORT)
477				/*
478				 * There is no route for this packet, but it is
479				 * possible that a matching SPD entry exists.
480				 */
481				no_route_but_check_spd = 1;
482				goto sendit;
483#endif
484				IPSTAT_INC(ips_noroute);
485				error = EHOSTUNREACH;
486				goto bad;
487			}
488		}
489		struct nhop_object *nh = ro->ro_nh;
490
491		ia = ifatoia(nh->nh_ifa);
492		ifp = nh->nh_ifp;
493		counter_u64_add(nh->nh_pksent, 1);
494		rt_update_ro_flags(ro, nh);
495		if (nh->nh_flags & NHF_GATEWAY)
496			gw = &nh->gw_sa;
497		if (nh->nh_flags & NHF_HOST)
498			isbroadcast = (nh->nh_flags & NHF_BROADCAST);
499		else if ((ifp->if_flags & IFF_BROADCAST) && (gw->sa_family == AF_INET))
500			isbroadcast = in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia);
501		else
502			isbroadcast = 0;
503		mtu = nh->nh_mtu;
504		src = IA_SIN(ia)->sin_addr;
505	} else {
506		struct nhop_object *nh;
507
508		nh = fib4_lookup(M_GETFIB(m), dst->sin_addr, 0, NHR_NONE,
509		    m->m_pkthdr.flowid);
510		if (nh == NULL) {
511#if defined(IPSEC) || defined(IPSEC_SUPPORT)
512			/*
513			 * There is no route for this packet, but it is
514			 * possible that a matching SPD entry exists.
515			 */
516			no_route_but_check_spd = 1;
517			goto sendit;
518#endif
519			IPSTAT_INC(ips_noroute);
520			error = EHOSTUNREACH;
521			goto bad;
522		}
523		ifp = nh->nh_ifp;
524		mtu = nh->nh_mtu;
525		rt_update_ro_flags(ro, nh);
526		if (nh->nh_flags & NHF_GATEWAY)
527			gw = &nh->gw_sa;
528		ia = ifatoia(nh->nh_ifa);
529		src = IA_SIN(ia)->sin_addr;
530		isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
531		    (NHF_HOST | NHF_BROADCAST)) ||
532		    ((ifp->if_flags & IFF_BROADCAST) &&
533		    (gw->sa_family == AF_INET) &&
534		    in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia)));
535	}
536
537	/* Catch a possible divide by zero later. */
538	KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p",
539	    __func__, mtu, ro,
540	    (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp));
541
542	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
543		m->m_flags |= M_MCAST;
544		/*
545		 * IP destination address is multicast.  Make sure "gw"
546		 * still points to the address in "ro".  (It may have been
547		 * changed to point to a gateway address, above.)
548		 */
549		gw = (const struct sockaddr *)dst;
550		/*
551		 * See if the caller provided any multicast options
552		 */
553		if (imo != NULL) {
554			ip->ip_ttl = imo->imo_multicast_ttl;
555			if (imo->imo_multicast_vif != -1)
556				ip->ip_src.s_addr =
557				    ip_mcast_src ?
558				    ip_mcast_src(imo->imo_multicast_vif) :
559				    INADDR_ANY;
560		} else
561			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
562		/*
563		 * Confirm that the outgoing interface supports multicast.
564		 */
565		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
566			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
567				IPSTAT_INC(ips_noroute);
568				error = ENETUNREACH;
569				goto bad;
570			}
571		}
572		/*
573		 * If source address not specified yet, use address
574		 * of outgoing interface.
575		 */
576		if (ip->ip_src.s_addr == INADDR_ANY)
577			ip->ip_src = src;
578
579		if ((imo == NULL && in_mcast_loop) ||
580		    (imo && imo->imo_multicast_loop)) {
581			/*
582			 * Loop back multicast datagram if not expressly
583			 * forbidden to do so, even if we are not a member
584			 * of the group; ip_input() will filter it later,
585			 * thus deferring a hash lookup and mutex acquisition
586			 * at the expense of a cheap copy using m_copym().
587			 */
588			ip_mloopback(ifp, m, hlen);
589		} else {
590			/*
591			 * If we are acting as a multicast router, perform
592			 * multicast forwarding as if the packet had just
593			 * arrived on the interface to which we are about
594			 * to send.  The multicast forwarding function
595			 * recursively calls this function, using the
596			 * IP_FORWARDING flag to prevent infinite recursion.
597			 *
598			 * Multicasts that are looped back by ip_mloopback(),
599			 * above, will be forwarded by the ip_input() routine,
600			 * if necessary.
601			 */
602			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
603				/*
604				 * If rsvp daemon is not running, do not
605				 * set ip_moptions. This ensures that the packet
606				 * is multicast and not just sent down one link
607				 * as prescribed by rsvpd.
608				 */
609				if (!V_rsvp_on)
610					imo = NULL;
611				if (ip_mforward &&
612				    ip_mforward(ip, ifp, m, imo) != 0) {
613					m_freem(m);
614					goto done;
615				}
616			}
617		}
618
619		/*
620		 * Multicasts with a time-to-live of zero may be looped-
621		 * back, above, but must not be transmitted on a network.
622		 * Also, multicasts addressed to the loopback interface
623		 * are not sent -- the above call to ip_mloopback() will
624		 * loop back a copy. ip_input() will drop the copy if
625		 * this host does not belong to the destination group on
626		 * the loopback interface.
627		 */
628		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
629			m_freem(m);
630			goto done;
631		}
632
633		goto sendit;
634	}
635
636	/*
637	 * If the source address is not specified yet, use the address
638	 * of the outoing interface.
639	 */
640	if (ip->ip_src.s_addr == INADDR_ANY)
641		ip->ip_src = src;
642
643	/*
644	 * Look for broadcast address and
645	 * verify user is allowed to send
646	 * such a packet.
647	 */
648	if (isbroadcast) {
649		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
650			error = EADDRNOTAVAIL;
651			goto bad;
652		}
653		if ((flags & IP_ALLOWBROADCAST) == 0) {
654			error = EACCES;
655			goto bad;
656		}
657		/* don't allow broadcast messages to be fragmented */
658		if (ip_len > mtu) {
659			error = EMSGSIZE;
660			goto bad;
661		}
662		m->m_flags |= M_BCAST;
663	} else {
664		m->m_flags &= ~M_BCAST;
665	}
666
667sendit:
668#if defined(IPSEC) || defined(IPSEC_SUPPORT)
669	if (IPSEC_ENABLED(ipv4)) {
670		m = mb_unmapped_to_ext(m);
671		if (m == NULL) {
672			IPSTAT_INC(ips_odropped);
673			error = ENOBUFS;
674			goto bad;
675		}
676		if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) {
677			if (error == EINPROGRESS)
678				error = 0;
679			goto done;
680		}
681	}
682	/*
683	 * Check if there was a route for this packet; return error if not.
684	 */
685	if (no_route_but_check_spd) {
686		IPSTAT_INC(ips_noroute);
687		error = EHOSTUNREACH;
688		goto bad;
689	}
690	/* Update variables that are affected by ipsec4_output(). */
691	ip = mtod(m, struct ip *);
692	hlen = ip->ip_hl << 2;
693#endif /* IPSEC */
694
695	/* Jump over all PFIL processing if hooks are not active. */
696	if (PFIL_HOOKED_OUT(V_inet_pfil_head)) {
697		switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum,
698		    &error)) {
699		case 1: /* Finished */
700			goto done;
701
702		case 0: /* Continue normally */
703			ip = mtod(m, struct ip *);
704			ip_len = ntohs(ip->ip_len);
705			break;
706
707		case -1: /* Need to try again */
708			/* Reset everything for a new round */
709			if (ro != NULL) {
710				RO_NHFREE(ro);
711				ro->ro_prepend = NULL;
712			}
713			gw = (const struct sockaddr *)dst;
714			ip = mtod(m, struct ip *);
715			goto again;
716		}
717	}
718
719	if (vlan_pcp > -1)
720		EVL_APPLY_PRI(m, vlan_pcp);
721
722	/* IN_LOOPBACK must not appear on the wire - RFC1122. */
723	if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
724	    IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) {
725		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
726			IPSTAT_INC(ips_badaddr);
727			error = EADDRNOTAVAIL;
728			goto bad;
729		}
730	}
731
732	/* Ensure the packet data is mapped if the interface requires it. */
733	if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
734		m = mb_unmapped_to_ext(m);
735		if (m == NULL) {
736			IPSTAT_INC(ips_odropped);
737			error = ENOBUFS;
738			goto bad;
739		}
740	}
741
742	m->m_pkthdr.csum_flags |= CSUM_IP;
743	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
744		in_delayed_cksum(m);
745		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
746	}
747#if defined(SCTP) || defined(SCTP_SUPPORT)
748	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
749		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
750		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
751	}
752#endif
753
754	/*
755	 * If small enough for interface, or the interface will take
756	 * care of the fragmentation for us, we can just send directly.
757	 * Note that if_vxlan could have requested TSO even though the outer
758	 * frame is UDP.  It is correct to not fragment such datagrams and
759	 * instead just pass them on to the driver.
760	 */
761	if (ip_len <= mtu ||
762	    (m->m_pkthdr.csum_flags & ifp->if_hwassist &
763	    (CSUM_TSO | CSUM_INNER_TSO)) != 0) {
764		ip->ip_sum = 0;
765		if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
766			ip->ip_sum = in_cksum(m, hlen);
767			m->m_pkthdr.csum_flags &= ~CSUM_IP;
768		}
769
770		/*
771		 * Record statistics for this interface address.
772		 * With CSUM_TSO the byte/packet count will be slightly
773		 * incorrect because we count the IP+TCP headers only
774		 * once instead of for every generated packet.
775		 */
776		if (!(flags & IP_FORWARDING) && ia) {
777			if (m->m_pkthdr.csum_flags &
778			    (CSUM_TSO | CSUM_INNER_TSO))
779				counter_u64_add(ia->ia_ifa.ifa_opackets,
780				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
781			else
782				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
783
784			counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
785		}
786#ifdef MBUF_STRESS_TEST
787		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
788			m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
789#endif
790		/*
791		 * Reset layer specific mbuf flags
792		 * to avoid confusing lower layers.
793		 */
794		m_clrprotoflags(m);
795		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
796		error = ip_output_send(inp, ifp, m, gw, ro,
797		    (flags & IP_NO_SND_TAG_RL) ? false : true);
798		goto done;
799	}
800
801	/* Balk when DF bit is set or the interface didn't support TSO. */
802	if ((ip_off & IP_DF) ||
803	    (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) {
804		error = EMSGSIZE;
805		IPSTAT_INC(ips_cantfrag);
806		goto bad;
807	}
808
809	/*
810	 * Too large for interface; fragment if possible. If successful,
811	 * on return, m will point to a list of packets to be sent.
812	 */
813	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
814	if (error)
815		goto bad;
816	for (; m; m = m0) {
817		m0 = m->m_nextpkt;
818		m->m_nextpkt = 0;
819		if (error == 0) {
820			/* Record statistics for this interface address. */
821			if (ia != NULL) {
822				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
823				counter_u64_add(ia->ia_ifa.ifa_obytes,
824				    m->m_pkthdr.len);
825			}
826			/*
827			 * Reset layer specific mbuf flags
828			 * to avoid confusing upper layers.
829			 */
830			m_clrprotoflags(m);
831
832			IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
833			    mtod(m, struct ip *), NULL);
834			error = ip_output_send(inp, ifp, m, gw, ro, true);
835		} else
836			m_freem(m);
837	}
838
839	if (error == 0)
840		IPSTAT_INC(ips_fragmented);
841
842done:
843	return (error);
844 bad:
845	m_freem(m);
846	goto done;
847}
848
849/*
850 * Create a chain of fragments which fit the given mtu. m_frag points to the
851 * mbuf to be fragmented; on return it points to the chain with the fragments.
852 * Return 0 if no error. If error, m_frag may contain a partially built
853 * chain of fragments that should be freed by the caller.
854 *
855 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
856 */
857int
858ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
859    u_long if_hwassist_flags)
860{
861	int error = 0;
862	int hlen = ip->ip_hl << 2;
863	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
864	int off;
865	struct mbuf *m0 = *m_frag;	/* the original packet		*/
866	int firstlen;
867	struct mbuf **mnext;
868	int nfrags;
869	uint16_t ip_len, ip_off;
870
871	ip_len = ntohs(ip->ip_len);
872	ip_off = ntohs(ip->ip_off);
873
874	/*
875	 * Packet shall not have "Don't Fragment" flag and have at least 8
876	 * bytes of payload.
877	 */
878	if (__predict_false((ip_off & IP_DF) || len < 8)) {
879		IPSTAT_INC(ips_cantfrag);
880		return (EMSGSIZE);
881	}
882
883	/*
884	 * If the interface will not calculate checksums on
885	 * fragmented packets, then do it here.
886	 */
887	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
888		in_delayed_cksum(m0);
889		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
890	}
891#if defined(SCTP) || defined(SCTP_SUPPORT)
892	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
893		sctp_delayed_cksum(m0, hlen);
894		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
895	}
896#endif
897	if (len > PAGE_SIZE) {
898		/*
899		 * Fragment large datagrams such that each segment
900		 * contains a multiple of PAGE_SIZE amount of data,
901		 * plus headers. This enables a receiver to perform
902		 * page-flipping zero-copy optimizations.
903		 *
904		 * XXX When does this help given that sender and receiver
905		 * could have different page sizes, and also mtu could
906		 * be less than the receiver's page size ?
907		 */
908		int newlen;
909
910		off = MIN(mtu, m0->m_pkthdr.len);
911
912		/*
913		 * firstlen (off - hlen) must be aligned on an
914		 * 8-byte boundary
915		 */
916		if (off < hlen)
917			goto smart_frag_failure;
918		off = ((off - hlen) & ~7) + hlen;
919		newlen = (~PAGE_MASK) & mtu;
920		if ((newlen + sizeof (struct ip)) > mtu) {
921			/* we failed, go back the default */
922smart_frag_failure:
923			newlen = len;
924			off = hlen + len;
925		}
926		len = newlen;
927
928	} else {
929		off = hlen + len;
930	}
931
932	firstlen = off - hlen;
933	mnext = &m0->m_nextpkt;		/* pointer to next packet */
934
935	/*
936	 * Loop through length of segment after first fragment,
937	 * make new header and copy data of each part and link onto chain.
938	 * Here, m0 is the original packet, m is the fragment being created.
939	 * The fragments are linked off the m_nextpkt of the original
940	 * packet, which after processing serves as the first fragment.
941	 */
942	for (nfrags = 1; off < ip_len; off += len, nfrags++) {
943		struct ip *mhip;	/* ip header on the fragment */
944		struct mbuf *m;
945		int mhlen = sizeof (struct ip);
946
947		m = m_gethdr(M_NOWAIT, MT_DATA);
948		if (m == NULL) {
949			error = ENOBUFS;
950			IPSTAT_INC(ips_odropped);
951			goto done;
952		}
953		/*
954		 * Make sure the complete packet header gets copied
955		 * from the originating mbuf to the newly created
956		 * mbuf. This also ensures that existing firewall
957		 * classification(s), VLAN tags and so on get copied
958		 * to the resulting fragmented packet(s):
959		 */
960		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
961			m_free(m);
962			error = ENOBUFS;
963			IPSTAT_INC(ips_odropped);
964			goto done;
965		}
966		/*
967		 * In the first mbuf, leave room for the link header, then
968		 * copy the original IP header including options. The payload
969		 * goes into an additional mbuf chain returned by m_copym().
970		 */
971		m->m_data += max_linkhdr;
972		mhip = mtod(m, struct ip *);
973		*mhip = *ip;
974		if (hlen > sizeof (struct ip)) {
975			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
976			mhip->ip_v = IPVERSION;
977			mhip->ip_hl = mhlen >> 2;
978		}
979		m->m_len = mhlen;
980		/* XXX do we need to add ip_off below ? */
981		mhip->ip_off = ((off - hlen) >> 3) + ip_off;
982		if (off + len >= ip_len)
983			len = ip_len - off;
984		else
985			mhip->ip_off |= IP_MF;
986		mhip->ip_len = htons((u_short)(len + mhlen));
987		m->m_next = m_copym(m0, off, len, M_NOWAIT);
988		if (m->m_next == NULL) {	/* copy failed */
989			m_free(m);
990			error = ENOBUFS;	/* ??? */
991			IPSTAT_INC(ips_odropped);
992			goto done;
993		}
994		m->m_pkthdr.len = mhlen + len;
995#ifdef MAC
996		mac_netinet_fragment(m0, m);
997#endif
998		mhip->ip_off = htons(mhip->ip_off);
999		mhip->ip_sum = 0;
1000		if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
1001			mhip->ip_sum = in_cksum(m, mhlen);
1002			m->m_pkthdr.csum_flags &= ~CSUM_IP;
1003		}
1004		*mnext = m;
1005		mnext = &m->m_nextpkt;
1006	}
1007	IPSTAT_ADD(ips_ofragments, nfrags);
1008
1009	/*
1010	 * Update first fragment by trimming what's been copied out
1011	 * and updating header.
1012	 */
1013	m_adj(m0, hlen + firstlen - ip_len);
1014	m0->m_pkthdr.len = hlen + firstlen;
1015	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1016	ip->ip_off = htons(ip_off | IP_MF);
1017	ip->ip_sum = 0;
1018	if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
1019		ip->ip_sum = in_cksum(m0, hlen);
1020		m0->m_pkthdr.csum_flags &= ~CSUM_IP;
1021	}
1022
1023done:
1024	*m_frag = m0;
1025	return error;
1026}
1027
1028void
1029in_delayed_cksum(struct mbuf *m)
1030{
1031	struct ip *ip;
1032	struct udphdr *uh;
1033	uint16_t cklen, csum, offset;
1034
1035	ip = mtod(m, struct ip *);
1036	offset = ip->ip_hl << 2 ;
1037
1038	if (m->m_pkthdr.csum_flags & CSUM_UDP) {
1039		/* if udp header is not in the first mbuf copy udplen */
1040		if (offset + sizeof(struct udphdr) > m->m_len) {
1041			m_copydata(m, offset + offsetof(struct udphdr,
1042			    uh_ulen), sizeof(cklen), (caddr_t)&cklen);
1043			cklen = ntohs(cklen);
1044		} else {
1045			uh = (struct udphdr *)mtodo(m, offset);
1046			cklen = ntohs(uh->uh_ulen);
1047		}
1048		csum = in_cksum_skip(m, cklen + offset, offset);
1049		if (csum == 0)
1050			csum = 0xffff;
1051	} else {
1052		cklen = ntohs(ip->ip_len);
1053		csum = in_cksum_skip(m, cklen, offset);
1054	}
1055	offset += m->m_pkthdr.csum_data;	/* checksum offset */
1056
1057	if (offset + sizeof(csum) > m->m_len)
1058		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
1059	else
1060		*(u_short *)mtodo(m, offset) = csum;
1061}
1062
1063/*
1064 * IP socket option processing.
1065 */
1066int
1067ip_ctloutput(struct socket *so, struct sockopt *sopt)
1068{
1069	struct inpcb *inp = sotoinpcb(so);
1070	int	error, optval;
1071#ifdef	RSS
1072	uint32_t rss_bucket;
1073	int retval;
1074#endif
1075
1076	error = optval = 0;
1077	if (sopt->sopt_level != IPPROTO_IP) {
1078		error = EINVAL;
1079
1080		if (sopt->sopt_level == SOL_SOCKET &&
1081		    sopt->sopt_dir == SOPT_SET) {
1082			switch (sopt->sopt_name) {
1083			case SO_SETFIB:
1084				INP_WLOCK(inp);
1085				inp->inp_inc.inc_fibnum = so->so_fibnum;
1086				INP_WUNLOCK(inp);
1087				error = 0;
1088				break;
1089			case SO_MAX_PACING_RATE:
1090#ifdef RATELIMIT
1091				INP_WLOCK(inp);
1092				inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
1093				INP_WUNLOCK(inp);
1094				error = 0;
1095#else
1096				error = EOPNOTSUPP;
1097#endif
1098				break;
1099			default:
1100				break;
1101			}
1102		}
1103		return (error);
1104	}
1105
1106	switch (sopt->sopt_dir) {
1107	case SOPT_SET:
1108		switch (sopt->sopt_name) {
1109		case IP_OPTIONS:
1110#ifdef notyet
1111		case IP_RETOPTS:
1112#endif
1113		{
1114			struct mbuf *m;
1115			if (sopt->sopt_valsize > MLEN) {
1116				error = EMSGSIZE;
1117				break;
1118			}
1119			m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
1120			if (m == NULL) {
1121				error = ENOBUFS;
1122				break;
1123			}
1124			m->m_len = sopt->sopt_valsize;
1125			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1126					    m->m_len);
1127			if (error) {
1128				m_free(m);
1129				break;
1130			}
1131			INP_WLOCK(inp);
1132			error = ip_pcbopts(inp, sopt->sopt_name, m);
1133			INP_WUNLOCK(inp);
1134			return (error);
1135		}
1136
1137		case IP_BINDANY:
1138			if (sopt->sopt_td != NULL) {
1139				error = priv_check(sopt->sopt_td,
1140				    PRIV_NETINET_BINDANY);
1141				if (error)
1142					break;
1143			}
1144			/* FALLTHROUGH */
1145		case IP_TOS:
1146		case IP_TTL:
1147		case IP_MINTTL:
1148		case IP_RECVOPTS:
1149		case IP_RECVRETOPTS:
1150		case IP_ORIGDSTADDR:
1151		case IP_RECVDSTADDR:
1152		case IP_RECVTTL:
1153		case IP_RECVIF:
1154		case IP_ONESBCAST:
1155		case IP_DONTFRAG:
1156		case IP_RECVTOS:
1157		case IP_RECVFLOWID:
1158#ifdef	RSS
1159		case IP_RECVRSSBUCKETID:
1160#endif
1161		case IP_VLAN_PCP:
1162			error = sooptcopyin(sopt, &optval, sizeof optval,
1163					    sizeof optval);
1164			if (error)
1165				break;
1166
1167			switch (sopt->sopt_name) {
1168			case IP_TOS:
1169				inp->inp_ip_tos = optval;
1170				break;
1171
1172			case IP_TTL:
1173				inp->inp_ip_ttl = optval;
1174				break;
1175
1176			case IP_MINTTL:
1177				if (optval >= 0 && optval <= MAXTTL)
1178					inp->inp_ip_minttl = optval;
1179				else
1180					error = EINVAL;
1181				break;
1182
1183#define	OPTSET(bit) do {						\
1184	INP_WLOCK(inp);							\
1185	if (optval)							\
1186		inp->inp_flags |= bit;					\
1187	else								\
1188		inp->inp_flags &= ~bit;					\
1189	INP_WUNLOCK(inp);						\
1190} while (0)
1191
1192#define	OPTSET2(bit, val) do {						\
1193	INP_WLOCK(inp);							\
1194	if (val)							\
1195		inp->inp_flags2 |= bit;					\
1196	else								\
1197		inp->inp_flags2 &= ~bit;				\
1198	INP_WUNLOCK(inp);						\
1199} while (0)
1200
1201			case IP_RECVOPTS:
1202				OPTSET(INP_RECVOPTS);
1203				break;
1204
1205			case IP_RECVRETOPTS:
1206				OPTSET(INP_RECVRETOPTS);
1207				break;
1208
1209			case IP_RECVDSTADDR:
1210				OPTSET(INP_RECVDSTADDR);
1211				break;
1212
1213			case IP_ORIGDSTADDR:
1214				OPTSET2(INP_ORIGDSTADDR, optval);
1215				break;
1216
1217			case IP_RECVTTL:
1218				OPTSET(INP_RECVTTL);
1219				break;
1220
1221			case IP_RECVIF:
1222				OPTSET(INP_RECVIF);
1223				break;
1224
1225			case IP_ONESBCAST:
1226				OPTSET(INP_ONESBCAST);
1227				break;
1228			case IP_DONTFRAG:
1229				OPTSET(INP_DONTFRAG);
1230				break;
1231			case IP_BINDANY:
1232				OPTSET(INP_BINDANY);
1233				break;
1234			case IP_RECVTOS:
1235				OPTSET(INP_RECVTOS);
1236				break;
1237			case IP_RECVFLOWID:
1238				OPTSET2(INP_RECVFLOWID, optval);
1239				break;
1240#ifdef RSS
1241			case IP_RECVRSSBUCKETID:
1242				OPTSET2(INP_RECVRSSBUCKETID, optval);
1243				break;
1244#endif
1245			case IP_VLAN_PCP:
1246				if ((optval >= -1) && (optval <=
1247				    (INP_2PCP_MASK >> INP_2PCP_SHIFT))) {
1248					if (optval == -1) {
1249						INP_WLOCK(inp);
1250						inp->inp_flags2 &=
1251						    ~(INP_2PCP_SET |
1252						      INP_2PCP_MASK);
1253						INP_WUNLOCK(inp);
1254					} else {
1255						INP_WLOCK(inp);
1256						inp->inp_flags2 |=
1257						    INP_2PCP_SET;
1258						inp->inp_flags2 &=
1259						    ~INP_2PCP_MASK;
1260						inp->inp_flags2 |=
1261						    optval << INP_2PCP_SHIFT;
1262						INP_WUNLOCK(inp);
1263					}
1264				} else
1265					error = EINVAL;
1266				break;
1267			}
1268			break;
1269#undef OPTSET
1270#undef OPTSET2
1271
1272		/*
1273		 * Multicast socket options are processed by the in_mcast
1274		 * module.
1275		 */
1276		case IP_MULTICAST_IF:
1277		case IP_MULTICAST_VIF:
1278		case IP_MULTICAST_TTL:
1279		case IP_MULTICAST_LOOP:
1280		case IP_ADD_MEMBERSHIP:
1281		case IP_DROP_MEMBERSHIP:
1282		case IP_ADD_SOURCE_MEMBERSHIP:
1283		case IP_DROP_SOURCE_MEMBERSHIP:
1284		case IP_BLOCK_SOURCE:
1285		case IP_UNBLOCK_SOURCE:
1286		case IP_MSFILTER:
1287		case MCAST_JOIN_GROUP:
1288		case MCAST_LEAVE_GROUP:
1289		case MCAST_JOIN_SOURCE_GROUP:
1290		case MCAST_LEAVE_SOURCE_GROUP:
1291		case MCAST_BLOCK_SOURCE:
1292		case MCAST_UNBLOCK_SOURCE:
1293			error = inp_setmoptions(inp, sopt);
1294			break;
1295
1296		case IP_PORTRANGE:
1297			error = sooptcopyin(sopt, &optval, sizeof optval,
1298					    sizeof optval);
1299			if (error)
1300				break;
1301
1302			INP_WLOCK(inp);
1303			switch (optval) {
1304			case IP_PORTRANGE_DEFAULT:
1305				inp->inp_flags &= ~(INP_LOWPORT);
1306				inp->inp_flags &= ~(INP_HIGHPORT);
1307				break;
1308
1309			case IP_PORTRANGE_HIGH:
1310				inp->inp_flags &= ~(INP_LOWPORT);
1311				inp->inp_flags |= INP_HIGHPORT;
1312				break;
1313
1314			case IP_PORTRANGE_LOW:
1315				inp->inp_flags &= ~(INP_HIGHPORT);
1316				inp->inp_flags |= INP_LOWPORT;
1317				break;
1318
1319			default:
1320				error = EINVAL;
1321				break;
1322			}
1323			INP_WUNLOCK(inp);
1324			break;
1325
1326#if defined(IPSEC) || defined(IPSEC_SUPPORT)
1327		case IP_IPSEC_POLICY:
1328			if (IPSEC_ENABLED(ipv4)) {
1329				error = IPSEC_PCBCTL(ipv4, inp, sopt);
1330				break;
1331			}
1332			/* FALLTHROUGH */
1333#endif /* IPSEC */
1334
1335		default:
1336			error = ENOPROTOOPT;
1337			break;
1338		}
1339		break;
1340
1341	case SOPT_GET:
1342		switch (sopt->sopt_name) {
1343		case IP_OPTIONS:
1344		case IP_RETOPTS:
1345			INP_RLOCK(inp);
1346			if (inp->inp_options) {
1347				struct mbuf *options;
1348
1349				options = m_copym(inp->inp_options, 0,
1350				    M_COPYALL, M_NOWAIT);
1351				INP_RUNLOCK(inp);
1352				if (options != NULL) {
1353					error = sooptcopyout(sopt,
1354							     mtod(options, char *),
1355							     options->m_len);
1356					m_freem(options);
1357				} else
1358					error = ENOMEM;
1359			} else {
1360				INP_RUNLOCK(inp);
1361				sopt->sopt_valsize = 0;
1362			}
1363			break;
1364
1365		case IP_TOS:
1366		case IP_TTL:
1367		case IP_MINTTL:
1368		case IP_RECVOPTS:
1369		case IP_RECVRETOPTS:
1370		case IP_ORIGDSTADDR:
1371		case IP_RECVDSTADDR:
1372		case IP_RECVTTL:
1373		case IP_RECVIF:
1374		case IP_PORTRANGE:
1375		case IP_ONESBCAST:
1376		case IP_DONTFRAG:
1377		case IP_BINDANY:
1378		case IP_RECVTOS:
1379		case IP_FLOWID:
1380		case IP_FLOWTYPE:
1381		case IP_RECVFLOWID:
1382#ifdef	RSS
1383		case IP_RSSBUCKETID:
1384		case IP_RECVRSSBUCKETID:
1385#endif
1386		case IP_VLAN_PCP:
1387			switch (sopt->sopt_name) {
1388			case IP_TOS:
1389				optval = inp->inp_ip_tos;
1390				break;
1391
1392			case IP_TTL:
1393				optval = inp->inp_ip_ttl;
1394				break;
1395
1396			case IP_MINTTL:
1397				optval = inp->inp_ip_minttl;
1398				break;
1399
1400#define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1401#define	OPTBIT2(bit)	(inp->inp_flags2 & bit ? 1 : 0)
1402
1403			case IP_RECVOPTS:
1404				optval = OPTBIT(INP_RECVOPTS);
1405				break;
1406
1407			case IP_RECVRETOPTS:
1408				optval = OPTBIT(INP_RECVRETOPTS);
1409				break;
1410
1411			case IP_RECVDSTADDR:
1412				optval = OPTBIT(INP_RECVDSTADDR);
1413				break;
1414
1415			case IP_ORIGDSTADDR:
1416				optval = OPTBIT2(INP_ORIGDSTADDR);
1417				break;
1418
1419			case IP_RECVTTL:
1420				optval = OPTBIT(INP_RECVTTL);
1421				break;
1422
1423			case IP_RECVIF:
1424				optval = OPTBIT(INP_RECVIF);
1425				break;
1426
1427			case IP_PORTRANGE:
1428				if (inp->inp_flags & INP_HIGHPORT)
1429					optval = IP_PORTRANGE_HIGH;
1430				else if (inp->inp_flags & INP_LOWPORT)
1431					optval = IP_PORTRANGE_LOW;
1432				else
1433					optval = 0;
1434				break;
1435
1436			case IP_ONESBCAST:
1437				optval = OPTBIT(INP_ONESBCAST);
1438				break;
1439			case IP_DONTFRAG:
1440				optval = OPTBIT(INP_DONTFRAG);
1441				break;
1442			case IP_BINDANY:
1443				optval = OPTBIT(INP_BINDANY);
1444				break;
1445			case IP_RECVTOS:
1446				optval = OPTBIT(INP_RECVTOS);
1447				break;
1448			case IP_FLOWID:
1449				optval = inp->inp_flowid;
1450				break;
1451			case IP_FLOWTYPE:
1452				optval = inp->inp_flowtype;
1453				break;
1454			case IP_RECVFLOWID:
1455				optval = OPTBIT2(INP_RECVFLOWID);
1456				break;
1457#ifdef	RSS
1458			case IP_RSSBUCKETID:
1459				retval = rss_hash2bucket(inp->inp_flowid,
1460				    inp->inp_flowtype,
1461				    &rss_bucket);
1462				if (retval == 0)
1463					optval = rss_bucket;
1464				else
1465					error = EINVAL;
1466				break;
1467			case IP_RECVRSSBUCKETID:
1468				optval = OPTBIT2(INP_RECVRSSBUCKETID);
1469				break;
1470#endif
1471			case IP_VLAN_PCP:
1472				if (OPTBIT2(INP_2PCP_SET)) {
1473					optval = (inp->inp_flags2 &
1474					    INP_2PCP_MASK) >> INP_2PCP_SHIFT;
1475				} else {
1476					optval = -1;
1477				}
1478				break;
1479			}
1480			error = sooptcopyout(sopt, &optval, sizeof optval);
1481			break;
1482
1483		/*
1484		 * Multicast socket options are processed by the in_mcast
1485		 * module.
1486		 */
1487		case IP_MULTICAST_IF:
1488		case IP_MULTICAST_VIF:
1489		case IP_MULTICAST_TTL:
1490		case IP_MULTICAST_LOOP:
1491		case IP_MSFILTER:
1492			error = inp_getmoptions(inp, sopt);
1493			break;
1494
1495#if defined(IPSEC) || defined(IPSEC_SUPPORT)
1496		case IP_IPSEC_POLICY:
1497			if (IPSEC_ENABLED(ipv4)) {
1498				error = IPSEC_PCBCTL(ipv4, inp, sopt);
1499				break;
1500			}
1501			/* FALLTHROUGH */
1502#endif /* IPSEC */
1503
1504		default:
1505			error = ENOPROTOOPT;
1506			break;
1507		}
1508		break;
1509	}
1510	return (error);
1511}
1512
1513/*
1514 * Routine called from ip_output() to loop back a copy of an IP multicast
1515 * packet to the input queue of a specified interface.  Note that this
1516 * calls the output routine of the loopback "driver", but with an interface
1517 * pointer that might NOT be a loopback interface -- evil, but easier than
1518 * replicating that code here.
1519 */
1520static void
1521ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
1522{
1523	struct ip *ip;
1524	struct mbuf *copym;
1525
1526	/*
1527	 * Make a deep copy of the packet because we're going to
1528	 * modify the pack in order to generate checksums.
1529	 */
1530	copym = m_dup(m, M_NOWAIT);
1531	if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
1532		copym = m_pullup(copym, hlen);
1533	if (copym != NULL) {
1534		/* If needed, compute the checksum and mark it as valid. */
1535		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1536			in_delayed_cksum(copym);
1537			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1538			copym->m_pkthdr.csum_flags |=
1539			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1540			copym->m_pkthdr.csum_data = 0xffff;
1541		}
1542		/*
1543		 * We don't bother to fragment if the IP length is greater
1544		 * than the interface's MTU.  Can this possibly matter?
1545		 */
1546		ip = mtod(copym, struct ip *);
1547		ip->ip_sum = 0;
1548		ip->ip_sum = in_cksum(copym, hlen);
1549		if_simloop(ifp, copym, AF_INET, 0);
1550	}
1551}
1552