1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5 *	The Regents of the University of California.
6 * Copyright (c) 2008 Robert N. M. Watson
7 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8 * Copyright (c) 2014 Kevin Lo
9 * All rights reserved.
10 *
11 * Portions of this software were developed by Robert N. M. Watson under
12 * contract to Juniper Networks, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39#include <sys/cdefs.h>
40#include "opt_inet.h"
41#include "opt_inet6.h"
42#include "opt_ipsec.h"
43#include "opt_route.h"
44#include "opt_rss.h"
45
46#include <sys/param.h>
47#include <sys/domain.h>
48#include <sys/eventhandler.h>
49#include <sys/jail.h>
50#include <sys/kernel.h>
51#include <sys/lock.h>
52#include <sys/malloc.h>
53#include <sys/mbuf.h>
54#include <sys/priv.h>
55#include <sys/proc.h>
56#include <sys/protosw.h>
57#include <sys/sdt.h>
58#include <sys/signalvar.h>
59#include <sys/socket.h>
60#include <sys/socketvar.h>
61#include <sys/sx.h>
62#include <sys/sysctl.h>
63#include <sys/syslog.h>
64#include <sys/systm.h>
65
66#include <vm/uma.h>
67
68#include <net/if.h>
69#include <net/if_var.h>
70#include <net/route.h>
71#include <net/route/nhop.h>
72#include <net/rss_config.h>
73
74#include <netinet/in.h>
75#include <netinet/in_kdtrace.h>
76#include <netinet/in_fib.h>
77#include <netinet/in_pcb.h>
78#include <netinet/in_systm.h>
79#include <netinet/in_var.h>
80#include <netinet/ip.h>
81#ifdef INET6
82#include <netinet/ip6.h>
83#endif
84#include <netinet/ip_icmp.h>
85#include <netinet/icmp_var.h>
86#include <netinet/ip_var.h>
87#include <netinet/ip_options.h>
88#ifdef INET6
89#include <netinet6/ip6_var.h>
90#endif
91#include <netinet/udp.h>
92#include <netinet/udp_var.h>
93#include <netinet/udplite.h>
94#include <netinet/in_rss.h>
95
96#include <netipsec/ipsec_support.h>
97
98#include <machine/in_cksum.h>
99
100#include <security/mac/mac_framework.h>
101
102/*
103 * UDP and UDP-Lite protocols implementation.
104 * Per RFC 768, August, 1980.
105 * Per RFC 3828, July, 2004.
106 */
107
108/*
109 * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
110 * removes the only data integrity mechanism for packets and malformed
111 * packets that would otherwise be discarded due to bad checksums, and may
112 * cause problems (especially for NFS data blocks).
113 */
114VNET_DEFINE(int, udp_cksum) = 1;
115SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
116    &VNET_NAME(udp_cksum), 0, "compute udp checksum");
117
118VNET_DEFINE(int, udp_log_in_vain) = 0;
119SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
120    &VNET_NAME(udp_log_in_vain), 0, "Log all incoming UDP packets");
121
122VNET_DEFINE(int, udp_blackhole) = 0;
123SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
124    &VNET_NAME(udp_blackhole), 0,
125    "Do not send port unreachables for refused connects");
126VNET_DEFINE(bool, udp_blackhole_local) = false;
127SYSCTL_BOOL(_net_inet_udp, OID_AUTO, blackhole_local, CTLFLAG_VNET |
128    CTLFLAG_RW, &VNET_NAME(udp_blackhole_local), false,
129    "Enforce net.inet.udp.blackhole for locally originated packets");
130
131u_long	udp_sendspace = 9216;		/* really max datagram size */
132SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
133    &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
134
135u_long	udp_recvspace = 40 * (1024 +
136#ifdef INET6
137				      sizeof(struct sockaddr_in6)
138#else
139				      sizeof(struct sockaddr_in)
140#endif
141				      );	/* 40 1K datagrams */
142
143SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
144    &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
145
146VNET_DEFINE(struct inpcbinfo, udbinfo);
147VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
148
149#ifndef UDBHASHSIZE
150#define	UDBHASHSIZE	128
151#endif
152
153VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
154VNET_PCPUSTAT_SYSINIT(udpstat);
155SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
156    udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
157
158#ifdef VIMAGE
159VNET_PCPUSTAT_SYSUNINIT(udpstat);
160#endif /* VIMAGE */
161#ifdef INET
162static void	udp_detach(struct socket *so);
163#endif
164
165INPCBSTORAGE_DEFINE(udpcbstor, udpcb, "udpinp", "udp_inpcb", "udp", "udphash");
166INPCBSTORAGE_DEFINE(udplitecbstor, udpcb, "udpliteinp", "udplite_inpcb",
167    "udplite", "udplitehash");
168
169static void
170udp_vnet_init(void *arg __unused)
171{
172
173	/*
174	 * For now default to 2-tuple UDP hashing - until the fragment
175	 * reassembly code can also update the flowid.
176	 *
177	 * Once we can calculate the flowid that way and re-establish
178	 * a 4-tuple, flip this to 4-tuple.
179	 */
180	in_pcbinfo_init(&V_udbinfo, &udpcbstor, UDBHASHSIZE, UDBHASHSIZE);
181	/* Additional pcbinfo for UDP-Lite */
182	in_pcbinfo_init(&V_ulitecbinfo, &udplitecbstor, UDBHASHSIZE,
183	    UDBHASHSIZE);
184}
185VNET_SYSINIT(udp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
186    udp_vnet_init, NULL);
187
188/*
189 * Kernel module interface for updating udpstat.  The argument is an index
190 * into udpstat treated as an array of u_long.  While this encodes the
191 * general layout of udpstat into the caller, it doesn't encode its location,
192 * so that future changes to add, for example, per-CPU stats support won't
193 * cause binary compatibility problems for kernel modules.
194 */
195void
196kmod_udpstat_inc(int statnum)
197{
198
199	counter_u64_add(VNET(udpstat)[statnum], 1);
200}
201
202#ifdef VIMAGE
203static void
204udp_destroy(void *unused __unused)
205{
206
207	in_pcbinfo_destroy(&V_udbinfo);
208	in_pcbinfo_destroy(&V_ulitecbinfo);
209}
210VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL);
211#endif
212
213#ifdef INET
214/*
215 * Subroutine of udp_input(), which appends the provided mbuf chain to the
216 * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
217 * contains the source address.  If the socket ends up being an IPv6 socket,
218 * udp_append() will convert to a sockaddr_in6 before passing the address
219 * into the socket code.
220 *
221 * In the normal case udp_append() will return 0, indicating that you
222 * must unlock the inp. However if a tunneling protocol is in place we increment
223 * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
224 * then decrement the reference count. If the inp_rele returns 1, indicating the
225 * inp is gone, we return that to the caller to tell them *not* to unlock
226 * the inp. In the case of multi-cast this will cause the distribution
227 * to stop (though most tunneling protocols known currently do *not* use
228 * multicast).
229 */
230static int
231udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
232    struct sockaddr_in *udp_in)
233{
234	struct sockaddr *append_sa;
235	struct socket *so;
236	struct mbuf *tmpopts, *opts = NULL;
237#ifdef INET6
238	struct sockaddr_in6 udp_in6;
239#endif
240	struct udpcb *up;
241	bool filtered;
242
243	INP_LOCK_ASSERT(inp);
244
245	/*
246	 * Engage the tunneling protocol.
247	 */
248	up = intoudpcb(inp);
249	if (up->u_tun_func != NULL) {
250		in_pcbref(inp);
251		INP_RUNLOCK(inp);
252		filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0],
253		    up->u_tun_ctx);
254		INP_RLOCK(inp);
255		if (filtered)
256			return (in_pcbrele_rlocked(inp));
257	}
258
259	off += sizeof(struct udphdr);
260
261#if defined(IPSEC) || defined(IPSEC_SUPPORT)
262	/* Check AH/ESP integrity. */
263	if (IPSEC_ENABLED(ipv4) &&
264	    IPSEC_CHECK_POLICY(ipv4, n, inp) != 0) {
265		m_freem(n);
266		return (0);
267	}
268	if (up->u_flags & UF_ESPINUDP) {/* IPSec UDP encaps. */
269		if (IPSEC_ENABLED(ipv4) &&
270		    UDPENCAP_INPUT(ipv4, n, off, AF_INET) != 0)
271			return (0);	/* Consumed. */
272	}
273#endif /* IPSEC */
274#ifdef MAC
275	if (mac_inpcb_check_deliver(inp, n) != 0) {
276		m_freem(n);
277		return (0);
278	}
279#endif /* MAC */
280	if (inp->inp_flags & INP_CONTROLOPTS ||
281	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
282#ifdef INET6
283		if (inp->inp_vflag & INP_IPV6)
284			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
285		else
286#endif /* INET6 */
287			ip_savecontrol(inp, &opts, ip, n);
288	}
289	if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags2 & INP_ORIGDSTADDR)) {
290		tmpopts = sbcreatecontrol(&udp_in[1],
291		    sizeof(struct sockaddr_in), IP_ORIGDSTADDR, IPPROTO_IP,
292		    M_NOWAIT);
293		if (tmpopts) {
294			if (opts) {
295				tmpopts->m_next = opts;
296				opts = tmpopts;
297			} else
298				opts = tmpopts;
299		}
300	}
301#ifdef INET6
302	if (inp->inp_vflag & INP_IPV6) {
303		bzero(&udp_in6, sizeof(udp_in6));
304		udp_in6.sin6_len = sizeof(udp_in6);
305		udp_in6.sin6_family = AF_INET6;
306		in6_sin_2_v4mapsin6(&udp_in[0], &udp_in6);
307		append_sa = (struct sockaddr *)&udp_in6;
308	} else
309#endif /* INET6 */
310		append_sa = (struct sockaddr *)&udp_in[0];
311	m_adj(n, off);
312
313	so = inp->inp_socket;
314	SOCKBUF_LOCK(&so->so_rcv);
315	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
316		soroverflow_locked(so);
317		m_freem(n);
318		if (opts)
319			m_freem(opts);
320		UDPSTAT_INC(udps_fullsock);
321	} else
322		sorwakeup_locked(so);
323	return (0);
324}
325
326static bool
327udp_multi_match(const struct inpcb *inp, void *v)
328{
329	struct ip *ip = v;
330	struct udphdr *uh = (struct udphdr *)(ip + 1);
331
332	if (inp->inp_lport != uh->uh_dport)
333		return (false);
334#ifdef INET6
335	if ((inp->inp_vflag & INP_IPV4) == 0)
336		return (false);
337#endif
338	if (inp->inp_laddr.s_addr != INADDR_ANY &&
339	    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
340		return (false);
341	if (inp->inp_faddr.s_addr != INADDR_ANY &&
342	    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
343		return (false);
344	if (inp->inp_fport != 0 &&
345	    inp->inp_fport != uh->uh_sport)
346		return (false);
347
348	return (true);
349}
350
351static int
352udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in)
353{
354	struct ip *ip = mtod(m, struct ip *);
355	struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto),
356	    INPLOOKUP_RLOCKPCB, udp_multi_match, ip);
357#ifdef KDTRACE_HOOKS
358	struct udphdr *uh = (struct udphdr *)(ip + 1);
359#endif
360	struct inpcb *inp;
361	struct mbuf *n;
362	int appends = 0;
363
364	MPASS(ip->ip_hl == sizeof(struct ip) >> 2);
365
366	while ((inp = inp_next(&inpi)) != NULL) {
367		/*
368		 * XXXRW: Because we weren't holding either the inpcb
369		 * or the hash lock when we checked for a match
370		 * before, we should probably recheck now that the
371		 * inpcb lock is held.
372		 */
373		/*
374		 * Handle socket delivery policy for any-source
375		 * and source-specific multicast. [RFC3678]
376		 */
377		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
378			struct ip_moptions	*imo;
379			struct sockaddr_in	 group;
380			int			 blocked;
381
382			imo = inp->inp_moptions;
383			if (imo == NULL)
384				continue;
385			bzero(&group, sizeof(struct sockaddr_in));
386			group.sin_len = sizeof(struct sockaddr_in);
387			group.sin_family = AF_INET;
388			group.sin_addr = ip->ip_dst;
389
390			blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif,
391				(struct sockaddr *)&group,
392				(struct sockaddr *)&udp_in[0]);
393			if (blocked != MCAST_PASS) {
394				if (blocked == MCAST_NOTGMEMBER)
395					IPSTAT_INC(ips_notmember);
396				if (blocked == MCAST_NOTSMEMBER ||
397				    blocked == MCAST_MUTED)
398					UDPSTAT_INC(udps_filtermcast);
399				continue;
400			}
401		}
402		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
403			if (proto == IPPROTO_UDPLITE)
404				UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
405			else
406				UDP_PROBE(receive, NULL, inp, ip, inp, uh);
407			if (udp_append(inp, ip, n, sizeof(struct ip), udp_in)) {
408				break;
409			} else
410				appends++;
411		}
412		/*
413		 * Don't look for additional matches if this one does
414		 * not have either the SO_REUSEPORT or SO_REUSEADDR
415		 * socket options set.  This heuristic avoids
416		 * searching through all pcbs in the common case of a
417		 * non-shared port.  It assumes that an application
418		 * will never clear these options after setting them.
419		 */
420		if ((inp->inp_socket->so_options &
421		    (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) {
422			INP_RUNLOCK(inp);
423			break;
424		}
425	}
426
427	if (appends == 0) {
428		/*
429		 * No matching pcb found; discard datagram.  (No need
430		 * to send an ICMP Port Unreachable for a broadcast
431		 * or multicast datgram.)
432		 */
433		UDPSTAT_INC(udps_noport);
434		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
435			UDPSTAT_INC(udps_noportmcast);
436		else
437			UDPSTAT_INC(udps_noportbcast);
438	}
439	m_freem(m);
440
441	return (IPPROTO_DONE);
442}
443
444static int
445udp_input(struct mbuf **mp, int *offp, int proto)
446{
447	struct ip *ip;
448	struct udphdr *uh;
449	struct ifnet *ifp;
450	struct inpcb *inp;
451	uint16_t len, ip_len;
452	struct inpcbinfo *pcbinfo;
453	struct sockaddr_in udp_in[2];
454	struct mbuf *m;
455	struct m_tag *fwd_tag;
456	int cscov_partial, iphlen;
457
458	m = *mp;
459	iphlen = *offp;
460	ifp = m->m_pkthdr.rcvif;
461	*mp = NULL;
462	UDPSTAT_INC(udps_ipackets);
463
464	/*
465	 * Strip IP options, if any; should skip this, make available to
466	 * user, and use on returned packets, but we don't yet have a way to
467	 * check the checksum with options still present.
468	 */
469	if (iphlen > sizeof (struct ip)) {
470		ip_stripoptions(m);
471		iphlen = sizeof(struct ip);
472	}
473
474	/*
475	 * Get IP and UDP header together in first mbuf.
476	 */
477	if (m->m_len < iphlen + sizeof(struct udphdr)) {
478		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
479			UDPSTAT_INC(udps_hdrops);
480			return (IPPROTO_DONE);
481		}
482	}
483	ip = mtod(m, struct ip *);
484	uh = (struct udphdr *)((caddr_t)ip + iphlen);
485	cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
486
487	/*
488	 * Destination port of 0 is illegal, based on RFC768.
489	 */
490	if (uh->uh_dport == 0)
491		goto badunlocked;
492
493	/*
494	 * Construct sockaddr format source address.  Stuff source address
495	 * and datagram in user buffer.
496	 */
497	bzero(&udp_in[0], sizeof(struct sockaddr_in) * 2);
498	udp_in[0].sin_len = sizeof(struct sockaddr_in);
499	udp_in[0].sin_family = AF_INET;
500	udp_in[0].sin_port = uh->uh_sport;
501	udp_in[0].sin_addr = ip->ip_src;
502	udp_in[1].sin_len = sizeof(struct sockaddr_in);
503	udp_in[1].sin_family = AF_INET;
504	udp_in[1].sin_port = uh->uh_dport;
505	udp_in[1].sin_addr = ip->ip_dst;
506
507	/*
508	 * Make mbuf data length reflect UDP length.  If not enough data to
509	 * reflect UDP length, drop.
510	 */
511	len = ntohs((u_short)uh->uh_ulen);
512	ip_len = ntohs(ip->ip_len) - iphlen;
513	if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
514		/* Zero means checksum over the complete packet. */
515		if (len == 0)
516			len = ip_len;
517		cscov_partial = 0;
518	}
519	if (ip_len != len) {
520		if (len > ip_len || len < sizeof(struct udphdr)) {
521			UDPSTAT_INC(udps_badlen);
522			goto badunlocked;
523		}
524		if (proto == IPPROTO_UDP)
525			m_adj(m, len - ip_len);
526	}
527
528	/*
529	 * Checksum extended UDP header and data.
530	 */
531	if (uh->uh_sum) {
532		u_short uh_sum;
533
534		if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
535		    !cscov_partial) {
536			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
537				uh_sum = m->m_pkthdr.csum_data;
538			else
539				uh_sum = in_pseudo(ip->ip_src.s_addr,
540				    ip->ip_dst.s_addr, htonl((u_short)len +
541				    m->m_pkthdr.csum_data + proto));
542			uh_sum ^= 0xffff;
543		} else {
544			char b[offsetof(struct ipovly, ih_src)];
545			struct ipovly *ipov = (struct ipovly *)ip;
546
547			bcopy(ipov, b, sizeof(b));
548			bzero(ipov, sizeof(ipov->ih_x1));
549			ipov->ih_len = (proto == IPPROTO_UDP) ?
550			    uh->uh_ulen : htons(ip_len);
551			uh_sum = in_cksum(m, len + sizeof (struct ip));
552			bcopy(b, ipov, sizeof(b));
553		}
554		if (uh_sum) {
555			UDPSTAT_INC(udps_badsum);
556			m_freem(m);
557			return (IPPROTO_DONE);
558		}
559	} else {
560		if (proto == IPPROTO_UDP) {
561			UDPSTAT_INC(udps_nosum);
562		} else {
563			/* UDPLite requires a checksum */
564			/* XXX: What is the right UDPLite MIB counter here? */
565			m_freem(m);
566			return (IPPROTO_DONE);
567		}
568	}
569
570	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
571	    in_broadcast(ip->ip_dst, ifp))
572		return (udp_multi_input(m, proto, udp_in));
573
574	pcbinfo = udp_get_inpcbinfo(proto);
575
576	/*
577	 * Locate pcb for datagram.
578	 *
579	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
580	 */
581	if ((m->m_flags & M_IP_NEXTHOP) &&
582	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
583		struct sockaddr_in *next_hop;
584
585		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
586
587		/*
588		 * Transparently forwarded. Pretend to be the destination.
589		 * Already got one like this?
590		 */
591		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
592		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
593		if (!inp) {
594			/*
595			 * It's new.  Try to find the ambushing socket.
596			 * Because we've rewritten the destination address,
597			 * any hardware-generated hash is ignored.
598			 */
599			inp = in_pcblookup(pcbinfo, ip->ip_src,
600			    uh->uh_sport, next_hop->sin_addr,
601			    next_hop->sin_port ? htons(next_hop->sin_port) :
602			    uh->uh_dport, INPLOOKUP_WILDCARD |
603			    INPLOOKUP_RLOCKPCB, ifp);
604		}
605		/* Remove the tag from the packet. We don't need it anymore. */
606		m_tag_delete(m, fwd_tag);
607		m->m_flags &= ~M_IP_NEXTHOP;
608	} else
609		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
610		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
611		    INPLOOKUP_RLOCKPCB, ifp, m);
612	if (inp == NULL) {
613		if (V_udp_log_in_vain) {
614			char src[INET_ADDRSTRLEN];
615			char dst[INET_ADDRSTRLEN];
616
617			log(LOG_INFO,
618			    "Connection attempt to UDP %s:%d from %s:%d\n",
619			    inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport),
620			    inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport));
621		}
622		if (proto == IPPROTO_UDPLITE)
623			UDPLITE_PROBE(receive, NULL, NULL, ip, NULL, uh);
624		else
625			UDP_PROBE(receive, NULL, NULL, ip, NULL, uh);
626		UDPSTAT_INC(udps_noport);
627		if (m->m_flags & (M_BCAST | M_MCAST)) {
628			UDPSTAT_INC(udps_noportbcast);
629			goto badunlocked;
630		}
631		if (V_udp_blackhole && (V_udp_blackhole_local ||
632		    !in_localip(ip->ip_src)))
633			goto badunlocked;
634		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
635			goto badunlocked;
636		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
637		return (IPPROTO_DONE);
638	}
639
640	/*
641	 * Check the minimum TTL for socket.
642	 */
643	INP_RLOCK_ASSERT(inp);
644	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
645		if (proto == IPPROTO_UDPLITE)
646			UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
647		else
648			UDP_PROBE(receive, NULL, inp, ip, inp, uh);
649		INP_RUNLOCK(inp);
650		m_freem(m);
651		return (IPPROTO_DONE);
652	}
653	if (cscov_partial) {
654		struct udpcb *up;
655
656		up = intoudpcb(inp);
657		if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
658			INP_RUNLOCK(inp);
659			m_freem(m);
660			return (IPPROTO_DONE);
661		}
662	}
663
664	if (proto == IPPROTO_UDPLITE)
665		UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
666	else
667		UDP_PROBE(receive, NULL, inp, ip, inp, uh);
668	if (udp_append(inp, ip, m, iphlen, udp_in) == 0)
669		INP_RUNLOCK(inp);
670	return (IPPROTO_DONE);
671
672badunlocked:
673	m_freem(m);
674	return (IPPROTO_DONE);
675}
676#endif /* INET */
677
678/*
679 * Notify a udp user of an asynchronous error; just wake up so that they can
680 * collect error status.
681 */
682struct inpcb *
683udp_notify(struct inpcb *inp, int errno)
684{
685
686	INP_WLOCK_ASSERT(inp);
687	if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
688	     errno == EHOSTDOWN) && inp->inp_route.ro_nh) {
689		NH_FREE(inp->inp_route.ro_nh);
690		inp->inp_route.ro_nh = (struct nhop_object *)NULL;
691	}
692
693	inp->inp_socket->so_error = errno;
694	sorwakeup(inp->inp_socket);
695	sowwakeup(inp->inp_socket);
696	return (inp);
697}
698
699#ifdef INET
700static void
701udp_common_ctlinput(struct icmp *icmp, struct inpcbinfo *pcbinfo)
702{
703	struct ip *ip = &icmp->icmp_ip;
704	struct udphdr *uh;
705	struct inpcb *inp;
706
707	if (icmp_errmap(icmp) == 0)
708		return;
709
710	uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
711	inp = in_pcblookup(pcbinfo, ip->ip_dst, uh->uh_dport, ip->ip_src,
712	    uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL);
713	if (inp != NULL) {
714		INP_WLOCK_ASSERT(inp);
715		if (inp->inp_socket != NULL)
716			udp_notify(inp, icmp_errmap(icmp));
717		INP_WUNLOCK(inp);
718	} else {
719		inp = in_pcblookup(pcbinfo, ip->ip_dst, uh->uh_dport,
720		    ip->ip_src, uh->uh_sport,
721		    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
722		if (inp != NULL) {
723			struct udpcb *up;
724			udp_tun_icmp_t *func;
725
726			up = intoudpcb(inp);
727			func = up->u_icmp_func;
728			INP_RUNLOCK(inp);
729			if (func != NULL)
730				func(icmp);
731		}
732	}
733}
734
735static void
736udp_ctlinput(struct icmp *icmp)
737{
738
739	return (udp_common_ctlinput(icmp, &V_udbinfo));
740}
741
742static void
743udplite_ctlinput(struct icmp *icmp)
744{
745
746	return (udp_common_ctlinput(icmp, &V_ulitecbinfo));
747}
748#endif /* INET */
749
750static int
751udp_pcblist(SYSCTL_HANDLER_ARGS)
752{
753	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo,
754	    INPLOOKUP_RLOCKPCB);
755	struct xinpgen xig;
756	struct inpcb *inp;
757	int error;
758
759	if (req->newptr != 0)
760		return (EPERM);
761
762	if (req->oldptr == 0) {
763		int n;
764
765		n = V_udbinfo.ipi_count;
766		n += imax(n / 8, 10);
767		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
768		return (0);
769	}
770
771	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
772		return (error);
773
774	bzero(&xig, sizeof(xig));
775	xig.xig_len = sizeof xig;
776	xig.xig_count = V_udbinfo.ipi_count;
777	xig.xig_gen = V_udbinfo.ipi_gencnt;
778	xig.xig_sogen = so_gencnt;
779	error = SYSCTL_OUT(req, &xig, sizeof xig);
780	if (error)
781		return (error);
782
783	while ((inp = inp_next(&inpi)) != NULL) {
784		if (inp->inp_gencnt <= xig.xig_gen &&
785		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
786			struct xinpcb xi;
787
788			in_pcbtoxinpcb(inp, &xi);
789			error = SYSCTL_OUT(req, &xi, sizeof xi);
790			if (error) {
791				INP_RUNLOCK(inp);
792				break;
793			}
794		}
795	}
796
797	if (!error) {
798		/*
799		 * Give the user an updated idea of our state.  If the
800		 * generation differs from what we told her before, she knows
801		 * that something happened while we were processing this
802		 * request, and it might be necessary to retry.
803		 */
804		xig.xig_gen = V_udbinfo.ipi_gencnt;
805		xig.xig_sogen = so_gencnt;
806		xig.xig_count = V_udbinfo.ipi_count;
807		error = SYSCTL_OUT(req, &xig, sizeof xig);
808	}
809
810	return (error);
811}
812
813SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
814    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
815    udp_pcblist, "S,xinpcb",
816    "List of active UDP sockets");
817
818#ifdef INET
819static int
820udp_getcred(SYSCTL_HANDLER_ARGS)
821{
822	struct xucred xuc;
823	struct sockaddr_in addrs[2];
824	struct epoch_tracker et;
825	struct inpcb *inp;
826	int error;
827
828	error = priv_check(req->td, PRIV_NETINET_GETCRED);
829	if (error)
830		return (error);
831	error = SYSCTL_IN(req, addrs, sizeof(addrs));
832	if (error)
833		return (error);
834	NET_EPOCH_ENTER(et);
835	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
836	    addrs[0].sin_addr, addrs[0].sin_port,
837	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
838	NET_EPOCH_EXIT(et);
839	if (inp != NULL) {
840		INP_RLOCK_ASSERT(inp);
841		if (inp->inp_socket == NULL)
842			error = ENOENT;
843		if (error == 0)
844			error = cr_canseeinpcb(req->td->td_ucred, inp);
845		if (error == 0)
846			cru2x(inp->inp_cred, &xuc);
847		INP_RUNLOCK(inp);
848	} else
849		error = ENOENT;
850	if (error == 0)
851		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
852	return (error);
853}
854
855SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
856    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
857    0, 0, udp_getcred, "S,xucred",
858    "Get the xucred of a UDP connection");
859#endif /* INET */
860
861int
862udp_ctloutput(struct socket *so, struct sockopt *sopt)
863{
864	struct inpcb *inp;
865	struct udpcb *up;
866	int isudplite, error, optval;
867
868	error = 0;
869	isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
870	inp = sotoinpcb(so);
871	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
872	INP_WLOCK(inp);
873	if (sopt->sopt_level != so->so_proto->pr_protocol) {
874#ifdef INET6
875		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
876			INP_WUNLOCK(inp);
877			error = ip6_ctloutput(so, sopt);
878		}
879#endif
880#if defined(INET) && defined(INET6)
881		else
882#endif
883#ifdef INET
884		{
885			INP_WUNLOCK(inp);
886			error = ip_ctloutput(so, sopt);
887		}
888#endif
889		return (error);
890	}
891
892	switch (sopt->sopt_dir) {
893	case SOPT_SET:
894		switch (sopt->sopt_name) {
895#if defined(IPSEC) || defined(IPSEC_SUPPORT)
896#if defined(INET) || defined(INET6)
897		case UDP_ENCAP:
898#ifdef INET
899			if (INP_SOCKAF(so) == AF_INET) {
900				if (!IPSEC_ENABLED(ipv4)) {
901					INP_WUNLOCK(inp);
902					return (ENOPROTOOPT);
903				}
904				error = UDPENCAP_PCBCTL(ipv4, inp, sopt);
905				break;
906			}
907#endif /* INET */
908#ifdef INET6
909			if (INP_SOCKAF(so) == AF_INET6) {
910				if (!IPSEC_ENABLED(ipv6)) {
911					INP_WUNLOCK(inp);
912					return (ENOPROTOOPT);
913				}
914				error = UDPENCAP_PCBCTL(ipv6, inp, sopt);
915				break;
916			}
917#endif /* INET6 */
918			INP_WUNLOCK(inp);
919			return (EINVAL);
920#endif /* INET || INET6 */
921
922#endif /* IPSEC */
923		case UDPLITE_SEND_CSCOV:
924		case UDPLITE_RECV_CSCOV:
925			if (!isudplite) {
926				INP_WUNLOCK(inp);
927				error = ENOPROTOOPT;
928				break;
929			}
930			INP_WUNLOCK(inp);
931			error = sooptcopyin(sopt, &optval, sizeof(optval),
932			    sizeof(optval));
933			if (error != 0)
934				break;
935			inp = sotoinpcb(so);
936			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
937			INP_WLOCK(inp);
938			up = intoudpcb(inp);
939			KASSERT(up != NULL, ("%s: up == NULL", __func__));
940			if ((optval != 0 && optval < 8) || (optval > 65535)) {
941				INP_WUNLOCK(inp);
942				error = EINVAL;
943				break;
944			}
945			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
946				up->u_txcslen = optval;
947			else
948				up->u_rxcslen = optval;
949			INP_WUNLOCK(inp);
950			break;
951		default:
952			INP_WUNLOCK(inp);
953			error = ENOPROTOOPT;
954			break;
955		}
956		break;
957	case SOPT_GET:
958		switch (sopt->sopt_name) {
959#if defined(IPSEC) || defined(IPSEC_SUPPORT)
960#if defined(INET) || defined(INET6)
961		case UDP_ENCAP:
962#ifdef INET
963			if (INP_SOCKAF(so) == AF_INET) {
964				if (!IPSEC_ENABLED(ipv4)) {
965					INP_WUNLOCK(inp);
966					return (ENOPROTOOPT);
967				}
968				error = UDPENCAP_PCBCTL(ipv4, inp, sopt);
969				break;
970			}
971#endif /* INET */
972#ifdef INET6
973			if (INP_SOCKAF(so) == AF_INET6) {
974				if (!IPSEC_ENABLED(ipv6)) {
975					INP_WUNLOCK(inp);
976					return (ENOPROTOOPT);
977				}
978				error = UDPENCAP_PCBCTL(ipv6, inp, sopt);
979				break;
980			}
981#endif /* INET6 */
982			INP_WUNLOCK(inp);
983			return (EINVAL);
984#endif /* INET || INET6 */
985
986#endif /* IPSEC */
987		case UDPLITE_SEND_CSCOV:
988		case UDPLITE_RECV_CSCOV:
989			if (!isudplite) {
990				INP_WUNLOCK(inp);
991				error = ENOPROTOOPT;
992				break;
993			}
994			up = intoudpcb(inp);
995			KASSERT(up != NULL, ("%s: up == NULL", __func__));
996			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
997				optval = up->u_txcslen;
998			else
999				optval = up->u_rxcslen;
1000			INP_WUNLOCK(inp);
1001			error = sooptcopyout(sopt, &optval, sizeof(optval));
1002			break;
1003		default:
1004			INP_WUNLOCK(inp);
1005			error = ENOPROTOOPT;
1006			break;
1007		}
1008		break;
1009	}
1010	return (error);
1011}
1012
1013#ifdef INET
1014#ifdef INET6
1015/* The logic here is derived from ip6_setpktopt(). See comments there. */
1016static int
1017udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src,
1018    struct inpcb *inp, int flags)
1019{
1020	struct ifnet *ifp;
1021	struct in6_pktinfo *pktinfo;
1022	struct in_addr ia;
1023
1024	if ((flags & PRUS_IPV6) == 0)
1025		return (0);
1026
1027	if (cm->cmsg_level != IPPROTO_IPV6)
1028		return (0);
1029
1030	if  (cm->cmsg_type != IPV6_2292PKTINFO &&
1031	    cm->cmsg_type != IPV6_PKTINFO)
1032		return (0);
1033
1034	if (cm->cmsg_len !=
1035	    CMSG_LEN(sizeof(struct in6_pktinfo)))
1036		return (EINVAL);
1037
1038	pktinfo = (struct in6_pktinfo *)CMSG_DATA(cm);
1039	if (!IN6_IS_ADDR_V4MAPPED(&pktinfo->ipi6_addr) &&
1040	    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr))
1041		return (EINVAL);
1042
1043	/* Validate the interface index if specified. */
1044	if (pktinfo->ipi6_ifindex) {
1045		struct epoch_tracker et;
1046
1047		NET_EPOCH_ENTER(et);
1048		ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
1049		NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
1050		if (ifp == NULL)
1051			return (ENXIO);
1052	} else
1053		ifp = NULL;
1054	if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
1055		ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1056		if (in_ifhasaddr(ifp, ia) == 0)
1057			return (EADDRNOTAVAIL);
1058	}
1059
1060	bzero(src, sizeof(*src));
1061	src->sin_family = AF_INET;
1062	src->sin_len = sizeof(*src);
1063	src->sin_port = inp->inp_lport;
1064	src->sin_addr.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
1065
1066	return (0);
1067}
1068#endif	/* INET6 */
1069
1070int
1071udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1072    struct mbuf *control, struct thread *td)
1073{
1074	struct inpcb *inp;
1075	struct udpiphdr *ui;
1076	int len, error = 0;
1077	struct in_addr faddr, laddr;
1078	struct cmsghdr *cm;
1079	struct inpcbinfo *pcbinfo;
1080	struct sockaddr_in *sin, src;
1081	struct epoch_tracker et;
1082	int cscov_partial = 0;
1083	int ipflags = 0;
1084	u_short fport, lport;
1085	u_char tos, vflagsav;
1086	uint8_t pr;
1087	uint16_t cscov = 0;
1088	uint32_t flowid = 0;
1089	uint8_t flowtype = M_HASHTYPE_NONE;
1090	bool use_cached_route;
1091
1092	inp = sotoinpcb(so);
1093	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1094
1095	if (addr != NULL) {
1096		if (addr->sa_family != AF_INET)
1097			error = EAFNOSUPPORT;
1098		else if (addr->sa_len != sizeof(struct sockaddr_in))
1099			error = EINVAL;
1100		if (__predict_false(error != 0)) {
1101			m_freem(control);
1102			m_freem(m);
1103			return (error);
1104		}
1105	}
1106
1107	len = m->m_pkthdr.len;
1108	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
1109		if (control)
1110			m_freem(control);
1111		m_freem(m);
1112		return (EMSGSIZE);
1113	}
1114
1115	src.sin_family = 0;
1116	sin = (struct sockaddr_in *)addr;
1117
1118	/*
1119	 * udp_send() may need to temporarily bind or connect the current
1120	 * inpcb.  As such, we don't know up front whether we will need the
1121	 * pcbinfo lock or not.  Do any work to decide what is needed up
1122	 * front before acquiring any locks.
1123	 *
1124	 * We will need network epoch in either case, to safely lookup into
1125	 * pcb hash.
1126	 */
1127	use_cached_route = sin == NULL || (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0);
1128	if (use_cached_route || (flags & PRUS_IPV6) != 0)
1129		INP_WLOCK(inp);
1130	else
1131		INP_RLOCK(inp);
1132	NET_EPOCH_ENTER(et);
1133	tos = inp->inp_ip_tos;
1134	if (control != NULL) {
1135		/*
1136		 * XXX: Currently, we assume all the optional information is
1137		 * stored in a single mbuf.
1138		 */
1139		if (control->m_next) {
1140			m_freem(control);
1141			error = EINVAL;
1142			goto release;
1143		}
1144		for (; control->m_len > 0;
1145		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
1146		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
1147			cm = mtod(control, struct cmsghdr *);
1148			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
1149			    || cm->cmsg_len > control->m_len) {
1150				error = EINVAL;
1151				break;
1152			}
1153#ifdef INET6
1154			error = udp_v4mapped_pktinfo(cm, &src, inp, flags);
1155			if (error != 0)
1156				break;
1157#endif
1158			if (cm->cmsg_level != IPPROTO_IP)
1159				continue;
1160
1161			switch (cm->cmsg_type) {
1162			case IP_SENDSRCADDR:
1163				if (cm->cmsg_len !=
1164				    CMSG_LEN(sizeof(struct in_addr))) {
1165					error = EINVAL;
1166					break;
1167				}
1168				bzero(&src, sizeof(src));
1169				src.sin_family = AF_INET;
1170				src.sin_len = sizeof(src);
1171				src.sin_port = inp->inp_lport;
1172				src.sin_addr =
1173				    *(struct in_addr *)CMSG_DATA(cm);
1174				break;
1175
1176			case IP_TOS:
1177				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1178					error = EINVAL;
1179					break;
1180				}
1181				tos = *(u_char *)CMSG_DATA(cm);
1182				break;
1183
1184			case IP_FLOWID:
1185				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1186					error = EINVAL;
1187					break;
1188				}
1189				flowid = *(uint32_t *) CMSG_DATA(cm);
1190				break;
1191
1192			case IP_FLOWTYPE:
1193				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1194					error = EINVAL;
1195					break;
1196				}
1197				flowtype = *(uint32_t *) CMSG_DATA(cm);
1198				break;
1199
1200#ifdef	RSS
1201			case IP_RSSBUCKETID:
1202				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
1203					error = EINVAL;
1204					break;
1205				}
1206				/* This is just a placeholder for now */
1207				break;
1208#endif	/* RSS */
1209			default:
1210				error = ENOPROTOOPT;
1211				break;
1212			}
1213			if (error)
1214				break;
1215		}
1216		m_freem(control);
1217		control = NULL;
1218	}
1219	if (error)
1220		goto release;
1221
1222	pr = inp->inp_socket->so_proto->pr_protocol;
1223	pcbinfo = udp_get_inpcbinfo(pr);
1224
1225	/*
1226	 * If the IP_SENDSRCADDR control message was specified, override the
1227	 * source address for this datagram.  Its use is invalidated if the
1228	 * address thus specified is incomplete or clobbers other inpcbs.
1229	 */
1230	laddr = inp->inp_laddr;
1231	lport = inp->inp_lport;
1232	if (src.sin_family == AF_INET) {
1233		if ((lport == 0) ||
1234		    (laddr.s_addr == INADDR_ANY &&
1235		     src.sin_addr.s_addr == INADDR_ANY)) {
1236			error = EINVAL;
1237			goto release;
1238		}
1239		if ((flags & PRUS_IPV6) != 0) {
1240			vflagsav = inp->inp_vflag;
1241			inp->inp_vflag |= INP_IPV4;
1242			inp->inp_vflag &= ~INP_IPV6;
1243		}
1244		INP_HASH_WLOCK(pcbinfo);
1245		error = in_pcbbind_setup(inp, &src, &laddr.s_addr, &lport,
1246		    td->td_ucred);
1247		INP_HASH_WUNLOCK(pcbinfo);
1248		if ((flags & PRUS_IPV6) != 0)
1249			inp->inp_vflag = vflagsav;
1250		if (error)
1251			goto release;
1252	}
1253
1254	/*
1255	 * If a UDP socket has been connected, then a local address/port will
1256	 * have been selected and bound.
1257	 *
1258	 * If a UDP socket has not been connected to, then an explicit
1259	 * destination address must be used, in which case a local
1260	 * address/port may not have been selected and bound.
1261	 */
1262	if (sin != NULL) {
1263		INP_LOCK_ASSERT(inp);
1264		if (inp->inp_faddr.s_addr != INADDR_ANY) {
1265			error = EISCONN;
1266			goto release;
1267		}
1268
1269		/*
1270		 * Jail may rewrite the destination address, so let it do
1271		 * that before we use it.
1272		 */
1273		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1274		if (error)
1275			goto release;
1276
1277		/*
1278		 * If a local address or port hasn't yet been selected, or if
1279		 * the destination address needs to be rewritten due to using
1280		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
1281		 * to do the heavy lifting.  Once a port is selected, we
1282		 * commit the binding back to the socket; we also commit the
1283		 * binding of the address if in jail.
1284		 *
1285		 * If we already have a valid binding and we're not
1286		 * requesting a destination address rewrite, use a fast path.
1287		 */
1288		if (inp->inp_laddr.s_addr == INADDR_ANY ||
1289		    inp->inp_lport == 0 ||
1290		    sin->sin_addr.s_addr == INADDR_ANY ||
1291		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
1292			if ((flags & PRUS_IPV6) != 0) {
1293				vflagsav = inp->inp_vflag;
1294				inp->inp_vflag |= INP_IPV4;
1295				inp->inp_vflag &= ~INP_IPV6;
1296			}
1297			INP_HASH_WLOCK(pcbinfo);
1298			error = in_pcbconnect_setup(inp, sin, &laddr.s_addr,
1299			    &lport, &faddr.s_addr, &fport, td->td_ucred);
1300			if ((flags & PRUS_IPV6) != 0)
1301				inp->inp_vflag = vflagsav;
1302			if (error) {
1303				INP_HASH_WUNLOCK(pcbinfo);
1304				goto release;
1305			}
1306
1307			/*
1308			 * XXXRW: Why not commit the port if the address is
1309			 * !INADDR_ANY?
1310			 */
1311			/* Commit the local port if newly assigned. */
1312			if (inp->inp_laddr.s_addr == INADDR_ANY &&
1313			    inp->inp_lport == 0) {
1314				INP_WLOCK_ASSERT(inp);
1315				/*
1316				 * Remember addr if jailed, to prevent
1317				 * rebinding.
1318				 */
1319				if (prison_flag(td->td_ucred, PR_IP4))
1320					inp->inp_laddr = laddr;
1321				inp->inp_lport = lport;
1322				error = in_pcbinshash(inp);
1323				INP_HASH_WUNLOCK(pcbinfo);
1324				if (error != 0) {
1325					inp->inp_lport = 0;
1326					error = EAGAIN;
1327					goto release;
1328				}
1329				inp->inp_flags |= INP_ANONPORT;
1330			} else
1331				INP_HASH_WUNLOCK(pcbinfo);
1332		} else {
1333			faddr = sin->sin_addr;
1334			fport = sin->sin_port;
1335		}
1336	} else {
1337		INP_LOCK_ASSERT(inp);
1338		faddr = inp->inp_faddr;
1339		fport = inp->inp_fport;
1340		if (faddr.s_addr == INADDR_ANY) {
1341			error = ENOTCONN;
1342			goto release;
1343		}
1344	}
1345
1346	/*
1347	 * Calculate data length and get a mbuf for UDP, IP, and possible
1348	 * link-layer headers.  Immediate slide the data pointer back forward
1349	 * since we won't use that space at this layer.
1350	 */
1351	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
1352	if (m == NULL) {
1353		error = ENOBUFS;
1354		goto release;
1355	}
1356	m->m_data += max_linkhdr;
1357	m->m_len -= max_linkhdr;
1358	m->m_pkthdr.len -= max_linkhdr;
1359
1360	/*
1361	 * Fill in mbuf with extended UDP header and addresses and length put
1362	 * into network format.
1363	 */
1364	ui = mtod(m, struct udpiphdr *);
1365	/*
1366	 * Filling only those fields of udpiphdr that participate in the
1367	 * checksum calculation. The rest must be zeroed and will be filled
1368	 * later.
1369	 */
1370	bzero(ui->ui_x1, sizeof(ui->ui_x1));
1371	ui->ui_pr = pr;
1372	ui->ui_src = laddr;
1373	ui->ui_dst = faddr;
1374	ui->ui_sport = lport;
1375	ui->ui_dport = fport;
1376	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1377	if (pr == IPPROTO_UDPLITE) {
1378		struct udpcb *up;
1379		uint16_t plen;
1380
1381		up = intoudpcb(inp);
1382		cscov = up->u_txcslen;
1383		plen = (u_short)len + sizeof(struct udphdr);
1384		if (cscov >= plen)
1385			cscov = 0;
1386		ui->ui_len = htons(plen);
1387		ui->ui_ulen = htons(cscov);
1388		/*
1389		 * For UDP-Lite, checksum coverage length of zero means
1390		 * the entire UDPLite packet is covered by the checksum.
1391		 */
1392		cscov_partial = (cscov == 0) ? 0 : 1;
1393	}
1394
1395	if (inp->inp_socket->so_options & SO_DONTROUTE)
1396		ipflags |= IP_ROUTETOIF;
1397	if (inp->inp_socket->so_options & SO_BROADCAST)
1398		ipflags |= IP_ALLOWBROADCAST;
1399	if (inp->inp_flags & INP_ONESBCAST)
1400		ipflags |= IP_SENDONES;
1401
1402#ifdef MAC
1403	mac_inpcb_create_mbuf(inp, m);
1404#endif
1405
1406	/*
1407	 * Set up checksum and output datagram.
1408	 */
1409	ui->ui_sum = 0;
1410	if (pr == IPPROTO_UDPLITE) {
1411		if (inp->inp_flags & INP_ONESBCAST)
1412			faddr.s_addr = INADDR_BROADCAST;
1413		if (cscov_partial) {
1414			if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
1415				ui->ui_sum = 0xffff;
1416		} else {
1417			if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
1418				ui->ui_sum = 0xffff;
1419		}
1420	} else if (V_udp_cksum) {
1421		if (inp->inp_flags & INP_ONESBCAST)
1422			faddr.s_addr = INADDR_BROADCAST;
1423		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1424		    htons((u_short)len + sizeof(struct udphdr) + pr));
1425		m->m_pkthdr.csum_flags = CSUM_UDP;
1426		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1427	}
1428	/*
1429	 * After finishing the checksum computation, fill the remaining fields
1430	 * of udpiphdr.
1431	 */
1432	((struct ip *)ui)->ip_v = IPVERSION;
1433	((struct ip *)ui)->ip_tos = tos;
1434	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
1435	if (inp->inp_flags & INP_DONTFRAG)
1436		((struct ip *)ui)->ip_off |= htons(IP_DF);
1437	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;
1438	UDPSTAT_INC(udps_opackets);
1439
1440	/*
1441	 * Setup flowid / RSS information for outbound socket.
1442	 *
1443	 * Once the UDP code decides to set a flowid some other way,
1444	 * this allows the flowid to be overridden by userland.
1445	 */
1446	if (flowtype != M_HASHTYPE_NONE) {
1447		m->m_pkthdr.flowid = flowid;
1448		M_HASHTYPE_SET(m, flowtype);
1449	}
1450#if defined(ROUTE_MPATH) || defined(RSS)
1451	else if (CALC_FLOWID_OUTBOUND_SENDTO) {
1452		uint32_t hash_val, hash_type;
1453
1454		hash_val = fib4_calc_packet_hash(laddr, faddr,
1455		    lport, fport, pr, &hash_type);
1456		m->m_pkthdr.flowid = hash_val;
1457		M_HASHTYPE_SET(m, hash_type);
1458	}
1459
1460	/*
1461	 * Don't override with the inp cached flowid value.
1462	 *
1463	 * Depending upon the kind of send being done, the inp
1464	 * flowid/flowtype values may actually not be appropriate
1465	 * for this particular socket send.
1466	 *
1467	 * We should either leave the flowid at zero (which is what is
1468	 * currently done) or set it to some software generated
1469	 * hash value based on the packet contents.
1470	 */
1471	ipflags |= IP_NODEFAULTFLOWID;
1472#endif	/* RSS */
1473
1474	if (pr == IPPROTO_UDPLITE)
1475		UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1476	else
1477		UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1478	error = ip_output(m, inp->inp_options,
1479	    use_cached_route ? &inp->inp_route : NULL, ipflags,
1480	    inp->inp_moptions, inp);
1481	INP_UNLOCK(inp);
1482	NET_EPOCH_EXIT(et);
1483	return (error);
1484
1485release:
1486	INP_UNLOCK(inp);
1487	NET_EPOCH_EXIT(et);
1488	m_freem(m);
1489	return (error);
1490}
1491
1492void
1493udp_abort(struct socket *so)
1494{
1495	struct inpcb *inp;
1496	struct inpcbinfo *pcbinfo;
1497
1498	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1499	inp = sotoinpcb(so);
1500	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1501	INP_WLOCK(inp);
1502	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1503		INP_HASH_WLOCK(pcbinfo);
1504		in_pcbdisconnect(inp);
1505		INP_HASH_WUNLOCK(pcbinfo);
1506		soisdisconnected(so);
1507	}
1508	INP_WUNLOCK(inp);
1509}
1510
1511static int
1512udp_attach(struct socket *so, int proto, struct thread *td)
1513{
1514	static uint32_t udp_flowid;
1515	struct inpcbinfo *pcbinfo;
1516	struct inpcb *inp;
1517	struct udpcb *up;
1518	int error;
1519
1520	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1521	inp = sotoinpcb(so);
1522	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1523	error = soreserve(so, udp_sendspace, udp_recvspace);
1524	if (error)
1525		return (error);
1526	error = in_pcballoc(so, pcbinfo);
1527	if (error)
1528		return (error);
1529
1530	inp = sotoinpcb(so);
1531	inp->inp_ip_ttl = V_ip_defttl;
1532	inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1);
1533	inp->inp_flowtype = M_HASHTYPE_OPAQUE;
1534	up = intoudpcb(inp);
1535	bzero(&up->u_start_zero, u_zero_size);
1536	INP_WUNLOCK(inp);
1537
1538	return (0);
1539}
1540#endif /* INET */
1541
1542int
1543udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx)
1544{
1545	struct inpcb *inp;
1546	struct udpcb *up;
1547
1548	KASSERT(so->so_type == SOCK_DGRAM,
1549	    ("udp_set_kernel_tunneling: !dgram"));
1550	inp = sotoinpcb(so);
1551	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1552	INP_WLOCK(inp);
1553	up = intoudpcb(inp);
1554	if ((f != NULL || i != NULL) && ((up->u_tun_func != NULL) ||
1555	    (up->u_icmp_func != NULL))) {
1556		INP_WUNLOCK(inp);
1557		return (EBUSY);
1558	}
1559	up->u_tun_func = f;
1560	up->u_icmp_func = i;
1561	up->u_tun_ctx = ctx;
1562	INP_WUNLOCK(inp);
1563	return (0);
1564}
1565
1566#ifdef INET
1567static int
1568udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1569{
1570	struct inpcb *inp;
1571	struct inpcbinfo *pcbinfo;
1572	struct sockaddr_in *sinp;
1573	int error;
1574
1575	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1576	inp = sotoinpcb(so);
1577	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1578
1579	sinp = (struct sockaddr_in *)nam;
1580	if (nam->sa_family != AF_INET) {
1581		/*
1582		 * Preserve compatibility with old programs.
1583		 */
1584		if (nam->sa_family != AF_UNSPEC ||
1585		    nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
1586		    sinp->sin_addr.s_addr != INADDR_ANY)
1587			return (EAFNOSUPPORT);
1588		nam->sa_family = AF_INET;
1589	}
1590	if (nam->sa_len != sizeof(struct sockaddr_in))
1591		return (EINVAL);
1592
1593	INP_WLOCK(inp);
1594	INP_HASH_WLOCK(pcbinfo);
1595	error = in_pcbbind(inp, sinp, td->td_ucred);
1596	INP_HASH_WUNLOCK(pcbinfo);
1597	INP_WUNLOCK(inp);
1598	return (error);
1599}
1600
1601static void
1602udp_close(struct socket *so)
1603{
1604	struct inpcb *inp;
1605	struct inpcbinfo *pcbinfo;
1606
1607	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1608	inp = sotoinpcb(so);
1609	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1610	INP_WLOCK(inp);
1611	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1612		INP_HASH_WLOCK(pcbinfo);
1613		in_pcbdisconnect(inp);
1614		INP_HASH_WUNLOCK(pcbinfo);
1615		soisdisconnected(so);
1616	}
1617	INP_WUNLOCK(inp);
1618}
1619
1620static int
1621udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1622{
1623	struct epoch_tracker et;
1624	struct inpcb *inp;
1625	struct inpcbinfo *pcbinfo;
1626	struct sockaddr_in *sin;
1627	int error;
1628
1629	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1630	inp = sotoinpcb(so);
1631	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1632
1633	sin = (struct sockaddr_in *)nam;
1634	if (sin->sin_family != AF_INET)
1635		return (EAFNOSUPPORT);
1636	if (sin->sin_len != sizeof(*sin))
1637		return (EINVAL);
1638
1639	INP_WLOCK(inp);
1640	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1641		INP_WUNLOCK(inp);
1642		return (EISCONN);
1643	}
1644	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1645	if (error != 0) {
1646		INP_WUNLOCK(inp);
1647		return (error);
1648	}
1649	NET_EPOCH_ENTER(et);
1650	INP_HASH_WLOCK(pcbinfo);
1651	error = in_pcbconnect(inp, sin, td->td_ucred, true);
1652	INP_HASH_WUNLOCK(pcbinfo);
1653	NET_EPOCH_EXIT(et);
1654	if (error == 0)
1655		soisconnected(so);
1656	INP_WUNLOCK(inp);
1657	return (error);
1658}
1659
1660static void
1661udp_detach(struct socket *so)
1662{
1663	struct inpcb *inp;
1664
1665	inp = sotoinpcb(so);
1666	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1667	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1668	    ("udp_detach: not disconnected"));
1669	INP_WLOCK(inp);
1670	in_pcbfree(inp);
1671}
1672
1673int
1674udp_disconnect(struct socket *so)
1675{
1676	struct inpcb *inp;
1677	struct inpcbinfo *pcbinfo;
1678
1679	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
1680	inp = sotoinpcb(so);
1681	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1682	INP_WLOCK(inp);
1683	if (inp->inp_faddr.s_addr == INADDR_ANY) {
1684		INP_WUNLOCK(inp);
1685		return (ENOTCONN);
1686	}
1687	INP_HASH_WLOCK(pcbinfo);
1688	in_pcbdisconnect(inp);
1689	INP_HASH_WUNLOCK(pcbinfo);
1690	SOCK_LOCK(so);
1691	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
1692	SOCK_UNLOCK(so);
1693	INP_WUNLOCK(inp);
1694	return (0);
1695}
1696#endif /* INET */
1697
1698int
1699udp_shutdown(struct socket *so, enum shutdown_how how)
1700{
1701	int error;
1702
1703	SOCK_LOCK(so);
1704	if (!(so->so_state & SS_ISCONNECTED))
1705		/*
1706		 * POSIX mandates us to just return ENOTCONN when shutdown(2) is
1707		 * invoked on a datagram sockets, however historically we would
1708		 * actually tear socket down.  This is known to be leveraged by
1709		 * some applications to unblock process waiting in recv(2) by
1710		 * other process that it shares that socket with.  Try to meet
1711		 * both backward-compatibility and POSIX requirements by forcing
1712		 * ENOTCONN but still flushing buffers and performing wakeup(9).
1713		 *
1714		 * XXXGL: it remains unknown what applications expect this
1715		 * behavior and is this isolated to unix/dgram or inet/dgram or
1716		 * both.  See: D10351, D3039.
1717		 */
1718		error = ENOTCONN;
1719	else
1720		error = 0;
1721	SOCK_UNLOCK(so);
1722
1723	switch (how) {
1724	case SHUT_RD:
1725		sorflush(so);
1726		break;
1727	case SHUT_RDWR:
1728		sorflush(so);
1729		/* FALLTHROUGH */
1730	case SHUT_WR:
1731		socantsendmore(so);
1732	}
1733
1734	return (error);
1735}
1736
1737#ifdef INET
1738#define	UDP_PROTOSW							\
1739	.pr_type =		SOCK_DGRAM,				\
1740	.pr_flags =		PR_ATOMIC | PR_ADDR | PR_CAPATTACH,	\
1741	.pr_ctloutput =		udp_ctloutput,				\
1742	.pr_abort =		udp_abort,				\
1743	.pr_attach =		udp_attach,				\
1744	.pr_bind =		udp_bind,				\
1745	.pr_connect =		udp_connect,				\
1746	.pr_control =		in_control,				\
1747	.pr_detach =		udp_detach,				\
1748	.pr_disconnect =	udp_disconnect,				\
1749	.pr_peeraddr =		in_getpeeraddr,				\
1750	.pr_send =		udp_send,				\
1751	.pr_soreceive =		soreceive_dgram,			\
1752	.pr_sosend =		sosend_dgram,				\
1753	.pr_shutdown =		udp_shutdown,				\
1754	.pr_sockaddr =		in_getsockaddr,				\
1755	.pr_sosetlabel =	in_pcbsosetlabel,			\
1756	.pr_close =		udp_close
1757
1758struct protosw udp_protosw = {
1759	.pr_protocol =		IPPROTO_UDP,
1760	UDP_PROTOSW
1761};
1762
1763struct protosw udplite_protosw = {
1764	.pr_protocol =		IPPROTO_UDPLITE,
1765	UDP_PROTOSW
1766};
1767
1768static void
1769udp_init(void *arg __unused)
1770{
1771
1772	IPPROTO_REGISTER(IPPROTO_UDP, udp_input, udp_ctlinput);
1773	IPPROTO_REGISTER(IPPROTO_UDPLITE, udp_input, udplite_ctlinput);
1774}
1775SYSINIT(udp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp_init, NULL);
1776#endif /* INET */
1777