udp_usrreq.c revision 265946
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 *	The Regents of the University of California.
4 * Copyright (c) 2008 Robert N. M. Watson
5 * Copyright (c) 2010-2011 Juniper Networks, Inc.
6 * Copyright (c) 2014 Kevin Lo
7 * All rights reserved.
8 *
9 * Portions of this software were developed by Robert N. M. Watson under
10 * contract to Juniper Networks, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD: stable/10/sys/netinet/udp_usrreq.c 265946 2014-05-13 06:05:53Z kevlo $");
41
42#include "opt_ipfw.h"
43#include "opt_inet.h"
44#include "opt_inet6.h"
45#include "opt_ipsec.h"
46#include "opt_kdtrace.h"
47
48#include <sys/param.h>
49#include <sys/domain.h>
50#include <sys/eventhandler.h>
51#include <sys/jail.h>
52#include <sys/kernel.h>
53#include <sys/lock.h>
54#include <sys/malloc.h>
55#include <sys/mbuf.h>
56#include <sys/priv.h>
57#include <sys/proc.h>
58#include <sys/protosw.h>
59#include <sys/sdt.h>
60#include <sys/signalvar.h>
61#include <sys/socket.h>
62#include <sys/socketvar.h>
63#include <sys/sx.h>
64#include <sys/sysctl.h>
65#include <sys/syslog.h>
66#include <sys/systm.h>
67
68#include <vm/uma.h>
69
70#include <net/if.h>
71#include <net/route.h>
72
73#include <netinet/in.h>
74#include <netinet/in_kdtrace.h>
75#include <netinet/in_pcb.h>
76#include <netinet/in_systm.h>
77#include <netinet/in_var.h>
78#include <netinet/ip.h>
79#ifdef INET6
80#include <netinet/ip6.h>
81#endif
82#include <netinet/ip_icmp.h>
83#include <netinet/icmp_var.h>
84#include <netinet/ip_var.h>
85#include <netinet/ip_options.h>
86#ifdef INET6
87#include <netinet6/ip6_var.h>
88#endif
89#include <netinet/udp.h>
90#include <netinet/udp_var.h>
91#include <netinet/udplite.h>
92
93#ifdef IPSEC
94#include <netipsec/ipsec.h>
95#include <netipsec/esp.h>
96#endif
97
98#include <machine/in_cksum.h>
99
100#include <security/mac/mac_framework.h>
101
102/*
103 * UDP and UDP-Lite protocols implementation.
104 * Per RFC 768, August, 1980.
105 * Per RFC 3828, July, 2004.
106 */
107
108/*
109 * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
110 * removes the only data integrity mechanism for packets and malformed
111 * packets that would otherwise be discarded due to bad checksums, and may
112 * cause problems (especially for NFS data blocks).
113 */
114VNET_DEFINE(int, udp_cksum) = 1;
115SYSCTL_VNET_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
116    &VNET_NAME(udp_cksum), 0, "compute udp checksum");
117
118int	udp_log_in_vain = 0;
119SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
120    &udp_log_in_vain, 0, "Log all incoming UDP packets");
121
122VNET_DEFINE(int, udp_blackhole) = 0;
123SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
124    &VNET_NAME(udp_blackhole), 0,
125    "Do not send port unreachables for refused connects");
126
127u_long	udp_sendspace = 9216;		/* really max datagram size */
128					/* 40 1K datagrams */
129SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
130    &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
131
132u_long	udp_recvspace = 40 * (1024 +
133#ifdef INET6
134				      sizeof(struct sockaddr_in6)
135#else
136				      sizeof(struct sockaddr_in)
137#endif
138				      );
139
140SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
141    &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
142
143VNET_DEFINE(struct inpcbhead, udb);		/* from udp_var.h */
144VNET_DEFINE(struct inpcbinfo, udbinfo);
145VNET_DEFINE(struct inpcbhead, ulitecb);
146VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
147static VNET_DEFINE(uma_zone_t, udpcb_zone);
148#define	V_udpcb_zone			VNET(udpcb_zone)
149
150#ifndef UDBHASHSIZE
151#define	UDBHASHSIZE	128
152#endif
153
154VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
155VNET_PCPUSTAT_SYSINIT(udpstat);
156SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
157    udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
158
159#ifdef VIMAGE
160VNET_PCPUSTAT_SYSUNINIT(udpstat);
161#endif /* VIMAGE */
162#ifdef INET
163static void	udp_detach(struct socket *so);
164static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
165		    struct mbuf *, struct thread *);
166#endif
167
168#ifdef IPSEC
169#ifdef IPSEC_NAT_T
170#define	UF_ESPINUDP_ALL	(UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
171#ifdef INET
172static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
173#endif
174#endif /* IPSEC_NAT_T */
175#endif /* IPSEC */
176
177static void
178udp_zone_change(void *tag)
179{
180
181	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
182	uma_zone_set_max(V_udpcb_zone, maxsockets);
183}
184
185static int
186udp_inpcb_init(void *mem, int size, int flags)
187{
188	struct inpcb *inp;
189
190	inp = mem;
191	INP_LOCK_INIT(inp, "inp", "udpinp");
192	return (0);
193}
194
195static int
196udplite_inpcb_init(void *mem, int size, int flags)
197{
198	struct inpcb *inp;
199
200	inp = mem;
201	INP_LOCK_INIT(inp, "inp", "udpliteinp");
202	return (0);
203}
204
205void
206udp_init(void)
207{
208
209	in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
210	    "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE,
211	    IPI_HASHFIELDS_2TUPLE);
212	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
213	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
214	uma_zone_set_max(V_udpcb_zone, maxsockets);
215	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
216	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
217	    EVENTHANDLER_PRI_ANY);
218}
219
220void
221udplite_init(void)
222{
223
224	in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
225	    UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL,
226	    UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE);
227}
228
229/*
230 * Kernel module interface for updating udpstat.  The argument is an index
231 * into udpstat treated as an array of u_long.  While this encodes the
232 * general layout of udpstat into the caller, it doesn't encode its location,
233 * so that future changes to add, for example, per-CPU stats support won't
234 * cause binary compatibility problems for kernel modules.
235 */
236void
237kmod_udpstat_inc(int statnum)
238{
239
240	counter_u64_add(VNET(udpstat)[statnum], 1);
241}
242
243int
244udp_newudpcb(struct inpcb *inp)
245{
246	struct udpcb *up;
247
248	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
249	if (up == NULL)
250		return (ENOBUFS);
251	inp->inp_ppcb = up;
252	return (0);
253}
254
255void
256udp_discardcb(struct udpcb *up)
257{
258
259	uma_zfree(V_udpcb_zone, up);
260}
261
262#ifdef VIMAGE
263void
264udp_destroy(void)
265{
266
267	in_pcbinfo_destroy(&V_udbinfo);
268	uma_zdestroy(V_udpcb_zone);
269}
270
271void
272udplite_destroy(void)
273{
274
275	in_pcbinfo_destroy(&V_ulitecbinfo);
276}
277#endif
278
279#ifdef INET
280/*
281 * Subroutine of udp_input(), which appends the provided mbuf chain to the
282 * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
283 * contains the source address.  If the socket ends up being an IPv6 socket,
284 * udp_append() will convert to a sockaddr_in6 before passing the address
285 * into the socket code.
286 */
287static void
288udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
289    struct sockaddr_in *udp_in)
290{
291	struct sockaddr *append_sa;
292	struct socket *so;
293	struct mbuf *opts = 0;
294#ifdef INET6
295	struct sockaddr_in6 udp_in6;
296#endif
297	struct udpcb *up;
298
299	INP_LOCK_ASSERT(inp);
300
301	/*
302	 * Engage the tunneling protocol.
303	 */
304	up = intoudpcb(inp);
305	if (up->u_tun_func != NULL) {
306		(*up->u_tun_func)(n, off, inp);
307		return;
308	}
309
310	if (n == NULL)
311		return;
312
313	off += sizeof(struct udphdr);
314
315#ifdef IPSEC
316	/* Check AH/ESP integrity. */
317	if (ipsec4_in_reject(n, inp)) {
318		m_freem(n);
319		IPSECSTAT_INC(ips_in_polvio);
320		return;
321	}
322#ifdef IPSEC_NAT_T
323	up = intoudpcb(inp);
324	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
325	if (up->u_flags & UF_ESPINUDP_ALL) {	/* IPSec UDP encaps. */
326		n = udp4_espdecap(inp, n, off);
327		if (n == NULL)				/* Consumed. */
328			return;
329	}
330#endif /* IPSEC_NAT_T */
331#endif /* IPSEC */
332#ifdef MAC
333	if (mac_inpcb_check_deliver(inp, n) != 0) {
334		m_freem(n);
335		return;
336	}
337#endif /* MAC */
338	if (inp->inp_flags & INP_CONTROLOPTS ||
339	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
340#ifdef INET6
341		if (inp->inp_vflag & INP_IPV6)
342			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
343		else
344#endif /* INET6 */
345			ip_savecontrol(inp, &opts, ip, n);
346	}
347#ifdef INET6
348	if (inp->inp_vflag & INP_IPV6) {
349		bzero(&udp_in6, sizeof(udp_in6));
350		udp_in6.sin6_len = sizeof(udp_in6);
351		udp_in6.sin6_family = AF_INET6;
352		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
353		append_sa = (struct sockaddr *)&udp_in6;
354	} else
355#endif /* INET6 */
356		append_sa = (struct sockaddr *)udp_in;
357	m_adj(n, off);
358
359	so = inp->inp_socket;
360	SOCKBUF_LOCK(&so->so_rcv);
361	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
362		SOCKBUF_UNLOCK(&so->so_rcv);
363		m_freem(n);
364		if (opts)
365			m_freem(opts);
366		UDPSTAT_INC(udps_fullsock);
367	} else
368		sorwakeup_locked(so);
369}
370
371void
372udp_input(struct mbuf *m, int off)
373{
374	int iphlen = off;
375	struct ip *ip;
376	struct udphdr *uh;
377	struct ifnet *ifp;
378	struct inpcb *inp;
379	uint16_t len, ip_len;
380	struct inpcbinfo *pcbinfo;
381	struct ip save_ip;
382	struct sockaddr_in udp_in;
383	struct m_tag *fwd_tag;
384	int cscov_partial;
385	uint8_t pr;
386
387	ifp = m->m_pkthdr.rcvif;
388	UDPSTAT_INC(udps_ipackets);
389
390	/*
391	 * Strip IP options, if any; should skip this, make available to
392	 * user, and use on returned packets, but we don't yet have a way to
393	 * check the checksum with options still present.
394	 */
395	if (iphlen > sizeof (struct ip)) {
396		ip_stripoptions(m);
397		iphlen = sizeof(struct ip);
398	}
399
400	/*
401	 * Get IP and UDP header together in first mbuf.
402	 */
403	ip = mtod(m, struct ip *);
404	if (m->m_len < iphlen + sizeof(struct udphdr)) {
405		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
406			UDPSTAT_INC(udps_hdrops);
407			return;
408		}
409		ip = mtod(m, struct ip *);
410	}
411	uh = (struct udphdr *)((caddr_t)ip + iphlen);
412	pr = ip->ip_p;
413	cscov_partial = (pr == IPPROTO_UDPLITE) ? 1 : 0;
414
415	/*
416	 * Destination port of 0 is illegal, based on RFC768.
417	 */
418	if (uh->uh_dport == 0)
419		goto badunlocked;
420
421	/*
422	 * Construct sockaddr format source address.  Stuff source address
423	 * and datagram in user buffer.
424	 */
425	bzero(&udp_in, sizeof(udp_in));
426	udp_in.sin_len = sizeof(udp_in);
427	udp_in.sin_family = AF_INET;
428	udp_in.sin_port = uh->uh_sport;
429	udp_in.sin_addr = ip->ip_src;
430
431	/*
432	 * Make mbuf data length reflect UDP length.  If not enough data to
433	 * reflect UDP length, drop.
434	 */
435	len = ntohs((u_short)uh->uh_ulen);
436	ip_len = ntohs(ip->ip_len) - iphlen;
437	if (pr == IPPROTO_UDPLITE && len == 0) {
438		/* Zero means checksum over the complete packet. */
439		len = ip_len;
440		cscov_partial = 0;
441	}
442	if (ip_len != len) {
443		if (len > ip_len || len < sizeof(struct udphdr)) {
444			UDPSTAT_INC(udps_badlen);
445			goto badunlocked;
446		}
447		if (pr == IPPROTO_UDP)
448			m_adj(m, len - ip_len);
449	}
450
451	/*
452	 * Save a copy of the IP header in case we want restore it for
453	 * sending an ICMP error message in response.
454	 */
455	if (!V_udp_blackhole)
456		save_ip = *ip;
457	else
458		memset(&save_ip, 0, sizeof(save_ip));
459
460	/*
461	 * Checksum extended UDP header and data.
462	 */
463	if (uh->uh_sum) {
464		u_short uh_sum;
465
466		if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
467		    !cscov_partial) {
468			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
469				uh_sum = m->m_pkthdr.csum_data;
470			else
471				uh_sum = in_pseudo(ip->ip_src.s_addr,
472				    ip->ip_dst.s_addr, htonl((u_short)len +
473				    m->m_pkthdr.csum_data + pr));
474			uh_sum ^= 0xffff;
475		} else {
476			char b[9];
477
478			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
479			bzero(((struct ipovly *)ip)->ih_x1, 9);
480			((struct ipovly *)ip)->ih_len = (pr == IPPROTO_UDP) ?
481			    uh->uh_ulen : htons(ip_len);
482			uh_sum = in_cksum(m, len + sizeof (struct ip));
483			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
484		}
485		if (uh_sum) {
486			UDPSTAT_INC(udps_badsum);
487			m_freem(m);
488			return;
489		}
490	} else
491		UDPSTAT_INC(udps_nosum);
492
493	pcbinfo = get_inpcbinfo(pr);
494	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
495	    in_broadcast(ip->ip_dst, ifp)) {
496		struct inpcb *last;
497		struct inpcbhead *pcblist;
498		struct ip_moptions *imo;
499
500		INP_INFO_RLOCK(pcbinfo);
501		pcblist = get_pcblist(pr);
502		last = NULL;
503		LIST_FOREACH(inp, pcblist, inp_list) {
504			if (inp->inp_lport != uh->uh_dport)
505				continue;
506#ifdef INET6
507			if ((inp->inp_vflag & INP_IPV4) == 0)
508				continue;
509#endif
510			if (inp->inp_laddr.s_addr != INADDR_ANY &&
511			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
512				continue;
513			if (inp->inp_faddr.s_addr != INADDR_ANY &&
514			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
515				continue;
516			if (inp->inp_fport != 0 &&
517			    inp->inp_fport != uh->uh_sport)
518				continue;
519
520			INP_RLOCK(inp);
521
522			/*
523			 * XXXRW: Because we weren't holding either the inpcb
524			 * or the hash lock when we checked for a match
525			 * before, we should probably recheck now that the
526			 * inpcb lock is held.
527			 */
528
529			/*
530			 * Handle socket delivery policy for any-source
531			 * and source-specific multicast. [RFC3678]
532			 */
533			imo = inp->inp_moptions;
534			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
535				struct sockaddr_in	 group;
536				int			 blocked;
537				if (imo == NULL) {
538					INP_RUNLOCK(inp);
539					continue;
540				}
541				bzero(&group, sizeof(struct sockaddr_in));
542				group.sin_len = sizeof(struct sockaddr_in);
543				group.sin_family = AF_INET;
544				group.sin_addr = ip->ip_dst;
545
546				blocked = imo_multi_filter(imo, ifp,
547					(struct sockaddr *)&group,
548					(struct sockaddr *)&udp_in);
549				if (blocked != MCAST_PASS) {
550					if (blocked == MCAST_NOTGMEMBER)
551						IPSTAT_INC(ips_notmember);
552					if (blocked == MCAST_NOTSMEMBER ||
553					    blocked == MCAST_MUTED)
554						UDPSTAT_INC(udps_filtermcast);
555					INP_RUNLOCK(inp);
556					continue;
557				}
558			}
559			if (last != NULL) {
560				struct mbuf *n;
561
562				n = m_copy(m, 0, M_COPYALL);
563				udp_append(last, ip, n, iphlen, &udp_in);
564				INP_RUNLOCK(last);
565			}
566			last = inp;
567			/*
568			 * Don't look for additional matches if this one does
569			 * not have either the SO_REUSEPORT or SO_REUSEADDR
570			 * socket options set.  This heuristic avoids
571			 * searching through all pcbs in the common case of a
572			 * non-shared port.  It assumes that an application
573			 * will never clear these options after setting them.
574			 */
575			if ((last->inp_socket->so_options &
576			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
577				break;
578		}
579
580		if (last == NULL) {
581			/*
582			 * No matching pcb found; discard datagram.  (No need
583			 * to send an ICMP Port Unreachable for a broadcast
584			 * or multicast datgram.)
585			 */
586			UDPSTAT_INC(udps_noportbcast);
587			if (inp)
588				INP_RUNLOCK(inp);
589			INP_INFO_RUNLOCK(pcbinfo);
590			goto badunlocked;
591		}
592		udp_append(last, ip, m, iphlen, &udp_in);
593		INP_RUNLOCK(last);
594		INP_INFO_RUNLOCK(pcbinfo);
595		return;
596	}
597
598	/*
599	 * Locate pcb for datagram.
600	 */
601
602	/*
603	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
604	 */
605	if ((m->m_flags & M_IP_NEXTHOP) &&
606	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
607		struct sockaddr_in *next_hop;
608
609		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
610
611		/*
612		 * Transparently forwarded. Pretend to be the destination.
613		 * Already got one like this?
614		 */
615		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
616		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
617		if (!inp) {
618			/*
619			 * It's new.  Try to find the ambushing socket.
620			 * Because we've rewritten the destination address,
621			 * any hardware-generated hash is ignored.
622			 */
623			inp = in_pcblookup(pcbinfo, ip->ip_src,
624			    uh->uh_sport, next_hop->sin_addr,
625			    next_hop->sin_port ? htons(next_hop->sin_port) :
626			    uh->uh_dport, INPLOOKUP_WILDCARD |
627			    INPLOOKUP_RLOCKPCB, ifp);
628		}
629		/* Remove the tag from the packet. We don't need it anymore. */
630		m_tag_delete(m, fwd_tag);
631		m->m_flags &= ~M_IP_NEXTHOP;
632	} else
633		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
634		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
635		    INPLOOKUP_RLOCKPCB, ifp, m);
636	if (inp == NULL) {
637		if (udp_log_in_vain) {
638			char buf[4*sizeof "123"];
639
640			strcpy(buf, inet_ntoa(ip->ip_dst));
641			log(LOG_INFO,
642			    "Connection attempt to UDP %s:%d from %s:%d\n",
643			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
644			    ntohs(uh->uh_sport));
645		}
646		UDPSTAT_INC(udps_noport);
647		if (m->m_flags & (M_BCAST | M_MCAST)) {
648			UDPSTAT_INC(udps_noportbcast);
649			goto badunlocked;
650		}
651		if (V_udp_blackhole)
652			goto badunlocked;
653		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
654			goto badunlocked;
655		*ip = save_ip;
656		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
657		return;
658	}
659
660	/*
661	 * Check the minimum TTL for socket.
662	 */
663	INP_RLOCK_ASSERT(inp);
664	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
665		INP_RUNLOCK(inp);
666		m_freem(m);
667		return;
668	}
669	if (cscov_partial) {
670		struct udpcb *up;
671
672		up = intoudpcb(inp);
673		if (up->u_rxcslen > len) {
674			INP_RUNLOCK(inp);
675			m_freem(m);
676			return;
677		}
678	}
679
680	UDP_PROBE(receive, NULL, inp, ip, inp, uh);
681	udp_append(inp, ip, m, iphlen, &udp_in);
682	INP_RUNLOCK(inp);
683	return;
684
685badunlocked:
686	m_freem(m);
687}
688#endif /* INET */
689
690/*
691 * Notify a udp user of an asynchronous error; just wake up so that they can
692 * collect error status.
693 */
694struct inpcb *
695udp_notify(struct inpcb *inp, int errno)
696{
697
698	/*
699	 * While udp_ctlinput() always calls udp_notify() with a read lock
700	 * when invoking it directly, in_pcbnotifyall() currently uses write
701	 * locks due to sharing code with TCP.  For now, accept either a read
702	 * or a write lock, but a read lock is sufficient.
703	 */
704	INP_LOCK_ASSERT(inp);
705
706	inp->inp_socket->so_error = errno;
707	sorwakeup(inp->inp_socket);
708	sowwakeup(inp->inp_socket);
709	return (inp);
710}
711
712#ifdef INET
713static void
714udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
715    struct inpcbinfo *pcbinfo)
716{
717	struct ip *ip = vip;
718	struct udphdr *uh;
719	struct in_addr faddr;
720	struct inpcb *inp;
721
722	faddr = ((struct sockaddr_in *)sa)->sin_addr;
723	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
724		return;
725
726	/*
727	 * Redirects don't need to be handled up here.
728	 */
729	if (PRC_IS_REDIRECT(cmd))
730		return;
731
732	/*
733	 * Hostdead is ugly because it goes linearly through all PCBs.
734	 *
735	 * XXX: We never get this from ICMP, otherwise it makes an excellent
736	 * DoS attack on machines with many connections.
737	 */
738	if (cmd == PRC_HOSTDEAD)
739		ip = NULL;
740	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
741		return;
742	if (ip != NULL) {
743		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
744		inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
745		    ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
746		if (inp != NULL) {
747			INP_RLOCK_ASSERT(inp);
748			if (inp->inp_socket != NULL) {
749				udp_notify(inp, inetctlerrmap[cmd]);
750			}
751			INP_RUNLOCK(inp);
752		}
753	} else
754		in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
755		    udp_notify);
756}
757void
758udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
759{
760
761	return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
762}
763
764void
765udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
766{
767
768	return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
769}
770#endif /* INET */
771
772static int
773udp_pcblist(SYSCTL_HANDLER_ARGS)
774{
775	int error, i, n;
776	struct inpcb *inp, **inp_list;
777	inp_gen_t gencnt;
778	struct xinpgen xig;
779
780	/*
781	 * The process of preparing the PCB list is too time-consuming and
782	 * resource-intensive to repeat twice on every request.
783	 */
784	if (req->oldptr == 0) {
785		n = V_udbinfo.ipi_count;
786		n += imax(n / 8, 10);
787		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
788		return (0);
789	}
790
791	if (req->newptr != 0)
792		return (EPERM);
793
794	/*
795	 * OK, now we're committed to doing something.
796	 */
797	INP_INFO_RLOCK(&V_udbinfo);
798	gencnt = V_udbinfo.ipi_gencnt;
799	n = V_udbinfo.ipi_count;
800	INP_INFO_RUNLOCK(&V_udbinfo);
801
802	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
803		+ n * sizeof(struct xinpcb));
804	if (error != 0)
805		return (error);
806
807	xig.xig_len = sizeof xig;
808	xig.xig_count = n;
809	xig.xig_gen = gencnt;
810	xig.xig_sogen = so_gencnt;
811	error = SYSCTL_OUT(req, &xig, sizeof xig);
812	if (error)
813		return (error);
814
815	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
816	if (inp_list == 0)
817		return (ENOMEM);
818
819	INP_INFO_RLOCK(&V_udbinfo);
820	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
821	     inp = LIST_NEXT(inp, inp_list)) {
822		INP_WLOCK(inp);
823		if (inp->inp_gencnt <= gencnt &&
824		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
825			in_pcbref(inp);
826			inp_list[i++] = inp;
827		}
828		INP_WUNLOCK(inp);
829	}
830	INP_INFO_RUNLOCK(&V_udbinfo);
831	n = i;
832
833	error = 0;
834	for (i = 0; i < n; i++) {
835		inp = inp_list[i];
836		INP_RLOCK(inp);
837		if (inp->inp_gencnt <= gencnt) {
838			struct xinpcb xi;
839
840			bzero(&xi, sizeof(xi));
841			xi.xi_len = sizeof xi;
842			/* XXX should avoid extra copy */
843			bcopy(inp, &xi.xi_inp, sizeof *inp);
844			if (inp->inp_socket)
845				sotoxsocket(inp->inp_socket, &xi.xi_socket);
846			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
847			INP_RUNLOCK(inp);
848			error = SYSCTL_OUT(req, &xi, sizeof xi);
849		} else
850			INP_RUNLOCK(inp);
851	}
852	INP_INFO_WLOCK(&V_udbinfo);
853	for (i = 0; i < n; i++) {
854		inp = inp_list[i];
855		INP_RLOCK(inp);
856		if (!in_pcbrele_rlocked(inp))
857			INP_RUNLOCK(inp);
858	}
859	INP_INFO_WUNLOCK(&V_udbinfo);
860
861	if (!error) {
862		/*
863		 * Give the user an updated idea of our state.  If the
864		 * generation differs from what we told her before, she knows
865		 * that something happened while we were processing this
866		 * request, and it might be necessary to retry.
867		 */
868		INP_INFO_RLOCK(&V_udbinfo);
869		xig.xig_gen = V_udbinfo.ipi_gencnt;
870		xig.xig_sogen = so_gencnt;
871		xig.xig_count = V_udbinfo.ipi_count;
872		INP_INFO_RUNLOCK(&V_udbinfo);
873		error = SYSCTL_OUT(req, &xig, sizeof xig);
874	}
875	free(inp_list, M_TEMP);
876	return (error);
877}
878
879SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
880    CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
881    udp_pcblist, "S,xinpcb", "List of active UDP sockets");
882
883#ifdef INET
884static int
885udp_getcred(SYSCTL_HANDLER_ARGS)
886{
887	struct xucred xuc;
888	struct sockaddr_in addrs[2];
889	struct inpcb *inp;
890	int error;
891
892	error = priv_check(req->td, PRIV_NETINET_GETCRED);
893	if (error)
894		return (error);
895	error = SYSCTL_IN(req, addrs, sizeof(addrs));
896	if (error)
897		return (error);
898	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
899	    addrs[0].sin_addr, addrs[0].sin_port,
900	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
901	if (inp != NULL) {
902		INP_RLOCK_ASSERT(inp);
903		if (inp->inp_socket == NULL)
904			error = ENOENT;
905		if (error == 0)
906			error = cr_canseeinpcb(req->td->td_ucred, inp);
907		if (error == 0)
908			cru2x(inp->inp_cred, &xuc);
909		INP_RUNLOCK(inp);
910	} else
911		error = ENOENT;
912	if (error == 0)
913		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
914	return (error);
915}
916
917SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
918    CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
919    udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
920#endif /* INET */
921
922int
923udp_ctloutput(struct socket *so, struct sockopt *sopt)
924{
925	struct inpcb *inp;
926	struct udpcb *up;
927	int isudplite, error, optval;
928
929	error = 0;
930	isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
931	inp = sotoinpcb(so);
932	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
933	INP_WLOCK(inp);
934	if (sopt->sopt_level != so->so_proto->pr_protocol) {
935#ifdef INET6
936		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
937			INP_WUNLOCK(inp);
938			error = ip6_ctloutput(so, sopt);
939		}
940#endif
941#if defined(INET) && defined(INET6)
942		else
943#endif
944#ifdef INET
945		{
946			INP_WUNLOCK(inp);
947			error = ip_ctloutput(so, sopt);
948		}
949#endif
950		return (error);
951	}
952
953	switch (sopt->sopt_dir) {
954	case SOPT_SET:
955		switch (sopt->sopt_name) {
956		case UDP_ENCAP:
957			INP_WUNLOCK(inp);
958			error = sooptcopyin(sopt, &optval, sizeof optval,
959					    sizeof optval);
960			if (error)
961				break;
962			inp = sotoinpcb(so);
963			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
964			INP_WLOCK(inp);
965#ifdef IPSEC_NAT_T
966			up = intoudpcb(inp);
967			KASSERT(up != NULL, ("%s: up == NULL", __func__));
968#endif
969			switch (optval) {
970			case 0:
971				/* Clear all UDP encap. */
972#ifdef IPSEC_NAT_T
973				up->u_flags &= ~UF_ESPINUDP_ALL;
974#endif
975				break;
976#ifdef IPSEC_NAT_T
977			case UDP_ENCAP_ESPINUDP:
978			case UDP_ENCAP_ESPINUDP_NON_IKE:
979				up->u_flags &= ~UF_ESPINUDP_ALL;
980				if (optval == UDP_ENCAP_ESPINUDP)
981					up->u_flags |= UF_ESPINUDP;
982				else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
983					up->u_flags |= UF_ESPINUDP_NON_IKE;
984				break;
985#endif
986			default:
987				error = EINVAL;
988				break;
989			}
990			INP_WUNLOCK(inp);
991			break;
992		case UDPLITE_SEND_CSCOV:
993		case UDPLITE_RECV_CSCOV:
994			if (!isudplite) {
995				INP_WUNLOCK(inp);
996				error = ENOPROTOOPT;
997				break;
998			}
999			INP_WUNLOCK(inp);
1000			error = sooptcopyin(sopt, &optval, sizeof(optval),
1001			    sizeof(optval));
1002			if (error != 0)
1003				break;
1004			inp = sotoinpcb(so);
1005			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
1006			INP_WLOCK(inp);
1007			up = intoudpcb(inp);
1008			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1009			if (optval != 0 && optval < 8) {
1010				INP_WUNLOCK(inp);
1011				error = EINVAL;
1012				break;
1013			}
1014			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1015				up->u_txcslen = optval;
1016			else
1017				up->u_rxcslen = optval;
1018			INP_WUNLOCK(inp);
1019			break;
1020		default:
1021			INP_WUNLOCK(inp);
1022			error = ENOPROTOOPT;
1023			break;
1024		}
1025		break;
1026	case SOPT_GET:
1027		switch (sopt->sopt_name) {
1028#ifdef IPSEC_NAT_T
1029		case UDP_ENCAP:
1030			up = intoudpcb(inp);
1031			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1032			optval = up->u_flags & UF_ESPINUDP_ALL;
1033			INP_WUNLOCK(inp);
1034			error = sooptcopyout(sopt, &optval, sizeof optval);
1035			break;
1036#endif
1037		case UDPLITE_SEND_CSCOV:
1038		case UDPLITE_RECV_CSCOV:
1039			if (!isudplite) {
1040				INP_WUNLOCK(inp);
1041				error = ENOPROTOOPT;
1042				break;
1043			}
1044			up = intoudpcb(inp);
1045			KASSERT(up != NULL, ("%s: up == NULL", __func__));
1046			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
1047				optval = up->u_txcslen;
1048			else
1049				optval = up->u_rxcslen;
1050			INP_WUNLOCK(inp);
1051			error = sooptcopyout(sopt, &optval, sizeof(optval));
1052			break;
1053		default:
1054			INP_WUNLOCK(inp);
1055			error = ENOPROTOOPT;
1056			break;
1057		}
1058		break;
1059	}
1060	return (error);
1061}
1062
1063#ifdef INET
1064#define	UH_WLOCKED	2
1065#define	UH_RLOCKED	1
1066#define	UH_UNLOCKED	0
1067static int
1068udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
1069    struct mbuf *control, struct thread *td)
1070{
1071	struct udpiphdr *ui;
1072	int len = m->m_pkthdr.len;
1073	struct in_addr faddr, laddr;
1074	struct cmsghdr *cm;
1075	struct inpcbinfo *pcbinfo;
1076	struct sockaddr_in *sin, src;
1077	int cscov_partial = 0;
1078	int error = 0;
1079	int ipflags;
1080	u_short fport, lport;
1081	int unlock_udbinfo;
1082	u_char tos;
1083	uint8_t pr;
1084	uint16_t cscov = 0;
1085
1086	/*
1087	 * udp_output() may need to temporarily bind or connect the current
1088	 * inpcb.  As such, we don't know up front whether we will need the
1089	 * pcbinfo lock or not.  Do any work to decide what is needed up
1090	 * front before acquiring any locks.
1091	 */
1092	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
1093		if (control)
1094			m_freem(control);
1095		m_freem(m);
1096		return (EMSGSIZE);
1097	}
1098
1099	src.sin_family = 0;
1100	INP_RLOCK(inp);
1101	tos = inp->inp_ip_tos;
1102	if (control != NULL) {
1103		/*
1104		 * XXX: Currently, we assume all the optional information is
1105		 * stored in a single mbuf.
1106		 */
1107		if (control->m_next) {
1108			INP_RUNLOCK(inp);
1109			m_freem(control);
1110			m_freem(m);
1111			return (EINVAL);
1112		}
1113		for (; control->m_len > 0;
1114		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
1115		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
1116			cm = mtod(control, struct cmsghdr *);
1117			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
1118			    || cm->cmsg_len > control->m_len) {
1119				error = EINVAL;
1120				break;
1121			}
1122			if (cm->cmsg_level != IPPROTO_IP)
1123				continue;
1124
1125			switch (cm->cmsg_type) {
1126			case IP_SENDSRCADDR:
1127				if (cm->cmsg_len !=
1128				    CMSG_LEN(sizeof(struct in_addr))) {
1129					error = EINVAL;
1130					break;
1131				}
1132				bzero(&src, sizeof(src));
1133				src.sin_family = AF_INET;
1134				src.sin_len = sizeof(src);
1135				src.sin_port = inp->inp_lport;
1136				src.sin_addr =
1137				    *(struct in_addr *)CMSG_DATA(cm);
1138				break;
1139
1140			case IP_TOS:
1141				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
1142					error = EINVAL;
1143					break;
1144				}
1145				tos = *(u_char *)CMSG_DATA(cm);
1146				break;
1147
1148			default:
1149				error = ENOPROTOOPT;
1150				break;
1151			}
1152			if (error)
1153				break;
1154		}
1155		m_freem(control);
1156	}
1157	if (error) {
1158		INP_RUNLOCK(inp);
1159		m_freem(m);
1160		return (error);
1161	}
1162
1163	/*
1164	 * Depending on whether or not the application has bound or connected
1165	 * the socket, we may have to do varying levels of work.  The optimal
1166	 * case is for a connected UDP socket, as a global lock isn't
1167	 * required at all.
1168	 *
1169	 * In order to decide which we need, we require stability of the
1170	 * inpcb binding, which we ensure by acquiring a read lock on the
1171	 * inpcb.  This doesn't strictly follow the lock order, so we play
1172	 * the trylock and retry game; note that we may end up with more
1173	 * conservative locks than required the second time around, so later
1174	 * assertions have to accept that.  Further analysis of the number of
1175	 * misses under contention is required.
1176	 *
1177	 * XXXRW: Check that hash locking update here is correct.
1178	 */
1179	pr = inp->inp_socket->so_proto->pr_protocol;
1180	pcbinfo = get_inpcbinfo(pr);
1181	sin = (struct sockaddr_in *)addr;
1182	if (sin != NULL &&
1183	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
1184		INP_RUNLOCK(inp);
1185		INP_WLOCK(inp);
1186		INP_HASH_WLOCK(pcbinfo);
1187		unlock_udbinfo = UH_WLOCKED;
1188	} else if ((sin != NULL && (
1189	    (sin->sin_addr.s_addr == INADDR_ANY) ||
1190	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
1191	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
1192	    (inp->inp_lport == 0))) ||
1193	    (src.sin_family == AF_INET)) {
1194		INP_HASH_RLOCK(pcbinfo);
1195		unlock_udbinfo = UH_RLOCKED;
1196	} else
1197		unlock_udbinfo = UH_UNLOCKED;
1198
1199	/*
1200	 * If the IP_SENDSRCADDR control message was specified, override the
1201	 * source address for this datagram.  Its use is invalidated if the
1202	 * address thus specified is incomplete or clobbers other inpcbs.
1203	 */
1204	laddr = inp->inp_laddr;
1205	lport = inp->inp_lport;
1206	if (src.sin_family == AF_INET) {
1207		INP_HASH_LOCK_ASSERT(pcbinfo);
1208		if ((lport == 0) ||
1209		    (laddr.s_addr == INADDR_ANY &&
1210		     src.sin_addr.s_addr == INADDR_ANY)) {
1211			error = EINVAL;
1212			goto release;
1213		}
1214		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
1215		    &laddr.s_addr, &lport, td->td_ucred);
1216		if (error)
1217			goto release;
1218	}
1219
1220	/*
1221	 * If a UDP socket has been connected, then a local address/port will
1222	 * have been selected and bound.
1223	 *
1224	 * If a UDP socket has not been connected to, then an explicit
1225	 * destination address must be used, in which case a local
1226	 * address/port may not have been selected and bound.
1227	 */
1228	if (sin != NULL) {
1229		INP_LOCK_ASSERT(inp);
1230		if (inp->inp_faddr.s_addr != INADDR_ANY) {
1231			error = EISCONN;
1232			goto release;
1233		}
1234
1235		/*
1236		 * Jail may rewrite the destination address, so let it do
1237		 * that before we use it.
1238		 */
1239		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1240		if (error)
1241			goto release;
1242
1243		/*
1244		 * If a local address or port hasn't yet been selected, or if
1245		 * the destination address needs to be rewritten due to using
1246		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
1247		 * to do the heavy lifting.  Once a port is selected, we
1248		 * commit the binding back to the socket; we also commit the
1249		 * binding of the address if in jail.
1250		 *
1251		 * If we already have a valid binding and we're not
1252		 * requesting a destination address rewrite, use a fast path.
1253		 */
1254		if (inp->inp_laddr.s_addr == INADDR_ANY ||
1255		    inp->inp_lport == 0 ||
1256		    sin->sin_addr.s_addr == INADDR_ANY ||
1257		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
1258			INP_HASH_LOCK_ASSERT(pcbinfo);
1259			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
1260			    &lport, &faddr.s_addr, &fport, NULL,
1261			    td->td_ucred);
1262			if (error)
1263				goto release;
1264
1265			/*
1266			 * XXXRW: Why not commit the port if the address is
1267			 * !INADDR_ANY?
1268			 */
1269			/* Commit the local port if newly assigned. */
1270			if (inp->inp_laddr.s_addr == INADDR_ANY &&
1271			    inp->inp_lport == 0) {
1272				INP_WLOCK_ASSERT(inp);
1273				INP_HASH_WLOCK_ASSERT(pcbinfo);
1274				/*
1275				 * Remember addr if jailed, to prevent
1276				 * rebinding.
1277				 */
1278				if (prison_flag(td->td_ucred, PR_IP4))
1279					inp->inp_laddr = laddr;
1280				inp->inp_lport = lport;
1281				if (in_pcbinshash(inp) != 0) {
1282					inp->inp_lport = 0;
1283					error = EAGAIN;
1284					goto release;
1285				}
1286				inp->inp_flags |= INP_ANONPORT;
1287			}
1288		} else {
1289			faddr = sin->sin_addr;
1290			fport = sin->sin_port;
1291		}
1292	} else {
1293		INP_LOCK_ASSERT(inp);
1294		faddr = inp->inp_faddr;
1295		fport = inp->inp_fport;
1296		if (faddr.s_addr == INADDR_ANY) {
1297			error = ENOTCONN;
1298			goto release;
1299		}
1300	}
1301
1302	/*
1303	 * Calculate data length and get a mbuf for UDP, IP, and possible
1304	 * link-layer headers.  Immediate slide the data pointer back forward
1305	 * since we won't use that space at this layer.
1306	 */
1307	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
1308	if (m == NULL) {
1309		error = ENOBUFS;
1310		goto release;
1311	}
1312	m->m_data += max_linkhdr;
1313	m->m_len -= max_linkhdr;
1314	m->m_pkthdr.len -= max_linkhdr;
1315
1316	/*
1317	 * Fill in mbuf with extended UDP header and addresses and length put
1318	 * into network format.
1319	 */
1320	ui = mtod(m, struct udpiphdr *);
1321	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
1322	ui->ui_pr = pr;
1323	ui->ui_src = laddr;
1324	ui->ui_dst = faddr;
1325	ui->ui_sport = lport;
1326	ui->ui_dport = fport;
1327	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
1328	if (pr == IPPROTO_UDPLITE) {
1329		struct udpcb *up;
1330		uint16_t plen;
1331
1332		up = intoudpcb(inp);
1333		cscov = up->u_txcslen;
1334		plen = (u_short)len + sizeof(struct udphdr);
1335		if (cscov >= plen)
1336			cscov = 0;
1337		ui->ui_len = htons(plen);
1338		ui->ui_ulen = htons(cscov);
1339		/*
1340		 * For UDP-Lite, checksum coverage length of zero means
1341		 * the entire UDPLite packet is covered by the checksum.
1342		 */
1343		cscov_partial = (cscov == 0) ? 0 : 1;
1344	} else
1345		ui->ui_v = IPVERSION << 4;
1346
1347	/*
1348	 * Set the Don't Fragment bit in the IP header.
1349	 */
1350	if (inp->inp_flags & INP_DONTFRAG) {
1351		struct ip *ip;
1352
1353		ip = (struct ip *)&ui->ui_i;
1354		ip->ip_off |= htons(IP_DF);
1355	}
1356
1357	ipflags = 0;
1358	if (inp->inp_socket->so_options & SO_DONTROUTE)
1359		ipflags |= IP_ROUTETOIF;
1360	if (inp->inp_socket->so_options & SO_BROADCAST)
1361		ipflags |= IP_ALLOWBROADCAST;
1362	if (inp->inp_flags & INP_ONESBCAST)
1363		ipflags |= IP_SENDONES;
1364
1365#ifdef MAC
1366	mac_inpcb_create_mbuf(inp, m);
1367#endif
1368
1369	/*
1370	 * Set up checksum and output datagram.
1371	 */
1372	ui->ui_sum = 0;
1373	if (pr == IPPROTO_UDPLITE) {
1374		if (inp->inp_flags & INP_ONESBCAST)
1375			faddr.s_addr = INADDR_BROADCAST;
1376		if (cscov_partial) {
1377			if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
1378				ui->ui_sum = 0xffff;
1379		} else {
1380			if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
1381				ui->ui_sum = 0xffff;
1382		}
1383	} else if (V_udp_cksum) {
1384		if (inp->inp_flags & INP_ONESBCAST)
1385			faddr.s_addr = INADDR_BROADCAST;
1386		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
1387		    htons((u_short)len + sizeof(struct udphdr) + pr));
1388		m->m_pkthdr.csum_flags = CSUM_UDP;
1389		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
1390	}
1391	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
1392	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
1393	((struct ip *)ui)->ip_tos = tos;		/* XXX */
1394	UDPSTAT_INC(udps_opackets);
1395
1396	if (unlock_udbinfo == UH_WLOCKED)
1397		INP_HASH_WUNLOCK(pcbinfo);
1398	else if (unlock_udbinfo == UH_RLOCKED)
1399		INP_HASH_RUNLOCK(pcbinfo);
1400	UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
1401	error = ip_output(m, inp->inp_options, NULL, ipflags,
1402	    inp->inp_moptions, inp);
1403	if (unlock_udbinfo == UH_WLOCKED)
1404		INP_WUNLOCK(inp);
1405	else
1406		INP_RUNLOCK(inp);
1407	return (error);
1408
1409release:
1410	if (unlock_udbinfo == UH_WLOCKED) {
1411		INP_HASH_WUNLOCK(pcbinfo);
1412		INP_WUNLOCK(inp);
1413	} else if (unlock_udbinfo == UH_RLOCKED) {
1414		INP_HASH_RUNLOCK(pcbinfo);
1415		INP_RUNLOCK(inp);
1416	} else
1417		INP_RUNLOCK(inp);
1418	m_freem(m);
1419	return (error);
1420}
1421
1422
1423#if defined(IPSEC) && defined(IPSEC_NAT_T)
1424/*
1425 * Potentially decap ESP in UDP frame.  Check for an ESP header
1426 * and optional marker; if present, strip the UDP header and
1427 * push the result through IPSec.
1428 *
1429 * Returns mbuf to be processed (potentially re-allocated) or
1430 * NULL if consumed and/or processed.
1431 */
1432static struct mbuf *
1433udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
1434{
1435	size_t minlen, payload, skip, iphlen;
1436	caddr_t data;
1437	struct udpcb *up;
1438	struct m_tag *tag;
1439	struct udphdr *udphdr;
1440	struct ip *ip;
1441
1442	INP_RLOCK_ASSERT(inp);
1443
1444	/*
1445	 * Pull up data so the longest case is contiguous:
1446	 *    IP/UDP hdr + non ESP marker + ESP hdr.
1447	 */
1448	minlen = off + sizeof(uint64_t) + sizeof(struct esp);
1449	if (minlen > m->m_pkthdr.len)
1450		minlen = m->m_pkthdr.len;
1451	if ((m = m_pullup(m, minlen)) == NULL) {
1452		IPSECSTAT_INC(ips_in_inval);
1453		return (NULL);		/* Bypass caller processing. */
1454	}
1455	data = mtod(m, caddr_t);	/* Points to ip header. */
1456	payload = m->m_len - off;	/* Size of payload. */
1457
1458	if (payload == 1 && data[off] == '\xff')
1459		return (m);		/* NB: keepalive packet, no decap. */
1460
1461	up = intoudpcb(inp);
1462	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
1463	KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
1464	    ("u_flags 0x%x", up->u_flags));
1465
1466	/*
1467	 * Check that the payload is large enough to hold an
1468	 * ESP header and compute the amount of data to remove.
1469	 *
1470	 * NB: the caller has already done a pullup for us.
1471	 * XXX can we assume alignment and eliminate bcopys?
1472	 */
1473	if (up->u_flags & UF_ESPINUDP_NON_IKE) {
1474		/*
1475		 * draft-ietf-ipsec-nat-t-ike-0[01].txt and
1476		 * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
1477		 * possible AH mode non-IKE marker+non-ESP marker
1478		 * from draft-ietf-ipsec-udp-encaps-00.txt.
1479		 */
1480		uint64_t marker;
1481
1482		if (payload <= sizeof(uint64_t) + sizeof(struct esp))
1483			return (m);	/* NB: no decap. */
1484		bcopy(data + off, &marker, sizeof(uint64_t));
1485		if (marker != 0)	/* Non-IKE marker. */
1486			return (m);	/* NB: no decap. */
1487		skip = sizeof(uint64_t) + sizeof(struct udphdr);
1488	} else {
1489		uint32_t spi;
1490
1491		if (payload <= sizeof(struct esp)) {
1492			IPSECSTAT_INC(ips_in_inval);
1493			m_freem(m);
1494			return (NULL);	/* Discard. */
1495		}
1496		bcopy(data + off, &spi, sizeof(uint32_t));
1497		if (spi == 0)		/* Non-ESP marker. */
1498			return (m);	/* NB: no decap. */
1499		skip = sizeof(struct udphdr);
1500	}
1501
1502	/*
1503	 * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
1504	 * the UDP ports. This is required if we want to select
1505	 * the right SPD for multiple hosts behind same NAT.
1506	 *
1507	 * NB: ports are maintained in network byte order everywhere
1508	 *     in the NAT-T code.
1509	 */
1510	tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
1511		2 * sizeof(uint16_t), M_NOWAIT);
1512	if (tag == NULL) {
1513		IPSECSTAT_INC(ips_in_nomem);
1514		m_freem(m);
1515		return (NULL);		/* Discard. */
1516	}
1517	iphlen = off - sizeof(struct udphdr);
1518	udphdr = (struct udphdr *)(data + iphlen);
1519	((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
1520	((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
1521	m_tag_prepend(m, tag);
1522
1523	/*
1524	 * Remove the UDP header (and possibly the non ESP marker)
1525	 * IP header length is iphlen
1526	 * Before:
1527	 *   <--- off --->
1528	 *   +----+------+-----+
1529	 *   | IP |  UDP | ESP |
1530	 *   +----+------+-----+
1531	 *        <-skip->
1532	 * After:
1533	 *          +----+-----+
1534	 *          | IP | ESP |
1535	 *          +----+-----+
1536	 *   <-skip->
1537	 */
1538	ovbcopy(data, data + skip, iphlen);
1539	m_adj(m, skip);
1540
1541	ip = mtod(m, struct ip *);
1542	ip->ip_len = htons(ntohs(ip->ip_len) - skip);
1543	ip->ip_p = IPPROTO_ESP;
1544
1545	/*
1546	 * We cannot yet update the cksums so clear any
1547	 * h/w cksum flags as they are no longer valid.
1548	 */
1549	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
1550		m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
1551
1552	(void) ipsec4_common_input(m, iphlen, ip->ip_p);
1553	return (NULL);			/* NB: consumed, bypass processing. */
1554}
1555#endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
1556
1557static void
1558udp_abort(struct socket *so)
1559{
1560	struct inpcb *inp;
1561	struct inpcbinfo *pcbinfo;
1562
1563	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1564	inp = sotoinpcb(so);
1565	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
1566	INP_WLOCK(inp);
1567	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1568		INP_HASH_WLOCK(pcbinfo);
1569		in_pcbdisconnect(inp);
1570		inp->inp_laddr.s_addr = INADDR_ANY;
1571		INP_HASH_WUNLOCK(pcbinfo);
1572		soisdisconnected(so);
1573	}
1574	INP_WUNLOCK(inp);
1575}
1576
1577static int
1578udp_attach(struct socket *so, int proto, struct thread *td)
1579{
1580	struct inpcb *inp;
1581	struct inpcbinfo *pcbinfo;
1582	int error;
1583
1584	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1585	inp = sotoinpcb(so);
1586	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
1587	error = soreserve(so, udp_sendspace, udp_recvspace);
1588	if (error)
1589		return (error);
1590	INP_INFO_WLOCK(pcbinfo);
1591	error = in_pcballoc(so, pcbinfo);
1592	if (error) {
1593		INP_INFO_WUNLOCK(pcbinfo);
1594		return (error);
1595	}
1596
1597	inp = sotoinpcb(so);
1598	inp->inp_vflag |= INP_IPV4;
1599	inp->inp_ip_ttl = V_ip_defttl;
1600
1601	error = udp_newudpcb(inp);
1602	if (error) {
1603		in_pcbdetach(inp);
1604		in_pcbfree(inp);
1605		INP_INFO_WUNLOCK(pcbinfo);
1606		return (error);
1607	}
1608
1609	INP_WUNLOCK(inp);
1610	INP_INFO_WUNLOCK(pcbinfo);
1611	return (0);
1612}
1613#endif /* INET */
1614
1615int
1616udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
1617{
1618	struct inpcb *inp;
1619	struct udpcb *up;
1620
1621	KASSERT(so->so_type == SOCK_DGRAM,
1622	    ("udp_set_kernel_tunneling: !dgram"));
1623	inp = sotoinpcb(so);
1624	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
1625	INP_WLOCK(inp);
1626	up = intoudpcb(inp);
1627	if (up->u_tun_func != NULL) {
1628		INP_WUNLOCK(inp);
1629		return (EBUSY);
1630	}
1631	up->u_tun_func = f;
1632	INP_WUNLOCK(inp);
1633	return (0);
1634}
1635
1636#ifdef INET
1637static int
1638udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
1639{
1640	struct inpcb *inp;
1641	struct inpcbinfo *pcbinfo;
1642	int error;
1643
1644	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1645	inp = sotoinpcb(so);
1646	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
1647	INP_WLOCK(inp);
1648	INP_HASH_WLOCK(pcbinfo);
1649	error = in_pcbbind(inp, nam, td->td_ucred);
1650	INP_HASH_WUNLOCK(pcbinfo);
1651	INP_WUNLOCK(inp);
1652	return (error);
1653}
1654
1655static void
1656udp_close(struct socket *so)
1657{
1658	struct inpcb *inp;
1659	struct inpcbinfo *pcbinfo;
1660
1661	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1662	inp = sotoinpcb(so);
1663	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
1664	INP_WLOCK(inp);
1665	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1666		INP_HASH_WLOCK(pcbinfo);
1667		in_pcbdisconnect(inp);
1668		inp->inp_laddr.s_addr = INADDR_ANY;
1669		INP_HASH_WUNLOCK(pcbinfo);
1670		soisdisconnected(so);
1671	}
1672	INP_WUNLOCK(inp);
1673}
1674
1675static int
1676udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1677{
1678	struct inpcb *inp;
1679	struct inpcbinfo *pcbinfo;
1680	struct sockaddr_in *sin;
1681	int error;
1682
1683	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1684	inp = sotoinpcb(so);
1685	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
1686	INP_WLOCK(inp);
1687	if (inp->inp_faddr.s_addr != INADDR_ANY) {
1688		INP_WUNLOCK(inp);
1689		return (EISCONN);
1690	}
1691	sin = (struct sockaddr_in *)nam;
1692	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
1693	if (error != 0) {
1694		INP_WUNLOCK(inp);
1695		return (error);
1696	}
1697	INP_HASH_WLOCK(pcbinfo);
1698	error = in_pcbconnect(inp, nam, td->td_ucred);
1699	INP_HASH_WUNLOCK(pcbinfo);
1700	if (error == 0)
1701		soisconnected(so);
1702	INP_WUNLOCK(inp);
1703	return (error);
1704}
1705
1706static void
1707udp_detach(struct socket *so)
1708{
1709	struct inpcb *inp;
1710	struct inpcbinfo *pcbinfo;
1711	struct udpcb *up;
1712
1713	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1714	inp = sotoinpcb(so);
1715	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
1716	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
1717	    ("udp_detach: not disconnected"));
1718	INP_INFO_WLOCK(pcbinfo);
1719	INP_WLOCK(inp);
1720	up = intoudpcb(inp);
1721	KASSERT(up != NULL, ("%s: up == NULL", __func__));
1722	inp->inp_ppcb = NULL;
1723	in_pcbdetach(inp);
1724	in_pcbfree(inp);
1725	INP_INFO_WUNLOCK(pcbinfo);
1726	udp_discardcb(up);
1727}
1728
1729static int
1730udp_disconnect(struct socket *so)
1731{
1732	struct inpcb *inp;
1733	struct inpcbinfo *pcbinfo;
1734
1735	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
1736	inp = sotoinpcb(so);
1737	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
1738	INP_WLOCK(inp);
1739	if (inp->inp_faddr.s_addr == INADDR_ANY) {
1740		INP_WUNLOCK(inp);
1741		return (ENOTCONN);
1742	}
1743	INP_HASH_WLOCK(pcbinfo);
1744	in_pcbdisconnect(inp);
1745	inp->inp_laddr.s_addr = INADDR_ANY;
1746	INP_HASH_WUNLOCK(pcbinfo);
1747	SOCK_LOCK(so);
1748	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
1749	SOCK_UNLOCK(so);
1750	INP_WUNLOCK(inp);
1751	return (0);
1752}
1753
1754static int
1755udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
1756    struct mbuf *control, struct thread *td)
1757{
1758	struct inpcb *inp;
1759
1760	inp = sotoinpcb(so);
1761	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
1762	return (udp_output(inp, m, addr, control, td));
1763}
1764#endif /* INET */
1765
1766int
1767udp_shutdown(struct socket *so)
1768{
1769	struct inpcb *inp;
1770
1771	inp = sotoinpcb(so);
1772	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
1773	INP_WLOCK(inp);
1774	socantsendmore(so);
1775	INP_WUNLOCK(inp);
1776	return (0);
1777}
1778
1779#ifdef INET
1780struct pr_usrreqs udp_usrreqs = {
1781	.pru_abort =		udp_abort,
1782	.pru_attach =		udp_attach,
1783	.pru_bind =		udp_bind,
1784	.pru_connect =		udp_connect,
1785	.pru_control =		in_control,
1786	.pru_detach =		udp_detach,
1787	.pru_disconnect =	udp_disconnect,
1788	.pru_peeraddr =		in_getpeeraddr,
1789	.pru_send =		udp_send,
1790	.pru_soreceive =	soreceive_dgram,
1791	.pru_sosend =		sosend_dgram,
1792	.pru_shutdown =		udp_shutdown,
1793	.pru_sockaddr =		in_getsockaddr,
1794	.pru_sosetlabel =	in_pcbsosetlabel,
1795	.pru_close =		udp_close,
1796};
1797#endif /* INET */
1798