1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993
5 *	The Regents of the University of California.
6 * Copyright (c) 2004 The FreeBSD Foundation
7 * Copyright (c) 2004-2008 Robert N. M. Watson
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
35 */
36
37/*
38 * Comments on the socket life cycle:
39 *
40 * soalloc() sets of socket layer state for a socket, called only by
41 * socreate() and sonewconn().  Socket layer private.
42 *
43 * sodealloc() tears down socket layer state for a socket, called only by
44 * sofree() and sonewconn().  Socket layer private.
45 *
46 * pru_attach() associates protocol layer state with an allocated socket;
47 * called only once, may fail, aborting socket allocation.  This is called
48 * from socreate() and sonewconn().  Socket layer private.
49 *
50 * pru_detach() disassociates protocol layer state from an attached socket,
51 * and will be called exactly once for sockets in which pru_attach() has
52 * been successfully called.  If pru_attach() returned an error,
53 * pru_detach() will not be called.  Socket layer private.
54 *
55 * pru_abort() and pru_close() notify the protocol layer that the last
56 * consumer of a socket is starting to tear down the socket, and that the
57 * protocol should terminate the connection.  Historically, pru_abort() also
58 * detached protocol state from the socket state, but this is no longer the
59 * case.
60 *
61 * socreate() creates a socket and attaches protocol state.  This is a public
62 * interface that may be used by socket layer consumers to create new
63 * sockets.
64 *
65 * sonewconn() creates a socket and attaches protocol state.  This is a
66 * public interface  that may be used by protocols to create new sockets when
67 * a new connection is received and will be available for accept() on a
68 * listen socket.
69 *
70 * soclose() destroys a socket after possibly waiting for it to disconnect.
71 * This is a public interface that socket consumers should use to close and
72 * release a socket when done with it.
73 *
74 * soabort() destroys a socket without waiting for it to disconnect (used
75 * only for incoming connections that are already partially or fully
76 * connected).  This is used internally by the socket layer when clearing
77 * listen socket queues (due to overflow or close on the listen socket), but
78 * is also a public interface protocols may use to abort connections in
79 * their incomplete listen queues should they no longer be required.  Sockets
80 * placed in completed connection listen queues should not be aborted for
81 * reasons described in the comment above the soclose() implementation.  This
82 * is not a general purpose close routine, and except in the specific
83 * circumstances described here, should not be used.
84 *
85 * sofree() will free a socket and its protocol state if all references on
86 * the socket have been released, and is the public interface to attempt to
87 * free a socket when a reference is removed.  This is a socket layer private
88 * interface.
89 *
90 * NOTE: In addition to socreate() and soclose(), which provide a single
91 * socket reference to the consumer to be managed as required, there are two
92 * calls to explicitly manage socket references, soref(), and sorele().
93 * Currently, these are generally required only when transitioning a socket
94 * from a listen queue to a file descriptor, in order to prevent garbage
95 * collection of the socket at an untimely moment.  For a number of reasons,
96 * these interfaces are not preferred, and should be avoided.
97 *
98 * NOTE: With regard to VNETs the general rule is that callers do not set
99 * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
100 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
101 * and sorflush(), which are usually called from a pre-set VNET context.
102 * sopoll() currently does not need a VNET context to be set.
103 */
104
105#include <sys/cdefs.h>
106__FBSDID("$FreeBSD$");
107
108#include "opt_inet.h"
109#include "opt_inet6.h"
110#include "opt_kern_tls.h"
111#include "opt_sctp.h"
112
113#include <sys/param.h>
114#include <sys/systm.h>
115#include <sys/fcntl.h>
116#include <sys/limits.h>
117#include <sys/lock.h>
118#include <sys/mac.h>
119#include <sys/malloc.h>
120#include <sys/mbuf.h>
121#include <sys/mutex.h>
122#include <sys/domain.h>
123#include <sys/file.h>			/* for struct knote */
124#include <sys/hhook.h>
125#include <sys/kernel.h>
126#include <sys/khelp.h>
127#include <sys/ktls.h>
128#include <sys/event.h>
129#include <sys/eventhandler.h>
130#include <sys/poll.h>
131#include <sys/proc.h>
132#include <sys/protosw.h>
133#include <sys/sbuf.h>
134#include <sys/socket.h>
135#include <sys/socketvar.h>
136#include <sys/resourcevar.h>
137#include <net/route.h>
138#include <sys/signalvar.h>
139#include <sys/stat.h>
140#include <sys/sx.h>
141#include <sys/sysctl.h>
142#include <sys/taskqueue.h>
143#include <sys/uio.h>
144#include <sys/un.h>
145#include <sys/unpcb.h>
146#include <sys/jail.h>
147#include <sys/syslog.h>
148#include <netinet/in.h>
149#include <netinet/in_pcb.h>
150#include <netinet/tcp.h>
151
152#include <net/vnet.h>
153
154#include <security/mac/mac_framework.h>
155
156#include <vm/uma.h>
157
158#ifdef COMPAT_FREEBSD32
159#include <sys/mount.h>
160#include <sys/sysent.h>
161#include <compat/freebsd32/freebsd32.h>
162#endif
163
164static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
165		    int flags);
166static void	so_rdknl_lock(void *);
167static void	so_rdknl_unlock(void *);
168static void	so_rdknl_assert_lock(void *, int);
169static void	so_wrknl_lock(void *);
170static void	so_wrknl_unlock(void *);
171static void	so_wrknl_assert_lock(void *, int);
172
173static void	filt_sordetach(struct knote *kn);
174static int	filt_soread(struct knote *kn, long hint);
175static void	filt_sowdetach(struct knote *kn);
176static int	filt_sowrite(struct knote *kn, long hint);
177static int	filt_soempty(struct knote *kn, long hint);
178static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
179fo_kqfilter_t	soo_kqfilter;
180
181static struct filterops soread_filtops = {
182	.f_isfd = 1,
183	.f_detach = filt_sordetach,
184	.f_event = filt_soread,
185};
186static struct filterops sowrite_filtops = {
187	.f_isfd = 1,
188	.f_detach = filt_sowdetach,
189	.f_event = filt_sowrite,
190};
191static struct filterops soempty_filtops = {
192	.f_isfd = 1,
193	.f_detach = filt_sowdetach,
194	.f_event = filt_soempty,
195};
196
197so_gen_t	so_gencnt;	/* generation count for sockets */
198
199MALLOC_DEFINE(M_SONAME, "soname", "socket name");
200MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
201
202#define	VNET_SO_ASSERT(so)						\
203	VNET_ASSERT(curvnet != NULL,					\
204	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
205
206VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
207#define	V_socket_hhh		VNET(socket_hhh)
208
209/*
210 * Limit on the number of connections in the listen queue waiting
211 * for accept(2).
212 * NB: The original sysctl somaxconn is still available but hidden
213 * to prevent confusion about the actual purpose of this number.
214 */
215static u_int somaxconn = SOMAXCONN;
216
217static int
218sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
219{
220	int error;
221	int val;
222
223	val = somaxconn;
224	error = sysctl_handle_int(oidp, &val, 0, req);
225	if (error || !req->newptr )
226		return (error);
227
228	/*
229	 * The purpose of the UINT_MAX / 3 limit, is so that the formula
230	 *   3 * so_qlimit / 2
231	 * below, will not overflow.
232         */
233
234	if (val < 1 || val > UINT_MAX / 3)
235		return (EINVAL);
236
237	somaxconn = val;
238	return (0);
239}
240SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
241    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int),
242    sysctl_somaxconn, "I",
243    "Maximum listen socket pending connection accept queue size");
244SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
245    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, 0,
246    sizeof(int), sysctl_somaxconn, "I",
247    "Maximum listen socket pending connection accept queue size (compat)");
248
249static int numopensockets;
250SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
251    &numopensockets, 0, "Number of open sockets");
252
253/*
254 * accept_mtx locks down per-socket fields relating to accept queues.  See
255 * socketvar.h for an annotation of the protected fields of struct socket.
256 */
257struct mtx accept_mtx;
258MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
259
260/*
261 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
262 * so_gencnt field.
263 */
264static struct mtx so_global_mtx;
265MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
266
267/*
268 * General IPC sysctl name space, used by sockets and a variety of other IPC
269 * types.
270 */
271SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
272    "IPC");
273
274/*
275 * Initialize the socket subsystem and set up the socket
276 * memory allocator.
277 */
278static uma_zone_t socket_zone;
279int	maxsockets;
280
281static void
282socket_zone_change(void *tag)
283{
284
285	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
286}
287
288static void
289socket_hhook_register(int subtype)
290{
291
292	if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
293	    &V_socket_hhh[subtype],
294	    HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
295		printf("%s: WARNING: unable to register hook\n", __func__);
296}
297
298static void
299socket_hhook_deregister(int subtype)
300{
301
302	if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
303		printf("%s: WARNING: unable to deregister hook\n", __func__);
304}
305
306static void
307socket_init(void *tag)
308{
309
310	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
311	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
312	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
313	uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
314	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
315	    EVENTHANDLER_PRI_FIRST);
316}
317SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
318
319static void
320socket_vnet_init(const void *unused __unused)
321{
322	int i;
323
324	/* We expect a contiguous range */
325	for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
326		socket_hhook_register(i);
327}
328VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
329    socket_vnet_init, NULL);
330
331static void
332socket_vnet_uninit(const void *unused __unused)
333{
334	int i;
335
336	for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
337		socket_hhook_deregister(i);
338}
339VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
340    socket_vnet_uninit, NULL);
341
342/*
343 * Initialise maxsockets.  This SYSINIT must be run after
344 * tunable_mbinit().
345 */
346static void
347init_maxsockets(void *ignored)
348{
349
350	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
351	maxsockets = imax(maxsockets, maxfiles);
352}
353SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
354
355/*
356 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
357 * of the change so that they can update their dependent limits as required.
358 */
359static int
360sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
361{
362	int error, newmaxsockets;
363
364	newmaxsockets = maxsockets;
365	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
366	if (error == 0 && req->newptr) {
367		if (newmaxsockets > maxsockets &&
368		    newmaxsockets <= maxfiles) {
369			maxsockets = newmaxsockets;
370			EVENTHANDLER_INVOKE(maxsockets_change);
371		} else
372			error = EINVAL;
373	}
374	return (error);
375}
376SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
377    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &maxsockets, 0,
378    sysctl_maxsockets, "IU",
379    "Maximum number of sockets available");
380
381/*
382 * Socket operation routines.  These routines are called by the routines in
383 * sys_socket.c or from a system process, and implement the semantics of
384 * socket operations by switching out to the protocol specific routines.
385 */
386
387/*
388 * Get a socket structure from our zone, and initialize it.  Note that it
389 * would probably be better to allocate socket and PCB at the same time, but
390 * I'm not convinced that all the protocols can be easily modified to do
391 * this.
392 *
393 * soalloc() returns a socket with a ref count of 0.
394 */
395static struct socket *
396soalloc(struct vnet *vnet)
397{
398	struct socket *so;
399
400	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
401	if (so == NULL)
402		return (NULL);
403#ifdef MAC
404	if (mac_socket_init(so, M_NOWAIT) != 0) {
405		uma_zfree(socket_zone, so);
406		return (NULL);
407	}
408#endif
409	if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
410		uma_zfree(socket_zone, so);
411		return (NULL);
412	}
413
414	/*
415	 * The socket locking protocol allows to lock 2 sockets at a time,
416	 * however, the first one must be a listening socket.  WITNESS lacks
417	 * a feature to change class of an existing lock, so we use DUPOK.
418	 */
419	mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
420	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
421	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
422	so->so_rcv.sb_sel = &so->so_rdsel;
423	so->so_snd.sb_sel = &so->so_wrsel;
424	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
425	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
426	TAILQ_INIT(&so->so_snd.sb_aiojobq);
427	TAILQ_INIT(&so->so_rcv.sb_aiojobq);
428	TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
429	TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
430#ifdef VIMAGE
431	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
432	    __func__, __LINE__, so));
433	so->so_vnet = vnet;
434#endif
435	/* We shouldn't need the so_global_mtx */
436	if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
437		/* Do we need more comprehensive error returns? */
438		uma_zfree(socket_zone, so);
439		return (NULL);
440	}
441	mtx_lock(&so_global_mtx);
442	so->so_gencnt = ++so_gencnt;
443	++numopensockets;
444#ifdef VIMAGE
445	vnet->vnet_sockcnt++;
446#endif
447	mtx_unlock(&so_global_mtx);
448
449	return (so);
450}
451
452/*
453 * Free the storage associated with a socket at the socket layer, tear down
454 * locks, labels, etc.  All protocol state is assumed already to have been
455 * torn down (and possibly never set up) by the caller.
456 */
457static void
458sodealloc(struct socket *so)
459{
460
461	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
462	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
463
464	mtx_lock(&so_global_mtx);
465	so->so_gencnt = ++so_gencnt;
466	--numopensockets;	/* Could be below, but faster here. */
467#ifdef VIMAGE
468	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
469	    __func__, __LINE__, so));
470	so->so_vnet->vnet_sockcnt--;
471#endif
472	mtx_unlock(&so_global_mtx);
473#ifdef MAC
474	mac_socket_destroy(so);
475#endif
476	hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
477
478	khelp_destroy_osd(&so->osd);
479	if (SOLISTENING(so)) {
480		if (so->sol_accept_filter != NULL)
481			accept_filt_setopt(so, NULL);
482	} else {
483		if (so->so_rcv.sb_hiwat)
484			(void)chgsbsize(so->so_cred->cr_uidinfo,
485			    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
486		if (so->so_snd.sb_hiwat)
487			(void)chgsbsize(so->so_cred->cr_uidinfo,
488			    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
489		sx_destroy(&so->so_snd.sb_sx);
490		sx_destroy(&so->so_rcv.sb_sx);
491		SOCKBUF_LOCK_DESTROY(&so->so_snd);
492		SOCKBUF_LOCK_DESTROY(&so->so_rcv);
493	}
494	crfree(so->so_cred);
495	mtx_destroy(&so->so_lock);
496	uma_zfree(socket_zone, so);
497}
498
499/*
500 * socreate returns a socket with a ref count of 1.  The socket should be
501 * closed with soclose().
502 */
503int
504socreate(int dom, struct socket **aso, int type, int proto,
505    struct ucred *cred, struct thread *td)
506{
507	struct protosw *prp;
508	struct socket *so;
509	int error;
510
511	if (proto)
512		prp = pffindproto(dom, proto, type);
513	else
514		prp = pffindtype(dom, type);
515
516	if (prp == NULL) {
517		/* No support for domain. */
518		if (pffinddomain(dom) == NULL)
519			return (EAFNOSUPPORT);
520		/* No support for socket type. */
521		if (proto == 0 && type != 0)
522			return (EPROTOTYPE);
523		return (EPROTONOSUPPORT);
524	}
525	if (prp->pr_usrreqs->pru_attach == NULL ||
526	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
527		return (EPROTONOSUPPORT);
528
529	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
530		return (EPROTONOSUPPORT);
531
532	if (prp->pr_type != type)
533		return (EPROTOTYPE);
534	so = soalloc(CRED_TO_VNET(cred));
535	if (so == NULL)
536		return (ENOBUFS);
537
538	so->so_type = type;
539	so->so_cred = crhold(cred);
540	if ((prp->pr_domain->dom_family == PF_INET) ||
541	    (prp->pr_domain->dom_family == PF_INET6) ||
542	    (prp->pr_domain->dom_family == PF_ROUTE))
543		so->so_fibnum = td->td_proc->p_fibnum;
544	else
545		so->so_fibnum = 0;
546	so->so_proto = prp;
547#ifdef MAC
548	mac_socket_create(cred, so);
549#endif
550	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
551	    so_rdknl_assert_lock);
552	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
553	    so_wrknl_assert_lock);
554	/*
555	 * Auto-sizing of socket buffers is managed by the protocols and
556	 * the appropriate flags must be set in the pru_attach function.
557	 */
558	CURVNET_SET(so->so_vnet);
559	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
560	CURVNET_RESTORE();
561	if (error) {
562		sodealloc(so);
563		return (error);
564	}
565	soref(so);
566	*aso = so;
567	return (0);
568}
569
570#ifdef REGRESSION
571static int regression_sonewconn_earlytest = 1;
572SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
573    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
574#endif
575
576static struct timeval overinterval = { 60, 0 };
577SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW,
578    &overinterval,
579    "Delay in seconds between warnings for listen socket overflows");
580
581/*
582 * When an attempt at a new connection is noted on a socket which accepts
583 * connections, sonewconn is called.  If the connection is possible (subject
584 * to space constraints, etc.) then we allocate a new structure, properly
585 * linked into the data structure of the original socket, and return this.
586 * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED.
587 *
588 * Note: the ref count on the socket is 0 on return.
589 */
590struct socket *
591sonewconn(struct socket *head, int connstatus)
592{
593	struct sbuf descrsb;
594	struct socket *so;
595	int len, overcount;
596	u_int qlen;
597	const char localprefix[] = "local:";
598	char descrbuf[SUNPATHLEN + sizeof(localprefix)];
599#if defined(INET6)
600	char addrbuf[INET6_ADDRSTRLEN];
601#elif defined(INET)
602	char addrbuf[INET_ADDRSTRLEN];
603#endif
604	bool dolog, over;
605
606	SOLISTEN_LOCK(head);
607	over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
608#ifdef REGRESSION
609	if (regression_sonewconn_earlytest && over) {
610#else
611	if (over) {
612#endif
613		head->sol_overcount++;
614		dolog = !!ratecheck(&head->sol_lastover, &overinterval);
615
616		/*
617		 * If we're going to log, copy the overflow count and queue
618		 * length from the listen socket before dropping the lock.
619		 * Also, reset the overflow count.
620		 */
621		if (dolog) {
622			overcount = head->sol_overcount;
623			head->sol_overcount = 0;
624			qlen = head->sol_qlen;
625		}
626		SOLISTEN_UNLOCK(head);
627
628		if (dolog) {
629			/*
630			 * Try to print something descriptive about the
631			 * socket for the error message.
632			 */
633			sbuf_new(&descrsb, descrbuf, sizeof(descrbuf),
634			    SBUF_FIXEDLEN);
635			switch (head->so_proto->pr_domain->dom_family) {
636#if defined(INET) || defined(INET6)
637#ifdef INET
638			case AF_INET:
639#endif
640#ifdef INET6
641			case AF_INET6:
642				if (head->so_proto->pr_domain->dom_family ==
643				    AF_INET6 ||
644				    (sotoinpcb(head)->inp_inc.inc_flags &
645				    INC_ISIPV6)) {
646					ip6_sprintf(addrbuf,
647					    &sotoinpcb(head)->inp_inc.inc6_laddr);
648					sbuf_printf(&descrsb, "[%s]", addrbuf);
649				} else
650#endif
651				{
652#ifdef INET
653					inet_ntoa_r(
654					    sotoinpcb(head)->inp_inc.inc_laddr,
655					    addrbuf);
656					sbuf_cat(&descrsb, addrbuf);
657#endif
658				}
659				sbuf_printf(&descrsb, ":%hu (proto %u)",
660				    ntohs(sotoinpcb(head)->inp_inc.inc_lport),
661				    head->so_proto->pr_protocol);
662				break;
663#endif /* INET || INET6 */
664			case AF_UNIX:
665				sbuf_cat(&descrsb, localprefix);
666				if (sotounpcb(head)->unp_addr != NULL)
667					len =
668					    sotounpcb(head)->unp_addr->sun_len -
669					    offsetof(struct sockaddr_un,
670					    sun_path);
671				else
672					len = 0;
673				if (len > 0)
674					sbuf_bcat(&descrsb,
675					    sotounpcb(head)->unp_addr->sun_path,
676					    len);
677				else
678					sbuf_cat(&descrsb, "(unknown)");
679				break;
680			}
681
682			/*
683			 * If we can't print something more specific, at least
684			 * print the domain name.
685			 */
686			if (sbuf_finish(&descrsb) != 0 ||
687			    sbuf_len(&descrsb) <= 0) {
688				sbuf_clear(&descrsb);
689				sbuf_cat(&descrsb,
690				    head->so_proto->pr_domain->dom_name ?:
691				    "unknown");
692				sbuf_finish(&descrsb);
693			}
694			KASSERT(sbuf_len(&descrsb) > 0,
695			    ("%s: sbuf creation failed", __func__));
696			log(LOG_DEBUG,
697			    "%s: pcb %p (%s): Listen queue overflow: "
698			    "%i already in queue awaiting acceptance "
699			    "(%d occurrences)\n",
700			    __func__, head->so_pcb, sbuf_data(&descrsb),
701			    qlen, overcount);
702			sbuf_delete(&descrsb);
703
704			overcount = 0;
705		}
706
707		return (NULL);
708	}
709	SOLISTEN_UNLOCK(head);
710	VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
711	    __func__, head));
712	so = soalloc(head->so_vnet);
713	if (so == NULL) {
714		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
715		    "limit reached or out of memory\n",
716		    __func__, head->so_pcb);
717		return (NULL);
718	}
719	so->so_listen = head;
720	so->so_type = head->so_type;
721	so->so_options = head->so_options & ~SO_ACCEPTCONN;
722	so->so_linger = head->so_linger;
723	so->so_state = head->so_state | SS_NOFDREF;
724	so->so_fibnum = head->so_fibnum;
725	so->so_proto = head->so_proto;
726	so->so_cred = crhold(head->so_cred);
727#ifdef MAC
728	mac_socket_newconn(head, so);
729#endif
730	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
731	    so_rdknl_assert_lock);
732	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
733	    so_wrknl_assert_lock);
734	VNET_SO_ASSERT(head);
735	if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
736		sodealloc(so);
737		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
738		    __func__, head->so_pcb);
739		return (NULL);
740	}
741	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
742		sodealloc(so);
743		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
744		    __func__, head->so_pcb);
745		return (NULL);
746	}
747	so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
748	so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
749	so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
750	so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
751	so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE;
752	so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE;
753
754	SOLISTEN_LOCK(head);
755	if (head->sol_accept_filter != NULL)
756		connstatus = 0;
757	so->so_state |= connstatus;
758	soref(head); /* A socket on (in)complete queue refs head. */
759	if (connstatus) {
760		TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
761		so->so_qstate = SQ_COMP;
762		head->sol_qlen++;
763		solisten_wakeup(head);	/* unlocks */
764	} else {
765		/*
766		 * Keep removing sockets from the head until there's room for
767		 * us to insert on the tail.  In pre-locking revisions, this
768		 * was a simple if(), but as we could be racing with other
769		 * threads and soabort() requires dropping locks, we must
770		 * loop waiting for the condition to be true.
771		 */
772		while (head->sol_incqlen > head->sol_qlimit) {
773			struct socket *sp;
774
775			sp = TAILQ_FIRST(&head->sol_incomp);
776			TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
777			head->sol_incqlen--;
778			SOCK_LOCK(sp);
779			sp->so_qstate = SQ_NONE;
780			sp->so_listen = NULL;
781			SOCK_UNLOCK(sp);
782			sorele(head);	/* does SOLISTEN_UNLOCK, head stays */
783			soabort(sp);
784			SOLISTEN_LOCK(head);
785		}
786		TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
787		so->so_qstate = SQ_INCOMP;
788		head->sol_incqlen++;
789		SOLISTEN_UNLOCK(head);
790	}
791	return (so);
792}
793
794#if defined(SCTP) || defined(SCTP_SUPPORT)
795/*
796 * Socket part of sctp_peeloff().  Detach a new socket from an
797 * association.  The new socket is returned with a reference.
798 */
799struct socket *
800sopeeloff(struct socket *head)
801{
802	struct socket *so;
803
804	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
805	    __func__, __LINE__, head));
806	so = soalloc(head->so_vnet);
807	if (so == NULL) {
808		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
809		    "limit reached or out of memory\n",
810		    __func__, head->so_pcb);
811		return (NULL);
812	}
813	so->so_type = head->so_type;
814	so->so_options = head->so_options;
815	so->so_linger = head->so_linger;
816	so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
817	so->so_fibnum = head->so_fibnum;
818	so->so_proto = head->so_proto;
819	so->so_cred = crhold(head->so_cred);
820#ifdef MAC
821	mac_socket_newconn(head, so);
822#endif
823	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
824	    so_rdknl_assert_lock);
825	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
826	    so_wrknl_assert_lock);
827	VNET_SO_ASSERT(head);
828	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
829		sodealloc(so);
830		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
831		    __func__, head->so_pcb);
832		return (NULL);
833	}
834	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
835		sodealloc(so);
836		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
837		    __func__, head->so_pcb);
838		return (NULL);
839	}
840	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
841	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
842	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
843	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
844	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
845	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
846
847	soref(so);
848
849	return (so);
850}
851#endif	/* SCTP */
852
853int
854sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
855{
856	int error;
857
858	CURVNET_SET(so->so_vnet);
859	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
860	CURVNET_RESTORE();
861	return (error);
862}
863
864int
865sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
866{
867	int error;
868
869	CURVNET_SET(so->so_vnet);
870	error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
871	CURVNET_RESTORE();
872	return (error);
873}
874
875/*
876 * solisten() transitions a socket from a non-listening state to a listening
877 * state, but can also be used to update the listen queue depth on an
878 * existing listen socket.  The protocol will call back into the sockets
879 * layer using solisten_proto_check() and solisten_proto() to check and set
880 * socket-layer listen state.  Call backs are used so that the protocol can
881 * acquire both protocol and socket layer locks in whatever order is required
882 * by the protocol.
883 *
884 * Protocol implementors are advised to hold the socket lock across the
885 * socket-layer test and set to avoid races at the socket layer.
886 */
887int
888solisten(struct socket *so, int backlog, struct thread *td)
889{
890	int error;
891
892	CURVNET_SET(so->so_vnet);
893	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
894	CURVNET_RESTORE();
895	return (error);
896}
897
898int
899solisten_proto_check(struct socket *so)
900{
901
902	SOCK_LOCK_ASSERT(so);
903
904	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
905	    SS_ISDISCONNECTING))
906		return (EINVAL);
907	return (0);
908}
909
910void
911solisten_proto(struct socket *so, int backlog)
912{
913	int sbrcv_lowat, sbsnd_lowat;
914	u_int sbrcv_hiwat, sbsnd_hiwat;
915	short sbrcv_flags, sbsnd_flags;
916	sbintime_t sbrcv_timeo, sbsnd_timeo;
917
918	SOCK_LOCK_ASSERT(so);
919
920	if (SOLISTENING(so))
921		goto listening;
922
923	/*
924	 * Change this socket to listening state.
925	 */
926	sbrcv_lowat = so->so_rcv.sb_lowat;
927	sbsnd_lowat = so->so_snd.sb_lowat;
928	sbrcv_hiwat = so->so_rcv.sb_hiwat;
929	sbsnd_hiwat = so->so_snd.sb_hiwat;
930	sbrcv_flags = so->so_rcv.sb_flags;
931	sbsnd_flags = so->so_snd.sb_flags;
932	sbrcv_timeo = so->so_rcv.sb_timeo;
933	sbsnd_timeo = so->so_snd.sb_timeo;
934
935	sbdestroy(&so->so_snd, so);
936	sbdestroy(&so->so_rcv, so);
937	sx_destroy(&so->so_snd.sb_sx);
938	sx_destroy(&so->so_rcv.sb_sx);
939	SOCKBUF_LOCK_DESTROY(&so->so_snd);
940	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
941
942#ifdef INVARIANTS
943	bzero(&so->so_rcv,
944	    sizeof(struct socket) - offsetof(struct socket, so_rcv));
945#endif
946
947	so->sol_sbrcv_lowat = sbrcv_lowat;
948	so->sol_sbsnd_lowat = sbsnd_lowat;
949	so->sol_sbrcv_hiwat = sbrcv_hiwat;
950	so->sol_sbsnd_hiwat = sbsnd_hiwat;
951	so->sol_sbrcv_flags = sbrcv_flags;
952	so->sol_sbsnd_flags = sbsnd_flags;
953	so->sol_sbrcv_timeo = sbrcv_timeo;
954	so->sol_sbsnd_timeo = sbsnd_timeo;
955
956	so->sol_qlen = so->sol_incqlen = 0;
957	TAILQ_INIT(&so->sol_incomp);
958	TAILQ_INIT(&so->sol_comp);
959
960	so->sol_accept_filter = NULL;
961	so->sol_accept_filter_arg = NULL;
962	so->sol_accept_filter_str = NULL;
963
964	so->sol_upcall = NULL;
965	so->sol_upcallarg = NULL;
966
967	so->so_options |= SO_ACCEPTCONN;
968
969listening:
970	if (backlog < 0 || backlog > somaxconn)
971		backlog = somaxconn;
972	so->sol_qlimit = backlog;
973}
974
975/*
976 * Wakeup listeners/subsystems once we have a complete connection.
977 * Enters with lock, returns unlocked.
978 */
979void
980solisten_wakeup(struct socket *sol)
981{
982
983	if (sol->sol_upcall != NULL)
984		(void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
985	else {
986		selwakeuppri(&sol->so_rdsel, PSOCK);
987		KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
988	}
989	SOLISTEN_UNLOCK(sol);
990	wakeup_one(&sol->sol_comp);
991	if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
992		pgsigio(&sol->so_sigio, SIGIO, 0);
993}
994
995/*
996 * Return single connection off a listening socket queue.  Main consumer of
997 * the function is kern_accept4().  Some modules, that do their own accept
998 * management also use the function.
999 *
1000 * Listening socket must be locked on entry and is returned unlocked on
1001 * return.
1002 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
1003 */
1004int
1005solisten_dequeue(struct socket *head, struct socket **ret, int flags)
1006{
1007	struct socket *so;
1008	int error;
1009
1010	SOLISTEN_LOCK_ASSERT(head);
1011
1012	while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
1013	    head->so_error == 0) {
1014		error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH,
1015		    "accept", 0);
1016		if (error != 0) {
1017			SOLISTEN_UNLOCK(head);
1018			return (error);
1019		}
1020	}
1021	if (head->so_error) {
1022		error = head->so_error;
1023		head->so_error = 0;
1024	} else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
1025		error = EWOULDBLOCK;
1026	else
1027		error = 0;
1028	if (error) {
1029		SOLISTEN_UNLOCK(head);
1030		return (error);
1031	}
1032	so = TAILQ_FIRST(&head->sol_comp);
1033	SOCK_LOCK(so);
1034	KASSERT(so->so_qstate == SQ_COMP,
1035	    ("%s: so %p not SQ_COMP", __func__, so));
1036	soref(so);
1037	head->sol_qlen--;
1038	so->so_qstate = SQ_NONE;
1039	so->so_listen = NULL;
1040	TAILQ_REMOVE(&head->sol_comp, so, so_list);
1041	if (flags & ACCEPT4_INHERIT)
1042		so->so_state |= (head->so_state & SS_NBIO);
1043	else
1044		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
1045	SOCK_UNLOCK(so);
1046	sorele(head);
1047
1048	*ret = so;
1049	return (0);
1050}
1051
1052/*
1053 * Evaluate the reference count and named references on a socket; if no
1054 * references remain, free it.  This should be called whenever a reference is
1055 * released, such as in sorele(), but also when named reference flags are
1056 * cleared in socket or protocol code.
1057 *
1058 * sofree() will free the socket if:
1059 *
1060 * - There are no outstanding file descriptor references or related consumers
1061 *   (so_count == 0).
1062 *
1063 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
1064 *
1065 * - The protocol does not have an outstanding strong reference on the socket
1066 *   (SS_PROTOREF).
1067 *
1068 * - The socket is not in a completed connection queue, so a process has been
1069 *   notified that it is present.  If it is removed, the user process may
1070 *   block in accept() despite select() saying the socket was ready.
1071 */
1072void
1073sofree(struct socket *so)
1074{
1075	struct protosw *pr = so->so_proto;
1076
1077	SOCK_LOCK_ASSERT(so);
1078
1079	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
1080	    (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) {
1081		SOCK_UNLOCK(so);
1082		return;
1083	}
1084
1085	if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
1086		struct socket *sol;
1087
1088		sol = so->so_listen;
1089		KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
1090
1091		/*
1092		 * To solve race between close of a listening socket and
1093		 * a socket on its incomplete queue, we need to lock both.
1094		 * The order is first listening socket, then regular.
1095		 * Since we don't have SS_NOFDREF neither SS_PROTOREF, this
1096		 * function and the listening socket are the only pointers
1097		 * to so.  To preserve so and sol, we reference both and then
1098		 * relock.
1099		 * After relock the socket may not move to so_comp since it
1100		 * doesn't have PCB already, but it may be removed from
1101		 * so_incomp. If that happens, we share responsiblity on
1102		 * freeing the socket, but soclose() has already removed
1103		 * it from queue.
1104		 */
1105		soref(sol);
1106		soref(so);
1107		SOCK_UNLOCK(so);
1108		SOLISTEN_LOCK(sol);
1109		SOCK_LOCK(so);
1110		if (so->so_qstate == SQ_INCOMP) {
1111			KASSERT(so->so_listen == sol,
1112			    ("%s: so %p migrated out of sol %p",
1113			    __func__, so, sol));
1114			TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
1115			sol->sol_incqlen--;
1116			/* This is guarenteed not to be the last. */
1117			refcount_release(&sol->so_count);
1118			so->so_qstate = SQ_NONE;
1119			so->so_listen = NULL;
1120		} else
1121			KASSERT(so->so_listen == NULL,
1122			    ("%s: so %p not on (in)comp with so_listen",
1123			    __func__, so));
1124		sorele(sol);
1125		KASSERT(so->so_count == 1,
1126		    ("%s: so %p count %u", __func__, so, so->so_count));
1127		so->so_count = 0;
1128	}
1129	if (SOLISTENING(so))
1130		so->so_error = ECONNABORTED;
1131	SOCK_UNLOCK(so);
1132
1133	if (so->so_dtor != NULL)
1134		so->so_dtor(so);
1135
1136	VNET_SO_ASSERT(so);
1137	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1138		(*pr->pr_domain->dom_dispose)(so);
1139	if (pr->pr_usrreqs->pru_detach != NULL)
1140		(*pr->pr_usrreqs->pru_detach)(so);
1141
1142	/*
1143	 * From this point on, we assume that no other references to this
1144	 * socket exist anywhere else in the stack.  Therefore, no locks need
1145	 * to be acquired or held.
1146	 *
1147	 * We used to do a lot of socket buffer and socket locking here, as
1148	 * well as invoke sorflush() and perform wakeups.  The direct call to
1149	 * dom_dispose() and sbdestroy() are an inlining of what was
1150	 * necessary from sorflush().
1151	 *
1152	 * Notice that the socket buffer and kqueue state are torn down
1153	 * before calling pru_detach.  This means that protocols shold not
1154	 * assume they can perform socket wakeups, etc, in their detach code.
1155	 */
1156	if (!SOLISTENING(so)) {
1157		sbdestroy(&so->so_snd, so);
1158		sbdestroy(&so->so_rcv, so);
1159	}
1160	seldrain(&so->so_rdsel);
1161	seldrain(&so->so_wrsel);
1162	knlist_destroy(&so->so_rdsel.si_note);
1163	knlist_destroy(&so->so_wrsel.si_note);
1164	sodealloc(so);
1165}
1166
1167/*
1168 * Close a socket on last file table reference removal.  Initiate disconnect
1169 * if connected.  Free socket when disconnect complete.
1170 *
1171 * This function will sorele() the socket.  Note that soclose() may be called
1172 * prior to the ref count reaching zero.  The actual socket structure will
1173 * not be freed until the ref count reaches zero.
1174 */
1175int
1176soclose(struct socket *so)
1177{
1178	struct accept_queue lqueue;
1179	int error = 0;
1180
1181	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
1182
1183	CURVNET_SET(so->so_vnet);
1184	funsetown(&so->so_sigio);
1185	if (so->so_state & SS_ISCONNECTED) {
1186		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1187			error = sodisconnect(so);
1188			if (error) {
1189				if (error == ENOTCONN)
1190					error = 0;
1191				goto drop;
1192			}
1193		}
1194
1195		if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) {
1196			if ((so->so_state & SS_ISDISCONNECTING) &&
1197			    (so->so_state & SS_NBIO))
1198				goto drop;
1199			while (so->so_state & SS_ISCONNECTED) {
1200				error = tsleep(&so->so_timeo,
1201				    PSOCK | PCATCH, "soclos",
1202				    so->so_linger * hz);
1203				if (error)
1204					break;
1205			}
1206		}
1207	}
1208
1209drop:
1210	if (so->so_proto->pr_usrreqs->pru_close != NULL)
1211		(*so->so_proto->pr_usrreqs->pru_close)(so);
1212
1213	SOCK_LOCK(so);
1214	if (SOLISTENING(so)) {
1215		struct socket *sp;
1216
1217		TAILQ_INIT(&lqueue);
1218		TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
1219		TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
1220
1221		so->sol_qlen = so->sol_incqlen = 0;
1222
1223		TAILQ_FOREACH(sp, &lqueue, so_list) {
1224			SOCK_LOCK(sp);
1225			sp->so_qstate = SQ_NONE;
1226			sp->so_listen = NULL;
1227			SOCK_UNLOCK(sp);
1228			/* Guaranteed not to be the last. */
1229			refcount_release(&so->so_count);
1230		}
1231	}
1232	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
1233	so->so_state |= SS_NOFDREF;
1234	sorele(so);
1235	if (SOLISTENING(so)) {
1236		struct socket *sp, *tsp;
1237
1238		TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) {
1239			SOCK_LOCK(sp);
1240			if (sp->so_count == 0) {
1241				SOCK_UNLOCK(sp);
1242				soabort(sp);
1243			} else
1244				/* sp is now in sofree() */
1245				SOCK_UNLOCK(sp);
1246		}
1247	}
1248	CURVNET_RESTORE();
1249	return (error);
1250}
1251
1252/*
1253 * soabort() is used to abruptly tear down a connection, such as when a
1254 * resource limit is reached (listen queue depth exceeded), or if a listen
1255 * socket is closed while there are sockets waiting to be accepted.
1256 *
1257 * This interface is tricky, because it is called on an unreferenced socket,
1258 * and must be called only by a thread that has actually removed the socket
1259 * from the listen queue it was on, or races with other threads are risked.
1260 *
1261 * This interface will call into the protocol code, so must not be called
1262 * with any socket locks held.  Protocols do call it while holding their own
1263 * recursible protocol mutexes, but this is something that should be subject
1264 * to review in the future.
1265 */
1266void
1267soabort(struct socket *so)
1268{
1269
1270	/*
1271	 * In as much as is possible, assert that no references to this
1272	 * socket are held.  This is not quite the same as asserting that the
1273	 * current thread is responsible for arranging for no references, but
1274	 * is as close as we can get for now.
1275	 */
1276	KASSERT(so->so_count == 0, ("soabort: so_count"));
1277	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
1278	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
1279	VNET_SO_ASSERT(so);
1280
1281	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
1282		(*so->so_proto->pr_usrreqs->pru_abort)(so);
1283	SOCK_LOCK(so);
1284	sofree(so);
1285}
1286
1287int
1288soaccept(struct socket *so, struct sockaddr **nam)
1289{
1290	int error;
1291
1292	SOCK_LOCK(so);
1293	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
1294	so->so_state &= ~SS_NOFDREF;
1295	SOCK_UNLOCK(so);
1296
1297	CURVNET_SET(so->so_vnet);
1298	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1299	CURVNET_RESTORE();
1300	return (error);
1301}
1302
1303int
1304soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
1305{
1306
1307	return (soconnectat(AT_FDCWD, so, nam, td));
1308}
1309
1310int
1311soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
1312{
1313	int error;
1314
1315	/* XXXMJ racy */
1316	if (SOLISTENING(so))
1317		return (EOPNOTSUPP);
1318
1319	CURVNET_SET(so->so_vnet);
1320	/*
1321	 * If protocol is connection-based, can only connect once.
1322	 * Otherwise, if connected, try to disconnect first.  This allows
1323	 * user to disconnect by connecting to, e.g., a null address.
1324	 */
1325	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1326	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1327	    (error = sodisconnect(so)))) {
1328		error = EISCONN;
1329	} else {
1330		/*
1331		 * Prevent accumulated error from previous connection from
1332		 * biting us.
1333		 */
1334		so->so_error = 0;
1335		if (fd == AT_FDCWD) {
1336			error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
1337			    nam, td);
1338		} else {
1339			error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
1340			    so, nam, td);
1341		}
1342	}
1343	CURVNET_RESTORE();
1344
1345	return (error);
1346}
1347
1348int
1349soconnect2(struct socket *so1, struct socket *so2)
1350{
1351	int error;
1352
1353	CURVNET_SET(so1->so_vnet);
1354	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1355	CURVNET_RESTORE();
1356	return (error);
1357}
1358
1359int
1360sodisconnect(struct socket *so)
1361{
1362	int error;
1363
1364	if ((so->so_state & SS_ISCONNECTED) == 0)
1365		return (ENOTCONN);
1366	if (so->so_state & SS_ISDISCONNECTING)
1367		return (EALREADY);
1368	VNET_SO_ASSERT(so);
1369	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1370	return (error);
1371}
1372
1373#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1374
1375int
1376sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1377    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1378{
1379	long space;
1380	ssize_t resid;
1381	int clen = 0, error, dontroute;
1382
1383	KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
1384	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1385	    ("sosend_dgram: !PR_ATOMIC"));
1386
1387	if (uio != NULL)
1388		resid = uio->uio_resid;
1389	else
1390		resid = top->m_pkthdr.len;
1391	/*
1392	 * In theory resid should be unsigned.  However, space must be
1393	 * signed, as it might be less than 0 if we over-committed, and we
1394	 * must use a signed comparison of space and resid.  On the other
1395	 * hand, a negative resid causes us to loop sending 0-length
1396	 * segments to the protocol.
1397	 */
1398	if (resid < 0) {
1399		error = EINVAL;
1400		goto out;
1401	}
1402
1403	dontroute =
1404	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1405	if (td != NULL)
1406		td->td_ru.ru_msgsnd++;
1407	if (control != NULL)
1408		clen = control->m_len;
1409
1410	SOCKBUF_LOCK(&so->so_snd);
1411	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1412		SOCKBUF_UNLOCK(&so->so_snd);
1413		error = EPIPE;
1414		goto out;
1415	}
1416	if (so->so_error) {
1417		error = so->so_error;
1418		so->so_error = 0;
1419		SOCKBUF_UNLOCK(&so->so_snd);
1420		goto out;
1421	}
1422	if ((so->so_state & SS_ISCONNECTED) == 0) {
1423		/*
1424		 * `sendto' and `sendmsg' is allowed on a connection-based
1425		 * socket if it supports implied connect.  Return ENOTCONN if
1426		 * not connected and no address is supplied.
1427		 */
1428		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1429		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1430			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1431			    !(resid == 0 && clen != 0)) {
1432				SOCKBUF_UNLOCK(&so->so_snd);
1433				error = ENOTCONN;
1434				goto out;
1435			}
1436		} else if (addr == NULL) {
1437			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1438				error = ENOTCONN;
1439			else
1440				error = EDESTADDRREQ;
1441			SOCKBUF_UNLOCK(&so->so_snd);
1442			goto out;
1443		}
1444	}
1445
1446	/*
1447	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1448	 * problem and need fixing.
1449	 */
1450	space = sbspace(&so->so_snd);
1451	if (flags & MSG_OOB)
1452		space += 1024;
1453	space -= clen;
1454	SOCKBUF_UNLOCK(&so->so_snd);
1455	if (resid > space) {
1456		error = EMSGSIZE;
1457		goto out;
1458	}
1459	if (uio == NULL) {
1460		resid = 0;
1461		if (flags & MSG_EOR)
1462			top->m_flags |= M_EOR;
1463	} else {
1464		/*
1465		 * Copy the data from userland into a mbuf chain.
1466		 * If no data is to be copied in, a single empty mbuf
1467		 * is returned.
1468		 */
1469		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1470		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1471		if (top == NULL) {
1472			error = EFAULT;	/* only possible error */
1473			goto out;
1474		}
1475		space -= resid - uio->uio_resid;
1476		resid = uio->uio_resid;
1477	}
1478	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1479	/*
1480	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1481	 * than with.
1482	 */
1483	if (dontroute) {
1484		SOCK_LOCK(so);
1485		so->so_options |= SO_DONTROUTE;
1486		SOCK_UNLOCK(so);
1487	}
1488	/*
1489	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1490	 * of date.  We could have received a reset packet in an interrupt or
1491	 * maybe we slept while doing page faults in uiomove() etc.  We could
1492	 * probably recheck again inside the locking protection here, but
1493	 * there are probably other places that this also happens.  We must
1494	 * rethink this.
1495	 */
1496	VNET_SO_ASSERT(so);
1497	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1498	    (flags & MSG_OOB) ? PRUS_OOB :
1499	/*
1500	 * If the user set MSG_EOF, the protocol understands this flag and
1501	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1502	 */
1503	    ((flags & MSG_EOF) &&
1504	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1505	     (resid <= 0)) ?
1506		PRUS_EOF :
1507		/* If there is more to send set PRUS_MORETOCOME */
1508		(flags & MSG_MORETOCOME) ||
1509		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1510		top, addr, control, td);
1511	if (dontroute) {
1512		SOCK_LOCK(so);
1513		so->so_options &= ~SO_DONTROUTE;
1514		SOCK_UNLOCK(so);
1515	}
1516	clen = 0;
1517	control = NULL;
1518	top = NULL;
1519out:
1520	if (top != NULL)
1521		m_freem(top);
1522	if (control != NULL)
1523		m_freem(control);
1524	return (error);
1525}
1526
1527/*
1528 * Send on a socket.  If send must go all at once and message is larger than
1529 * send buffering, then hard error.  Lock against other senders.  If must go
1530 * all at once and not enough room now, then inform user that this would
1531 * block and do nothing.  Otherwise, if nonblocking, send as much as
1532 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1533 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1534 * in mbuf chain must be small enough to send all at once.
1535 *
1536 * Returns nonzero on error, timeout or signal; callers must check for short
1537 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1538 * on return.
1539 */
1540int
1541sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1542    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1543{
1544	long space;
1545	ssize_t resid;
1546	int clen = 0, error, dontroute;
1547	int atomic = sosendallatonce(so) || top;
1548	int pru_flag;
1549#ifdef KERN_TLS
1550	struct ktls_session *tls;
1551	int tls_enq_cnt, tls_pruflag;
1552	uint8_t tls_rtype;
1553
1554	tls = NULL;
1555	tls_rtype = TLS_RLTYPE_APP;
1556#endif
1557	if (uio != NULL)
1558		resid = uio->uio_resid;
1559	else if ((top->m_flags & M_PKTHDR) != 0)
1560		resid = top->m_pkthdr.len;
1561	else
1562		resid = m_length(top, NULL);
1563	/*
1564	 * In theory resid should be unsigned.  However, space must be
1565	 * signed, as it might be less than 0 if we over-committed, and we
1566	 * must use a signed comparison of space and resid.  On the other
1567	 * hand, a negative resid causes us to loop sending 0-length
1568	 * segments to the protocol.
1569	 *
1570	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1571	 * type sockets since that's an error.
1572	 */
1573	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1574		error = EINVAL;
1575		goto out;
1576	}
1577
1578	dontroute =
1579	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1580	    (so->so_proto->pr_flags & PR_ATOMIC);
1581	if (td != NULL)
1582		td->td_ru.ru_msgsnd++;
1583	if (control != NULL)
1584		clen = control->m_len;
1585
1586	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1587	if (error)
1588		goto out;
1589
1590#ifdef KERN_TLS
1591	tls_pruflag = 0;
1592	tls = ktls_hold(so->so_snd.sb_tls_info);
1593	if (tls != NULL) {
1594		if (tls->mode == TCP_TLS_MODE_SW)
1595			tls_pruflag = PRUS_NOTREADY;
1596
1597		if (control != NULL) {
1598			struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1599
1600			if (clen >= sizeof(*cm) &&
1601			    cm->cmsg_type == TLS_SET_RECORD_TYPE) {
1602				tls_rtype = *((uint8_t *)CMSG_DATA(cm));
1603				clen = 0;
1604				m_freem(control);
1605				control = NULL;
1606				atomic = 1;
1607			}
1608		}
1609	}
1610#endif
1611
1612restart:
1613	do {
1614		SOCKBUF_LOCK(&so->so_snd);
1615		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1616			SOCKBUF_UNLOCK(&so->so_snd);
1617			error = EPIPE;
1618			goto release;
1619		}
1620		if (so->so_error) {
1621			error = so->so_error;
1622			so->so_error = 0;
1623			SOCKBUF_UNLOCK(&so->so_snd);
1624			goto release;
1625		}
1626		if ((so->so_state & SS_ISCONNECTED) == 0) {
1627			/*
1628			 * `sendto' and `sendmsg' is allowed on a connection-
1629			 * based socket if it supports implied connect.
1630			 * Return ENOTCONN if not connected and no address is
1631			 * supplied.
1632			 */
1633			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1634			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1635				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1636				    !(resid == 0 && clen != 0)) {
1637					SOCKBUF_UNLOCK(&so->so_snd);
1638					error = ENOTCONN;
1639					goto release;
1640				}
1641			} else if (addr == NULL) {
1642				SOCKBUF_UNLOCK(&so->so_snd);
1643				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1644					error = ENOTCONN;
1645				else
1646					error = EDESTADDRREQ;
1647				goto release;
1648			}
1649		}
1650		space = sbspace(&so->so_snd);
1651		if (flags & MSG_OOB)
1652			space += 1024;
1653		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1654		    clen > so->so_snd.sb_hiwat) {
1655			SOCKBUF_UNLOCK(&so->so_snd);
1656			error = EMSGSIZE;
1657			goto release;
1658		}
1659		if (space < resid + clen &&
1660		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1661			if ((so->so_state & SS_NBIO) ||
1662			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1663				SOCKBUF_UNLOCK(&so->so_snd);
1664				error = EWOULDBLOCK;
1665				goto release;
1666			}
1667			error = sbwait(&so->so_snd);
1668			SOCKBUF_UNLOCK(&so->so_snd);
1669			if (error)
1670				goto release;
1671			goto restart;
1672		}
1673		SOCKBUF_UNLOCK(&so->so_snd);
1674		space -= clen;
1675		do {
1676			if (uio == NULL) {
1677				resid = 0;
1678				if (flags & MSG_EOR)
1679					top->m_flags |= M_EOR;
1680#ifdef KERN_TLS
1681				if (tls != NULL) {
1682					ktls_frame(top, tls, &tls_enq_cnt,
1683					    tls_rtype);
1684					tls_rtype = TLS_RLTYPE_APP;
1685				}
1686#endif
1687			} else {
1688				/*
1689				 * Copy the data from userland into a mbuf
1690				 * chain.  If resid is 0, which can happen
1691				 * only if we have control to send, then
1692				 * a single empty mbuf is returned.  This
1693				 * is a workaround to prevent protocol send
1694				 * methods to panic.
1695				 */
1696#ifdef KERN_TLS
1697				if (tls != NULL) {
1698					top = m_uiotombuf(uio, M_WAITOK, space,
1699					    tls->params.max_frame_len,
1700					    M_EXTPG |
1701					    ((flags & MSG_EOR) ? M_EOR : 0));
1702					if (top != NULL) {
1703						ktls_frame(top, tls,
1704						    &tls_enq_cnt, tls_rtype);
1705					}
1706					tls_rtype = TLS_RLTYPE_APP;
1707				} else
1708#endif
1709					top = m_uiotombuf(uio, M_WAITOK, space,
1710					    (atomic ? max_hdr : 0),
1711					    (atomic ? M_PKTHDR : 0) |
1712					    ((flags & MSG_EOR) ? M_EOR : 0));
1713				if (top == NULL) {
1714					error = EFAULT; /* only possible error */
1715					goto release;
1716				}
1717				space -= resid - uio->uio_resid;
1718				resid = uio->uio_resid;
1719			}
1720			if (dontroute) {
1721				SOCK_LOCK(so);
1722				so->so_options |= SO_DONTROUTE;
1723				SOCK_UNLOCK(so);
1724			}
1725			/*
1726			 * XXX all the SBS_CANTSENDMORE checks previously
1727			 * done could be out of date.  We could have received
1728			 * a reset packet in an interrupt or maybe we slept
1729			 * while doing page faults in uiomove() etc.  We
1730			 * could probably recheck again inside the locking
1731			 * protection here, but there are probably other
1732			 * places that this also happens.  We must rethink
1733			 * this.
1734			 */
1735			VNET_SO_ASSERT(so);
1736
1737			pru_flag = (flags & MSG_OOB) ? PRUS_OOB :
1738			/*
1739			 * If the user set MSG_EOF, the protocol understands
1740			 * this flag and nothing left to send then use
1741			 * PRU_SEND_EOF instead of PRU_SEND.
1742			 */
1743			    ((flags & MSG_EOF) &&
1744			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1745			     (resid <= 0)) ?
1746				PRUS_EOF :
1747			/* If there is more to send set PRUS_MORETOCOME. */
1748			    (flags & MSG_MORETOCOME) ||
1749			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1750
1751#ifdef KERN_TLS
1752			pru_flag |= tls_pruflag;
1753#endif
1754
1755			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1756			    pru_flag, top, addr, control, td);
1757
1758			if (dontroute) {
1759				SOCK_LOCK(so);
1760				so->so_options &= ~SO_DONTROUTE;
1761				SOCK_UNLOCK(so);
1762			}
1763
1764#ifdef KERN_TLS
1765			if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
1766				if (error != 0) {
1767					m_freem(top);
1768					top = NULL;
1769				} else {
1770					soref(so);
1771					ktls_enqueue(top, so, tls_enq_cnt);
1772				}
1773			}
1774#endif
1775			clen = 0;
1776			control = NULL;
1777			top = NULL;
1778			if (error)
1779				goto release;
1780		} while (resid && space > 0);
1781	} while (resid);
1782
1783release:
1784	sbunlock(&so->so_snd);
1785out:
1786#ifdef KERN_TLS
1787	if (tls != NULL)
1788		ktls_free(tls);
1789#endif
1790	if (top != NULL)
1791		m_freem(top);
1792	if (control != NULL)
1793		m_freem(control);
1794	return (error);
1795}
1796
1797int
1798sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1799    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1800{
1801	int error;
1802
1803	CURVNET_SET(so->so_vnet);
1804	if (!SOLISTENING(so))
1805		error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio,
1806		    top, control, flags, td);
1807	else {
1808		m_freem(top);
1809		m_freem(control);
1810		error = ENOTCONN;
1811	}
1812	CURVNET_RESTORE();
1813	return (error);
1814}
1815
1816/*
1817 * The part of soreceive() that implements reading non-inline out-of-band
1818 * data from a socket.  For more complete comments, see soreceive(), from
1819 * which this code originated.
1820 *
1821 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1822 * unable to return an mbuf chain to the caller.
1823 */
1824static int
1825soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1826{
1827	struct protosw *pr = so->so_proto;
1828	struct mbuf *m;
1829	int error;
1830
1831	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1832	VNET_SO_ASSERT(so);
1833
1834	m = m_get(M_WAITOK, MT_DATA);
1835	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1836	if (error)
1837		goto bad;
1838	do {
1839		error = uiomove(mtod(m, void *),
1840		    (int) min(uio->uio_resid, m->m_len), uio);
1841		m = m_free(m);
1842	} while (uio->uio_resid && error == 0 && m);
1843bad:
1844	if (m != NULL)
1845		m_freem(m);
1846	return (error);
1847}
1848
1849/*
1850 * Following replacement or removal of the first mbuf on the first mbuf chain
1851 * of a socket buffer, push necessary state changes back into the socket
1852 * buffer so that other consumers see the values consistently.  'nextrecord'
1853 * is the callers locally stored value of the original value of
1854 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1855 * NOTE: 'nextrecord' may be NULL.
1856 */
1857static __inline void
1858sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1859{
1860
1861	SOCKBUF_LOCK_ASSERT(sb);
1862	/*
1863	 * First, update for the new value of nextrecord.  If necessary, make
1864	 * it the first record.
1865	 */
1866	if (sb->sb_mb != NULL)
1867		sb->sb_mb->m_nextpkt = nextrecord;
1868	else
1869		sb->sb_mb = nextrecord;
1870
1871	/*
1872	 * Now update any dependent socket buffer fields to reflect the new
1873	 * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1874	 * addition of a second clause that takes care of the case where
1875	 * sb_mb has been updated, but remains the last record.
1876	 */
1877	if (sb->sb_mb == NULL) {
1878		sb->sb_mbtail = NULL;
1879		sb->sb_lastrecord = NULL;
1880	} else if (sb->sb_mb->m_nextpkt == NULL)
1881		sb->sb_lastrecord = sb->sb_mb;
1882}
1883
1884/*
1885 * Implement receive operations on a socket.  We depend on the way that
1886 * records are added to the sockbuf by sbappend.  In particular, each record
1887 * (mbufs linked through m_next) must begin with an address if the protocol
1888 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1889 * data, and then zero or more mbufs of data.  In order to allow parallelism
1890 * between network receive and copying to user space, as well as avoid
1891 * sleeping with a mutex held, we release the socket buffer mutex during the
1892 * user space copy.  Although the sockbuf is locked, new data may still be
1893 * appended, and thus we must maintain consistency of the sockbuf during that
1894 * time.
1895 *
1896 * The caller may receive the data as a single mbuf chain by supplying an
1897 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1898 * the count in uio_resid.
1899 */
1900int
1901soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1902    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1903{
1904	struct mbuf *m, **mp;
1905	int flags, error, offset;
1906	ssize_t len;
1907	struct protosw *pr = so->so_proto;
1908	struct mbuf *nextrecord;
1909	int moff, type = 0;
1910	ssize_t orig_resid = uio->uio_resid;
1911
1912	mp = mp0;
1913	if (psa != NULL)
1914		*psa = NULL;
1915	if (controlp != NULL)
1916		*controlp = NULL;
1917	if (flagsp != NULL)
1918		flags = *flagsp &~ MSG_EOR;
1919	else
1920		flags = 0;
1921	if (flags & MSG_OOB)
1922		return (soreceive_rcvoob(so, uio, flags));
1923	if (mp != NULL)
1924		*mp = NULL;
1925	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1926	    && uio->uio_resid) {
1927		VNET_SO_ASSERT(so);
1928		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1929	}
1930
1931	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1932	if (error)
1933		return (error);
1934
1935restart:
1936	SOCKBUF_LOCK(&so->so_rcv);
1937	m = so->so_rcv.sb_mb;
1938	/*
1939	 * If we have less data than requested, block awaiting more (subject
1940	 * to any timeout) if:
1941	 *   1. the current count is less than the low water mark, or
1942	 *   2. MSG_DONTWAIT is not set
1943	 */
1944	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1945	    sbavail(&so->so_rcv) < uio->uio_resid) &&
1946	    sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
1947	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1948		KASSERT(m != NULL || !sbavail(&so->so_rcv),
1949		    ("receive: m == %p sbavail == %u",
1950		    m, sbavail(&so->so_rcv)));
1951		if (so->so_error) {
1952			if (m != NULL)
1953				goto dontblock;
1954			error = so->so_error;
1955			if ((flags & MSG_PEEK) == 0)
1956				so->so_error = 0;
1957			SOCKBUF_UNLOCK(&so->so_rcv);
1958			goto release;
1959		}
1960		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1961		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1962			if (m != NULL)
1963				goto dontblock;
1964#ifdef KERN_TLS
1965			else if (so->so_rcv.sb_tlsdcc == 0 &&
1966			    so->so_rcv.sb_tlscc == 0) {
1967#else
1968			else {
1969#endif
1970				SOCKBUF_UNLOCK(&so->so_rcv);
1971				goto release;
1972			}
1973		}
1974		for (; m != NULL; m = m->m_next)
1975			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1976				m = so->so_rcv.sb_mb;
1977				goto dontblock;
1978			}
1979		if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
1980		    SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 &&
1981		    (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1982			SOCKBUF_UNLOCK(&so->so_rcv);
1983			error = ENOTCONN;
1984			goto release;
1985		}
1986		if (uio->uio_resid == 0) {
1987			SOCKBUF_UNLOCK(&so->so_rcv);
1988			goto release;
1989		}
1990		if ((so->so_state & SS_NBIO) ||
1991		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1992			SOCKBUF_UNLOCK(&so->so_rcv);
1993			error = EWOULDBLOCK;
1994			goto release;
1995		}
1996		SBLASTRECORDCHK(&so->so_rcv);
1997		SBLASTMBUFCHK(&so->so_rcv);
1998		error = sbwait(&so->so_rcv);
1999		SOCKBUF_UNLOCK(&so->so_rcv);
2000		if (error)
2001			goto release;
2002		goto restart;
2003	}
2004dontblock:
2005	/*
2006	 * From this point onward, we maintain 'nextrecord' as a cache of the
2007	 * pointer to the next record in the socket buffer.  We must keep the
2008	 * various socket buffer pointers and local stack versions of the
2009	 * pointers in sync, pushing out modifications before dropping the
2010	 * socket buffer mutex, and re-reading them when picking it up.
2011	 *
2012	 * Otherwise, we will race with the network stack appending new data
2013	 * or records onto the socket buffer by using inconsistent/stale
2014	 * versions of the field, possibly resulting in socket buffer
2015	 * corruption.
2016	 *
2017	 * By holding the high-level sblock(), we prevent simultaneous
2018	 * readers from pulling off the front of the socket buffer.
2019	 */
2020	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2021	if (uio->uio_td)
2022		uio->uio_td->td_ru.ru_msgrcv++;
2023	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
2024	SBLASTRECORDCHK(&so->so_rcv);
2025	SBLASTMBUFCHK(&so->so_rcv);
2026	nextrecord = m->m_nextpkt;
2027	if (pr->pr_flags & PR_ADDR) {
2028		KASSERT(m->m_type == MT_SONAME,
2029		    ("m->m_type == %d", m->m_type));
2030		orig_resid = 0;
2031		if (psa != NULL)
2032			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2033			    M_NOWAIT);
2034		if (flags & MSG_PEEK) {
2035			m = m->m_next;
2036		} else {
2037			sbfree(&so->so_rcv, m);
2038			so->so_rcv.sb_mb = m_free(m);
2039			m = so->so_rcv.sb_mb;
2040			sockbuf_pushsync(&so->so_rcv, nextrecord);
2041		}
2042	}
2043
2044	/*
2045	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2046	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2047	 * just copy the data; if !MSG_PEEK, we call into the protocol to
2048	 * perform externalization (or freeing if controlp == NULL).
2049	 */
2050	if (m != NULL && m->m_type == MT_CONTROL) {
2051		struct mbuf *cm = NULL, *cmn;
2052		struct mbuf **cme = &cm;
2053#ifdef KERN_TLS
2054		struct cmsghdr *cmsg;
2055		struct tls_get_record tgr;
2056
2057		/*
2058		 * For MSG_TLSAPPDATA, check for a non-application data
2059		 * record.  If found, return ENXIO without removing
2060		 * it from the receive queue.  This allows a subsequent
2061		 * call without MSG_TLSAPPDATA to receive it.
2062		 * Note that, for TLS, there should only be a single
2063		 * control mbuf with the TLS_GET_RECORD message in it.
2064		 */
2065		if (flags & MSG_TLSAPPDATA) {
2066			cmsg = mtod(m, struct cmsghdr *);
2067			if (cmsg->cmsg_type == TLS_GET_RECORD &&
2068			    cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) {
2069				memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr));
2070				/* This will need to change for TLS 1.3. */
2071				if (tgr.tls_type != TLS_RLTYPE_APP) {
2072					SOCKBUF_UNLOCK(&so->so_rcv);
2073					error = ENXIO;
2074					goto release;
2075				}
2076			}
2077		}
2078#endif
2079
2080		do {
2081			if (flags & MSG_PEEK) {
2082				if (controlp != NULL) {
2083					*controlp = m_copym(m, 0, m->m_len,
2084					    M_NOWAIT);
2085					controlp = &(*controlp)->m_next;
2086				}
2087				m = m->m_next;
2088			} else {
2089				sbfree(&so->so_rcv, m);
2090				so->so_rcv.sb_mb = m->m_next;
2091				m->m_next = NULL;
2092				*cme = m;
2093				cme = &(*cme)->m_next;
2094				m = so->so_rcv.sb_mb;
2095			}
2096		} while (m != NULL && m->m_type == MT_CONTROL);
2097		if ((flags & MSG_PEEK) == 0)
2098			sockbuf_pushsync(&so->so_rcv, nextrecord);
2099		while (cm != NULL) {
2100			cmn = cm->m_next;
2101			cm->m_next = NULL;
2102			if (pr->pr_domain->dom_externalize != NULL) {
2103				SOCKBUF_UNLOCK(&so->so_rcv);
2104				VNET_SO_ASSERT(so);
2105				error = (*pr->pr_domain->dom_externalize)
2106				    (cm, controlp, flags);
2107				SOCKBUF_LOCK(&so->so_rcv);
2108			} else if (controlp != NULL)
2109				*controlp = cm;
2110			else
2111				m_freem(cm);
2112			if (controlp != NULL) {
2113				orig_resid = 0;
2114				while (*controlp != NULL)
2115					controlp = &(*controlp)->m_next;
2116			}
2117			cm = cmn;
2118		}
2119		if (m != NULL)
2120			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
2121		else
2122			nextrecord = so->so_rcv.sb_mb;
2123		orig_resid = 0;
2124	}
2125	if (m != NULL) {
2126		if ((flags & MSG_PEEK) == 0) {
2127			KASSERT(m->m_nextpkt == nextrecord,
2128			    ("soreceive: post-control, nextrecord !sync"));
2129			if (nextrecord == NULL) {
2130				KASSERT(so->so_rcv.sb_mb == m,
2131				    ("soreceive: post-control, sb_mb!=m"));
2132				KASSERT(so->so_rcv.sb_lastrecord == m,
2133				    ("soreceive: post-control, lastrecord!=m"));
2134			}
2135		}
2136		type = m->m_type;
2137		if (type == MT_OOBDATA)
2138			flags |= MSG_OOB;
2139	} else {
2140		if ((flags & MSG_PEEK) == 0) {
2141			KASSERT(so->so_rcv.sb_mb == nextrecord,
2142			    ("soreceive: sb_mb != nextrecord"));
2143			if (so->so_rcv.sb_mb == NULL) {
2144				KASSERT(so->so_rcv.sb_lastrecord == NULL,
2145				    ("soreceive: sb_lastercord != NULL"));
2146			}
2147		}
2148	}
2149	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2150	SBLASTRECORDCHK(&so->so_rcv);
2151	SBLASTMBUFCHK(&so->so_rcv);
2152
2153	/*
2154	 * Now continue to read any data mbufs off of the head of the socket
2155	 * buffer until the read request is satisfied.  Note that 'type' is
2156	 * used to store the type of any mbuf reads that have happened so far
2157	 * such that soreceive() can stop reading if the type changes, which
2158	 * causes soreceive() to return only one of regular data and inline
2159	 * out-of-band data in a single socket receive operation.
2160	 */
2161	moff = 0;
2162	offset = 0;
2163	while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
2164	    && error == 0) {
2165		/*
2166		 * If the type of mbuf has changed since the last mbuf
2167		 * examined ('type'), end the receive operation.
2168		 */
2169		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2170		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
2171			if (type != m->m_type)
2172				break;
2173		} else if (type == MT_OOBDATA)
2174			break;
2175		else
2176		    KASSERT(m->m_type == MT_DATA,
2177			("m->m_type == %d", m->m_type));
2178		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
2179		len = uio->uio_resid;
2180		if (so->so_oobmark && len > so->so_oobmark - offset)
2181			len = so->so_oobmark - offset;
2182		if (len > m->m_len - moff)
2183			len = m->m_len - moff;
2184		/*
2185		 * If mp is set, just pass back the mbufs.  Otherwise copy
2186		 * them out via the uio, then free.  Sockbuf must be
2187		 * consistent here (points to current mbuf, it points to next
2188		 * record) when we drop priority; we must note any additions
2189		 * to the sockbuf when we block interrupts again.
2190		 */
2191		if (mp == NULL) {
2192			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2193			SBLASTRECORDCHK(&so->so_rcv);
2194			SBLASTMBUFCHK(&so->so_rcv);
2195			SOCKBUF_UNLOCK(&so->so_rcv);
2196			if ((m->m_flags & M_EXTPG) != 0)
2197				error = m_unmappedtouio(m, moff, uio, (int)len);
2198			else
2199				error = uiomove(mtod(m, char *) + moff,
2200				    (int)len, uio);
2201			SOCKBUF_LOCK(&so->so_rcv);
2202			if (error) {
2203				/*
2204				 * The MT_SONAME mbuf has already been removed
2205				 * from the record, so it is necessary to
2206				 * remove the data mbufs, if any, to preserve
2207				 * the invariant in the case of PR_ADDR that
2208				 * requires MT_SONAME mbufs at the head of
2209				 * each record.
2210				 */
2211				if (pr->pr_flags & PR_ATOMIC &&
2212				    ((flags & MSG_PEEK) == 0))
2213					(void)sbdroprecord_locked(&so->so_rcv);
2214				SOCKBUF_UNLOCK(&so->so_rcv);
2215				goto release;
2216			}
2217		} else
2218			uio->uio_resid -= len;
2219		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2220		if (len == m->m_len - moff) {
2221			if (m->m_flags & M_EOR)
2222				flags |= MSG_EOR;
2223			if (flags & MSG_PEEK) {
2224				m = m->m_next;
2225				moff = 0;
2226			} else {
2227				nextrecord = m->m_nextpkt;
2228				sbfree(&so->so_rcv, m);
2229				if (mp != NULL) {
2230					m->m_nextpkt = NULL;
2231					*mp = m;
2232					mp = &m->m_next;
2233					so->so_rcv.sb_mb = m = m->m_next;
2234					*mp = NULL;
2235				} else {
2236					so->so_rcv.sb_mb = m_free(m);
2237					m = so->so_rcv.sb_mb;
2238				}
2239				sockbuf_pushsync(&so->so_rcv, nextrecord);
2240				SBLASTRECORDCHK(&so->so_rcv);
2241				SBLASTMBUFCHK(&so->so_rcv);
2242			}
2243		} else {
2244			if (flags & MSG_PEEK)
2245				moff += len;
2246			else {
2247				if (mp != NULL) {
2248					if (flags & MSG_DONTWAIT) {
2249						*mp = m_copym(m, 0, len,
2250						    M_NOWAIT);
2251						if (*mp == NULL) {
2252							/*
2253							 * m_copym() couldn't
2254							 * allocate an mbuf.
2255							 * Adjust uio_resid back
2256							 * (it was adjusted
2257							 * down by len bytes,
2258							 * which we didn't end
2259							 * up "copying" over).
2260							 */
2261							uio->uio_resid += len;
2262							break;
2263						}
2264					} else {
2265						SOCKBUF_UNLOCK(&so->so_rcv);
2266						*mp = m_copym(m, 0, len,
2267						    M_WAITOK);
2268						SOCKBUF_LOCK(&so->so_rcv);
2269					}
2270				}
2271				sbcut_locked(&so->so_rcv, len);
2272			}
2273		}
2274		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2275		if (so->so_oobmark) {
2276			if ((flags & MSG_PEEK) == 0) {
2277				so->so_oobmark -= len;
2278				if (so->so_oobmark == 0) {
2279					so->so_rcv.sb_state |= SBS_RCVATMARK;
2280					break;
2281				}
2282			} else {
2283				offset += len;
2284				if (offset == so->so_oobmark)
2285					break;
2286			}
2287		}
2288		if (flags & MSG_EOR)
2289			break;
2290		/*
2291		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
2292		 * must not quit until "uio->uio_resid == 0" or an error
2293		 * termination.  If a signal/timeout occurs, return with a
2294		 * short count but without error.  Keep sockbuf locked
2295		 * against other readers.
2296		 */
2297		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
2298		    !sosendallatonce(so) && nextrecord == NULL) {
2299			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2300			if (so->so_error ||
2301			    so->so_rcv.sb_state & SBS_CANTRCVMORE)
2302				break;
2303			/*
2304			 * Notify the protocol that some data has been
2305			 * drained before blocking.
2306			 */
2307			if (pr->pr_flags & PR_WANTRCVD) {
2308				SOCKBUF_UNLOCK(&so->so_rcv);
2309				VNET_SO_ASSERT(so);
2310				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
2311				SOCKBUF_LOCK(&so->so_rcv);
2312			}
2313			SBLASTRECORDCHK(&so->so_rcv);
2314			SBLASTMBUFCHK(&so->so_rcv);
2315			/*
2316			 * We could receive some data while was notifying
2317			 * the protocol. Skip blocking in this case.
2318			 */
2319			if (so->so_rcv.sb_mb == NULL) {
2320				error = sbwait(&so->so_rcv);
2321				if (error) {
2322					SOCKBUF_UNLOCK(&so->so_rcv);
2323					goto release;
2324				}
2325			}
2326			m = so->so_rcv.sb_mb;
2327			if (m != NULL)
2328				nextrecord = m->m_nextpkt;
2329		}
2330	}
2331
2332	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2333	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2334		flags |= MSG_TRUNC;
2335		if ((flags & MSG_PEEK) == 0)
2336			(void) sbdroprecord_locked(&so->so_rcv);
2337	}
2338	if ((flags & MSG_PEEK) == 0) {
2339		if (m == NULL) {
2340			/*
2341			 * First part is an inline SB_EMPTY_FIXUP().  Second
2342			 * part makes sure sb_lastrecord is up-to-date if
2343			 * there is still data in the socket buffer.
2344			 */
2345			so->so_rcv.sb_mb = nextrecord;
2346			if (so->so_rcv.sb_mb == NULL) {
2347				so->so_rcv.sb_mbtail = NULL;
2348				so->so_rcv.sb_lastrecord = NULL;
2349			} else if (nextrecord->m_nextpkt == NULL)
2350				so->so_rcv.sb_lastrecord = nextrecord;
2351		}
2352		SBLASTRECORDCHK(&so->so_rcv);
2353		SBLASTMBUFCHK(&so->so_rcv);
2354		/*
2355		 * If soreceive() is being done from the socket callback,
2356		 * then don't need to generate ACK to peer to update window,
2357		 * since ACK will be generated on return to TCP.
2358		 */
2359		if (!(flags & MSG_SOCALLBCK) &&
2360		    (pr->pr_flags & PR_WANTRCVD)) {
2361			SOCKBUF_UNLOCK(&so->so_rcv);
2362			VNET_SO_ASSERT(so);
2363			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
2364			SOCKBUF_LOCK(&so->so_rcv);
2365		}
2366	}
2367	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2368	if (orig_resid == uio->uio_resid && orig_resid &&
2369	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
2370		SOCKBUF_UNLOCK(&so->so_rcv);
2371		goto restart;
2372	}
2373	SOCKBUF_UNLOCK(&so->so_rcv);
2374
2375	if (flagsp != NULL)
2376		*flagsp |= flags;
2377release:
2378	sbunlock(&so->so_rcv);
2379	return (error);
2380}
2381
2382/*
2383 * Optimized version of soreceive() for stream (TCP) sockets.
2384 */
2385int
2386soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
2387    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2388{
2389	int len = 0, error = 0, flags, oresid;
2390	struct sockbuf *sb;
2391	struct mbuf *m, *n = NULL;
2392
2393	/* We only do stream sockets. */
2394	if (so->so_type != SOCK_STREAM)
2395		return (EINVAL);
2396	if (psa != NULL)
2397		*psa = NULL;
2398	if (flagsp != NULL)
2399		flags = *flagsp &~ MSG_EOR;
2400	else
2401		flags = 0;
2402	if (controlp != NULL)
2403		*controlp = NULL;
2404	if (flags & MSG_OOB)
2405		return (soreceive_rcvoob(so, uio, flags));
2406	if (mp0 != NULL)
2407		*mp0 = NULL;
2408
2409	sb = &so->so_rcv;
2410
2411#ifdef KERN_TLS
2412	/*
2413	 * KTLS store TLS records as records with a control message to
2414	 * describe the framing.
2415	 *
2416	 * We check once here before acquiring locks to optimize the
2417	 * common case.
2418	 */
2419	if (sb->sb_tls_info != NULL)
2420		return (soreceive_generic(so, psa, uio, mp0, controlp,
2421		    flagsp));
2422#endif
2423
2424	/* Prevent other readers from entering the socket. */
2425	error = sblock(sb, SBLOCKWAIT(flags));
2426	if (error)
2427		return (error);
2428	SOCKBUF_LOCK(sb);
2429
2430#ifdef KERN_TLS
2431	if (sb->sb_tls_info != NULL) {
2432		SOCKBUF_UNLOCK(sb);
2433		sbunlock(sb);
2434		return (soreceive_generic(so, psa, uio, mp0, controlp,
2435		    flagsp));
2436	}
2437#endif
2438
2439	/* Easy one, no space to copyout anything. */
2440	if (uio->uio_resid == 0) {
2441		error = EINVAL;
2442		goto out;
2443	}
2444	oresid = uio->uio_resid;
2445
2446	/* We will never ever get anything unless we are or were connected. */
2447	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2448		error = ENOTCONN;
2449		goto out;
2450	}
2451
2452restart:
2453	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2454
2455	/* Abort if socket has reported problems. */
2456	if (so->so_error) {
2457		if (sbavail(sb) > 0)
2458			goto deliver;
2459		if (oresid > uio->uio_resid)
2460			goto out;
2461		error = so->so_error;
2462		if (!(flags & MSG_PEEK))
2463			so->so_error = 0;
2464		goto out;
2465	}
2466
2467	/* Door is closed.  Deliver what is left, if any. */
2468	if (sb->sb_state & SBS_CANTRCVMORE) {
2469		if (sbavail(sb) > 0)
2470			goto deliver;
2471		else
2472			goto out;
2473	}
2474
2475	/* Socket buffer is empty and we shall not block. */
2476	if (sbavail(sb) == 0 &&
2477	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2478		error = EAGAIN;
2479		goto out;
2480	}
2481
2482	/* Socket buffer got some data that we shall deliver now. */
2483	if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
2484	    ((so->so_state & SS_NBIO) ||
2485	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2486	     sbavail(sb) >= sb->sb_lowat ||
2487	     sbavail(sb) >= uio->uio_resid ||
2488	     sbavail(sb) >= sb->sb_hiwat) ) {
2489		goto deliver;
2490	}
2491
2492	/* On MSG_WAITALL we must wait until all data or error arrives. */
2493	if ((flags & MSG_WAITALL) &&
2494	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
2495		goto deliver;
2496
2497	/*
2498	 * Wait and block until (more) data comes in.
2499	 * NB: Drops the sockbuf lock during wait.
2500	 */
2501	error = sbwait(sb);
2502	if (error)
2503		goto out;
2504	goto restart;
2505
2506deliver:
2507	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2508	KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
2509	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2510
2511	/* Statistics. */
2512	if (uio->uio_td)
2513		uio->uio_td->td_ru.ru_msgrcv++;
2514
2515	/* Fill uio until full or current end of socket buffer is reached. */
2516	len = min(uio->uio_resid, sbavail(sb));
2517	if (mp0 != NULL) {
2518		/* Dequeue as many mbufs as possible. */
2519		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2520			if (*mp0 == NULL)
2521				*mp0 = sb->sb_mb;
2522			else
2523				m_cat(*mp0, sb->sb_mb);
2524			for (m = sb->sb_mb;
2525			     m != NULL && m->m_len <= len;
2526			     m = m->m_next) {
2527				KASSERT(!(m->m_flags & M_NOTAVAIL),
2528				    ("%s: m %p not available", __func__, m));
2529				len -= m->m_len;
2530				uio->uio_resid -= m->m_len;
2531				sbfree(sb, m);
2532				n = m;
2533			}
2534			n->m_next = NULL;
2535			sb->sb_mb = m;
2536			sb->sb_lastrecord = sb->sb_mb;
2537			if (sb->sb_mb == NULL)
2538				SB_EMPTY_FIXUP(sb);
2539		}
2540		/* Copy the remainder. */
2541		if (len > 0) {
2542			KASSERT(sb->sb_mb != NULL,
2543			    ("%s: len > 0 && sb->sb_mb empty", __func__));
2544
2545			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2546			if (m == NULL)
2547				len = 0;	/* Don't flush data from sockbuf. */
2548			else
2549				uio->uio_resid -= len;
2550			if (*mp0 != NULL)
2551				m_cat(*mp0, m);
2552			else
2553				*mp0 = m;
2554			if (*mp0 == NULL) {
2555				error = ENOBUFS;
2556				goto out;
2557			}
2558		}
2559	} else {
2560		/* NB: Must unlock socket buffer as uiomove may sleep. */
2561		SOCKBUF_UNLOCK(sb);
2562		error = m_mbuftouio(uio, sb->sb_mb, len);
2563		SOCKBUF_LOCK(sb);
2564		if (error)
2565			goto out;
2566	}
2567	SBLASTRECORDCHK(sb);
2568	SBLASTMBUFCHK(sb);
2569
2570	/*
2571	 * Remove the delivered data from the socket buffer unless we
2572	 * were only peeking.
2573	 */
2574	if (!(flags & MSG_PEEK)) {
2575		if (len > 0)
2576			sbdrop_locked(sb, len);
2577
2578		/* Notify protocol that we drained some data. */
2579		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2580		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2581		     !(flags & MSG_SOCALLBCK))) {
2582			SOCKBUF_UNLOCK(sb);
2583			VNET_SO_ASSERT(so);
2584			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2585			SOCKBUF_LOCK(sb);
2586		}
2587	}
2588
2589	/*
2590	 * For MSG_WAITALL we may have to loop again and wait for
2591	 * more data to come in.
2592	 */
2593	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2594		goto restart;
2595out:
2596	SOCKBUF_LOCK_ASSERT(sb);
2597	SBLASTRECORDCHK(sb);
2598	SBLASTMBUFCHK(sb);
2599	SOCKBUF_UNLOCK(sb);
2600	sbunlock(sb);
2601	return (error);
2602}
2603
2604/*
2605 * Optimized version of soreceive() for simple datagram cases from userspace.
2606 * Unlike in the stream case, we're able to drop a datagram if copyout()
2607 * fails, and because we handle datagrams atomically, we don't need to use a
2608 * sleep lock to prevent I/O interlacing.
2609 */
2610int
2611soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2612    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2613{
2614	struct mbuf *m, *m2;
2615	int flags, error;
2616	ssize_t len;
2617	struct protosw *pr = so->so_proto;
2618	struct mbuf *nextrecord;
2619
2620	if (psa != NULL)
2621		*psa = NULL;
2622	if (controlp != NULL)
2623		*controlp = NULL;
2624	if (flagsp != NULL)
2625		flags = *flagsp &~ MSG_EOR;
2626	else
2627		flags = 0;
2628
2629	/*
2630	 * For any complicated cases, fall back to the full
2631	 * soreceive_generic().
2632	 */
2633	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2634		return (soreceive_generic(so, psa, uio, mp0, controlp,
2635		    flagsp));
2636
2637	/*
2638	 * Enforce restrictions on use.
2639	 */
2640	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2641	    ("soreceive_dgram: wantrcvd"));
2642	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2643	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2644	    ("soreceive_dgram: SBS_RCVATMARK"));
2645	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2646	    ("soreceive_dgram: P_CONNREQUIRED"));
2647
2648	/*
2649	 * Loop blocking while waiting for a datagram.
2650	 */
2651	SOCKBUF_LOCK(&so->so_rcv);
2652	while ((m = so->so_rcv.sb_mb) == NULL) {
2653		KASSERT(sbavail(&so->so_rcv) == 0,
2654		    ("soreceive_dgram: sb_mb NULL but sbavail %u",
2655		    sbavail(&so->so_rcv)));
2656		if (so->so_error) {
2657			error = so->so_error;
2658			so->so_error = 0;
2659			SOCKBUF_UNLOCK(&so->so_rcv);
2660			return (error);
2661		}
2662		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2663		    uio->uio_resid == 0) {
2664			SOCKBUF_UNLOCK(&so->so_rcv);
2665			return (0);
2666		}
2667		if ((so->so_state & SS_NBIO) ||
2668		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2669			SOCKBUF_UNLOCK(&so->so_rcv);
2670			return (EWOULDBLOCK);
2671		}
2672		SBLASTRECORDCHK(&so->so_rcv);
2673		SBLASTMBUFCHK(&so->so_rcv);
2674		error = sbwait(&so->so_rcv);
2675		if (error) {
2676			SOCKBUF_UNLOCK(&so->so_rcv);
2677			return (error);
2678		}
2679	}
2680	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2681
2682	if (uio->uio_td)
2683		uio->uio_td->td_ru.ru_msgrcv++;
2684	SBLASTRECORDCHK(&so->so_rcv);
2685	SBLASTMBUFCHK(&so->so_rcv);
2686	nextrecord = m->m_nextpkt;
2687	if (nextrecord == NULL) {
2688		KASSERT(so->so_rcv.sb_lastrecord == m,
2689		    ("soreceive_dgram: lastrecord != m"));
2690	}
2691
2692	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2693	    ("soreceive_dgram: m_nextpkt != nextrecord"));
2694
2695	/*
2696	 * Pull 'm' and its chain off the front of the packet queue.
2697	 */
2698	so->so_rcv.sb_mb = NULL;
2699	sockbuf_pushsync(&so->so_rcv, nextrecord);
2700
2701	/*
2702	 * Walk 'm's chain and free that many bytes from the socket buffer.
2703	 */
2704	for (m2 = m; m2 != NULL; m2 = m2->m_next)
2705		sbfree(&so->so_rcv, m2);
2706
2707	/*
2708	 * Do a few last checks before we let go of the lock.
2709	 */
2710	SBLASTRECORDCHK(&so->so_rcv);
2711	SBLASTMBUFCHK(&so->so_rcv);
2712	SOCKBUF_UNLOCK(&so->so_rcv);
2713
2714	if (pr->pr_flags & PR_ADDR) {
2715		KASSERT(m->m_type == MT_SONAME,
2716		    ("m->m_type == %d", m->m_type));
2717		if (psa != NULL)
2718			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2719			    M_NOWAIT);
2720		m = m_free(m);
2721	}
2722	if (m == NULL) {
2723		/* XXXRW: Can this happen? */
2724		return (0);
2725	}
2726
2727	/*
2728	 * Packet to copyout() is now in 'm' and it is disconnected from the
2729	 * queue.
2730	 *
2731	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2732	 * in the first mbuf chain on the socket buffer.  We call into the
2733	 * protocol to perform externalization (or freeing if controlp ==
2734	 * NULL). In some cases there can be only MT_CONTROL mbufs without
2735	 * MT_DATA mbufs.
2736	 */
2737	if (m->m_type == MT_CONTROL) {
2738		struct mbuf *cm = NULL, *cmn;
2739		struct mbuf **cme = &cm;
2740
2741		do {
2742			m2 = m->m_next;
2743			m->m_next = NULL;
2744			*cme = m;
2745			cme = &(*cme)->m_next;
2746			m = m2;
2747		} while (m != NULL && m->m_type == MT_CONTROL);
2748		while (cm != NULL) {
2749			cmn = cm->m_next;
2750			cm->m_next = NULL;
2751			if (pr->pr_domain->dom_externalize != NULL) {
2752				error = (*pr->pr_domain->dom_externalize)
2753				    (cm, controlp, flags);
2754			} else if (controlp != NULL)
2755				*controlp = cm;
2756			else
2757				m_freem(cm);
2758			if (controlp != NULL) {
2759				while (*controlp != NULL)
2760					controlp = &(*controlp)->m_next;
2761			}
2762			cm = cmn;
2763		}
2764	}
2765	KASSERT(m == NULL || m->m_type == MT_DATA,
2766	    ("soreceive_dgram: !data"));
2767	while (m != NULL && uio->uio_resid > 0) {
2768		len = uio->uio_resid;
2769		if (len > m->m_len)
2770			len = m->m_len;
2771		error = uiomove(mtod(m, char *), (int)len, uio);
2772		if (error) {
2773			m_freem(m);
2774			return (error);
2775		}
2776		if (len == m->m_len)
2777			m = m_free(m);
2778		else {
2779			m->m_data += len;
2780			m->m_len -= len;
2781		}
2782	}
2783	if (m != NULL) {
2784		flags |= MSG_TRUNC;
2785		m_freem(m);
2786	}
2787	if (flagsp != NULL)
2788		*flagsp |= flags;
2789	return (0);
2790}
2791
2792int
2793soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2794    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2795{
2796	int error;
2797
2798	CURVNET_SET(so->so_vnet);
2799	if (!SOLISTENING(so))
2800		error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio,
2801		    mp0, controlp, flagsp));
2802	else
2803		error = ENOTCONN;
2804	CURVNET_RESTORE();
2805	return (error);
2806}
2807
2808int
2809soshutdown(struct socket *so, int how)
2810{
2811	struct protosw *pr = so->so_proto;
2812	int error, soerror_enotconn;
2813
2814	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2815		return (EINVAL);
2816
2817	soerror_enotconn = 0;
2818	if ((so->so_state &
2819	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
2820		/*
2821		 * POSIX mandates us to return ENOTCONN when shutdown(2) is
2822		 * invoked on a datagram sockets, however historically we would
2823		 * actually tear socket down. This is known to be leveraged by
2824		 * some applications to unblock process waiting in recvXXX(2)
2825		 * by other process that it shares that socket with. Try to meet
2826		 * both backward-compatibility and POSIX requirements by forcing
2827		 * ENOTCONN but still asking protocol to perform pru_shutdown().
2828		 */
2829		if (so->so_type != SOCK_DGRAM && !SOLISTENING(so))
2830			return (ENOTCONN);
2831		soerror_enotconn = 1;
2832	}
2833
2834	if (SOLISTENING(so)) {
2835		if (how != SHUT_WR) {
2836			SOLISTEN_LOCK(so);
2837			so->so_error = ECONNABORTED;
2838			solisten_wakeup(so);	/* unlocks so */
2839		}
2840		goto done;
2841	}
2842
2843	CURVNET_SET(so->so_vnet);
2844	if (pr->pr_usrreqs->pru_flush != NULL)
2845		(*pr->pr_usrreqs->pru_flush)(so, how);
2846	if (how != SHUT_WR)
2847		sorflush(so);
2848	if (how != SHUT_RD) {
2849		error = (*pr->pr_usrreqs->pru_shutdown)(so);
2850		wakeup(&so->so_timeo);
2851		CURVNET_RESTORE();
2852		return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
2853	}
2854	wakeup(&so->so_timeo);
2855	CURVNET_RESTORE();
2856
2857done:
2858	return (soerror_enotconn ? ENOTCONN : 0);
2859}
2860
2861void
2862sorflush(struct socket *so)
2863{
2864	struct sockbuf *sb = &so->so_rcv;
2865	struct protosw *pr = so->so_proto;
2866	struct socket aso;
2867
2868	VNET_SO_ASSERT(so);
2869
2870	/*
2871	 * In order to avoid calling dom_dispose with the socket buffer mutex
2872	 * held, and in order to generally avoid holding the lock for a long
2873	 * time, we make a copy of the socket buffer and clear the original
2874	 * (except locks, state).  The new socket buffer copy won't have
2875	 * initialized locks so we can only call routines that won't use or
2876	 * assert those locks.
2877	 *
2878	 * Dislodge threads currently blocked in receive and wait to acquire
2879	 * a lock against other simultaneous readers before clearing the
2880	 * socket buffer.  Don't let our acquire be interrupted by a signal
2881	 * despite any existing socket disposition on interruptable waiting.
2882	 */
2883	socantrcvmore(so);
2884	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2885
2886	/*
2887	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2888	 * and mutex data unchanged.
2889	 */
2890	SOCKBUF_LOCK(sb);
2891	bzero(&aso, sizeof(aso));
2892	aso.so_pcb = so->so_pcb;
2893	bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero,
2894	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2895	bzero(&sb->sb_startzero,
2896	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2897	SOCKBUF_UNLOCK(sb);
2898	sbunlock(sb);
2899
2900	/*
2901	 * Dispose of special rights and flush the copied socket.  Don't call
2902	 * any unsafe routines (that rely on locks being initialized) on aso.
2903	 */
2904	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2905		(*pr->pr_domain->dom_dispose)(&aso);
2906	sbrelease_internal(&aso.so_rcv, so);
2907}
2908
2909/*
2910 * Wrapper for Socket established helper hook.
2911 * Parameters: socket, context of the hook point, hook id.
2912 */
2913static int inline
2914hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
2915{
2916	struct socket_hhook_data hhook_data = {
2917		.so = so,
2918		.hctx = hctx,
2919		.m = NULL,
2920		.status = 0
2921	};
2922
2923	CURVNET_SET(so->so_vnet);
2924	HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
2925	CURVNET_RESTORE();
2926
2927	/* Ugly but needed, since hhooks return void for now */
2928	return (hhook_data.status);
2929}
2930
2931/*
2932 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2933 * additional variant to handle the case where the option value needs to be
2934 * some kind of integer, but not a specific size.  In addition to their use
2935 * here, these functions are also called by the protocol-level pr_ctloutput()
2936 * routines.
2937 */
2938int
2939sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2940{
2941	size_t	valsize;
2942
2943	/*
2944	 * If the user gives us more than we wanted, we ignore it, but if we
2945	 * don't get the minimum length the caller wants, we return EINVAL.
2946	 * On success, sopt->sopt_valsize is set to however much we actually
2947	 * retrieved.
2948	 */
2949	if ((valsize = sopt->sopt_valsize) < minlen)
2950		return EINVAL;
2951	if (valsize > len)
2952		sopt->sopt_valsize = valsize = len;
2953
2954	if (sopt->sopt_td != NULL)
2955		return (copyin(sopt->sopt_val, buf, valsize));
2956
2957	bcopy(sopt->sopt_val, buf, valsize);
2958	return (0);
2959}
2960
2961/*
2962 * Kernel version of setsockopt(2).
2963 *
2964 * XXX: optlen is size_t, not socklen_t
2965 */
2966int
2967so_setsockopt(struct socket *so, int level, int optname, void *optval,
2968    size_t optlen)
2969{
2970	struct sockopt sopt;
2971
2972	sopt.sopt_level = level;
2973	sopt.sopt_name = optname;
2974	sopt.sopt_dir = SOPT_SET;
2975	sopt.sopt_val = optval;
2976	sopt.sopt_valsize = optlen;
2977	sopt.sopt_td = NULL;
2978	return (sosetopt(so, &sopt));
2979}
2980
2981int
2982sosetopt(struct socket *so, struct sockopt *sopt)
2983{
2984	int	error, optval;
2985	struct	linger l;
2986	struct	timeval tv;
2987	sbintime_t val;
2988	uint32_t val32;
2989#ifdef MAC
2990	struct mac extmac;
2991#endif
2992
2993	CURVNET_SET(so->so_vnet);
2994	error = 0;
2995	if (sopt->sopt_level != SOL_SOCKET) {
2996		if (so->so_proto->pr_ctloutput != NULL)
2997			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2998		else
2999			error = ENOPROTOOPT;
3000	} else {
3001		switch (sopt->sopt_name) {
3002		case SO_ACCEPTFILTER:
3003			error = accept_filt_setopt(so, sopt);
3004			if (error)
3005				goto bad;
3006			break;
3007
3008		case SO_LINGER:
3009			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
3010			if (error)
3011				goto bad;
3012			if (l.l_linger < 0 ||
3013			    l.l_linger > USHRT_MAX ||
3014			    l.l_linger > (INT_MAX / hz)) {
3015				error = EDOM;
3016				goto bad;
3017			}
3018			SOCK_LOCK(so);
3019			so->so_linger = l.l_linger;
3020			if (l.l_onoff)
3021				so->so_options |= SO_LINGER;
3022			else
3023				so->so_options &= ~SO_LINGER;
3024			SOCK_UNLOCK(so);
3025			break;
3026
3027		case SO_DEBUG:
3028		case SO_KEEPALIVE:
3029		case SO_DONTROUTE:
3030		case SO_USELOOPBACK:
3031		case SO_BROADCAST:
3032		case SO_REUSEADDR:
3033		case SO_REUSEPORT:
3034		case SO_REUSEPORT_LB:
3035		case SO_OOBINLINE:
3036		case SO_TIMESTAMP:
3037		case SO_BINTIME:
3038		case SO_NOSIGPIPE:
3039		case SO_NO_DDP:
3040		case SO_NO_OFFLOAD:
3041			error = sooptcopyin(sopt, &optval, sizeof optval,
3042			    sizeof optval);
3043			if (error)
3044				goto bad;
3045			SOCK_LOCK(so);
3046			if (optval)
3047				so->so_options |= sopt->sopt_name;
3048			else
3049				so->so_options &= ~sopt->sopt_name;
3050			SOCK_UNLOCK(so);
3051			break;
3052
3053		case SO_SETFIB:
3054			error = sooptcopyin(sopt, &optval, sizeof optval,
3055			    sizeof optval);
3056			if (error)
3057				goto bad;
3058
3059			if (optval < 0 || optval >= rt_numfibs) {
3060				error = EINVAL;
3061				goto bad;
3062			}
3063			if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
3064			   (so->so_proto->pr_domain->dom_family == PF_INET6) ||
3065			   (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
3066				so->so_fibnum = optval;
3067			else
3068				so->so_fibnum = 0;
3069			break;
3070
3071		case SO_USER_COOKIE:
3072			error = sooptcopyin(sopt, &val32, sizeof val32,
3073			    sizeof val32);
3074			if (error)
3075				goto bad;
3076			so->so_user_cookie = val32;
3077			break;
3078
3079		case SO_SNDBUF:
3080		case SO_RCVBUF:
3081		case SO_SNDLOWAT:
3082		case SO_RCVLOWAT:
3083			error = sooptcopyin(sopt, &optval, sizeof optval,
3084			    sizeof optval);
3085			if (error)
3086				goto bad;
3087
3088			/*
3089			 * Values < 1 make no sense for any of these options,
3090			 * so disallow them.
3091			 */
3092			if (optval < 1) {
3093				error = EINVAL;
3094				goto bad;
3095			}
3096
3097			error = sbsetopt(so, sopt->sopt_name, optval);
3098			break;
3099
3100		case SO_SNDTIMEO:
3101		case SO_RCVTIMEO:
3102#ifdef COMPAT_FREEBSD32
3103			if (SV_CURPROC_FLAG(SV_ILP32)) {
3104				struct timeval32 tv32;
3105
3106				error = sooptcopyin(sopt, &tv32, sizeof tv32,
3107				    sizeof tv32);
3108				CP(tv32, tv, tv_sec);
3109				CP(tv32, tv, tv_usec);
3110			} else
3111#endif
3112				error = sooptcopyin(sopt, &tv, sizeof tv,
3113				    sizeof tv);
3114			if (error)
3115				goto bad;
3116			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
3117			    tv.tv_usec >= 1000000) {
3118				error = EDOM;
3119				goto bad;
3120			}
3121			if (tv.tv_sec > INT32_MAX)
3122				val = SBT_MAX;
3123			else
3124				val = tvtosbt(tv);
3125			switch (sopt->sopt_name) {
3126			case SO_SNDTIMEO:
3127				so->so_snd.sb_timeo = val;
3128				break;
3129			case SO_RCVTIMEO:
3130				so->so_rcv.sb_timeo = val;
3131				break;
3132			}
3133			break;
3134
3135		case SO_LABEL:
3136#ifdef MAC
3137			error = sooptcopyin(sopt, &extmac, sizeof extmac,
3138			    sizeof extmac);
3139			if (error)
3140				goto bad;
3141			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
3142			    so, &extmac);
3143#else
3144			error = EOPNOTSUPP;
3145#endif
3146			break;
3147
3148		case SO_TS_CLOCK:
3149			error = sooptcopyin(sopt, &optval, sizeof optval,
3150			    sizeof optval);
3151			if (error)
3152				goto bad;
3153			if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
3154				error = EINVAL;
3155				goto bad;
3156			}
3157			so->so_ts_clock = optval;
3158			break;
3159
3160		case SO_MAX_PACING_RATE:
3161			error = sooptcopyin(sopt, &val32, sizeof(val32),
3162			    sizeof(val32));
3163			if (error)
3164				goto bad;
3165			so->so_max_pacing_rate = val32;
3166			break;
3167
3168		default:
3169			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3170				error = hhook_run_socket(so, sopt,
3171				    HHOOK_SOCKET_OPT);
3172			else
3173				error = ENOPROTOOPT;
3174			break;
3175		}
3176		if (error == 0 && so->so_proto->pr_ctloutput != NULL)
3177			(void)(*so->so_proto->pr_ctloutput)(so, sopt);
3178	}
3179bad:
3180	CURVNET_RESTORE();
3181	return (error);
3182}
3183
3184/*
3185 * Helper routine for getsockopt.
3186 */
3187int
3188sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
3189{
3190	int	error;
3191	size_t	valsize;
3192
3193	error = 0;
3194
3195	/*
3196	 * Documented get behavior is that we always return a value, possibly
3197	 * truncated to fit in the user's buffer.  Traditional behavior is
3198	 * that we always tell the user precisely how much we copied, rather
3199	 * than something useful like the total amount we had available for
3200	 * her.  Note that this interface is not idempotent; the entire
3201	 * answer must be generated ahead of time.
3202	 */
3203	valsize = min(len, sopt->sopt_valsize);
3204	sopt->sopt_valsize = valsize;
3205	if (sopt->sopt_val != NULL) {
3206		if (sopt->sopt_td != NULL)
3207			error = copyout(buf, sopt->sopt_val, valsize);
3208		else
3209			bcopy(buf, sopt->sopt_val, valsize);
3210	}
3211	return (error);
3212}
3213
3214int
3215sogetopt(struct socket *so, struct sockopt *sopt)
3216{
3217	int	error, optval;
3218	struct	linger l;
3219	struct	timeval tv;
3220#ifdef MAC
3221	struct mac extmac;
3222#endif
3223
3224	CURVNET_SET(so->so_vnet);
3225	error = 0;
3226	if (sopt->sopt_level != SOL_SOCKET) {
3227		if (so->so_proto->pr_ctloutput != NULL)
3228			error = (*so->so_proto->pr_ctloutput)(so, sopt);
3229		else
3230			error = ENOPROTOOPT;
3231		CURVNET_RESTORE();
3232		return (error);
3233	} else {
3234		switch (sopt->sopt_name) {
3235		case SO_ACCEPTFILTER:
3236			error = accept_filt_getopt(so, sopt);
3237			break;
3238
3239		case SO_LINGER:
3240			SOCK_LOCK(so);
3241			l.l_onoff = so->so_options & SO_LINGER;
3242			l.l_linger = so->so_linger;
3243			SOCK_UNLOCK(so);
3244			error = sooptcopyout(sopt, &l, sizeof l);
3245			break;
3246
3247		case SO_USELOOPBACK:
3248		case SO_DONTROUTE:
3249		case SO_DEBUG:
3250		case SO_KEEPALIVE:
3251		case SO_REUSEADDR:
3252		case SO_REUSEPORT:
3253		case SO_REUSEPORT_LB:
3254		case SO_BROADCAST:
3255		case SO_OOBINLINE:
3256		case SO_ACCEPTCONN:
3257		case SO_TIMESTAMP:
3258		case SO_BINTIME:
3259		case SO_NOSIGPIPE:
3260		case SO_NO_DDP:
3261		case SO_NO_OFFLOAD:
3262			optval = so->so_options & sopt->sopt_name;
3263integer:
3264			error = sooptcopyout(sopt, &optval, sizeof optval);
3265			break;
3266
3267		case SO_DOMAIN:
3268			optval = so->so_proto->pr_domain->dom_family;
3269			goto integer;
3270
3271		case SO_TYPE:
3272			optval = so->so_type;
3273			goto integer;
3274
3275		case SO_PROTOCOL:
3276			optval = so->so_proto->pr_protocol;
3277			goto integer;
3278
3279		case SO_ERROR:
3280			SOCK_LOCK(so);
3281			optval = so->so_error;
3282			so->so_error = 0;
3283			SOCK_UNLOCK(so);
3284			goto integer;
3285
3286		case SO_SNDBUF:
3287			optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
3288			    so->so_snd.sb_hiwat;
3289			goto integer;
3290
3291		case SO_RCVBUF:
3292			optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
3293			    so->so_rcv.sb_hiwat;
3294			goto integer;
3295
3296		case SO_SNDLOWAT:
3297			optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
3298			    so->so_snd.sb_lowat;
3299			goto integer;
3300
3301		case SO_RCVLOWAT:
3302			optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
3303			    so->so_rcv.sb_lowat;
3304			goto integer;
3305
3306		case SO_SNDTIMEO:
3307		case SO_RCVTIMEO:
3308			tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
3309			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3310#ifdef COMPAT_FREEBSD32
3311			if (SV_CURPROC_FLAG(SV_ILP32)) {
3312				struct timeval32 tv32;
3313
3314				CP(tv, tv32, tv_sec);
3315				CP(tv, tv32, tv_usec);
3316				error = sooptcopyout(sopt, &tv32, sizeof tv32);
3317			} else
3318#endif
3319				error = sooptcopyout(sopt, &tv, sizeof tv);
3320			break;
3321
3322		case SO_LABEL:
3323#ifdef MAC
3324			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3325			    sizeof(extmac));
3326			if (error)
3327				goto bad;
3328			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
3329			    so, &extmac);
3330			if (error)
3331				goto bad;
3332			error = sooptcopyout(sopt, &extmac, sizeof extmac);
3333#else
3334			error = EOPNOTSUPP;
3335#endif
3336			break;
3337
3338		case SO_PEERLABEL:
3339#ifdef MAC
3340			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3341			    sizeof(extmac));
3342			if (error)
3343				goto bad;
3344			error = mac_getsockopt_peerlabel(
3345			    sopt->sopt_td->td_ucred, so, &extmac);
3346			if (error)
3347				goto bad;
3348			error = sooptcopyout(sopt, &extmac, sizeof extmac);
3349#else
3350			error = EOPNOTSUPP;
3351#endif
3352			break;
3353
3354		case SO_LISTENQLIMIT:
3355			optval = SOLISTENING(so) ? so->sol_qlimit : 0;
3356			goto integer;
3357
3358		case SO_LISTENQLEN:
3359			optval = SOLISTENING(so) ? so->sol_qlen : 0;
3360			goto integer;
3361
3362		case SO_LISTENINCQLEN:
3363			optval = SOLISTENING(so) ? so->sol_incqlen : 0;
3364			goto integer;
3365
3366		case SO_TS_CLOCK:
3367			optval = so->so_ts_clock;
3368			goto integer;
3369
3370		case SO_MAX_PACING_RATE:
3371			optval = so->so_max_pacing_rate;
3372			goto integer;
3373
3374		default:
3375			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3376				error = hhook_run_socket(so, sopt,
3377				    HHOOK_SOCKET_OPT);
3378			else
3379				error = ENOPROTOOPT;
3380			break;
3381		}
3382	}
3383#ifdef MAC
3384bad:
3385#endif
3386	CURVNET_RESTORE();
3387	return (error);
3388}
3389
3390int
3391soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3392{
3393	struct mbuf *m, *m_prev;
3394	int sopt_size = sopt->sopt_valsize;
3395
3396	MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3397	if (m == NULL)
3398		return ENOBUFS;
3399	if (sopt_size > MLEN) {
3400		MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
3401		if ((m->m_flags & M_EXT) == 0) {
3402			m_free(m);
3403			return ENOBUFS;
3404		}
3405		m->m_len = min(MCLBYTES, sopt_size);
3406	} else {
3407		m->m_len = min(MLEN, sopt_size);
3408	}
3409	sopt_size -= m->m_len;
3410	*mp = m;
3411	m_prev = m;
3412
3413	while (sopt_size) {
3414		MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3415		if (m == NULL) {
3416			m_freem(*mp);
3417			return ENOBUFS;
3418		}
3419		if (sopt_size > MLEN) {
3420			MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
3421			    M_NOWAIT);
3422			if ((m->m_flags & M_EXT) == 0) {
3423				m_freem(m);
3424				m_freem(*mp);
3425				return ENOBUFS;
3426			}
3427			m->m_len = min(MCLBYTES, sopt_size);
3428		} else {
3429			m->m_len = min(MLEN, sopt_size);
3430		}
3431		sopt_size -= m->m_len;
3432		m_prev->m_next = m;
3433		m_prev = m;
3434	}
3435	return (0);
3436}
3437
3438int
3439soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3440{
3441	struct mbuf *m0 = m;
3442
3443	if (sopt->sopt_val == NULL)
3444		return (0);
3445	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3446		if (sopt->sopt_td != NULL) {
3447			int error;
3448
3449			error = copyin(sopt->sopt_val, mtod(m, char *),
3450			    m->m_len);
3451			if (error != 0) {
3452				m_freem(m0);
3453				return(error);
3454			}
3455		} else
3456			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
3457		sopt->sopt_valsize -= m->m_len;
3458		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3459		m = m->m_next;
3460	}
3461	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3462		panic("ip6_sooptmcopyin");
3463	return (0);
3464}
3465
3466int
3467soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3468{
3469	struct mbuf *m0 = m;
3470	size_t valsize = 0;
3471
3472	if (sopt->sopt_val == NULL)
3473		return (0);
3474	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3475		if (sopt->sopt_td != NULL) {
3476			int error;
3477
3478			error = copyout(mtod(m, char *), sopt->sopt_val,
3479			    m->m_len);
3480			if (error != 0) {
3481				m_freem(m0);
3482				return(error);
3483			}
3484		} else
3485			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
3486		sopt->sopt_valsize -= m->m_len;
3487		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3488		valsize += m->m_len;
3489		m = m->m_next;
3490	}
3491	if (m != NULL) {
3492		/* enough soopt buffer should be given from user-land */
3493		m_freem(m0);
3494		return(EINVAL);
3495	}
3496	sopt->sopt_valsize = valsize;
3497	return (0);
3498}
3499
3500/*
3501 * sohasoutofband(): protocol notifies socket layer of the arrival of new
3502 * out-of-band data, which will then notify socket consumers.
3503 */
3504void
3505sohasoutofband(struct socket *so)
3506{
3507
3508	if (so->so_sigio != NULL)
3509		pgsigio(&so->so_sigio, SIGURG, 0);
3510	selwakeuppri(&so->so_rdsel, PSOCK);
3511}
3512
3513int
3514sopoll(struct socket *so, int events, struct ucred *active_cred,
3515    struct thread *td)
3516{
3517
3518	/*
3519	 * We do not need to set or assert curvnet as long as everyone uses
3520	 * sopoll_generic().
3521	 */
3522	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
3523	    td));
3524}
3525
3526int
3527sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
3528    struct thread *td)
3529{
3530	int revents;
3531
3532	SOCK_LOCK(so);
3533	if (SOLISTENING(so)) {
3534		if (!(events & (POLLIN | POLLRDNORM)))
3535			revents = 0;
3536		else if (!TAILQ_EMPTY(&so->sol_comp))
3537			revents = events & (POLLIN | POLLRDNORM);
3538		else if ((events & POLLINIGNEOF) == 0 && so->so_error)
3539			revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
3540		else {
3541			selrecord(td, &so->so_rdsel);
3542			revents = 0;
3543		}
3544	} else {
3545		revents = 0;
3546		SOCKBUF_LOCK(&so->so_snd);
3547		SOCKBUF_LOCK(&so->so_rcv);
3548		if (events & (POLLIN | POLLRDNORM))
3549			if (soreadabledata(so))
3550				revents |= events & (POLLIN | POLLRDNORM);
3551		if (events & (POLLOUT | POLLWRNORM))
3552			if (sowriteable(so))
3553				revents |= events & (POLLOUT | POLLWRNORM);
3554		if (events & (POLLPRI | POLLRDBAND))
3555			if (so->so_oobmark ||
3556			    (so->so_rcv.sb_state & SBS_RCVATMARK))
3557				revents |= events & (POLLPRI | POLLRDBAND);
3558		if ((events & POLLINIGNEOF) == 0) {
3559			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3560				revents |= events & (POLLIN | POLLRDNORM);
3561				if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3562					revents |= POLLHUP;
3563			}
3564		}
3565		if (revents == 0) {
3566			if (events &
3567			    (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3568				selrecord(td, &so->so_rdsel);
3569				so->so_rcv.sb_flags |= SB_SEL;
3570			}
3571			if (events & (POLLOUT | POLLWRNORM)) {
3572				selrecord(td, &so->so_wrsel);
3573				so->so_snd.sb_flags |= SB_SEL;
3574			}
3575		}
3576		SOCKBUF_UNLOCK(&so->so_rcv);
3577		SOCKBUF_UNLOCK(&so->so_snd);
3578	}
3579	SOCK_UNLOCK(so);
3580	return (revents);
3581}
3582
3583int
3584soo_kqfilter(struct file *fp, struct knote *kn)
3585{
3586	struct socket *so = kn->kn_fp->f_data;
3587	struct sockbuf *sb;
3588	struct knlist *knl;
3589
3590	switch (kn->kn_filter) {
3591	case EVFILT_READ:
3592		kn->kn_fop = &soread_filtops;
3593		knl = &so->so_rdsel.si_note;
3594		sb = &so->so_rcv;
3595		break;
3596	case EVFILT_WRITE:
3597		kn->kn_fop = &sowrite_filtops;
3598		knl = &so->so_wrsel.si_note;
3599		sb = &so->so_snd;
3600		break;
3601	case EVFILT_EMPTY:
3602		kn->kn_fop = &soempty_filtops;
3603		knl = &so->so_wrsel.si_note;
3604		sb = &so->so_snd;
3605		break;
3606	default:
3607		return (EINVAL);
3608	}
3609
3610	SOCK_LOCK(so);
3611	if (SOLISTENING(so)) {
3612		knlist_add(knl, kn, 1);
3613	} else {
3614		SOCKBUF_LOCK(sb);
3615		knlist_add(knl, kn, 1);
3616		sb->sb_flags |= SB_KNOTE;
3617		SOCKBUF_UNLOCK(sb);
3618	}
3619	SOCK_UNLOCK(so);
3620	return (0);
3621}
3622
3623/*
3624 * Some routines that return EOPNOTSUPP for entry points that are not
3625 * supported by a protocol.  Fill in as needed.
3626 */
3627int
3628pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3629{
3630
3631	return EOPNOTSUPP;
3632}
3633
3634int
3635pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job)
3636{
3637
3638	return EOPNOTSUPP;
3639}
3640
3641int
3642pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3643{
3644
3645	return EOPNOTSUPP;
3646}
3647
3648int
3649pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3650{
3651
3652	return EOPNOTSUPP;
3653}
3654
3655int
3656pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3657    struct thread *td)
3658{
3659
3660	return EOPNOTSUPP;
3661}
3662
3663int
3664pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3665{
3666
3667	return EOPNOTSUPP;
3668}
3669
3670int
3671pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3672    struct thread *td)
3673{
3674
3675	return EOPNOTSUPP;
3676}
3677
3678int
3679pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3680{
3681
3682	return EOPNOTSUPP;
3683}
3684
3685int
3686pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3687    struct ifnet *ifp, struct thread *td)
3688{
3689
3690	return EOPNOTSUPP;
3691}
3692
3693int
3694pru_disconnect_notsupp(struct socket *so)
3695{
3696
3697	return EOPNOTSUPP;
3698}
3699
3700int
3701pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3702{
3703
3704	return EOPNOTSUPP;
3705}
3706
3707int
3708pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3709{
3710
3711	return EOPNOTSUPP;
3712}
3713
3714int
3715pru_rcvd_notsupp(struct socket *so, int flags)
3716{
3717
3718	return EOPNOTSUPP;
3719}
3720
3721int
3722pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3723{
3724
3725	return EOPNOTSUPP;
3726}
3727
3728int
3729pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3730    struct sockaddr *addr, struct mbuf *control, struct thread *td)
3731{
3732
3733	if (control != NULL)
3734		m_freem(control);
3735	if ((flags & PRUS_NOTREADY) == 0)
3736		m_freem(m);
3737	return (EOPNOTSUPP);
3738}
3739
3740int
3741pru_ready_notsupp(struct socket *so, struct mbuf *m, int count)
3742{
3743
3744	return (EOPNOTSUPP);
3745}
3746
3747/*
3748 * This isn't really a ``null'' operation, but it's the default one and
3749 * doesn't do anything destructive.
3750 */
3751int
3752pru_sense_null(struct socket *so, struct stat *sb)
3753{
3754
3755	sb->st_blksize = so->so_snd.sb_hiwat;
3756	return 0;
3757}
3758
3759int
3760pru_shutdown_notsupp(struct socket *so)
3761{
3762
3763	return EOPNOTSUPP;
3764}
3765
3766int
3767pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3768{
3769
3770	return EOPNOTSUPP;
3771}
3772
3773int
3774pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3775    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3776{
3777
3778	return EOPNOTSUPP;
3779}
3780
3781int
3782pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3783    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3784{
3785
3786	return EOPNOTSUPP;
3787}
3788
3789int
3790pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3791    struct thread *td)
3792{
3793
3794	return EOPNOTSUPP;
3795}
3796
3797static void
3798filt_sordetach(struct knote *kn)
3799{
3800	struct socket *so = kn->kn_fp->f_data;
3801
3802	so_rdknl_lock(so);
3803	knlist_remove(&so->so_rdsel.si_note, kn, 1);
3804	if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
3805		so->so_rcv.sb_flags &= ~SB_KNOTE;
3806	so_rdknl_unlock(so);
3807}
3808
3809/*ARGSUSED*/
3810static int
3811filt_soread(struct knote *kn, long hint)
3812{
3813	struct socket *so;
3814
3815	so = kn->kn_fp->f_data;
3816
3817	if (SOLISTENING(so)) {
3818		SOCK_LOCK_ASSERT(so);
3819		kn->kn_data = so->sol_qlen;
3820		if (so->so_error) {
3821			kn->kn_flags |= EV_EOF;
3822			kn->kn_fflags = so->so_error;
3823			return (1);
3824		}
3825		return (!TAILQ_EMPTY(&so->sol_comp));
3826	}
3827
3828	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3829
3830	kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
3831	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3832		kn->kn_flags |= EV_EOF;
3833		kn->kn_fflags = so->so_error;
3834		return (1);
3835	} else if (so->so_error)	/* temporary udp error */
3836		return (1);
3837
3838	if (kn->kn_sfflags & NOTE_LOWAT) {
3839		if (kn->kn_data >= kn->kn_sdata)
3840			return (1);
3841	} else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
3842		return (1);
3843
3844	/* This hook returning non-zero indicates an event, not error */
3845	return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
3846}
3847
3848static void
3849filt_sowdetach(struct knote *kn)
3850{
3851	struct socket *so = kn->kn_fp->f_data;
3852
3853	so_wrknl_lock(so);
3854	knlist_remove(&so->so_wrsel.si_note, kn, 1);
3855	if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
3856		so->so_snd.sb_flags &= ~SB_KNOTE;
3857	so_wrknl_unlock(so);
3858}
3859
3860/*ARGSUSED*/
3861static int
3862filt_sowrite(struct knote *kn, long hint)
3863{
3864	struct socket *so;
3865
3866	so = kn->kn_fp->f_data;
3867
3868	if (SOLISTENING(so))
3869		return (0);
3870
3871	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3872	kn->kn_data = sbspace(&so->so_snd);
3873
3874	hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
3875
3876	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3877		kn->kn_flags |= EV_EOF;
3878		kn->kn_fflags = so->so_error;
3879		return (1);
3880	} else if (so->so_error)	/* temporary udp error */
3881		return (1);
3882	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3883	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
3884		return (0);
3885	else if (kn->kn_sfflags & NOTE_LOWAT)
3886		return (kn->kn_data >= kn->kn_sdata);
3887	else
3888		return (kn->kn_data >= so->so_snd.sb_lowat);
3889}
3890
3891static int
3892filt_soempty(struct knote *kn, long hint)
3893{
3894	struct socket *so;
3895
3896	so = kn->kn_fp->f_data;
3897
3898	if (SOLISTENING(so))
3899		return (1);
3900
3901	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3902	kn->kn_data = sbused(&so->so_snd);
3903
3904	if (kn->kn_data == 0)
3905		return (1);
3906	else
3907		return (0);
3908}
3909
3910int
3911socheckuid(struct socket *so, uid_t uid)
3912{
3913
3914	if (so == NULL)
3915		return (EPERM);
3916	if (so->so_cred->cr_uid != uid)
3917		return (EPERM);
3918	return (0);
3919}
3920
3921/*
3922 * These functions are used by protocols to notify the socket layer (and its
3923 * consumers) of state changes in the sockets driven by protocol-side events.
3924 */
3925
3926/*
3927 * Procedures to manipulate state flags of socket and do appropriate wakeups.
3928 *
3929 * Normal sequence from the active (originating) side is that
3930 * soisconnecting() is called during processing of connect() call, resulting
3931 * in an eventual call to soisconnected() if/when the connection is
3932 * established.  When the connection is torn down soisdisconnecting() is
3933 * called during processing of disconnect() call, and soisdisconnected() is
3934 * called when the connection to the peer is totally severed.  The semantics
3935 * of these routines are such that connectionless protocols can call
3936 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3937 * calls when setting up a ``connection'' takes no time.
3938 *
3939 * From the passive side, a socket is created with two queues of sockets:
3940 * so_incomp for connections in progress and so_comp for connections already
3941 * made and awaiting user acceptance.  As a protocol is preparing incoming
3942 * connections, it creates a socket structure queued on so_incomp by calling
3943 * sonewconn().  When the connection is established, soisconnected() is
3944 * called, and transfers the socket structure to so_comp, making it available
3945 * to accept().
3946 *
3947 * If a socket is closed with sockets on either so_incomp or so_comp, these
3948 * sockets are dropped.
3949 *
3950 * If higher-level protocols are implemented in the kernel, the wakeups done
3951 * here will sometimes cause software-interrupt process scheduling.
3952 */
3953void
3954soisconnecting(struct socket *so)
3955{
3956
3957	SOCK_LOCK(so);
3958	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3959	so->so_state |= SS_ISCONNECTING;
3960	SOCK_UNLOCK(so);
3961}
3962
3963void
3964soisconnected(struct socket *so)
3965{
3966
3967	SOCK_LOCK(so);
3968	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3969	so->so_state |= SS_ISCONNECTED;
3970
3971	if (so->so_qstate == SQ_INCOMP) {
3972		struct socket *head = so->so_listen;
3973		int ret;
3974
3975		KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
3976		/*
3977		 * Promoting a socket from incomplete queue to complete, we
3978		 * need to go through reverse order of locking.  We first do
3979		 * trylock, and if that doesn't succeed, we go the hard way
3980		 * leaving a reference and rechecking consistency after proper
3981		 * locking.
3982		 */
3983		if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
3984			soref(head);
3985			SOCK_UNLOCK(so);
3986			SOLISTEN_LOCK(head);
3987			SOCK_LOCK(so);
3988			if (__predict_false(head != so->so_listen)) {
3989				/*
3990				 * The socket went off the listen queue,
3991				 * should be lost race to close(2) of sol.
3992				 * The socket is about to soabort().
3993				 */
3994				SOCK_UNLOCK(so);
3995				sorele(head);
3996				return;
3997			}
3998			/* Not the last one, as so holds a ref. */
3999			refcount_release(&head->so_count);
4000		}
4001again:
4002		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
4003			TAILQ_REMOVE(&head->sol_incomp, so, so_list);
4004			head->sol_incqlen--;
4005			TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
4006			head->sol_qlen++;
4007			so->so_qstate = SQ_COMP;
4008			SOCK_UNLOCK(so);
4009			solisten_wakeup(head);	/* unlocks */
4010		} else {
4011			SOCKBUF_LOCK(&so->so_rcv);
4012			soupcall_set(so, SO_RCV,
4013			    head->sol_accept_filter->accf_callback,
4014			    head->sol_accept_filter_arg);
4015			so->so_options &= ~SO_ACCEPTFILTER;
4016			ret = head->sol_accept_filter->accf_callback(so,
4017			    head->sol_accept_filter_arg, M_NOWAIT);
4018			if (ret == SU_ISCONNECTED) {
4019				soupcall_clear(so, SO_RCV);
4020				SOCKBUF_UNLOCK(&so->so_rcv);
4021				goto again;
4022			}
4023			SOCKBUF_UNLOCK(&so->so_rcv);
4024			SOCK_UNLOCK(so);
4025			SOLISTEN_UNLOCK(head);
4026		}
4027		return;
4028	}
4029	SOCK_UNLOCK(so);
4030	wakeup(&so->so_timeo);
4031	sorwakeup(so);
4032	sowwakeup(so);
4033}
4034
4035void
4036soisdisconnecting(struct socket *so)
4037{
4038
4039	SOCK_LOCK(so);
4040	so->so_state &= ~SS_ISCONNECTING;
4041	so->so_state |= SS_ISDISCONNECTING;
4042
4043	if (!SOLISTENING(so)) {
4044		SOCKBUF_LOCK(&so->so_rcv);
4045		socantrcvmore_locked(so);
4046		SOCKBUF_LOCK(&so->so_snd);
4047		socantsendmore_locked(so);
4048	}
4049	SOCK_UNLOCK(so);
4050	wakeup(&so->so_timeo);
4051}
4052
4053void
4054soisdisconnected(struct socket *so)
4055{
4056
4057	SOCK_LOCK(so);
4058
4059	/*
4060	 * There is at least one reader of so_state that does not
4061	 * acquire socket lock, namely soreceive_generic().  Ensure
4062	 * that it never sees all flags that track connection status
4063	 * cleared, by ordering the update with a barrier semantic of
4064	 * our release thread fence.
4065	 */
4066	so->so_state |= SS_ISDISCONNECTED;
4067	atomic_thread_fence_rel();
4068	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
4069
4070	if (!SOLISTENING(so)) {
4071		SOCK_UNLOCK(so);
4072		SOCKBUF_LOCK(&so->so_rcv);
4073		socantrcvmore_locked(so);
4074		SOCKBUF_LOCK(&so->so_snd);
4075		sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
4076		socantsendmore_locked(so);
4077	} else
4078		SOCK_UNLOCK(so);
4079	wakeup(&so->so_timeo);
4080}
4081
4082/*
4083 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
4084 */
4085struct sockaddr *
4086sodupsockaddr(const struct sockaddr *sa, int mflags)
4087{
4088	struct sockaddr *sa2;
4089
4090	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
4091	if (sa2)
4092		bcopy(sa, sa2, sa->sa_len);
4093	return sa2;
4094}
4095
4096/*
4097 * Register per-socket destructor.
4098 */
4099void
4100sodtor_set(struct socket *so, so_dtor_t *func)
4101{
4102
4103	SOCK_LOCK_ASSERT(so);
4104	so->so_dtor = func;
4105}
4106
4107/*
4108 * Register per-socket buffer upcalls.
4109 */
4110void
4111soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg)
4112{
4113	struct sockbuf *sb;
4114
4115	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4116
4117	switch (which) {
4118	case SO_RCV:
4119		sb = &so->so_rcv;
4120		break;
4121	case SO_SND:
4122		sb = &so->so_snd;
4123		break;
4124	default:
4125		panic("soupcall_set: bad which");
4126	}
4127	SOCKBUF_LOCK_ASSERT(sb);
4128	sb->sb_upcall = func;
4129	sb->sb_upcallarg = arg;
4130	sb->sb_flags |= SB_UPCALL;
4131}
4132
4133void
4134soupcall_clear(struct socket *so, int which)
4135{
4136	struct sockbuf *sb;
4137
4138	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4139
4140	switch (which) {
4141	case SO_RCV:
4142		sb = &so->so_rcv;
4143		break;
4144	case SO_SND:
4145		sb = &so->so_snd;
4146		break;
4147	default:
4148		panic("soupcall_clear: bad which");
4149	}
4150	SOCKBUF_LOCK_ASSERT(sb);
4151	KASSERT(sb->sb_upcall != NULL,
4152	    ("%s: so %p no upcall to clear", __func__, so));
4153	sb->sb_upcall = NULL;
4154	sb->sb_upcallarg = NULL;
4155	sb->sb_flags &= ~SB_UPCALL;
4156}
4157
4158void
4159solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
4160{
4161
4162	SOLISTEN_LOCK_ASSERT(so);
4163	so->sol_upcall = func;
4164	so->sol_upcallarg = arg;
4165}
4166
4167static void
4168so_rdknl_lock(void *arg)
4169{
4170	struct socket *so = arg;
4171
4172	if (SOLISTENING(so))
4173		SOCK_LOCK(so);
4174	else
4175		SOCKBUF_LOCK(&so->so_rcv);
4176}
4177
4178static void
4179so_rdknl_unlock(void *arg)
4180{
4181	struct socket *so = arg;
4182
4183	if (SOLISTENING(so))
4184		SOCK_UNLOCK(so);
4185	else
4186		SOCKBUF_UNLOCK(&so->so_rcv);
4187}
4188
4189static void
4190so_rdknl_assert_lock(void *arg, int what)
4191{
4192	struct socket *so = arg;
4193
4194	if (what == LA_LOCKED) {
4195		if (SOLISTENING(so))
4196			SOCK_LOCK_ASSERT(so);
4197		else
4198			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
4199	} else {
4200		if (SOLISTENING(so))
4201			SOCK_UNLOCK_ASSERT(so);
4202		else
4203			SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
4204	}
4205}
4206
4207static void
4208so_wrknl_lock(void *arg)
4209{
4210	struct socket *so = arg;
4211
4212	if (SOLISTENING(so))
4213		SOCK_LOCK(so);
4214	else
4215		SOCKBUF_LOCK(&so->so_snd);
4216}
4217
4218static void
4219so_wrknl_unlock(void *arg)
4220{
4221	struct socket *so = arg;
4222
4223	if (SOLISTENING(so))
4224		SOCK_UNLOCK(so);
4225	else
4226		SOCKBUF_UNLOCK(&so->so_snd);
4227}
4228
4229static void
4230so_wrknl_assert_lock(void *arg, int what)
4231{
4232	struct socket *so = arg;
4233
4234	if (what == LA_LOCKED) {
4235		if (SOLISTENING(so))
4236			SOCK_LOCK_ASSERT(so);
4237		else
4238			SOCKBUF_LOCK_ASSERT(&so->so_snd);
4239	} else {
4240		if (SOLISTENING(so))
4241			SOCK_UNLOCK_ASSERT(so);
4242		else
4243			SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
4244	}
4245}
4246
4247/*
4248 * Create an external-format (``xsocket'') structure using the information in
4249 * the kernel-format socket structure pointed to by so.  This is done to
4250 * reduce the spew of irrelevant information over this interface, to isolate
4251 * user code from changes in the kernel structure, and potentially to provide
4252 * information-hiding if we decide that some of this information should be
4253 * hidden from users.
4254 */
4255void
4256sotoxsocket(struct socket *so, struct xsocket *xso)
4257{
4258
4259	bzero(xso, sizeof(*xso));
4260	xso->xso_len = sizeof *xso;
4261	xso->xso_so = (uintptr_t)so;
4262	xso->so_type = so->so_type;
4263	xso->so_options = so->so_options;
4264	xso->so_linger = so->so_linger;
4265	xso->so_state = so->so_state;
4266	xso->so_pcb = (uintptr_t)so->so_pcb;
4267	xso->xso_protocol = so->so_proto->pr_protocol;
4268	xso->xso_family = so->so_proto->pr_domain->dom_family;
4269	xso->so_timeo = so->so_timeo;
4270	xso->so_error = so->so_error;
4271	xso->so_uid = so->so_cred->cr_uid;
4272	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
4273	if (SOLISTENING(so)) {
4274		xso->so_qlen = so->sol_qlen;
4275		xso->so_incqlen = so->sol_incqlen;
4276		xso->so_qlimit = so->sol_qlimit;
4277		xso->so_oobmark = 0;
4278	} else {
4279		xso->so_state |= so->so_qstate;
4280		xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
4281		xso->so_oobmark = so->so_oobmark;
4282		sbtoxsockbuf(&so->so_snd, &xso->so_snd);
4283		sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
4284	}
4285}
4286
4287struct sockbuf *
4288so_sockbuf_rcv(struct socket *so)
4289{
4290
4291	return (&so->so_rcv);
4292}
4293
4294struct sockbuf *
4295so_sockbuf_snd(struct socket *so)
4296{
4297
4298	return (&so->so_snd);
4299}
4300
4301int
4302so_state_get(const struct socket *so)
4303{
4304
4305	return (so->so_state);
4306}
4307
4308void
4309so_state_set(struct socket *so, int val)
4310{
4311
4312	so->so_state = val;
4313}
4314
4315int
4316so_options_get(const struct socket *so)
4317{
4318
4319	return (so->so_options);
4320}
4321
4322void
4323so_options_set(struct socket *so, int val)
4324{
4325
4326	so->so_options = val;
4327}
4328
4329int
4330so_error_get(const struct socket *so)
4331{
4332
4333	return (so->so_error);
4334}
4335
4336void
4337so_error_set(struct socket *so, int val)
4338{
4339
4340	so->so_error = val;
4341}
4342
4343int
4344so_linger_get(const struct socket *so)
4345{
4346
4347	return (so->so_linger);
4348}
4349
4350void
4351so_linger_set(struct socket *so, int val)
4352{
4353
4354	KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
4355	    ("%s: val %d out of range", __func__, val));
4356
4357	so->so_linger = val;
4358}
4359
4360struct protosw *
4361so_protosw_get(const struct socket *so)
4362{
4363
4364	return (so->so_proto);
4365}
4366
4367void
4368so_protosw_set(struct socket *so, struct protosw *val)
4369{
4370
4371	so->so_proto = val;
4372}
4373
4374void
4375so_sorwakeup(struct socket *so)
4376{
4377
4378	sorwakeup(so);
4379}
4380
4381void
4382so_sowwakeup(struct socket *so)
4383{
4384
4385	sowwakeup(so);
4386}
4387
4388void
4389so_sorwakeup_locked(struct socket *so)
4390{
4391
4392	sorwakeup_locked(so);
4393}
4394
4395void
4396so_sowwakeup_locked(struct socket *so)
4397{
4398
4399	sowwakeup_locked(so);
4400}
4401
4402void
4403so_lock(struct socket *so)
4404{
4405
4406	SOCK_LOCK(so);
4407}
4408
4409void
4410so_unlock(struct socket *so)
4411{
4412
4413	SOCK_UNLOCK(so);
4414}
4415