1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33 */
34
35/*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn().  Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn().  Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation.  This is called
46 * from socreate() and sonewconn().  Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called.  If pru_attach() returned an error,
51 * pru_detach() will not be called.  Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection.  Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state.  This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state.  This is a
64 * public interface  that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected).  This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required.  Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation.  This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed.  This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment.  For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 *
96 * NOTE: With regard to VNETs the general rule is that callers do not set
97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
99 * and sorflush(), which are usually called from a pre-set VNET context.
100 * sopoll() currently does not need a VNET context to be set.
101 */
102
103#include <sys/cdefs.h>
104__FBSDID("$FreeBSD$");
105
106#include "opt_inet.h"
107#include "opt_inet6.h"
108#include "opt_compat.h"
109
110#include <sys/param.h>
111#include <sys/systm.h>
112#include <sys/fcntl.h>
113#include <sys/limits.h>
114#include <sys/lock.h>
115#include <sys/mac.h>
116#include <sys/malloc.h>
117#include <sys/mbuf.h>
118#include <sys/mutex.h>
119#include <sys/domain.h>
120#include <sys/file.h>			/* for struct knote */
121#include <sys/kernel.h>
122#include <sys/event.h>
123#include <sys/eventhandler.h>
124#include <sys/poll.h>
125#include <sys/proc.h>
126#include <sys/protosw.h>
127#include <sys/socket.h>
128#include <sys/socketvar.h>
129#include <sys/resourcevar.h>
130#include <net/route.h>
131#include <sys/signalvar.h>
132#include <sys/stat.h>
133#include <sys/sx.h>
134#include <sys/sysctl.h>
135#include <sys/uio.h>
136#include <sys/jail.h>
137#include <sys/syslog.h>
138#include <netinet/in.h>
139
140#include <net/vnet.h>
141
142#include <security/mac/mac_framework.h>
143
144#include <vm/uma.h>
145
146#ifdef COMPAT_FREEBSD32
147#include <sys/mount.h>
148#include <sys/sysent.h>
149#include <compat/freebsd32/freebsd32.h>
150#endif
151
152static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
153		    int flags);
154
155static void	filt_sordetach(struct knote *kn);
156static int	filt_soread(struct knote *kn, long hint);
157static void	filt_sowdetach(struct knote *kn);
158static int	filt_sowrite(struct knote *kn, long hint);
159static int	filt_solisten(struct knote *kn, long hint);
160
161static struct filterops solisten_filtops = {
162	.f_isfd = 1,
163	.f_detach = filt_sordetach,
164	.f_event = filt_solisten,
165};
166static struct filterops soread_filtops = {
167	.f_isfd = 1,
168	.f_detach = filt_sordetach,
169	.f_event = filt_soread,
170};
171static struct filterops sowrite_filtops = {
172	.f_isfd = 1,
173	.f_detach = filt_sowdetach,
174	.f_event = filt_sowrite,
175};
176
177so_gen_t	so_gencnt;	/* generation count for sockets */
178
179MALLOC_DEFINE(M_SONAME, "soname", "socket name");
180MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
181
182#define	VNET_SO_ASSERT(so)						\
183	VNET_ASSERT(curvnet != NULL,					\
184	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
185
186/*
187 * Limit on the number of connections in the listen queue waiting
188 * for accept(2).
189 * NB: The orginal sysctl somaxconn is still available but hidden
190 * to prevent confusion about the actual purpose of this number.
191 */
192static int somaxconn = SOMAXCONN;
193
194static int
195sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
196{
197	int error;
198	int val;
199
200	val = somaxconn;
201	error = sysctl_handle_int(oidp, &val, 0, req);
202	if (error || !req->newptr )
203		return (error);
204
205	if (val < 1 || val > USHRT_MAX)
206		return (EINVAL);
207
208	somaxconn = val;
209	return (0);
210}
211SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW,
212    0, sizeof(int), sysctl_somaxconn, "I",
213    "Maximum listen socket pending connection accept queue size");
214SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
215    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP,
216    0, sizeof(int), sysctl_somaxconn, "I",
217    "Maximum listen socket pending connection accept queue size (compat)");
218
219static int numopensockets;
220SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
221    &numopensockets, 0, "Number of open sockets");
222
223/*
224 * accept_mtx locks down per-socket fields relating to accept queues.  See
225 * socketvar.h for an annotation of the protected fields of struct socket.
226 */
227struct mtx accept_mtx;
228MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
229
230/*
231 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
232 * so_gencnt field.
233 */
234static struct mtx so_global_mtx;
235MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
236
237/*
238 * General IPC sysctl name space, used by sockets and a variety of other IPC
239 * types.
240 */
241SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
242
243/*
244 * Initialize the socket subsystem and set up the socket
245 * memory allocator.
246 */
247static uma_zone_t socket_zone;
248int	maxsockets;
249
250static void
251socket_zone_change(void *tag)
252{
253
254	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
255}
256
257static void
258socket_init(void *tag)
259{
260
261	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
262	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
263	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
264	uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
265	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
266	    EVENTHANDLER_PRI_FIRST);
267}
268SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
269
270/*
271 * Initialise maxsockets.  This SYSINIT must be run after
272 * tunable_mbinit().
273 */
274static void
275init_maxsockets(void *ignored)
276{
277
278	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
279	maxsockets = imax(maxsockets, maxfiles);
280}
281SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
282
283/*
284 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
285 * of the change so that they can update their dependent limits as required.
286 */
287static int
288sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
289{
290	int error, newmaxsockets;
291
292	newmaxsockets = maxsockets;
293	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
294	if (error == 0 && req->newptr) {
295		if (newmaxsockets > maxsockets &&
296		    newmaxsockets <= maxfiles) {
297			maxsockets = newmaxsockets;
298			EVENTHANDLER_INVOKE(maxsockets_change);
299		} else
300			error = EINVAL;
301	}
302	return (error);
303}
304SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
305    &maxsockets, 0, sysctl_maxsockets, "IU",
306    "Maximum number of sockets avaliable");
307
308/*
309 * Socket operation routines.  These routines are called by the routines in
310 * sys_socket.c or from a system process, and implement the semantics of
311 * socket operations by switching out to the protocol specific routines.
312 */
313
314/*
315 * Get a socket structure from our zone, and initialize it.  Note that it
316 * would probably be better to allocate socket and PCB at the same time, but
317 * I'm not convinced that all the protocols can be easily modified to do
318 * this.
319 *
320 * soalloc() returns a socket with a ref count of 0.
321 */
322static struct socket *
323soalloc(struct vnet *vnet)
324{
325	struct socket *so;
326
327	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
328	if (so == NULL)
329		return (NULL);
330#ifdef MAC
331	if (mac_socket_init(so, M_NOWAIT) != 0) {
332		uma_zfree(socket_zone, so);
333		return (NULL);
334	}
335#endif
336	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
337	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
338	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
339	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
340	TAILQ_INIT(&so->so_aiojobq);
341	mtx_lock(&so_global_mtx);
342	so->so_gencnt = ++so_gencnt;
343	++numopensockets;
344#ifdef VIMAGE
345	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
346	    __func__, __LINE__, so));
347	vnet->vnet_sockcnt++;
348	so->so_vnet = vnet;
349#endif
350	mtx_unlock(&so_global_mtx);
351	return (so);
352}
353
354/*
355 * Free the storage associated with a socket at the socket layer, tear down
356 * locks, labels, etc.  All protocol state is assumed already to have been
357 * torn down (and possibly never set up) by the caller.
358 */
359static void
360sodealloc(struct socket *so)
361{
362
363	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
364	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
365
366	mtx_lock(&so_global_mtx);
367	so->so_gencnt = ++so_gencnt;
368	--numopensockets;	/* Could be below, but faster here. */
369#ifdef VIMAGE
370	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
371	    __func__, __LINE__, so));
372	so->so_vnet->vnet_sockcnt--;
373#endif
374	mtx_unlock(&so_global_mtx);
375	if (so->so_rcv.sb_hiwat)
376		(void)chgsbsize(so->so_cred->cr_uidinfo,
377		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
378	if (so->so_snd.sb_hiwat)
379		(void)chgsbsize(so->so_cred->cr_uidinfo,
380		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
381#ifdef INET
382	/* remove acccept filter if one is present. */
383	if (so->so_accf != NULL)
384		do_setopt_accept_filter(so, NULL);
385#endif
386#ifdef MAC
387	mac_socket_destroy(so);
388#endif
389	crfree(so->so_cred);
390	sx_destroy(&so->so_snd.sb_sx);
391	sx_destroy(&so->so_rcv.sb_sx);
392	SOCKBUF_LOCK_DESTROY(&so->so_snd);
393	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
394	uma_zfree(socket_zone, so);
395}
396
397/*
398 * socreate returns a socket with a ref count of 1.  The socket should be
399 * closed with soclose().
400 */
401int
402socreate(int dom, struct socket **aso, int type, int proto,
403    struct ucred *cred, struct thread *td)
404{
405	struct protosw *prp;
406	struct socket *so;
407	int error;
408
409	if (proto)
410		prp = pffindproto(dom, proto, type);
411	else
412		prp = pffindtype(dom, type);
413
414	if (prp == NULL) {
415		/* No support for domain. */
416		if (pffinddomain(dom) == NULL)
417			return (EAFNOSUPPORT);
418		/* No support for socket type. */
419		if (proto == 0 && type != 0)
420			return (EPROTOTYPE);
421		return (EPROTONOSUPPORT);
422	}
423	if (prp->pr_usrreqs->pru_attach == NULL ||
424	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
425		return (EPROTONOSUPPORT);
426
427	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
428		return (EPROTONOSUPPORT);
429
430	if (prp->pr_type != type)
431		return (EPROTOTYPE);
432	so = soalloc(CRED_TO_VNET(cred));
433	if (so == NULL)
434		return (ENOBUFS);
435
436	TAILQ_INIT(&so->so_incomp);
437	TAILQ_INIT(&so->so_comp);
438	so->so_type = type;
439	so->so_cred = crhold(cred);
440	if ((prp->pr_domain->dom_family == PF_INET) ||
441	    (prp->pr_domain->dom_family == PF_INET6) ||
442	    (prp->pr_domain->dom_family == PF_ROUTE))
443		so->so_fibnum = td->td_proc->p_fibnum;
444	else
445		so->so_fibnum = 0;
446	so->so_proto = prp;
447#ifdef MAC
448	mac_socket_create(cred, so);
449#endif
450	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
451	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
452	so->so_count = 1;
453	/*
454	 * Auto-sizing of socket buffers is managed by the protocols and
455	 * the appropriate flags must be set in the pru_attach function.
456	 */
457	CURVNET_SET(so->so_vnet);
458	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
459	CURVNET_RESTORE();
460	if (error) {
461		KASSERT(so->so_count == 1, ("socreate: so_count %d",
462		    so->so_count));
463		so->so_count = 0;
464		sodealloc(so);
465		return (error);
466	}
467	*aso = so;
468	return (0);
469}
470
471#ifdef REGRESSION
472static int regression_sonewconn_earlytest = 1;
473SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
474    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
475#endif
476
477/*
478 * When an attempt at a new connection is noted on a socket which accepts
479 * connections, sonewconn is called.  If the connection is possible (subject
480 * to space constraints, etc.) then we allocate a new structure, propoerly
481 * linked into the data structure of the original socket, and return this.
482 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
483 *
484 * Note: the ref count on the socket is 0 on return.
485 */
486struct socket *
487sonewconn(struct socket *head, int connstatus)
488{
489	static struct timeval lastover;
490	static struct timeval overinterval = { 60, 0 };
491	static int overcount;
492
493	struct socket *so;
494	int over;
495
496	ACCEPT_LOCK();
497	over = (head->so_qlen > 3 * head->so_qlimit / 2);
498	ACCEPT_UNLOCK();
499#ifdef REGRESSION
500	if (regression_sonewconn_earlytest && over) {
501#else
502	if (over) {
503#endif
504		overcount++;
505
506		if (ratecheck(&lastover, &overinterval)) {
507			log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
508			    "%i already in queue awaiting acceptance "
509			    "(%d occurrences)\n",
510			    __func__, head->so_pcb, head->so_qlen, overcount);
511
512			overcount = 0;
513		}
514
515		return (NULL);
516	}
517	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
518	    __func__, __LINE__, head));
519	so = soalloc(head->so_vnet);
520	if (so == NULL) {
521		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
522		    "limit reached or out of memory\n",
523		    __func__, head->so_pcb);
524		return (NULL);
525	}
526	if ((head->so_options & SO_ACCEPTFILTER) != 0)
527		connstatus = 0;
528	so->so_head = head;
529	so->so_type = head->so_type;
530	so->so_options = head->so_options &~ SO_ACCEPTCONN;
531	so->so_linger = head->so_linger;
532	so->so_state = head->so_state | SS_NOFDREF;
533	so->so_fibnum = head->so_fibnum;
534	so->so_proto = head->so_proto;
535	so->so_cred = crhold(head->so_cred);
536#ifdef MAC
537	mac_socket_newconn(head, so);
538#endif
539	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
540	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
541	VNET_SO_ASSERT(head);
542	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
543		sodealloc(so);
544		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
545		    __func__, head->so_pcb);
546		return (NULL);
547	}
548	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
549		sodealloc(so);
550		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
551		    __func__, head->so_pcb);
552		return (NULL);
553	}
554	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
555	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
556	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
557	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
558	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
559	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
560	so->so_state |= connstatus;
561	ACCEPT_LOCK();
562	/*
563	 * The accept socket may be tearing down but we just
564	 * won a race on the ACCEPT_LOCK.
565	 * However, if sctp_peeloff() is called on a 1-to-many
566	 * style socket, the SO_ACCEPTCONN doesn't need to be set.
567	 */
568	if (!(head->so_options & SO_ACCEPTCONN) &&
569	    ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
570	     (head->so_type != SOCK_SEQPACKET))) {
571		SOCK_LOCK(so);
572		so->so_head = NULL;
573		sofree(so);		/* NB: returns ACCEPT_UNLOCK'ed. */
574		return (NULL);
575	}
576	if (connstatus) {
577		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
578		so->so_qstate |= SQ_COMP;
579		head->so_qlen++;
580	} else {
581		/*
582		 * Keep removing sockets from the head until there's room for
583		 * us to insert on the tail.  In pre-locking revisions, this
584		 * was a simple if(), but as we could be racing with other
585		 * threads and soabort() requires dropping locks, we must
586		 * loop waiting for the condition to be true.
587		 */
588		while (head->so_incqlen > head->so_qlimit) {
589			struct socket *sp;
590			sp = TAILQ_FIRST(&head->so_incomp);
591			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
592			head->so_incqlen--;
593			sp->so_qstate &= ~SQ_INCOMP;
594			sp->so_head = NULL;
595			ACCEPT_UNLOCK();
596			soabort(sp);
597			ACCEPT_LOCK();
598		}
599		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
600		so->so_qstate |= SQ_INCOMP;
601		head->so_incqlen++;
602	}
603	ACCEPT_UNLOCK();
604	if (connstatus) {
605		sorwakeup(head);
606		wakeup_one(&head->so_timeo);
607	}
608	return (so);
609}
610
611int
612sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
613{
614	int error;
615
616	CURVNET_SET(so->so_vnet);
617	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
618	CURVNET_RESTORE();
619	return (error);
620}
621
622int
623sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
624{
625	int error;
626
627	CURVNET_SET(so->so_vnet);
628	error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
629	CURVNET_RESTORE();
630	return (error);
631}
632
633/*
634 * solisten() transitions a socket from a non-listening state to a listening
635 * state, but can also be used to update the listen queue depth on an
636 * existing listen socket.  The protocol will call back into the sockets
637 * layer using solisten_proto_check() and solisten_proto() to check and set
638 * socket-layer listen state.  Call backs are used so that the protocol can
639 * acquire both protocol and socket layer locks in whatever order is required
640 * by the protocol.
641 *
642 * Protocol implementors are advised to hold the socket lock across the
643 * socket-layer test and set to avoid races at the socket layer.
644 */
645int
646solisten(struct socket *so, int backlog, struct thread *td)
647{
648	int error;
649
650	CURVNET_SET(so->so_vnet);
651	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
652	CURVNET_RESTORE();
653	return (error);
654}
655
656int
657solisten_proto_check(struct socket *so)
658{
659
660	SOCK_LOCK_ASSERT(so);
661
662	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
663	    SS_ISDISCONNECTING))
664		return (EINVAL);
665	return (0);
666}
667
668void
669solisten_proto(struct socket *so, int backlog)
670{
671
672	SOCK_LOCK_ASSERT(so);
673
674	if (backlog < 0 || backlog > somaxconn)
675		backlog = somaxconn;
676	so->so_qlimit = backlog;
677	so->so_options |= SO_ACCEPTCONN;
678}
679
680/*
681 * Evaluate the reference count and named references on a socket; if no
682 * references remain, free it.  This should be called whenever a reference is
683 * released, such as in sorele(), but also when named reference flags are
684 * cleared in socket or protocol code.
685 *
686 * sofree() will free the socket if:
687 *
688 * - There are no outstanding file descriptor references or related consumers
689 *   (so_count == 0).
690 *
691 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
692 *
693 * - The protocol does not have an outstanding strong reference on the socket
694 *   (SS_PROTOREF).
695 *
696 * - The socket is not in a completed connection queue, so a process has been
697 *   notified that it is present.  If it is removed, the user process may
698 *   block in accept() despite select() saying the socket was ready.
699 */
700void
701sofree(struct socket *so)
702{
703	struct protosw *pr = so->so_proto;
704	struct socket *head;
705
706	ACCEPT_LOCK_ASSERT();
707	SOCK_LOCK_ASSERT(so);
708
709	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
710	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
711		SOCK_UNLOCK(so);
712		ACCEPT_UNLOCK();
713		return;
714	}
715
716	head = so->so_head;
717	if (head != NULL) {
718		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
719		    (so->so_qstate & SQ_INCOMP) != 0,
720		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
721		    "SQ_INCOMP"));
722		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
723		    (so->so_qstate & SQ_INCOMP) == 0,
724		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
725		TAILQ_REMOVE(&head->so_incomp, so, so_list);
726		head->so_incqlen--;
727		so->so_qstate &= ~SQ_INCOMP;
728		so->so_head = NULL;
729	}
730	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
731	    (so->so_qstate & SQ_INCOMP) == 0,
732	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
733	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
734	if (so->so_options & SO_ACCEPTCONN) {
735		KASSERT((TAILQ_EMPTY(&so->so_comp)),
736		    ("sofree: so_comp populated"));
737		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
738		    ("sofree: so_incomp populated"));
739	}
740	SOCK_UNLOCK(so);
741	ACCEPT_UNLOCK();
742
743	VNET_SO_ASSERT(so);
744	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
745		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
746	if (pr->pr_usrreqs->pru_detach != NULL)
747		(*pr->pr_usrreqs->pru_detach)(so);
748
749	/*
750	 * From this point on, we assume that no other references to this
751	 * socket exist anywhere else in the stack.  Therefore, no locks need
752	 * to be acquired or held.
753	 *
754	 * We used to do a lot of socket buffer and socket locking here, as
755	 * well as invoke sorflush() and perform wakeups.  The direct call to
756	 * dom_dispose() and sbrelease_internal() are an inlining of what was
757	 * necessary from sorflush().
758	 *
759	 * Notice that the socket buffer and kqueue state are torn down
760	 * before calling pru_detach.  This means that protocols shold not
761	 * assume they can perform socket wakeups, etc, in their detach code.
762	 */
763	sbdestroy(&so->so_snd, so);
764	sbdestroy(&so->so_rcv, so);
765	seldrain(&so->so_snd.sb_sel);
766	seldrain(&so->so_rcv.sb_sel);
767	knlist_destroy(&so->so_rcv.sb_sel.si_note);
768	knlist_destroy(&so->so_snd.sb_sel.si_note);
769	sodealloc(so);
770}
771
772/*
773 * Close a socket on last file table reference removal.  Initiate disconnect
774 * if connected.  Free socket when disconnect complete.
775 *
776 * This function will sorele() the socket.  Note that soclose() may be called
777 * prior to the ref count reaching zero.  The actual socket structure will
778 * not be freed until the ref count reaches zero.
779 */
780int
781soclose(struct socket *so)
782{
783	int error = 0;
784
785	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
786
787	CURVNET_SET(so->so_vnet);
788	funsetown(&so->so_sigio);
789	if (so->so_state & SS_ISCONNECTED) {
790		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
791			error = sodisconnect(so);
792			if (error) {
793				if (error == ENOTCONN)
794					error = 0;
795				goto drop;
796			}
797		}
798		if (so->so_options & SO_LINGER) {
799			if ((so->so_state & SS_ISDISCONNECTING) &&
800			    (so->so_state & SS_NBIO))
801				goto drop;
802			while (so->so_state & SS_ISCONNECTED) {
803				error = tsleep(&so->so_timeo,
804				    PSOCK | PCATCH, "soclos",
805				    so->so_linger * hz);
806				if (error)
807					break;
808			}
809		}
810	}
811
812drop:
813	if (so->so_proto->pr_usrreqs->pru_close != NULL)
814		(*so->so_proto->pr_usrreqs->pru_close)(so);
815	ACCEPT_LOCK();
816	if (so->so_options & SO_ACCEPTCONN) {
817		struct socket *sp;
818		/*
819		 * Prevent new additions to the accept queues due
820		 * to ACCEPT_LOCK races while we are draining them.
821		 */
822		so->so_options &= ~SO_ACCEPTCONN;
823		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
824			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
825			so->so_incqlen--;
826			sp->so_qstate &= ~SQ_INCOMP;
827			sp->so_head = NULL;
828			ACCEPT_UNLOCK();
829			soabort(sp);
830			ACCEPT_LOCK();
831		}
832		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
833			TAILQ_REMOVE(&so->so_comp, sp, so_list);
834			so->so_qlen--;
835			sp->so_qstate &= ~SQ_COMP;
836			sp->so_head = NULL;
837			ACCEPT_UNLOCK();
838			soabort(sp);
839			ACCEPT_LOCK();
840		}
841		KASSERT((TAILQ_EMPTY(&so->so_comp)),
842		    ("%s: so_comp populated", __func__));
843		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
844		    ("%s: so_incomp populated", __func__));
845	}
846	SOCK_LOCK(so);
847	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
848	so->so_state |= SS_NOFDREF;
849	sorele(so);			/* NB: Returns with ACCEPT_UNLOCK(). */
850	CURVNET_RESTORE();
851	return (error);
852}
853
854/*
855 * soabort() is used to abruptly tear down a connection, such as when a
856 * resource limit is reached (listen queue depth exceeded), or if a listen
857 * socket is closed while there are sockets waiting to be accepted.
858 *
859 * This interface is tricky, because it is called on an unreferenced socket,
860 * and must be called only by a thread that has actually removed the socket
861 * from the listen queue it was on, or races with other threads are risked.
862 *
863 * This interface will call into the protocol code, so must not be called
864 * with any socket locks held.  Protocols do call it while holding their own
865 * recursible protocol mutexes, but this is something that should be subject
866 * to review in the future.
867 */
868void
869soabort(struct socket *so)
870{
871
872	/*
873	 * In as much as is possible, assert that no references to this
874	 * socket are held.  This is not quite the same as asserting that the
875	 * current thread is responsible for arranging for no references, but
876	 * is as close as we can get for now.
877	 */
878	KASSERT(so->so_count == 0, ("soabort: so_count"));
879	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
880	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
881	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
882	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
883	VNET_SO_ASSERT(so);
884
885	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
886		(*so->so_proto->pr_usrreqs->pru_abort)(so);
887	ACCEPT_LOCK();
888	SOCK_LOCK(so);
889	sofree(so);
890}
891
892int
893soaccept(struct socket *so, struct sockaddr **nam)
894{
895	int error;
896
897	SOCK_LOCK(so);
898	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
899	so->so_state &= ~SS_NOFDREF;
900	SOCK_UNLOCK(so);
901
902	CURVNET_SET(so->so_vnet);
903	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
904	CURVNET_RESTORE();
905	return (error);
906}
907
908int
909soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
910{
911
912	return (soconnectat(AT_FDCWD, so, nam, td));
913}
914
915int
916soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
917{
918	int error;
919
920	if (so->so_options & SO_ACCEPTCONN)
921		return (EOPNOTSUPP);
922
923	CURVNET_SET(so->so_vnet);
924	/*
925	 * If protocol is connection-based, can only connect once.
926	 * Otherwise, if connected, try to disconnect first.  This allows
927	 * user to disconnect by connecting to, e.g., a null address.
928	 */
929	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
930	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
931	    (error = sodisconnect(so)))) {
932		error = EISCONN;
933	} else {
934		/*
935		 * Prevent accumulated error from previous connection from
936		 * biting us.
937		 */
938		so->so_error = 0;
939		if (fd == AT_FDCWD) {
940			error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
941			    nam, td);
942		} else {
943			error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
944			    so, nam, td);
945		}
946	}
947	CURVNET_RESTORE();
948
949	return (error);
950}
951
952int
953soconnect2(struct socket *so1, struct socket *so2)
954{
955	int error;
956
957	CURVNET_SET(so1->so_vnet);
958	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
959	CURVNET_RESTORE();
960	return (error);
961}
962
963int
964sodisconnect(struct socket *so)
965{
966	int error;
967
968	if ((so->so_state & SS_ISCONNECTED) == 0)
969		return (ENOTCONN);
970	if (so->so_state & SS_ISDISCONNECTING)
971		return (EALREADY);
972	VNET_SO_ASSERT(so);
973	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
974	return (error);
975}
976
977#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
978
979int
980sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
981    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
982{
983	long space;
984	ssize_t resid;
985	int clen = 0, error, dontroute;
986
987	KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
988	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
989	    ("sosend_dgram: !PR_ATOMIC"));
990
991	if (uio != NULL)
992		resid = uio->uio_resid;
993	else
994		resid = top->m_pkthdr.len;
995	/*
996	 * In theory resid should be unsigned.  However, space must be
997	 * signed, as it might be less than 0 if we over-committed, and we
998	 * must use a signed comparison of space and resid.  On the other
999	 * hand, a negative resid causes us to loop sending 0-length
1000	 * segments to the protocol.
1001	 */
1002	if (resid < 0) {
1003		error = EINVAL;
1004		goto out;
1005	}
1006
1007	dontroute =
1008	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1009	if (td != NULL)
1010		td->td_ru.ru_msgsnd++;
1011	if (control != NULL)
1012		clen = control->m_len;
1013
1014	SOCKBUF_LOCK(&so->so_snd);
1015	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1016		SOCKBUF_UNLOCK(&so->so_snd);
1017		error = EPIPE;
1018		goto out;
1019	}
1020	if (so->so_error) {
1021		error = so->so_error;
1022		so->so_error = 0;
1023		SOCKBUF_UNLOCK(&so->so_snd);
1024		goto out;
1025	}
1026	if ((so->so_state & SS_ISCONNECTED) == 0) {
1027		/*
1028		 * `sendto' and `sendmsg' is allowed on a connection-based
1029		 * socket if it supports implied connect.  Return ENOTCONN if
1030		 * not connected and no address is supplied.
1031		 */
1032		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1033		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1034			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1035			    !(resid == 0 && clen != 0)) {
1036				SOCKBUF_UNLOCK(&so->so_snd);
1037				error = ENOTCONN;
1038				goto out;
1039			}
1040		} else if (addr == NULL) {
1041			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1042				error = ENOTCONN;
1043			else
1044				error = EDESTADDRREQ;
1045			SOCKBUF_UNLOCK(&so->so_snd);
1046			goto out;
1047		}
1048	}
1049
1050	/*
1051	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1052	 * problem and need fixing.
1053	 */
1054	space = sbspace(&so->so_snd);
1055	if (flags & MSG_OOB)
1056		space += 1024;
1057	space -= clen;
1058	SOCKBUF_UNLOCK(&so->so_snd);
1059	if (resid > space) {
1060		error = EMSGSIZE;
1061		goto out;
1062	}
1063	if (uio == NULL) {
1064		resid = 0;
1065		if (flags & MSG_EOR)
1066			top->m_flags |= M_EOR;
1067	} else {
1068		/*
1069		 * Copy the data from userland into a mbuf chain.
1070		 * If no data is to be copied in, a single empty mbuf
1071		 * is returned.
1072		 */
1073		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1074		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1075		if (top == NULL) {
1076			error = EFAULT;	/* only possible error */
1077			goto out;
1078		}
1079		space -= resid - uio->uio_resid;
1080		resid = uio->uio_resid;
1081	}
1082	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1083	/*
1084	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1085	 * than with.
1086	 */
1087	if (dontroute) {
1088		SOCK_LOCK(so);
1089		so->so_options |= SO_DONTROUTE;
1090		SOCK_UNLOCK(so);
1091	}
1092	/*
1093	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1094	 * of date.  We could have recieved a reset packet in an interrupt or
1095	 * maybe we slept while doing page faults in uiomove() etc.  We could
1096	 * probably recheck again inside the locking protection here, but
1097	 * there are probably other places that this also happens.  We must
1098	 * rethink this.
1099	 */
1100	VNET_SO_ASSERT(so);
1101	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1102	    (flags & MSG_OOB) ? PRUS_OOB :
1103	/*
1104	 * If the user set MSG_EOF, the protocol understands this flag and
1105	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1106	 */
1107	    ((flags & MSG_EOF) &&
1108	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1109	     (resid <= 0)) ?
1110		PRUS_EOF :
1111		/* If there is more to send set PRUS_MORETOCOME */
1112		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1113		top, addr, control, td);
1114	if (dontroute) {
1115		SOCK_LOCK(so);
1116		so->so_options &= ~SO_DONTROUTE;
1117		SOCK_UNLOCK(so);
1118	}
1119	clen = 0;
1120	control = NULL;
1121	top = NULL;
1122out:
1123	if (top != NULL)
1124		m_freem(top);
1125	if (control != NULL)
1126		m_freem(control);
1127	return (error);
1128}
1129
1130/*
1131 * Send on a socket.  If send must go all at once and message is larger than
1132 * send buffering, then hard error.  Lock against other senders.  If must go
1133 * all at once and not enough room now, then inform user that this would
1134 * block and do nothing.  Otherwise, if nonblocking, send as much as
1135 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1136 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1137 * in mbuf chain must be small enough to send all at once.
1138 *
1139 * Returns nonzero on error, timeout or signal; callers must check for short
1140 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1141 * on return.
1142 */
1143int
1144sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1145    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1146{
1147	long space;
1148	ssize_t resid;
1149	int clen = 0, error, dontroute;
1150	int atomic = sosendallatonce(so) || top;
1151
1152	if (uio != NULL)
1153		resid = uio->uio_resid;
1154	else
1155		resid = top->m_pkthdr.len;
1156	/*
1157	 * In theory resid should be unsigned.  However, space must be
1158	 * signed, as it might be less than 0 if we over-committed, and we
1159	 * must use a signed comparison of space and resid.  On the other
1160	 * hand, a negative resid causes us to loop sending 0-length
1161	 * segments to the protocol.
1162	 *
1163	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1164	 * type sockets since that's an error.
1165	 */
1166	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1167		error = EINVAL;
1168		goto out;
1169	}
1170
1171	dontroute =
1172	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1173	    (so->so_proto->pr_flags & PR_ATOMIC);
1174	if (td != NULL)
1175		td->td_ru.ru_msgsnd++;
1176	if (control != NULL)
1177		clen = control->m_len;
1178
1179	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1180	if (error)
1181		goto out;
1182
1183restart:
1184	do {
1185		SOCKBUF_LOCK(&so->so_snd);
1186		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1187			SOCKBUF_UNLOCK(&so->so_snd);
1188			error = EPIPE;
1189			goto release;
1190		}
1191		if (so->so_error) {
1192			error = so->so_error;
1193			so->so_error = 0;
1194			SOCKBUF_UNLOCK(&so->so_snd);
1195			goto release;
1196		}
1197		if ((so->so_state & SS_ISCONNECTED) == 0) {
1198			/*
1199			 * `sendto' and `sendmsg' is allowed on a connection-
1200			 * based socket if it supports implied connect.
1201			 * Return ENOTCONN if not connected and no address is
1202			 * supplied.
1203			 */
1204			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1205			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1206				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1207				    !(resid == 0 && clen != 0)) {
1208					SOCKBUF_UNLOCK(&so->so_snd);
1209					error = ENOTCONN;
1210					goto release;
1211				}
1212			} else if (addr == NULL) {
1213				SOCKBUF_UNLOCK(&so->so_snd);
1214				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1215					error = ENOTCONN;
1216				else
1217					error = EDESTADDRREQ;
1218				goto release;
1219			}
1220		}
1221		space = sbspace(&so->so_snd);
1222		if (flags & MSG_OOB)
1223			space += 1024;
1224		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1225		    clen > so->so_snd.sb_hiwat) {
1226			SOCKBUF_UNLOCK(&so->so_snd);
1227			error = EMSGSIZE;
1228			goto release;
1229		}
1230		if (space < resid + clen &&
1231		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1232			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1233				SOCKBUF_UNLOCK(&so->so_snd);
1234				error = EWOULDBLOCK;
1235				goto release;
1236			}
1237			error = sbwait(&so->so_snd);
1238			SOCKBUF_UNLOCK(&so->so_snd);
1239			if (error)
1240				goto release;
1241			goto restart;
1242		}
1243		SOCKBUF_UNLOCK(&so->so_snd);
1244		space -= clen;
1245		do {
1246			if (uio == NULL) {
1247				resid = 0;
1248				if (flags & MSG_EOR)
1249					top->m_flags |= M_EOR;
1250			} else {
1251				/*
1252				 * Copy the data from userland into a mbuf
1253				 * chain.  If no data is to be copied in,
1254				 * a single empty mbuf is returned.
1255				 */
1256				top = m_uiotombuf(uio, M_WAITOK, space,
1257				    (atomic ? max_hdr : 0),
1258				    (atomic ? M_PKTHDR : 0) |
1259				    ((flags & MSG_EOR) ? M_EOR : 0));
1260				if (top == NULL) {
1261					error = EFAULT; /* only possible error */
1262					goto release;
1263				}
1264				space -= resid - uio->uio_resid;
1265				resid = uio->uio_resid;
1266			}
1267			if (dontroute) {
1268				SOCK_LOCK(so);
1269				so->so_options |= SO_DONTROUTE;
1270				SOCK_UNLOCK(so);
1271			}
1272			/*
1273			 * XXX all the SBS_CANTSENDMORE checks previously
1274			 * done could be out of date.  We could have recieved
1275			 * a reset packet in an interrupt or maybe we slept
1276			 * while doing page faults in uiomove() etc.  We
1277			 * could probably recheck again inside the locking
1278			 * protection here, but there are probably other
1279			 * places that this also happens.  We must rethink
1280			 * this.
1281			 */
1282			VNET_SO_ASSERT(so);
1283			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1284			    (flags & MSG_OOB) ? PRUS_OOB :
1285			/*
1286			 * If the user set MSG_EOF, the protocol understands
1287			 * this flag and nothing left to send then use
1288			 * PRU_SEND_EOF instead of PRU_SEND.
1289			 */
1290			    ((flags & MSG_EOF) &&
1291			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1292			     (resid <= 0)) ?
1293				PRUS_EOF :
1294			/* If there is more to send set PRUS_MORETOCOME. */
1295			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1296			    top, addr, control, td);
1297			if (dontroute) {
1298				SOCK_LOCK(so);
1299				so->so_options &= ~SO_DONTROUTE;
1300				SOCK_UNLOCK(so);
1301			}
1302			clen = 0;
1303			control = NULL;
1304			top = NULL;
1305			if (error)
1306				goto release;
1307		} while (resid && space > 0);
1308	} while (resid);
1309
1310release:
1311	sbunlock(&so->so_snd);
1312out:
1313	if (top != NULL)
1314		m_freem(top);
1315	if (control != NULL)
1316		m_freem(control);
1317	return (error);
1318}
1319
1320int
1321sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1322    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1323{
1324	int error;
1325
1326	CURVNET_SET(so->so_vnet);
1327	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1328	    control, flags, td);
1329	CURVNET_RESTORE();
1330	return (error);
1331}
1332
1333/*
1334 * The part of soreceive() that implements reading non-inline out-of-band
1335 * data from a socket.  For more complete comments, see soreceive(), from
1336 * which this code originated.
1337 *
1338 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1339 * unable to return an mbuf chain to the caller.
1340 */
1341static int
1342soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1343{
1344	struct protosw *pr = so->so_proto;
1345	struct mbuf *m;
1346	int error;
1347
1348	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1349	VNET_SO_ASSERT(so);
1350
1351	m = m_get(M_WAITOK, MT_DATA);
1352	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1353	if (error)
1354		goto bad;
1355	do {
1356		error = uiomove(mtod(m, void *),
1357		    (int) min(uio->uio_resid, m->m_len), uio);
1358		m = m_free(m);
1359	} while (uio->uio_resid && error == 0 && m);
1360bad:
1361	if (m != NULL)
1362		m_freem(m);
1363	return (error);
1364}
1365
1366/*
1367 * Following replacement or removal of the first mbuf on the first mbuf chain
1368 * of a socket buffer, push necessary state changes back into the socket
1369 * buffer so that other consumers see the values consistently.  'nextrecord'
1370 * is the callers locally stored value of the original value of
1371 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1372 * NOTE: 'nextrecord' may be NULL.
1373 */
1374static __inline void
1375sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1376{
1377
1378	SOCKBUF_LOCK_ASSERT(sb);
1379	/*
1380	 * First, update for the new value of nextrecord.  If necessary, make
1381	 * it the first record.
1382	 */
1383	if (sb->sb_mb != NULL)
1384		sb->sb_mb->m_nextpkt = nextrecord;
1385	else
1386		sb->sb_mb = nextrecord;
1387
1388	/*
1389	 * Now update any dependent socket buffer fields to reflect the new
1390	 * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1391	 * addition of a second clause that takes care of the case where
1392	 * sb_mb has been updated, but remains the last record.
1393	 */
1394	if (sb->sb_mb == NULL) {
1395		sb->sb_mbtail = NULL;
1396		sb->sb_lastrecord = NULL;
1397	} else if (sb->sb_mb->m_nextpkt == NULL)
1398		sb->sb_lastrecord = sb->sb_mb;
1399}
1400
1401/*
1402 * Implement receive operations on a socket.  We depend on the way that
1403 * records are added to the sockbuf by sbappend.  In particular, each record
1404 * (mbufs linked through m_next) must begin with an address if the protocol
1405 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1406 * data, and then zero or more mbufs of data.  In order to allow parallelism
1407 * between network receive and copying to user space, as well as avoid
1408 * sleeping with a mutex held, we release the socket buffer mutex during the
1409 * user space copy.  Although the sockbuf is locked, new data may still be
1410 * appended, and thus we must maintain consistency of the sockbuf during that
1411 * time.
1412 *
1413 * The caller may receive the data as a single mbuf chain by supplying an
1414 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1415 * the count in uio_resid.
1416 */
1417int
1418soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1419    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1420{
1421	struct mbuf *m, **mp;
1422	int flags, error, offset;
1423	ssize_t len;
1424	struct protosw *pr = so->so_proto;
1425	struct mbuf *nextrecord;
1426	int moff, type = 0;
1427	ssize_t orig_resid = uio->uio_resid;
1428
1429	mp = mp0;
1430	if (psa != NULL)
1431		*psa = NULL;
1432	if (controlp != NULL)
1433		*controlp = NULL;
1434	if (flagsp != NULL)
1435		flags = *flagsp &~ MSG_EOR;
1436	else
1437		flags = 0;
1438	if (flags & MSG_OOB)
1439		return (soreceive_rcvoob(so, uio, flags));
1440	if (mp != NULL)
1441		*mp = NULL;
1442	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1443	    && uio->uio_resid) {
1444		VNET_SO_ASSERT(so);
1445		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1446	}
1447
1448	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1449	if (error)
1450		return (error);
1451
1452restart:
1453	SOCKBUF_LOCK(&so->so_rcv);
1454	m = so->so_rcv.sb_mb;
1455	/*
1456	 * If we have less data than requested, block awaiting more (subject
1457	 * to any timeout) if:
1458	 *   1. the current count is less than the low water mark, or
1459	 *   2. MSG_DONTWAIT is not set
1460	 */
1461	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1462	    so->so_rcv.sb_cc < uio->uio_resid) &&
1463	    so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
1464	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1465		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1466		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1467		    m, so->so_rcv.sb_cc));
1468		if (so->so_error) {
1469			if (m != NULL)
1470				goto dontblock;
1471			error = so->so_error;
1472			if ((flags & MSG_PEEK) == 0)
1473				so->so_error = 0;
1474			SOCKBUF_UNLOCK(&so->so_rcv);
1475			goto release;
1476		}
1477		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1478		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1479			if (m == NULL) {
1480				SOCKBUF_UNLOCK(&so->so_rcv);
1481				goto release;
1482			} else
1483				goto dontblock;
1484		}
1485		for (; m != NULL; m = m->m_next)
1486			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1487				m = so->so_rcv.sb_mb;
1488				goto dontblock;
1489			}
1490		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1491		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1492			SOCKBUF_UNLOCK(&so->so_rcv);
1493			error = ENOTCONN;
1494			goto release;
1495		}
1496		if (uio->uio_resid == 0) {
1497			SOCKBUF_UNLOCK(&so->so_rcv);
1498			goto release;
1499		}
1500		if ((so->so_state & SS_NBIO) ||
1501		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1502			SOCKBUF_UNLOCK(&so->so_rcv);
1503			error = EWOULDBLOCK;
1504			goto release;
1505		}
1506		SBLASTRECORDCHK(&so->so_rcv);
1507		SBLASTMBUFCHK(&so->so_rcv);
1508		error = sbwait(&so->so_rcv);
1509		SOCKBUF_UNLOCK(&so->so_rcv);
1510		if (error)
1511			goto release;
1512		goto restart;
1513	}
1514dontblock:
1515	/*
1516	 * From this point onward, we maintain 'nextrecord' as a cache of the
1517	 * pointer to the next record in the socket buffer.  We must keep the
1518	 * various socket buffer pointers and local stack versions of the
1519	 * pointers in sync, pushing out modifications before dropping the
1520	 * socket buffer mutex, and re-reading them when picking it up.
1521	 *
1522	 * Otherwise, we will race with the network stack appending new data
1523	 * or records onto the socket buffer by using inconsistent/stale
1524	 * versions of the field, possibly resulting in socket buffer
1525	 * corruption.
1526	 *
1527	 * By holding the high-level sblock(), we prevent simultaneous
1528	 * readers from pulling off the front of the socket buffer.
1529	 */
1530	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1531	if (uio->uio_td)
1532		uio->uio_td->td_ru.ru_msgrcv++;
1533	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1534	SBLASTRECORDCHK(&so->so_rcv);
1535	SBLASTMBUFCHK(&so->so_rcv);
1536	nextrecord = m->m_nextpkt;
1537	if (pr->pr_flags & PR_ADDR) {
1538		KASSERT(m->m_type == MT_SONAME,
1539		    ("m->m_type == %d", m->m_type));
1540		orig_resid = 0;
1541		if (psa != NULL)
1542			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1543			    M_NOWAIT);
1544		if (flags & MSG_PEEK) {
1545			m = m->m_next;
1546		} else {
1547			sbfree(&so->so_rcv, m);
1548			so->so_rcv.sb_mb = m_free(m);
1549			m = so->so_rcv.sb_mb;
1550			sockbuf_pushsync(&so->so_rcv, nextrecord);
1551		}
1552	}
1553
1554	/*
1555	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1556	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1557	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1558	 * perform externalization (or freeing if controlp == NULL).
1559	 */
1560	if (m != NULL && m->m_type == MT_CONTROL) {
1561		struct mbuf *cm = NULL, *cmn;
1562		struct mbuf **cme = &cm;
1563
1564		do {
1565			if (flags & MSG_PEEK) {
1566				if (controlp != NULL) {
1567					*controlp = m_copy(m, 0, m->m_len);
1568					controlp = &(*controlp)->m_next;
1569				}
1570				m = m->m_next;
1571			} else {
1572				sbfree(&so->so_rcv, m);
1573				so->so_rcv.sb_mb = m->m_next;
1574				m->m_next = NULL;
1575				*cme = m;
1576				cme = &(*cme)->m_next;
1577				m = so->so_rcv.sb_mb;
1578			}
1579		} while (m != NULL && m->m_type == MT_CONTROL);
1580		if ((flags & MSG_PEEK) == 0)
1581			sockbuf_pushsync(&so->so_rcv, nextrecord);
1582		while (cm != NULL) {
1583			cmn = cm->m_next;
1584			cm->m_next = NULL;
1585			if (pr->pr_domain->dom_externalize != NULL) {
1586				SOCKBUF_UNLOCK(&so->so_rcv);
1587				VNET_SO_ASSERT(so);
1588				error = (*pr->pr_domain->dom_externalize)
1589				    (cm, controlp, flags);
1590				SOCKBUF_LOCK(&so->so_rcv);
1591			} else if (controlp != NULL)
1592				*controlp = cm;
1593			else
1594				m_freem(cm);
1595			if (controlp != NULL) {
1596				orig_resid = 0;
1597				while (*controlp != NULL)
1598					controlp = &(*controlp)->m_next;
1599			}
1600			cm = cmn;
1601		}
1602		if (m != NULL)
1603			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1604		else
1605			nextrecord = so->so_rcv.sb_mb;
1606		orig_resid = 0;
1607	}
1608	if (m != NULL) {
1609		if ((flags & MSG_PEEK) == 0) {
1610			KASSERT(m->m_nextpkt == nextrecord,
1611			    ("soreceive: post-control, nextrecord !sync"));
1612			if (nextrecord == NULL) {
1613				KASSERT(so->so_rcv.sb_mb == m,
1614				    ("soreceive: post-control, sb_mb!=m"));
1615				KASSERT(so->so_rcv.sb_lastrecord == m,
1616				    ("soreceive: post-control, lastrecord!=m"));
1617			}
1618		}
1619		type = m->m_type;
1620		if (type == MT_OOBDATA)
1621			flags |= MSG_OOB;
1622	} else {
1623		if ((flags & MSG_PEEK) == 0) {
1624			KASSERT(so->so_rcv.sb_mb == nextrecord,
1625			    ("soreceive: sb_mb != nextrecord"));
1626			if (so->so_rcv.sb_mb == NULL) {
1627				KASSERT(so->so_rcv.sb_lastrecord == NULL,
1628				    ("soreceive: sb_lastercord != NULL"));
1629			}
1630		}
1631	}
1632	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1633	SBLASTRECORDCHK(&so->so_rcv);
1634	SBLASTMBUFCHK(&so->so_rcv);
1635
1636	/*
1637	 * Now continue to read any data mbufs off of the head of the socket
1638	 * buffer until the read request is satisfied.  Note that 'type' is
1639	 * used to store the type of any mbuf reads that have happened so far
1640	 * such that soreceive() can stop reading if the type changes, which
1641	 * causes soreceive() to return only one of regular data and inline
1642	 * out-of-band data in a single socket receive operation.
1643	 */
1644	moff = 0;
1645	offset = 0;
1646	while (m != NULL && uio->uio_resid > 0 && error == 0) {
1647		/*
1648		 * If the type of mbuf has changed since the last mbuf
1649		 * examined ('type'), end the receive operation.
1650		 */
1651		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1652		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
1653			if (type != m->m_type)
1654				break;
1655		} else if (type == MT_OOBDATA)
1656			break;
1657		else
1658		    KASSERT(m->m_type == MT_DATA,
1659			("m->m_type == %d", m->m_type));
1660		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1661		len = uio->uio_resid;
1662		if (so->so_oobmark && len > so->so_oobmark - offset)
1663			len = so->so_oobmark - offset;
1664		if (len > m->m_len - moff)
1665			len = m->m_len - moff;
1666		/*
1667		 * If mp is set, just pass back the mbufs.  Otherwise copy
1668		 * them out via the uio, then free.  Sockbuf must be
1669		 * consistent here (points to current mbuf, it points to next
1670		 * record) when we drop priority; we must note any additions
1671		 * to the sockbuf when we block interrupts again.
1672		 */
1673		if (mp == NULL) {
1674			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1675			SBLASTRECORDCHK(&so->so_rcv);
1676			SBLASTMBUFCHK(&so->so_rcv);
1677			SOCKBUF_UNLOCK(&so->so_rcv);
1678			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1679			SOCKBUF_LOCK(&so->so_rcv);
1680			if (error) {
1681				/*
1682				 * The MT_SONAME mbuf has already been removed
1683				 * from the record, so it is necessary to
1684				 * remove the data mbufs, if any, to preserve
1685				 * the invariant in the case of PR_ADDR that
1686				 * requires MT_SONAME mbufs at the head of
1687				 * each record.
1688				 */
1689				if (m && pr->pr_flags & PR_ATOMIC &&
1690				    ((flags & MSG_PEEK) == 0))
1691					(void)sbdroprecord_locked(&so->so_rcv);
1692				SOCKBUF_UNLOCK(&so->so_rcv);
1693				goto release;
1694			}
1695		} else
1696			uio->uio_resid -= len;
1697		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1698		if (len == m->m_len - moff) {
1699			if (m->m_flags & M_EOR)
1700				flags |= MSG_EOR;
1701			if (flags & MSG_PEEK) {
1702				m = m->m_next;
1703				moff = 0;
1704			} else {
1705				nextrecord = m->m_nextpkt;
1706				sbfree(&so->so_rcv, m);
1707				if (mp != NULL) {
1708					m->m_nextpkt = NULL;
1709					*mp = m;
1710					mp = &m->m_next;
1711					so->so_rcv.sb_mb = m = m->m_next;
1712					*mp = NULL;
1713				} else {
1714					so->so_rcv.sb_mb = m_free(m);
1715					m = so->so_rcv.sb_mb;
1716				}
1717				sockbuf_pushsync(&so->so_rcv, nextrecord);
1718				SBLASTRECORDCHK(&so->so_rcv);
1719				SBLASTMBUFCHK(&so->so_rcv);
1720			}
1721		} else {
1722			if (flags & MSG_PEEK)
1723				moff += len;
1724			else {
1725				if (mp != NULL) {
1726					int copy_flag;
1727
1728					if (flags & MSG_DONTWAIT)
1729						copy_flag = M_NOWAIT;
1730					else
1731						copy_flag = M_WAIT;
1732					if (copy_flag == M_WAITOK)
1733						SOCKBUF_UNLOCK(&so->so_rcv);
1734					*mp = m_copym(m, 0, len, copy_flag);
1735					if (copy_flag == M_WAITOK)
1736						SOCKBUF_LOCK(&so->so_rcv);
1737					if (*mp == NULL) {
1738						/*
1739						 * m_copym() couldn't
1740						 * allocate an mbuf.  Adjust
1741						 * uio_resid back (it was
1742						 * adjusted down by len
1743						 * bytes, which we didn't end
1744						 * up "copying" over).
1745						 */
1746						uio->uio_resid += len;
1747						break;
1748					}
1749				}
1750				m->m_data += len;
1751				m->m_len -= len;
1752				so->so_rcv.sb_cc -= len;
1753			}
1754		}
1755		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1756		if (so->so_oobmark) {
1757			if ((flags & MSG_PEEK) == 0) {
1758				so->so_oobmark -= len;
1759				if (so->so_oobmark == 0) {
1760					so->so_rcv.sb_state |= SBS_RCVATMARK;
1761					break;
1762				}
1763			} else {
1764				offset += len;
1765				if (offset == so->so_oobmark)
1766					break;
1767			}
1768		}
1769		if (flags & MSG_EOR)
1770			break;
1771		/*
1772		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1773		 * must not quit until "uio->uio_resid == 0" or an error
1774		 * termination.  If a signal/timeout occurs, return with a
1775		 * short count but without error.  Keep sockbuf locked
1776		 * against other readers.
1777		 */
1778		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1779		    !sosendallatonce(so) && nextrecord == NULL) {
1780			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1781			if (so->so_error ||
1782			    so->so_rcv.sb_state & SBS_CANTRCVMORE)
1783				break;
1784			/*
1785			 * Notify the protocol that some data has been
1786			 * drained before blocking.
1787			 */
1788			if (pr->pr_flags & PR_WANTRCVD) {
1789				SOCKBUF_UNLOCK(&so->so_rcv);
1790				VNET_SO_ASSERT(so);
1791				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1792				SOCKBUF_LOCK(&so->so_rcv);
1793			}
1794			SBLASTRECORDCHK(&so->so_rcv);
1795			SBLASTMBUFCHK(&so->so_rcv);
1796			/*
1797			 * We could receive some data while was notifying
1798			 * the protocol. Skip blocking in this case.
1799			 */
1800			if (so->so_rcv.sb_mb == NULL) {
1801				error = sbwait(&so->so_rcv);
1802				if (error) {
1803					SOCKBUF_UNLOCK(&so->so_rcv);
1804					goto release;
1805				}
1806			}
1807			m = so->so_rcv.sb_mb;
1808			if (m != NULL)
1809				nextrecord = m->m_nextpkt;
1810		}
1811	}
1812
1813	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1814	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1815		flags |= MSG_TRUNC;
1816		if ((flags & MSG_PEEK) == 0)
1817			(void) sbdroprecord_locked(&so->so_rcv);
1818	}
1819	if ((flags & MSG_PEEK) == 0) {
1820		if (m == NULL) {
1821			/*
1822			 * First part is an inline SB_EMPTY_FIXUP().  Second
1823			 * part makes sure sb_lastrecord is up-to-date if
1824			 * there is still data in the socket buffer.
1825			 */
1826			so->so_rcv.sb_mb = nextrecord;
1827			if (so->so_rcv.sb_mb == NULL) {
1828				so->so_rcv.sb_mbtail = NULL;
1829				so->so_rcv.sb_lastrecord = NULL;
1830			} else if (nextrecord->m_nextpkt == NULL)
1831				so->so_rcv.sb_lastrecord = nextrecord;
1832		}
1833		SBLASTRECORDCHK(&so->so_rcv);
1834		SBLASTMBUFCHK(&so->so_rcv);
1835		/*
1836		 * If soreceive() is being done from the socket callback,
1837		 * then don't need to generate ACK to peer to update window,
1838		 * since ACK will be generated on return to TCP.
1839		 */
1840		if (!(flags & MSG_SOCALLBCK) &&
1841		    (pr->pr_flags & PR_WANTRCVD)) {
1842			SOCKBUF_UNLOCK(&so->so_rcv);
1843			VNET_SO_ASSERT(so);
1844			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
1845			SOCKBUF_LOCK(&so->so_rcv);
1846		}
1847	}
1848	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1849	if (orig_resid == uio->uio_resid && orig_resid &&
1850	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1851		SOCKBUF_UNLOCK(&so->so_rcv);
1852		goto restart;
1853	}
1854	SOCKBUF_UNLOCK(&so->so_rcv);
1855
1856	if (flagsp != NULL)
1857		*flagsp |= flags;
1858release:
1859	sbunlock(&so->so_rcv);
1860	return (error);
1861}
1862
1863/*
1864 * Optimized version of soreceive() for stream (TCP) sockets.
1865 * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
1866 */
1867int
1868soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1869    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1870{
1871	int len = 0, error = 0, flags, oresid;
1872	struct sockbuf *sb;
1873	struct mbuf *m, *n = NULL;
1874
1875	/* We only do stream sockets. */
1876	if (so->so_type != SOCK_STREAM)
1877		return (EINVAL);
1878	if (psa != NULL)
1879		*psa = NULL;
1880	if (controlp != NULL)
1881		return (EINVAL);
1882	if (flagsp != NULL)
1883		flags = *flagsp &~ MSG_EOR;
1884	else
1885		flags = 0;
1886	if (flags & MSG_OOB)
1887		return (soreceive_rcvoob(so, uio, flags));
1888	if (mp0 != NULL)
1889		*mp0 = NULL;
1890
1891	sb = &so->so_rcv;
1892
1893	/* Prevent other readers from entering the socket. */
1894	error = sblock(sb, SBLOCKWAIT(flags));
1895	if (error)
1896		goto out;
1897	SOCKBUF_LOCK(sb);
1898
1899	/* Easy one, no space to copyout anything. */
1900	if (uio->uio_resid == 0) {
1901		error = EINVAL;
1902		goto out;
1903	}
1904	oresid = uio->uio_resid;
1905
1906	/* We will never ever get anything unless we are or were connected. */
1907	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1908		error = ENOTCONN;
1909		goto out;
1910	}
1911
1912restart:
1913	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1914
1915	/* Abort if socket has reported problems. */
1916	if (so->so_error) {
1917		if (sb->sb_cc > 0)
1918			goto deliver;
1919		if (oresid > uio->uio_resid)
1920			goto out;
1921		error = so->so_error;
1922		if (!(flags & MSG_PEEK))
1923			so->so_error = 0;
1924		goto out;
1925	}
1926
1927	/* Door is closed.  Deliver what is left, if any. */
1928	if (sb->sb_state & SBS_CANTRCVMORE) {
1929		if (sb->sb_cc > 0)
1930			goto deliver;
1931		else
1932			goto out;
1933	}
1934
1935	/* Socket buffer is empty and we shall not block. */
1936	if (sb->sb_cc == 0 &&
1937	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1938		error = EAGAIN;
1939		goto out;
1940	}
1941
1942	/* Socket buffer got some data that we shall deliver now. */
1943	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1944	    ((sb->sb_flags & SS_NBIO) ||
1945	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1946	     sb->sb_cc >= sb->sb_lowat ||
1947	     sb->sb_cc >= uio->uio_resid ||
1948	     sb->sb_cc >= sb->sb_hiwat) ) {
1949		goto deliver;
1950	}
1951
1952	/* On MSG_WAITALL we must wait until all data or error arrives. */
1953	if ((flags & MSG_WAITALL) &&
1954	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
1955		goto deliver;
1956
1957	/*
1958	 * Wait and block until (more) data comes in.
1959	 * NB: Drops the sockbuf lock during wait.
1960	 */
1961	error = sbwait(sb);
1962	if (error)
1963		goto out;
1964	goto restart;
1965
1966deliver:
1967	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1968	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1969	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1970
1971	/* Statistics. */
1972	if (uio->uio_td)
1973		uio->uio_td->td_ru.ru_msgrcv++;
1974
1975	/* Fill uio until full or current end of socket buffer is reached. */
1976	len = min(uio->uio_resid, sb->sb_cc);
1977	if (mp0 != NULL) {
1978		/* Dequeue as many mbufs as possible. */
1979		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1980			if (*mp0 == NULL)
1981				*mp0 = sb->sb_mb;
1982			else
1983				m_cat(*mp0, sb->sb_mb);
1984			for (m = sb->sb_mb;
1985			     m != NULL && m->m_len <= len;
1986			     m = m->m_next) {
1987				len -= m->m_len;
1988				uio->uio_resid -= m->m_len;
1989				sbfree(sb, m);
1990				n = m;
1991			}
1992			n->m_next = NULL;
1993			sb->sb_mb = m;
1994			sb->sb_lastrecord = sb->sb_mb;
1995			if (sb->sb_mb == NULL)
1996				SB_EMPTY_FIXUP(sb);
1997		}
1998		/* Copy the remainder. */
1999		if (len > 0) {
2000			KASSERT(sb->sb_mb != NULL,
2001			    ("%s: len > 0 && sb->sb_mb empty", __func__));
2002
2003			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2004			if (m == NULL)
2005				len = 0;	/* Don't flush data from sockbuf. */
2006			else
2007				uio->uio_resid -= len;
2008			if (*mp0 != NULL)
2009				m_cat(*mp0, m);
2010			else
2011				*mp0 = m;
2012			if (*mp0 == NULL) {
2013				error = ENOBUFS;
2014				goto out;
2015			}
2016		}
2017	} else {
2018		/* NB: Must unlock socket buffer as uiomove may sleep. */
2019		SOCKBUF_UNLOCK(sb);
2020		error = m_mbuftouio(uio, sb->sb_mb, len);
2021		SOCKBUF_LOCK(sb);
2022		if (error)
2023			goto out;
2024	}
2025	SBLASTRECORDCHK(sb);
2026	SBLASTMBUFCHK(sb);
2027
2028	/*
2029	 * Remove the delivered data from the socket buffer unless we
2030	 * were only peeking.
2031	 */
2032	if (!(flags & MSG_PEEK)) {
2033		if (len > 0)
2034			sbdrop_locked(sb, len);
2035
2036		/* Notify protocol that we drained some data. */
2037		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2038		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2039		     !(flags & MSG_SOCALLBCK))) {
2040			SOCKBUF_UNLOCK(sb);
2041			VNET_SO_ASSERT(so);
2042			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2043			SOCKBUF_LOCK(sb);
2044		}
2045	}
2046
2047	/*
2048	 * For MSG_WAITALL we may have to loop again and wait for
2049	 * more data to come in.
2050	 */
2051	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2052		goto restart;
2053out:
2054	SOCKBUF_LOCK_ASSERT(sb);
2055	SBLASTRECORDCHK(sb);
2056	SBLASTMBUFCHK(sb);
2057	SOCKBUF_UNLOCK(sb);
2058	sbunlock(sb);
2059	return (error);
2060}
2061
2062/*
2063 * Optimized version of soreceive() for simple datagram cases from userspace.
2064 * Unlike in the stream case, we're able to drop a datagram if copyout()
2065 * fails, and because we handle datagrams atomically, we don't need to use a
2066 * sleep lock to prevent I/O interlacing.
2067 */
2068int
2069soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2070    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2071{
2072	struct mbuf *m, *m2;
2073	int flags, error;
2074	ssize_t len;
2075	struct protosw *pr = so->so_proto;
2076	struct mbuf *nextrecord;
2077
2078	if (psa != NULL)
2079		*psa = NULL;
2080	if (controlp != NULL)
2081		*controlp = NULL;
2082	if (flagsp != NULL)
2083		flags = *flagsp &~ MSG_EOR;
2084	else
2085		flags = 0;
2086
2087	/*
2088	 * For any complicated cases, fall back to the full
2089	 * soreceive_generic().
2090	 */
2091	if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2092		return (soreceive_generic(so, psa, uio, mp0, controlp,
2093		    flagsp));
2094
2095	/*
2096	 * Enforce restrictions on use.
2097	 */
2098	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2099	    ("soreceive_dgram: wantrcvd"));
2100	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2101	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2102	    ("soreceive_dgram: SBS_RCVATMARK"));
2103	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2104	    ("soreceive_dgram: P_CONNREQUIRED"));
2105
2106	/*
2107	 * Loop blocking while waiting for a datagram.
2108	 */
2109	SOCKBUF_LOCK(&so->so_rcv);
2110	while ((m = so->so_rcv.sb_mb) == NULL) {
2111		KASSERT(so->so_rcv.sb_cc == 0,
2112		    ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2113		    so->so_rcv.sb_cc));
2114		if (so->so_error) {
2115			error = so->so_error;
2116			so->so_error = 0;
2117			SOCKBUF_UNLOCK(&so->so_rcv);
2118			return (error);
2119		}
2120		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2121		    uio->uio_resid == 0) {
2122			SOCKBUF_UNLOCK(&so->so_rcv);
2123			return (0);
2124		}
2125		if ((so->so_state & SS_NBIO) ||
2126		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2127			SOCKBUF_UNLOCK(&so->so_rcv);
2128			return (EWOULDBLOCK);
2129		}
2130		SBLASTRECORDCHK(&so->so_rcv);
2131		SBLASTMBUFCHK(&so->so_rcv);
2132		error = sbwait(&so->so_rcv);
2133		if (error) {
2134			SOCKBUF_UNLOCK(&so->so_rcv);
2135			return (error);
2136		}
2137	}
2138	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2139
2140	if (uio->uio_td)
2141		uio->uio_td->td_ru.ru_msgrcv++;
2142	SBLASTRECORDCHK(&so->so_rcv);
2143	SBLASTMBUFCHK(&so->so_rcv);
2144	nextrecord = m->m_nextpkt;
2145	if (nextrecord == NULL) {
2146		KASSERT(so->so_rcv.sb_lastrecord == m,
2147		    ("soreceive_dgram: lastrecord != m"));
2148	}
2149
2150	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2151	    ("soreceive_dgram: m_nextpkt != nextrecord"));
2152
2153	/*
2154	 * Pull 'm' and its chain off the front of the packet queue.
2155	 */
2156	so->so_rcv.sb_mb = NULL;
2157	sockbuf_pushsync(&so->so_rcv, nextrecord);
2158
2159	/*
2160	 * Walk 'm's chain and free that many bytes from the socket buffer.
2161	 */
2162	for (m2 = m; m2 != NULL; m2 = m2->m_next)
2163		sbfree(&so->so_rcv, m2);
2164
2165	/*
2166	 * Do a few last checks before we let go of the lock.
2167	 */
2168	SBLASTRECORDCHK(&so->so_rcv);
2169	SBLASTMBUFCHK(&so->so_rcv);
2170	SOCKBUF_UNLOCK(&so->so_rcv);
2171
2172	if (pr->pr_flags & PR_ADDR) {
2173		KASSERT(m->m_type == MT_SONAME,
2174		    ("m->m_type == %d", m->m_type));
2175		if (psa != NULL)
2176			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
2177			    M_NOWAIT);
2178		m = m_free(m);
2179	}
2180	if (m == NULL) {
2181		/* XXXRW: Can this happen? */
2182		return (0);
2183	}
2184
2185	/*
2186	 * Packet to copyout() is now in 'm' and it is disconnected from the
2187	 * queue.
2188	 *
2189	 * Process one or more MT_CONTROL mbufs present before any data mbufs
2190	 * in the first mbuf chain on the socket buffer.  We call into the
2191	 * protocol to perform externalization (or freeing if controlp ==
2192	 * NULL).
2193	 */
2194	if (m->m_type == MT_CONTROL) {
2195		struct mbuf *cm = NULL, *cmn;
2196		struct mbuf **cme = &cm;
2197
2198		do {
2199			m2 = m->m_next;
2200			m->m_next = NULL;
2201			*cme = m;
2202			cme = &(*cme)->m_next;
2203			m = m2;
2204		} while (m != NULL && m->m_type == MT_CONTROL);
2205		while (cm != NULL) {
2206			cmn = cm->m_next;
2207			cm->m_next = NULL;
2208			if (pr->pr_domain->dom_externalize != NULL) {
2209				error = (*pr->pr_domain->dom_externalize)
2210				    (cm, controlp, flags);
2211			} else if (controlp != NULL)
2212				*controlp = cm;
2213			else
2214				m_freem(cm);
2215			if (controlp != NULL) {
2216				while (*controlp != NULL)
2217					controlp = &(*controlp)->m_next;
2218			}
2219			cm = cmn;
2220		}
2221	}
2222	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2223
2224	while (m != NULL && uio->uio_resid > 0) {
2225		len = uio->uio_resid;
2226		if (len > m->m_len)
2227			len = m->m_len;
2228		error = uiomove(mtod(m, char *), (int)len, uio);
2229		if (error) {
2230			m_freem(m);
2231			return (error);
2232		}
2233		if (len == m->m_len)
2234			m = m_free(m);
2235		else {
2236			m->m_data += len;
2237			m->m_len -= len;
2238		}
2239	}
2240	if (m != NULL)
2241		flags |= MSG_TRUNC;
2242	m_freem(m);
2243	if (flagsp != NULL)
2244		*flagsp |= flags;
2245	return (0);
2246}
2247
2248int
2249soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2250    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2251{
2252	int error;
2253
2254	CURVNET_SET(so->so_vnet);
2255	error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2256	    controlp, flagsp));
2257	CURVNET_RESTORE();
2258	return (error);
2259}
2260
2261int
2262soshutdown(struct socket *so, int how)
2263{
2264	struct protosw *pr = so->so_proto;
2265	int error;
2266
2267	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2268		return (EINVAL);
2269
2270	CURVNET_SET(so->so_vnet);
2271	if (pr->pr_usrreqs->pru_flush != NULL)
2272		(*pr->pr_usrreqs->pru_flush)(so, how);
2273	if (how != SHUT_WR)
2274		sorflush(so);
2275	if (how != SHUT_RD) {
2276		error = (*pr->pr_usrreqs->pru_shutdown)(so);
2277		wakeup(&so->so_timeo);
2278		CURVNET_RESTORE();
2279		return (error);
2280	}
2281	wakeup(&so->so_timeo);
2282	CURVNET_RESTORE();
2283	return (0);
2284}
2285
2286void
2287sorflush(struct socket *so)
2288{
2289	struct sockbuf *sb = &so->so_rcv;
2290	struct protosw *pr = so->so_proto;
2291	struct sockbuf asb;
2292
2293	VNET_SO_ASSERT(so);
2294
2295	/*
2296	 * In order to avoid calling dom_dispose with the socket buffer mutex
2297	 * held, and in order to generally avoid holding the lock for a long
2298	 * time, we make a copy of the socket buffer and clear the original
2299	 * (except locks, state).  The new socket buffer copy won't have
2300	 * initialized locks so we can only call routines that won't use or
2301	 * assert those locks.
2302	 *
2303	 * Dislodge threads currently blocked in receive and wait to acquire
2304	 * a lock against other simultaneous readers before clearing the
2305	 * socket buffer.  Don't let our acquire be interrupted by a signal
2306	 * despite any existing socket disposition on interruptable waiting.
2307	 */
2308	socantrcvmore(so);
2309	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2310
2311	/*
2312	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2313	 * and mutex data unchanged.
2314	 */
2315	SOCKBUF_LOCK(sb);
2316	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2317	bcopy(&sb->sb_startzero, &asb.sb_startzero,
2318	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2319	bzero(&sb->sb_startzero,
2320	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2321	SOCKBUF_UNLOCK(sb);
2322	sbunlock(sb);
2323
2324	/*
2325	 * Dispose of special rights and flush the socket buffer.  Don't call
2326	 * any unsafe routines (that rely on locks being initialized) on asb.
2327	 */
2328	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2329		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
2330	sbrelease_internal(&asb, so);
2331}
2332
2333/*
2334 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2335 * additional variant to handle the case where the option value needs to be
2336 * some kind of integer, but not a specific size.  In addition to their use
2337 * here, these functions are also called by the protocol-level pr_ctloutput()
2338 * routines.
2339 */
2340int
2341sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2342{
2343	size_t	valsize;
2344
2345	/*
2346	 * If the user gives us more than we wanted, we ignore it, but if we
2347	 * don't get the minimum length the caller wants, we return EINVAL.
2348	 * On success, sopt->sopt_valsize is set to however much we actually
2349	 * retrieved.
2350	 */
2351	if ((valsize = sopt->sopt_valsize) < minlen)
2352		return EINVAL;
2353	if (valsize > len)
2354		sopt->sopt_valsize = valsize = len;
2355
2356	if (sopt->sopt_td != NULL)
2357		return (copyin(sopt->sopt_val, buf, valsize));
2358
2359	bcopy(sopt->sopt_val, buf, valsize);
2360	return (0);
2361}
2362
2363/*
2364 * Kernel version of setsockopt(2).
2365 *
2366 * XXX: optlen is size_t, not socklen_t
2367 */
2368int
2369so_setsockopt(struct socket *so, int level, int optname, void *optval,
2370    size_t optlen)
2371{
2372	struct sockopt sopt;
2373
2374	sopt.sopt_level = level;
2375	sopt.sopt_name = optname;
2376	sopt.sopt_dir = SOPT_SET;
2377	sopt.sopt_val = optval;
2378	sopt.sopt_valsize = optlen;
2379	sopt.sopt_td = NULL;
2380	return (sosetopt(so, &sopt));
2381}
2382
2383int
2384sosetopt(struct socket *so, struct sockopt *sopt)
2385{
2386	int	error, optval;
2387	struct	linger l;
2388	struct	timeval tv;
2389	sbintime_t val;
2390	uint32_t val32;
2391#ifdef MAC
2392	struct mac extmac;
2393#endif
2394
2395	CURVNET_SET(so->so_vnet);
2396	error = 0;
2397	if (sopt->sopt_level != SOL_SOCKET) {
2398		if (so->so_proto->pr_ctloutput != NULL) {
2399			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2400			CURVNET_RESTORE();
2401			return (error);
2402		}
2403		error = ENOPROTOOPT;
2404	} else {
2405		switch (sopt->sopt_name) {
2406#ifdef INET
2407		case SO_ACCEPTFILTER:
2408			error = do_setopt_accept_filter(so, sopt);
2409			if (error)
2410				goto bad;
2411			break;
2412#endif
2413		case SO_LINGER:
2414			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2415			if (error)
2416				goto bad;
2417
2418			SOCK_LOCK(so);
2419			so->so_linger = l.l_linger;
2420			if (l.l_onoff)
2421				so->so_options |= SO_LINGER;
2422			else
2423				so->so_options &= ~SO_LINGER;
2424			SOCK_UNLOCK(so);
2425			break;
2426
2427		case SO_DEBUG:
2428		case SO_KEEPALIVE:
2429		case SO_DONTROUTE:
2430		case SO_USELOOPBACK:
2431		case SO_BROADCAST:
2432		case SO_REUSEADDR:
2433		case SO_REUSEPORT:
2434		case SO_OOBINLINE:
2435		case SO_TIMESTAMP:
2436		case SO_BINTIME:
2437		case SO_NOSIGPIPE:
2438		case SO_NO_DDP:
2439		case SO_NO_OFFLOAD:
2440			error = sooptcopyin(sopt, &optval, sizeof optval,
2441			    sizeof optval);
2442			if (error)
2443				goto bad;
2444			SOCK_LOCK(so);
2445			if (optval)
2446				so->so_options |= sopt->sopt_name;
2447			else
2448				so->so_options &= ~sopt->sopt_name;
2449			SOCK_UNLOCK(so);
2450			break;
2451
2452		case SO_SETFIB:
2453			error = sooptcopyin(sopt, &optval, sizeof optval,
2454			    sizeof optval);
2455			if (error)
2456				goto bad;
2457
2458			if (optval < 0 || optval >= rt_numfibs) {
2459				error = EINVAL;
2460				goto bad;
2461			}
2462			if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
2463			   (so->so_proto->pr_domain->dom_family == PF_INET6) ||
2464			   (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
2465				so->so_fibnum = optval;
2466			else
2467				so->so_fibnum = 0;
2468			break;
2469
2470		case SO_USER_COOKIE:
2471			error = sooptcopyin(sopt, &val32, sizeof val32,
2472			    sizeof val32);
2473			if (error)
2474				goto bad;
2475			so->so_user_cookie = val32;
2476			break;
2477
2478		case SO_SNDBUF:
2479		case SO_RCVBUF:
2480		case SO_SNDLOWAT:
2481		case SO_RCVLOWAT:
2482			error = sooptcopyin(sopt, &optval, sizeof optval,
2483			    sizeof optval);
2484			if (error)
2485				goto bad;
2486
2487			/*
2488			 * Values < 1 make no sense for any of these options,
2489			 * so disallow them.
2490			 */
2491			if (optval < 1) {
2492				error = EINVAL;
2493				goto bad;
2494			}
2495
2496			switch (sopt->sopt_name) {
2497			case SO_SNDBUF:
2498			case SO_RCVBUF:
2499				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2500				    &so->so_snd : &so->so_rcv, (u_long)optval,
2501				    so, curthread) == 0) {
2502					error = ENOBUFS;
2503					goto bad;
2504				}
2505				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2506				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2507				break;
2508
2509			/*
2510			 * Make sure the low-water is never greater than the
2511			 * high-water.
2512			 */
2513			case SO_SNDLOWAT:
2514				SOCKBUF_LOCK(&so->so_snd);
2515				so->so_snd.sb_lowat =
2516				    (optval > so->so_snd.sb_hiwat) ?
2517				    so->so_snd.sb_hiwat : optval;
2518				SOCKBUF_UNLOCK(&so->so_snd);
2519				break;
2520			case SO_RCVLOWAT:
2521				SOCKBUF_LOCK(&so->so_rcv);
2522				so->so_rcv.sb_lowat =
2523				    (optval > so->so_rcv.sb_hiwat) ?
2524				    so->so_rcv.sb_hiwat : optval;
2525				SOCKBUF_UNLOCK(&so->so_rcv);
2526				break;
2527			}
2528			break;
2529
2530		case SO_SNDTIMEO:
2531		case SO_RCVTIMEO:
2532#ifdef COMPAT_FREEBSD32
2533			if (SV_CURPROC_FLAG(SV_ILP32)) {
2534				struct timeval32 tv32;
2535
2536				error = sooptcopyin(sopt, &tv32, sizeof tv32,
2537				    sizeof tv32);
2538				CP(tv32, tv, tv_sec);
2539				CP(tv32, tv, tv_usec);
2540			} else
2541#endif
2542				error = sooptcopyin(sopt, &tv, sizeof tv,
2543				    sizeof tv);
2544			if (error)
2545				goto bad;
2546			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
2547			    tv.tv_usec >= 1000000) {
2548				error = EDOM;
2549				goto bad;
2550			}
2551			if (tv.tv_sec > INT32_MAX)
2552				val = SBT_MAX;
2553			else
2554				val = tvtosbt(tv);
2555			switch (sopt->sopt_name) {
2556			case SO_SNDTIMEO:
2557				so->so_snd.sb_timeo = val;
2558				break;
2559			case SO_RCVTIMEO:
2560				so->so_rcv.sb_timeo = val;
2561				break;
2562			}
2563			break;
2564
2565		case SO_LABEL:
2566#ifdef MAC
2567			error = sooptcopyin(sopt, &extmac, sizeof extmac,
2568			    sizeof extmac);
2569			if (error)
2570				goto bad;
2571			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2572			    so, &extmac);
2573#else
2574			error = EOPNOTSUPP;
2575#endif
2576			break;
2577
2578		default:
2579			error = ENOPROTOOPT;
2580			break;
2581		}
2582		if (error == 0 && so->so_proto->pr_ctloutput != NULL)
2583			(void)(*so->so_proto->pr_ctloutput)(so, sopt);
2584	}
2585bad:
2586	CURVNET_RESTORE();
2587	return (error);
2588}
2589
2590/*
2591 * Helper routine for getsockopt.
2592 */
2593int
2594sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2595{
2596	int	error;
2597	size_t	valsize;
2598
2599	error = 0;
2600
2601	/*
2602	 * Documented get behavior is that we always return a value, possibly
2603	 * truncated to fit in the user's buffer.  Traditional behavior is
2604	 * that we always tell the user precisely how much we copied, rather
2605	 * than something useful like the total amount we had available for
2606	 * her.  Note that this interface is not idempotent; the entire
2607	 * answer must generated ahead of time.
2608	 */
2609	valsize = min(len, sopt->sopt_valsize);
2610	sopt->sopt_valsize = valsize;
2611	if (sopt->sopt_val != NULL) {
2612		if (sopt->sopt_td != NULL)
2613			error = copyout(buf, sopt->sopt_val, valsize);
2614		else
2615			bcopy(buf, sopt->sopt_val, valsize);
2616	}
2617	return (error);
2618}
2619
2620int
2621sogetopt(struct socket *so, struct sockopt *sopt)
2622{
2623	int	error, optval;
2624	struct	linger l;
2625	struct	timeval tv;
2626#ifdef MAC
2627	struct mac extmac;
2628#endif
2629
2630	CURVNET_SET(so->so_vnet);
2631	error = 0;
2632	if (sopt->sopt_level != SOL_SOCKET) {
2633		if (so->so_proto->pr_ctloutput != NULL)
2634			error = (*so->so_proto->pr_ctloutput)(so, sopt);
2635		else
2636			error = ENOPROTOOPT;
2637		CURVNET_RESTORE();
2638		return (error);
2639	} else {
2640		switch (sopt->sopt_name) {
2641#ifdef INET
2642		case SO_ACCEPTFILTER:
2643			error = do_getopt_accept_filter(so, sopt);
2644			break;
2645#endif
2646		case SO_LINGER:
2647			SOCK_LOCK(so);
2648			l.l_onoff = so->so_options & SO_LINGER;
2649			l.l_linger = so->so_linger;
2650			SOCK_UNLOCK(so);
2651			error = sooptcopyout(sopt, &l, sizeof l);
2652			break;
2653
2654		case SO_USELOOPBACK:
2655		case SO_DONTROUTE:
2656		case SO_DEBUG:
2657		case SO_KEEPALIVE:
2658		case SO_REUSEADDR:
2659		case SO_REUSEPORT:
2660		case SO_BROADCAST:
2661		case SO_OOBINLINE:
2662		case SO_ACCEPTCONN:
2663		case SO_TIMESTAMP:
2664		case SO_BINTIME:
2665		case SO_NOSIGPIPE:
2666			optval = so->so_options & sopt->sopt_name;
2667integer:
2668			error = sooptcopyout(sopt, &optval, sizeof optval);
2669			break;
2670
2671		case SO_TYPE:
2672			optval = so->so_type;
2673			goto integer;
2674
2675		case SO_PROTOCOL:
2676			optval = so->so_proto->pr_protocol;
2677			goto integer;
2678
2679		case SO_ERROR:
2680			SOCK_LOCK(so);
2681			optval = so->so_error;
2682			so->so_error = 0;
2683			SOCK_UNLOCK(so);
2684			goto integer;
2685
2686		case SO_SNDBUF:
2687			optval = so->so_snd.sb_hiwat;
2688			goto integer;
2689
2690		case SO_RCVBUF:
2691			optval = so->so_rcv.sb_hiwat;
2692			goto integer;
2693
2694		case SO_SNDLOWAT:
2695			optval = so->so_snd.sb_lowat;
2696			goto integer;
2697
2698		case SO_RCVLOWAT:
2699			optval = so->so_rcv.sb_lowat;
2700			goto integer;
2701
2702		case SO_SNDTIMEO:
2703		case SO_RCVTIMEO:
2704			tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
2705			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2706#ifdef COMPAT_FREEBSD32
2707			if (SV_CURPROC_FLAG(SV_ILP32)) {
2708				struct timeval32 tv32;
2709
2710				CP(tv, tv32, tv_sec);
2711				CP(tv, tv32, tv_usec);
2712				error = sooptcopyout(sopt, &tv32, sizeof tv32);
2713			} else
2714#endif
2715				error = sooptcopyout(sopt, &tv, sizeof tv);
2716			break;
2717
2718		case SO_LABEL:
2719#ifdef MAC
2720			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2721			    sizeof(extmac));
2722			if (error)
2723				goto bad;
2724			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2725			    so, &extmac);
2726			if (error)
2727				goto bad;
2728			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2729#else
2730			error = EOPNOTSUPP;
2731#endif
2732			break;
2733
2734		case SO_PEERLABEL:
2735#ifdef MAC
2736			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2737			    sizeof(extmac));
2738			if (error)
2739				goto bad;
2740			error = mac_getsockopt_peerlabel(
2741			    sopt->sopt_td->td_ucred, so, &extmac);
2742			if (error)
2743				goto bad;
2744			error = sooptcopyout(sopt, &extmac, sizeof extmac);
2745#else
2746			error = EOPNOTSUPP;
2747#endif
2748			break;
2749
2750		case SO_LISTENQLIMIT:
2751			optval = so->so_qlimit;
2752			goto integer;
2753
2754		case SO_LISTENQLEN:
2755			optval = so->so_qlen;
2756			goto integer;
2757
2758		case SO_LISTENINCQLEN:
2759			optval = so->so_incqlen;
2760			goto integer;
2761
2762		default:
2763			error = ENOPROTOOPT;
2764			break;
2765		}
2766	}
2767#ifdef MAC
2768bad:
2769#endif
2770	CURVNET_RESTORE();
2771	return (error);
2772}
2773
2774int
2775soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2776{
2777	struct mbuf *m, *m_prev;
2778	int sopt_size = sopt->sopt_valsize;
2779
2780	MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
2781	if (m == NULL)
2782		return ENOBUFS;
2783	if (sopt_size > MLEN) {
2784		MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
2785		if ((m->m_flags & M_EXT) == 0) {
2786			m_free(m);
2787			return ENOBUFS;
2788		}
2789		m->m_len = min(MCLBYTES, sopt_size);
2790	} else {
2791		m->m_len = min(MLEN, sopt_size);
2792	}
2793	sopt_size -= m->m_len;
2794	*mp = m;
2795	m_prev = m;
2796
2797	while (sopt_size) {
2798		MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
2799		if (m == NULL) {
2800			m_freem(*mp);
2801			return ENOBUFS;
2802		}
2803		if (sopt_size > MLEN) {
2804			MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
2805			    M_NOWAIT);
2806			if ((m->m_flags & M_EXT) == 0) {
2807				m_freem(m);
2808				m_freem(*mp);
2809				return ENOBUFS;
2810			}
2811			m->m_len = min(MCLBYTES, sopt_size);
2812		} else {
2813			m->m_len = min(MLEN, sopt_size);
2814		}
2815		sopt_size -= m->m_len;
2816		m_prev->m_next = m;
2817		m_prev = m;
2818	}
2819	return (0);
2820}
2821
2822int
2823soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2824{
2825	struct mbuf *m0 = m;
2826
2827	if (sopt->sopt_val == NULL)
2828		return (0);
2829	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2830		if (sopt->sopt_td != NULL) {
2831			int error;
2832
2833			error = copyin(sopt->sopt_val, mtod(m, char *),
2834			    m->m_len);
2835			if (error != 0) {
2836				m_freem(m0);
2837				return(error);
2838			}
2839		} else
2840			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2841		sopt->sopt_valsize -= m->m_len;
2842		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2843		m = m->m_next;
2844	}
2845	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2846		panic("ip6_sooptmcopyin");
2847	return (0);
2848}
2849
2850int
2851soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2852{
2853	struct mbuf *m0 = m;
2854	size_t valsize = 0;
2855
2856	if (sopt->sopt_val == NULL)
2857		return (0);
2858	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2859		if (sopt->sopt_td != NULL) {
2860			int error;
2861
2862			error = copyout(mtod(m, char *), sopt->sopt_val,
2863			    m->m_len);
2864			if (error != 0) {
2865				m_freem(m0);
2866				return(error);
2867			}
2868		} else
2869			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2870		sopt->sopt_valsize -= m->m_len;
2871		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2872		valsize += m->m_len;
2873		m = m->m_next;
2874	}
2875	if (m != NULL) {
2876		/* enough soopt buffer should be given from user-land */
2877		m_freem(m0);
2878		return(EINVAL);
2879	}
2880	sopt->sopt_valsize = valsize;
2881	return (0);
2882}
2883
2884/*
2885 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2886 * out-of-band data, which will then notify socket consumers.
2887 */
2888void
2889sohasoutofband(struct socket *so)
2890{
2891
2892	if (so->so_sigio != NULL)
2893		pgsigio(&so->so_sigio, SIGURG, 0);
2894	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2895}
2896
2897int
2898sopoll(struct socket *so, int events, struct ucred *active_cred,
2899    struct thread *td)
2900{
2901
2902	/*
2903	 * We do not need to set or assert curvnet as long as everyone uses
2904	 * sopoll_generic().
2905	 */
2906	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2907	    td));
2908}
2909
2910int
2911sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2912    struct thread *td)
2913{
2914	int revents = 0;
2915
2916	SOCKBUF_LOCK(&so->so_snd);
2917	SOCKBUF_LOCK(&so->so_rcv);
2918	if (events & (POLLIN | POLLRDNORM))
2919		if (soreadabledata(so))
2920			revents |= events & (POLLIN | POLLRDNORM);
2921
2922	if (events & (POLLOUT | POLLWRNORM))
2923		if (sowriteable(so))
2924			revents |= events & (POLLOUT | POLLWRNORM);
2925
2926	if (events & (POLLPRI | POLLRDBAND))
2927		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2928			revents |= events & (POLLPRI | POLLRDBAND);
2929
2930	if ((events & POLLINIGNEOF) == 0) {
2931		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2932			revents |= events & (POLLIN | POLLRDNORM);
2933			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
2934				revents |= POLLHUP;
2935		}
2936	}
2937
2938	if (revents == 0) {
2939		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2940			selrecord(td, &so->so_rcv.sb_sel);
2941			so->so_rcv.sb_flags |= SB_SEL;
2942		}
2943
2944		if (events & (POLLOUT | POLLWRNORM)) {
2945			selrecord(td, &so->so_snd.sb_sel);
2946			so->so_snd.sb_flags |= SB_SEL;
2947		}
2948	}
2949
2950	SOCKBUF_UNLOCK(&so->so_rcv);
2951	SOCKBUF_UNLOCK(&so->so_snd);
2952	return (revents);
2953}
2954
2955int
2956soo_kqfilter(struct file *fp, struct knote *kn)
2957{
2958	struct socket *so = kn->kn_fp->f_data;
2959	struct sockbuf *sb;
2960
2961	switch (kn->kn_filter) {
2962	case EVFILT_READ:
2963		if (so->so_options & SO_ACCEPTCONN)
2964			kn->kn_fop = &solisten_filtops;
2965		else
2966			kn->kn_fop = &soread_filtops;
2967		sb = &so->so_rcv;
2968		break;
2969	case EVFILT_WRITE:
2970		kn->kn_fop = &sowrite_filtops;
2971		sb = &so->so_snd;
2972		break;
2973	default:
2974		return (EINVAL);
2975	}
2976
2977	SOCKBUF_LOCK(sb);
2978	knlist_add(&sb->sb_sel.si_note, kn, 1);
2979	sb->sb_flags |= SB_KNOTE;
2980	SOCKBUF_UNLOCK(sb);
2981	return (0);
2982}
2983
2984/*
2985 * Some routines that return EOPNOTSUPP for entry points that are not
2986 * supported by a protocol.  Fill in as needed.
2987 */
2988int
2989pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2990{
2991
2992	return EOPNOTSUPP;
2993}
2994
2995int
2996pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2997{
2998
2999	return EOPNOTSUPP;
3000}
3001
3002int
3003pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3004{
3005
3006	return EOPNOTSUPP;
3007}
3008
3009int
3010pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3011    struct thread *td)
3012{
3013
3014	return EOPNOTSUPP;
3015}
3016
3017int
3018pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3019{
3020
3021	return EOPNOTSUPP;
3022}
3023
3024int
3025pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3026    struct thread *td)
3027{
3028
3029	return EOPNOTSUPP;
3030}
3031
3032int
3033pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3034{
3035
3036	return EOPNOTSUPP;
3037}
3038
3039int
3040pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3041    struct ifnet *ifp, struct thread *td)
3042{
3043
3044	return EOPNOTSUPP;
3045}
3046
3047int
3048pru_disconnect_notsupp(struct socket *so)
3049{
3050
3051	return EOPNOTSUPP;
3052}
3053
3054int
3055pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3056{
3057
3058	return EOPNOTSUPP;
3059}
3060
3061int
3062pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3063{
3064
3065	return EOPNOTSUPP;
3066}
3067
3068int
3069pru_rcvd_notsupp(struct socket *so, int flags)
3070{
3071
3072	return EOPNOTSUPP;
3073}
3074
3075int
3076pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3077{
3078
3079	return EOPNOTSUPP;
3080}
3081
3082int
3083pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3084    struct sockaddr *addr, struct mbuf *control, struct thread *td)
3085{
3086
3087	return EOPNOTSUPP;
3088}
3089
3090/*
3091 * This isn't really a ``null'' operation, but it's the default one and
3092 * doesn't do anything destructive.
3093 */
3094int
3095pru_sense_null(struct socket *so, struct stat *sb)
3096{
3097
3098	sb->st_blksize = so->so_snd.sb_hiwat;
3099	return 0;
3100}
3101
3102int
3103pru_shutdown_notsupp(struct socket *so)
3104{
3105
3106	return EOPNOTSUPP;
3107}
3108
3109int
3110pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3111{
3112
3113	return EOPNOTSUPP;
3114}
3115
3116int
3117pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3118    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3119{
3120
3121	return EOPNOTSUPP;
3122}
3123
3124int
3125pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3126    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3127{
3128
3129	return EOPNOTSUPP;
3130}
3131
3132int
3133pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3134    struct thread *td)
3135{
3136
3137	return EOPNOTSUPP;
3138}
3139
3140static void
3141filt_sordetach(struct knote *kn)
3142{
3143	struct socket *so = kn->kn_fp->f_data;
3144
3145	SOCKBUF_LOCK(&so->so_rcv);
3146	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3147	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3148		so->so_rcv.sb_flags &= ~SB_KNOTE;
3149	SOCKBUF_UNLOCK(&so->so_rcv);
3150}
3151
3152/*ARGSUSED*/
3153static int
3154filt_soread(struct knote *kn, long hint)
3155{
3156	struct socket *so;
3157
3158	so = kn->kn_fp->f_data;
3159	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3160
3161	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3162	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3163		kn->kn_flags |= EV_EOF;
3164		kn->kn_fflags = so->so_error;
3165		return (1);
3166	} else if (so->so_error)	/* temporary udp error */
3167		return (1);
3168	else if (kn->kn_sfflags & NOTE_LOWAT)
3169		return (kn->kn_data >= kn->kn_sdata);
3170	else
3171		return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3172}
3173
3174static void
3175filt_sowdetach(struct knote *kn)
3176{
3177	struct socket *so = kn->kn_fp->f_data;
3178
3179	SOCKBUF_LOCK(&so->so_snd);
3180	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3181	if (knlist_empty(&so->so_snd.sb_sel.si_note))
3182		so->so_snd.sb_flags &= ~SB_KNOTE;
3183	SOCKBUF_UNLOCK(&so->so_snd);
3184}
3185
3186/*ARGSUSED*/
3187static int
3188filt_sowrite(struct knote *kn, long hint)
3189{
3190	struct socket *so;
3191
3192	so = kn->kn_fp->f_data;
3193	SOCKBUF_LOCK_ASSERT(&so->so_snd);
3194	kn->kn_data = sbspace(&so->so_snd);
3195	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3196		kn->kn_flags |= EV_EOF;
3197		kn->kn_fflags = so->so_error;
3198		return (1);
3199	} else if (so->so_error)	/* temporary udp error */
3200		return (1);
3201	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3202	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
3203		return (0);
3204	else if (kn->kn_sfflags & NOTE_LOWAT)
3205		return (kn->kn_data >= kn->kn_sdata);
3206	else
3207		return (kn->kn_data >= so->so_snd.sb_lowat);
3208}
3209
3210/*ARGSUSED*/
3211static int
3212filt_solisten(struct knote *kn, long hint)
3213{
3214	struct socket *so = kn->kn_fp->f_data;
3215
3216	kn->kn_data = so->so_qlen;
3217	return (!TAILQ_EMPTY(&so->so_comp));
3218}
3219
3220int
3221socheckuid(struct socket *so, uid_t uid)
3222{
3223
3224	if (so == NULL)
3225		return (EPERM);
3226	if (so->so_cred->cr_uid != uid)
3227		return (EPERM);
3228	return (0);
3229}
3230
3231/*
3232 * These functions are used by protocols to notify the socket layer (and its
3233 * consumers) of state changes in the sockets driven by protocol-side events.
3234 */
3235
3236/*
3237 * Procedures to manipulate state flags of socket and do appropriate wakeups.
3238 *
3239 * Normal sequence from the active (originating) side is that
3240 * soisconnecting() is called during processing of connect() call, resulting
3241 * in an eventual call to soisconnected() if/when the connection is
3242 * established.  When the connection is torn down soisdisconnecting() is
3243 * called during processing of disconnect() call, and soisdisconnected() is
3244 * called when the connection to the peer is totally severed.  The semantics
3245 * of these routines are such that connectionless protocols can call
3246 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3247 * calls when setting up a ``connection'' takes no time.
3248 *
3249 * From the passive side, a socket is created with two queues of sockets:
3250 * so_incomp for connections in progress and so_comp for connections already
3251 * made and awaiting user acceptance.  As a protocol is preparing incoming
3252 * connections, it creates a socket structure queued on so_incomp by calling
3253 * sonewconn().  When the connection is established, soisconnected() is
3254 * called, and transfers the socket structure to so_comp, making it available
3255 * to accept().
3256 *
3257 * If a socket is closed with sockets on either so_incomp or so_comp, these
3258 * sockets are dropped.
3259 *
3260 * If higher-level protocols are implemented in the kernel, the wakeups done
3261 * here will sometimes cause software-interrupt process scheduling.
3262 */
3263void
3264soisconnecting(struct socket *so)
3265{
3266
3267	SOCK_LOCK(so);
3268	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3269	so->so_state |= SS_ISCONNECTING;
3270	SOCK_UNLOCK(so);
3271}
3272
3273void
3274soisconnected(struct socket *so)
3275{
3276	struct socket *head;
3277	int ret;
3278
3279restart:
3280	ACCEPT_LOCK();
3281	SOCK_LOCK(so);
3282	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3283	so->so_state |= SS_ISCONNECTED;
3284	head = so->so_head;
3285	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3286		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3287			SOCK_UNLOCK(so);
3288			TAILQ_REMOVE(&head->so_incomp, so, so_list);
3289			head->so_incqlen--;
3290			so->so_qstate &= ~SQ_INCOMP;
3291			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3292			head->so_qlen++;
3293			so->so_qstate |= SQ_COMP;
3294			ACCEPT_UNLOCK();
3295			sorwakeup(head);
3296			wakeup_one(&head->so_timeo);
3297		} else {
3298			ACCEPT_UNLOCK();
3299			soupcall_set(so, SO_RCV,
3300			    head->so_accf->so_accept_filter->accf_callback,
3301			    head->so_accf->so_accept_filter_arg);
3302			so->so_options &= ~SO_ACCEPTFILTER;
3303			ret = head->so_accf->so_accept_filter->accf_callback(so,
3304			    head->so_accf->so_accept_filter_arg, M_NOWAIT);
3305			if (ret == SU_ISCONNECTED)
3306				soupcall_clear(so, SO_RCV);
3307			SOCK_UNLOCK(so);
3308			if (ret == SU_ISCONNECTED)
3309				goto restart;
3310		}
3311		return;
3312	}
3313	SOCK_UNLOCK(so);
3314	ACCEPT_UNLOCK();
3315	wakeup(&so->so_timeo);
3316	sorwakeup(so);
3317	sowwakeup(so);
3318}
3319
3320void
3321soisdisconnecting(struct socket *so)
3322{
3323
3324	/*
3325	 * Note: This code assumes that SOCK_LOCK(so) and
3326	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3327	 */
3328	SOCKBUF_LOCK(&so->so_rcv);
3329	so->so_state &= ~SS_ISCONNECTING;
3330	so->so_state |= SS_ISDISCONNECTING;
3331	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3332	sorwakeup_locked(so);
3333	SOCKBUF_LOCK(&so->so_snd);
3334	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3335	sowwakeup_locked(so);
3336	wakeup(&so->so_timeo);
3337}
3338
3339void
3340soisdisconnected(struct socket *so)
3341{
3342
3343	/*
3344	 * Note: This code assumes that SOCK_LOCK(so) and
3345	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3346	 */
3347	SOCKBUF_LOCK(&so->so_rcv);
3348	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3349	so->so_state |= SS_ISDISCONNECTED;
3350	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3351	sorwakeup_locked(so);
3352	SOCKBUF_LOCK(&so->so_snd);
3353	so->so_snd.sb_state |= SBS_CANTSENDMORE;
3354	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3355	sowwakeup_locked(so);
3356	wakeup(&so->so_timeo);
3357}
3358
3359/*
3360 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3361 */
3362struct sockaddr *
3363sodupsockaddr(const struct sockaddr *sa, int mflags)
3364{
3365	struct sockaddr *sa2;
3366
3367	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3368	if (sa2)
3369		bcopy(sa, sa2, sa->sa_len);
3370	return sa2;
3371}
3372
3373/*
3374 * Register per-socket buffer upcalls.
3375 */
3376void
3377soupcall_set(struct socket *so, int which,
3378    int (*func)(struct socket *, void *, int), void *arg)
3379{
3380	struct sockbuf *sb;
3381
3382	switch (which) {
3383	case SO_RCV:
3384		sb = &so->so_rcv;
3385		break;
3386	case SO_SND:
3387		sb = &so->so_snd;
3388		break;
3389	default:
3390		panic("soupcall_set: bad which");
3391	}
3392	SOCKBUF_LOCK_ASSERT(sb);
3393#if 0
3394	/* XXX: accf_http actually wants to do this on purpose. */
3395	KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3396#endif
3397	sb->sb_upcall = func;
3398	sb->sb_upcallarg = arg;
3399	sb->sb_flags |= SB_UPCALL;
3400}
3401
3402void
3403soupcall_clear(struct socket *so, int which)
3404{
3405	struct sockbuf *sb;
3406
3407	switch (which) {
3408	case SO_RCV:
3409		sb = &so->so_rcv;
3410		break;
3411	case SO_SND:
3412		sb = &so->so_snd;
3413		break;
3414	default:
3415		panic("soupcall_clear: bad which");
3416	}
3417	SOCKBUF_LOCK_ASSERT(sb);
3418	KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3419	sb->sb_upcall = NULL;
3420	sb->sb_upcallarg = NULL;
3421	sb->sb_flags &= ~SB_UPCALL;
3422}
3423
3424/*
3425 * Create an external-format (``xsocket'') structure using the information in
3426 * the kernel-format socket structure pointed to by so.  This is done to
3427 * reduce the spew of irrelevant information over this interface, to isolate
3428 * user code from changes in the kernel structure, and potentially to provide
3429 * information-hiding if we decide that some of this information should be
3430 * hidden from users.
3431 */
3432void
3433sotoxsocket(struct socket *so, struct xsocket *xso)
3434{
3435
3436	xso->xso_len = sizeof *xso;
3437	xso->xso_so = so;
3438	xso->so_type = so->so_type;
3439	xso->so_options = so->so_options;
3440	xso->so_linger = so->so_linger;
3441	xso->so_state = so->so_state;
3442	xso->so_pcb = so->so_pcb;
3443	xso->xso_protocol = so->so_proto->pr_protocol;
3444	xso->xso_family = so->so_proto->pr_domain->dom_family;
3445	xso->so_qlen = so->so_qlen;
3446	xso->so_incqlen = so->so_incqlen;
3447	xso->so_qlimit = so->so_qlimit;
3448	xso->so_timeo = so->so_timeo;
3449	xso->so_error = so->so_error;
3450	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3451	xso->so_oobmark = so->so_oobmark;
3452	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3453	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3454	xso->so_uid = so->so_cred->cr_uid;
3455}
3456
3457
3458/*
3459 * Socket accessor functions to provide external consumers with
3460 * a safe interface to socket state
3461 *
3462 */
3463
3464void
3465so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
3466    void *arg)
3467{
3468
3469	TAILQ_FOREACH(so, &so->so_comp, so_list)
3470		func(so, arg);
3471}
3472
3473struct sockbuf *
3474so_sockbuf_rcv(struct socket *so)
3475{
3476
3477	return (&so->so_rcv);
3478}
3479
3480struct sockbuf *
3481so_sockbuf_snd(struct socket *so)
3482{
3483
3484	return (&so->so_snd);
3485}
3486
3487int
3488so_state_get(const struct socket *so)
3489{
3490
3491	return (so->so_state);
3492}
3493
3494void
3495so_state_set(struct socket *so, int val)
3496{
3497
3498	so->so_state = val;
3499}
3500
3501int
3502so_options_get(const struct socket *so)
3503{
3504
3505	return (so->so_options);
3506}
3507
3508void
3509so_options_set(struct socket *so, int val)
3510{
3511
3512	so->so_options = val;
3513}
3514
3515int
3516so_error_get(const struct socket *so)
3517{
3518
3519	return (so->so_error);
3520}
3521
3522void
3523so_error_set(struct socket *so, int val)
3524{
3525
3526	so->so_error = val;
3527}
3528
3529int
3530so_linger_get(const struct socket *so)
3531{
3532
3533	return (so->so_linger);
3534}
3535
3536void
3537so_linger_set(struct socket *so, int val)
3538{
3539
3540	so->so_linger = val;
3541}
3542
3543struct protosw *
3544so_protosw_get(const struct socket *so)
3545{
3546
3547	return (so->so_proto);
3548}
3549
3550void
3551so_protosw_set(struct socket *so, struct protosw *val)
3552{
3553
3554	so->so_proto = val;
3555}
3556
3557void
3558so_sorwakeup(struct socket *so)
3559{
3560
3561	sorwakeup(so);
3562}
3563
3564void
3565so_sowwakeup(struct socket *so)
3566{
3567
3568	sowwakeup(so);
3569}
3570
3571void
3572so_sorwakeup_locked(struct socket *so)
3573{
3574
3575	sorwakeup_locked(so);
3576}
3577
3578void
3579so_sowwakeup_locked(struct socket *so)
3580{
3581
3582	sowwakeup_locked(so);
3583}
3584
3585void
3586so_lock(struct socket *so)
3587{
3588
3589	SOCK_LOCK(so);
3590}
3591
3592void
3593so_unlock(struct socket *so)
3594{
3595
3596	SOCK_UNLOCK(so);
3597}
3598