sdp_main.c revision 330897
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5 *      The Regents of the University of California.  All rights reserved.
6 * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
7 * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
34 */
35
36/*
37 *
38 * Copyright (c) 2010 Isilon Systems, Inc.
39 * Copyright (c) 2010 iX Systems, Inc.
40 * Copyright (c) 2010 Panasas, Inc.
41 * All rights reserved.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 *    notice unmodified, this list of conditions, and the following
48 *    disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 *    notice, this list of conditions and the following disclaimer in the
51 *    documentation and/or other materials provided with the distribution.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63 *
64 */
65#include <sys/cdefs.h>
66__FBSDID("$FreeBSD$");
67
68#include "sdp.h"
69
70#include <net/if.h>
71#include <net/route.h>
72#include <net/vnet.h>
73#include <sys/sysctl.h>
74
75uma_zone_t	sdp_zone;
76struct rwlock	sdp_lock;
77LIST_HEAD(, sdp_sock) sdp_list;
78
79struct workqueue_struct *rx_comp_wq;
80
81RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
82#define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
83#define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
84#define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
85#define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
86#define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
87#define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
88#define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
89
90static MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
91
92static void sdp_stop_keepalive_timer(struct socket *so);
93
94/*
95 * SDP protocol interface to socket abstraction.
96 */
97/*
98 * sdp_sendspace and sdp_recvspace are the default send and receive window
99 * sizes, respectively.
100 */
101u_long	sdp_sendspace = 1024*32;
102u_long	sdp_recvspace = 1024*64;
103
104static int sdp_count;
105
106/*
107 * Disable async. CMA events for sockets which are being torn down.
108 */
109static void
110sdp_destroy_cma(struct sdp_sock *ssk)
111{
112
113	if (ssk->id == NULL)
114		return;
115	rdma_destroy_id(ssk->id);
116	ssk->id = NULL;
117}
118
119static int
120sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
121{
122	struct sockaddr_in *sin;
123	struct sockaddr_in null;
124	int error;
125
126	SDP_WLOCK_ASSERT(ssk);
127
128	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
129		return (EINVAL);
130	/* rdma_bind_addr handles bind races.  */
131	SDP_WUNLOCK(ssk);
132	if (ssk->id == NULL)
133		ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
134	if (ssk->id == NULL) {
135		SDP_WLOCK(ssk);
136		return (ENOMEM);
137	}
138	if (nam == NULL) {
139		null.sin_family = AF_INET;
140		null.sin_len = sizeof(null);
141		null.sin_addr.s_addr = INADDR_ANY;
142		null.sin_port = 0;
143		bzero(&null.sin_zero, sizeof(null.sin_zero));
144		nam = (struct sockaddr *)&null;
145	}
146	error = -rdma_bind_addr(ssk->id, nam);
147	SDP_WLOCK(ssk);
148	if (error == 0) {
149		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
150		ssk->laddr = sin->sin_addr.s_addr;
151		ssk->lport = sin->sin_port;
152	} else
153		sdp_destroy_cma(ssk);
154	return (error);
155}
156
157static void
158sdp_pcbfree(struct sdp_sock *ssk)
159{
160	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
161
162	sdp_dbg(ssk->socket, "Freeing pcb");
163	SDP_WLOCK_ASSERT(ssk);
164	ssk->flags |= SDP_DESTROY;
165	SDP_WUNLOCK(ssk);
166	SDP_LIST_WLOCK();
167	sdp_count--;
168	LIST_REMOVE(ssk, list);
169	SDP_LIST_WUNLOCK();
170	crfree(ssk->cred);
171	sdp_destroy_cma(ssk);
172	ssk->qp_active = 0;
173	if (ssk->qp) {
174		ib_destroy_qp(ssk->qp);
175		ssk->qp = NULL;
176	}
177	sdp_tx_ring_destroy(ssk);
178	sdp_rx_ring_destroy(ssk);
179	rw_destroy(&ssk->rx_ring.destroyed_lock);
180	uma_zfree(sdp_zone, ssk);
181	rw_destroy(&ssk->lock);
182}
183
184/*
185 * Common routines to return a socket address.
186 */
187static struct sockaddr *
188sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
189{
190	struct sockaddr_in *sin;
191
192	sin = malloc(sizeof *sin, M_SONAME,
193		M_WAITOK | M_ZERO);
194	sin->sin_family = AF_INET;
195	sin->sin_len = sizeof(*sin);
196	sin->sin_addr = *addr_p;
197	sin->sin_port = port;
198
199	return (struct sockaddr *)sin;
200}
201
202static int
203sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
204{
205	struct sdp_sock *ssk;
206	struct in_addr addr;
207	in_port_t port;
208
209	ssk = sdp_sk(so);
210	SDP_RLOCK(ssk);
211	port = ssk->lport;
212	addr.s_addr = ssk->laddr;
213	SDP_RUNLOCK(ssk);
214
215	*nam = sdp_sockaddr(port, &addr);
216	return 0;
217}
218
219static int
220sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
221{
222	struct sdp_sock *ssk;
223	struct in_addr addr;
224	in_port_t port;
225
226	ssk = sdp_sk(so);
227	SDP_RLOCK(ssk);
228	port = ssk->fport;
229	addr.s_addr = ssk->faddr;
230	SDP_RUNLOCK(ssk);
231
232	*nam = sdp_sockaddr(port, &addr);
233	return 0;
234}
235
236static void
237sdp_pcbnotifyall(struct in_addr faddr, int errno,
238    struct sdp_sock *(*notify)(struct sdp_sock *, int))
239{
240	struct sdp_sock *ssk, *ssk_temp;
241
242	SDP_LIST_WLOCK();
243	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
244		SDP_WLOCK(ssk);
245		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
246			SDP_WUNLOCK(ssk);
247			continue;
248		}
249		if ((ssk->flags & SDP_DESTROY) == 0)
250			if ((*notify)(ssk, errno))
251				SDP_WUNLOCK(ssk);
252	}
253	SDP_LIST_WUNLOCK();
254}
255
256#if 0
257static void
258sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
259{
260	struct sdp_sock *ssk;
261
262	SDP_LIST_RLOCK();
263	LIST_FOREACH(ssk, &sdp_list, list) {
264		SDP_WLOCK(ssk);
265		func(ssk, arg);
266		SDP_WUNLOCK(ssk);
267	}
268	SDP_LIST_RUNLOCK();
269}
270#endif
271
272static void
273sdp_output_reset(struct sdp_sock *ssk)
274{
275	struct rdma_cm_id *id;
276
277	SDP_WLOCK_ASSERT(ssk);
278	if (ssk->id) {
279		id = ssk->id;
280		ssk->qp_active = 0;
281		SDP_WUNLOCK(ssk);
282		rdma_disconnect(id);
283		SDP_WLOCK(ssk);
284	}
285	ssk->state = TCPS_CLOSED;
286}
287
288/*
289 * Attempt to close a SDP socket, marking it as dropped, and freeing
290 * the socket if we hold the only reference.
291 */
292static struct sdp_sock *
293sdp_closed(struct sdp_sock *ssk)
294{
295	struct socket *so;
296
297	SDP_WLOCK_ASSERT(ssk);
298
299	ssk->flags |= SDP_DROPPED;
300	so = ssk->socket;
301	soisdisconnected(so);
302	if (ssk->flags & SDP_SOCKREF) {
303		KASSERT(so->so_state & SS_PROTOREF,
304		    ("sdp_closed: !SS_PROTOREF"));
305		ssk->flags &= ~SDP_SOCKREF;
306		SDP_WUNLOCK(ssk);
307		ACCEPT_LOCK();
308		SOCK_LOCK(so);
309		so->so_state &= ~SS_PROTOREF;
310		sofree(so);
311		return (NULL);
312	}
313	return (ssk);
314}
315
316/*
317 * Perform timer based shutdowns which can not operate in
318 * callout context.
319 */
320static void
321sdp_shutdown_task(void *data, int pending)
322{
323	struct sdp_sock *ssk;
324
325	ssk = data;
326	SDP_WLOCK(ssk);
327	/*
328	 * I don't think this can race with another call to pcbfree()
329	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
330	 */
331	if (ssk->flags & SDP_DESTROY)
332		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
333		    ssk);
334	if (ssk->flags & SDP_DISCON)
335		sdp_output_reset(ssk);
336	/* We have to clear this so sdp_detach() will call pcbfree(). */
337	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
338	if ((ssk->flags & SDP_DROPPED) == 0 &&
339	    sdp_closed(ssk) == NULL)
340		return;
341	if (ssk->socket == NULL) {
342		sdp_pcbfree(ssk);
343		return;
344	}
345	SDP_WUNLOCK(ssk);
346}
347
348/*
349 * 2msl has expired, schedule the shutdown task.
350 */
351static void
352sdp_2msl_timeout(void *data)
353{
354	struct sdp_sock *ssk;
355
356	ssk = data;
357	/* Callout canceled. */
358        if (!callout_active(&ssk->keep2msl))
359		goto out;
360        callout_deactivate(&ssk->keep2msl);
361	/* Should be impossible, defensive programming. */
362	if ((ssk->flags & SDP_TIMEWAIT) == 0)
363		goto out;
364	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
365out:
366	SDP_WUNLOCK(ssk);
367	return;
368}
369
370/*
371 * Schedule the 2msl wait timer.
372 */
373static void
374sdp_2msl_wait(struct sdp_sock *ssk)
375{
376
377	SDP_WLOCK_ASSERT(ssk);
378	ssk->flags |= SDP_TIMEWAIT;
379	ssk->state = TCPS_TIME_WAIT;
380	soisdisconnected(ssk->socket);
381	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
382}
383
384/*
385 * Timed out waiting for the final fin/ack from rdma_disconnect().
386 */
387static void
388sdp_dreq_timeout(void *data)
389{
390	struct sdp_sock *ssk;
391
392	ssk = data;
393	/* Callout canceled. */
394        if (!callout_active(&ssk->keep2msl))
395		goto out;
396	/* Callout rescheduled, probably as a different timer. */
397	if (callout_pending(&ssk->keep2msl))
398		goto out;
399        callout_deactivate(&ssk->keep2msl);
400	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
401		goto out;
402	if ((ssk->flags & SDP_DREQWAIT) == 0)
403		goto out;
404	ssk->flags &= ~SDP_DREQWAIT;
405	ssk->flags |= SDP_DISCON;
406	sdp_2msl_wait(ssk);
407	ssk->qp_active = 0;
408out:
409	SDP_WUNLOCK(ssk);
410}
411
412/*
413 * Received the final fin/ack.  Cancel the 2msl.
414 */
415void
416sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
417{
418	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
419	ssk->flags &= ~SDP_DREQWAIT;
420	sdp_2msl_wait(ssk);
421}
422
423static int
424sdp_init_sock(struct socket *sk)
425{
426	struct sdp_sock *ssk = sdp_sk(sk);
427
428	sdp_dbg(sk, "%s\n", __func__);
429
430	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
431	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
432#ifdef SDP_ZCOPY
433	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
434	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
435	ssk->tx_ring.rdma_inflight = NULL;
436#endif
437	atomic_set(&ssk->mseq_ack, 0);
438	sdp_rx_ring_init(ssk);
439	ssk->tx_ring.buffer = NULL;
440
441	return 0;
442}
443
444/*
445 * Allocate an sdp_sock for the socket and reserve socket buffer space.
446 */
447static int
448sdp_attach(struct socket *so, int proto, struct thread *td)
449{
450	struct sdp_sock *ssk;
451	int error;
452
453	ssk = sdp_sk(so);
454	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
455	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
456		error = soreserve(so, sdp_sendspace, sdp_recvspace);
457		if (error)
458			return (error);
459	}
460	so->so_rcv.sb_flags |= SB_AUTOSIZE;
461	so->so_snd.sb_flags |= SB_AUTOSIZE;
462	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
463	if (ssk == NULL)
464		return (ENOBUFS);
465	rw_init(&ssk->lock, "sdpsock");
466	ssk->socket = so;
467	ssk->cred = crhold(so->so_cred);
468	so->so_pcb = (caddr_t)ssk;
469	sdp_init_sock(so);
470	ssk->flags = 0;
471	ssk->qp_active = 0;
472	ssk->state = TCPS_CLOSED;
473	SDP_LIST_WLOCK();
474	LIST_INSERT_HEAD(&sdp_list, ssk, list);
475	sdp_count++;
476	SDP_LIST_WUNLOCK();
477	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
478		so->so_linger = TCP_LINGERTIME;
479
480	return (0);
481}
482
483/*
484 * Detach SDP from the socket, potentially leaving it around for the
485 * timewait to expire.
486 */
487static void
488sdp_detach(struct socket *so)
489{
490	struct sdp_sock *ssk;
491
492	ssk = sdp_sk(so);
493	SDP_WLOCK(ssk);
494	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
495	ssk->socket->so_pcb = NULL;
496	ssk->socket = NULL;
497	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
498		SDP_WUNLOCK(ssk);
499	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
500		sdp_pcbfree(ssk);
501	else
502		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
503}
504
505/*
506 * Allocate a local address for the socket.
507 */
508static int
509sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
510{
511	int error = 0;
512	struct sdp_sock *ssk;
513	struct sockaddr_in *sin;
514
515	sin = (struct sockaddr_in *)nam;
516	if (nam->sa_len != sizeof (*sin))
517		return (EINVAL);
518	if (sin->sin_family != AF_INET)
519		return (EINVAL);
520	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
521		return (EAFNOSUPPORT);
522
523	ssk = sdp_sk(so);
524	SDP_WLOCK(ssk);
525	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
526		error = EINVAL;
527		goto out;
528	}
529	error = sdp_pcbbind(ssk, nam, td->td_ucred);
530out:
531	SDP_WUNLOCK(ssk);
532
533	return (error);
534}
535
536/*
537 * Prepare to accept connections.
538 */
539static int
540sdp_listen(struct socket *so, int backlog, struct thread *td)
541{
542	int error = 0;
543	struct sdp_sock *ssk;
544
545	ssk = sdp_sk(so);
546	SDP_WLOCK(ssk);
547	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
548		error = EINVAL;
549		goto out;
550	}
551	if (error == 0 && ssk->lport == 0)
552		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
553	SOCK_LOCK(so);
554	if (error == 0)
555		error = solisten_proto_check(so);
556	if (error == 0) {
557		solisten_proto(so, backlog);
558		ssk->state = TCPS_LISTEN;
559	}
560	SOCK_UNLOCK(so);
561
562out:
563	SDP_WUNLOCK(ssk);
564	if (error == 0)
565		error = -rdma_listen(ssk->id, backlog);
566	return (error);
567}
568
569/*
570 * Initiate a SDP connection to nam.
571 */
572static int
573sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
574{
575	struct sockaddr_in src;
576	struct socket *so;
577	int error;
578
579	so = ssk->socket;
580
581	SDP_WLOCK_ASSERT(ssk);
582	if (ssk->lport == 0) {
583		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
584		if (error)
585			return error;
586	}
587	src.sin_family = AF_INET;
588	src.sin_len = sizeof(src);
589	bzero(&src.sin_zero, sizeof(src.sin_zero));
590	src.sin_port = ssk->lport;
591	src.sin_addr.s_addr = ssk->laddr;
592	soisconnecting(so);
593	SDP_WUNLOCK(ssk);
594	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
595	    SDP_RESOLVE_TIMEOUT);
596	SDP_WLOCK(ssk);
597	if (error == 0)
598		ssk->state = TCPS_SYN_SENT;
599
600	return 0;
601}
602
603/*
604 * Initiate SDP connection.
605 */
606static int
607sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
608{
609	int error = 0;
610	struct sdp_sock *ssk;
611	struct sockaddr_in *sin;
612
613	sin = (struct sockaddr_in *)nam;
614	if (nam->sa_len != sizeof (*sin))
615		return (EINVAL);
616	if (sin->sin_family != AF_INET)
617		return (EINVAL);
618	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
619		return (EAFNOSUPPORT);
620	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
621		return (error);
622	ssk = sdp_sk(so);
623	SDP_WLOCK(ssk);
624	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
625		error = EINVAL;
626	else
627		error = sdp_start_connect(ssk, nam, td);
628	SDP_WUNLOCK(ssk);
629	return (error);
630}
631
632/*
633 * Drop a SDP socket, reporting
634 * the specified error.  If connection is synchronized,
635 * then send a RST to peer.
636 */
637static struct sdp_sock *
638sdp_drop(struct sdp_sock *ssk, int errno)
639{
640	struct socket *so;
641
642	SDP_WLOCK_ASSERT(ssk);
643	so = ssk->socket;
644	if (TCPS_HAVERCVDSYN(ssk->state))
645		sdp_output_reset(ssk);
646	if (errno == ETIMEDOUT && ssk->softerror)
647		errno = ssk->softerror;
648	so->so_error = errno;
649	return (sdp_closed(ssk));
650}
651
652/*
653 * User issued close, and wish to trail through shutdown states:
654 * if never received SYN, just forget it.  If got a SYN from peer,
655 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
656 * If already got a FIN from peer, then almost done; go to LAST_ACK
657 * state.  In all other cases, have already sent FIN to peer (e.g.
658 * after PRU_SHUTDOWN), and just have to play tedious game waiting
659 * for peer to send FIN or not respond to keep-alives, etc.
660 * We can let the user exit from the close as soon as the FIN is acked.
661 */
662static void
663sdp_usrclosed(struct sdp_sock *ssk)
664{
665
666	SDP_WLOCK_ASSERT(ssk);
667
668	switch (ssk->state) {
669	case TCPS_LISTEN:
670		ssk->state = TCPS_CLOSED;
671		SDP_WUNLOCK(ssk);
672		sdp_destroy_cma(ssk);
673		SDP_WLOCK(ssk);
674		/* FALLTHROUGH */
675	case TCPS_CLOSED:
676		ssk = sdp_closed(ssk);
677		/*
678		 * sdp_closed() should never return NULL here as the socket is
679		 * still open.
680		 */
681		KASSERT(ssk != NULL,
682		    ("sdp_usrclosed: sdp_closed() returned NULL"));
683		break;
684
685	case TCPS_SYN_SENT:
686		/* FALLTHROUGH */
687	case TCPS_SYN_RECEIVED:
688		ssk->flags |= SDP_NEEDFIN;
689		break;
690
691	case TCPS_ESTABLISHED:
692		ssk->flags |= SDP_NEEDFIN;
693		ssk->state = TCPS_FIN_WAIT_1;
694		break;
695
696	case TCPS_CLOSE_WAIT:
697		ssk->state = TCPS_LAST_ACK;
698		break;
699	}
700	if (ssk->state >= TCPS_FIN_WAIT_2) {
701		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
702		if (ssk->state == TCPS_FIN_WAIT_2)
703			sdp_2msl_wait(ssk);
704		else
705			soisdisconnected(ssk->socket);
706	}
707}
708
709static void
710sdp_output_disconnect(struct sdp_sock *ssk)
711{
712
713	SDP_WLOCK_ASSERT(ssk);
714	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
715	    sdp_dreq_timeout, ssk);
716	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
717	sdp_post_sends(ssk, M_NOWAIT);
718}
719
720/*
721 * Initiate or continue a disconnect.
722 * If embryonic state, just send reset (once).
723 * If in ``let data drain'' option and linger null, just drop.
724 * Otherwise (hard), mark socket disconnecting and drop
725 * current input data; switch states based on user close, and
726 * send segment to peer (with FIN).
727 */
728static void
729sdp_start_disconnect(struct sdp_sock *ssk)
730{
731	struct socket *so;
732	int unread;
733
734	so = ssk->socket;
735	SDP_WLOCK_ASSERT(ssk);
736	sdp_stop_keepalive_timer(so);
737	/*
738	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
739	 * socket is still open.
740	 */
741	if (ssk->state < TCPS_ESTABLISHED) {
742		ssk = sdp_closed(ssk);
743		KASSERT(ssk != NULL,
744		    ("sdp_start_disconnect: sdp_close() returned NULL"));
745	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
746		ssk = sdp_drop(ssk, 0);
747		KASSERT(ssk != NULL,
748		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
749	} else {
750		soisdisconnecting(so);
751		unread = sbused(&so->so_rcv);
752		sbflush(&so->so_rcv);
753		sdp_usrclosed(ssk);
754		if (!(ssk->flags & SDP_DROPPED)) {
755			if (unread)
756				sdp_output_reset(ssk);
757			else
758				sdp_output_disconnect(ssk);
759		}
760	}
761}
762
763/*
764 * User initiated disconnect.
765 */
766static int
767sdp_disconnect(struct socket *so)
768{
769	struct sdp_sock *ssk;
770	int error = 0;
771
772	ssk = sdp_sk(so);
773	SDP_WLOCK(ssk);
774	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
775		error = ECONNRESET;
776		goto out;
777	}
778	sdp_start_disconnect(ssk);
779out:
780	SDP_WUNLOCK(ssk);
781	return (error);
782}
783
784/*
785 * Accept a connection.  Essentially all the work is done at higher levels;
786 * just return the address of the peer, storing through addr.
787 *
788 *
789 * XXX This is broken XXX
790 *
791 * The rationale for acquiring the sdp lock here is somewhat complicated,
792 * and is described in detail in the commit log entry for r175612.  Acquiring
793 * it delays an accept(2) racing with sonewconn(), which inserts the socket
794 * before the address/port fields are initialized.  A better fix would
795 * prevent the socket from being placed in the listen queue until all fields
796 * are fully initialized.
797 */
798static int
799sdp_accept(struct socket *so, struct sockaddr **nam)
800{
801	struct sdp_sock *ssk = NULL;
802	struct in_addr addr;
803	in_port_t port;
804	int error;
805
806	if (so->so_state & SS_ISDISCONNECTED)
807		return (ECONNABORTED);
808
809	port = 0;
810	addr.s_addr = 0;
811	error = 0;
812	ssk = sdp_sk(so);
813	SDP_WLOCK(ssk);
814	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
815		error = ECONNABORTED;
816		goto out;
817	}
818	port = ssk->fport;
819	addr.s_addr = ssk->faddr;
820out:
821	SDP_WUNLOCK(ssk);
822	if (error == 0)
823		*nam = sdp_sockaddr(port, &addr);
824	return error;
825}
826
827/*
828 * Mark the connection as being incapable of further output.
829 */
830static int
831sdp_shutdown(struct socket *so)
832{
833	int error = 0;
834	struct sdp_sock *ssk;
835
836	ssk = sdp_sk(so);
837	SDP_WLOCK(ssk);
838	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
839		error = ECONNRESET;
840		goto out;
841	}
842	socantsendmore(so);
843	sdp_usrclosed(ssk);
844	if (!(ssk->flags & SDP_DROPPED))
845		sdp_output_disconnect(ssk);
846
847out:
848	SDP_WUNLOCK(ssk);
849
850	return (error);
851}
852
853static void
854sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
855{
856	struct mbuf *n;
857	int ncnt;
858
859	SOCKBUF_LOCK_ASSERT(sb);
860	SBLASTRECORDCHK(sb);
861	KASSERT(mb->m_flags & M_PKTHDR,
862		("sdp_append: %p Missing packet header.\n", mb));
863	n = sb->sb_lastrecord;
864	/*
865	 * If the queue is empty just set all pointers and proceed.
866	 */
867	if (n == NULL) {
868		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
869		for (; mb; mb = mb->m_next) {
870	                sb->sb_mbtail = mb;
871			sballoc(sb, mb);
872		}
873		return;
874	}
875	/*
876	 * Count the number of mbufs in the current tail.
877	 */
878	for (ncnt = 0; n->m_next; n = n->m_next)
879		ncnt++;
880	n = sb->sb_lastrecord;
881	/*
882	 * If the two chains can fit in a single sdp packet and
883	 * the last record has not been sent yet (WRITABLE) coalesce
884	 * them.  The lastrecord remains the same but we must strip the
885	 * packet header and then let sbcompress do the hard part.
886	 */
887	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
888	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
889	    ssk->xmit_size_goal) {
890		m_adj(mb, SDP_HEAD_SIZE);
891		n->m_pkthdr.len += mb->m_pkthdr.len;
892		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
893		m_demote(mb, 1, 0);
894		sbcompress(sb, mb, sb->sb_mbtail);
895		return;
896	}
897	/*
898	 * Not compressible, just append to the end and adjust counters.
899	 */
900	sb->sb_lastrecord->m_flags |= M_PUSH;
901	sb->sb_lastrecord->m_nextpkt = mb;
902	sb->sb_lastrecord = mb;
903	if (sb->sb_sndptr == NULL)
904		sb->sb_sndptr = mb;
905	for (; mb; mb = mb->m_next) {
906		sb->sb_mbtail = mb;
907		sballoc(sb, mb);
908	}
909}
910
911/*
912 * Do a send by putting data in output queue and updating urgent
913 * marker if URG set.  Possibly send more data.  Unlike the other
914 * pru_*() routines, the mbuf chains are our responsibility.  We
915 * must either enqueue them or free them.  The other pru_* routines
916 * generally are caller-frees.
917 *
918 * This comes from sendfile, normal sends will come from sdp_sosend().
919 */
920static int
921sdp_send(struct socket *so, int flags, struct mbuf *m,
922    struct sockaddr *nam, struct mbuf *control, struct thread *td)
923{
924	struct sdp_sock *ssk;
925	struct mbuf *n;
926	int error;
927	int cnt;
928
929	error = 0;
930	ssk = sdp_sk(so);
931	KASSERT(m->m_flags & M_PKTHDR,
932	    ("sdp_send: %p no packet header", m));
933	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
934	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
935	for (n = m, cnt = 0; n->m_next; n = n->m_next)
936		cnt++;
937	if (cnt > SDP_MAX_SEND_SGES) {
938		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
939		if (n == NULL) {
940			m_freem(m);
941			return (EMSGSIZE);
942		}
943		m = n;
944		for (cnt = 0; n->m_next; n = n->m_next)
945			cnt++;
946	}
947	SDP_WLOCK(ssk);
948	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
949		if (control)
950			m_freem(control);
951		if (m)
952			m_freem(m);
953		error = ECONNRESET;
954		goto out;
955	}
956	if (control) {
957		/* SDP doesn't support control messages. */
958		if (control->m_len) {
959			m_freem(control);
960			if (m)
961				m_freem(m);
962			error = EINVAL;
963			goto out;
964		}
965		m_freem(control);	/* empty control, just free it */
966	}
967	if (!(flags & PRUS_OOB)) {
968		SOCKBUF_LOCK(&so->so_snd);
969		sdp_append(ssk, &so->so_snd, m, cnt);
970		SOCKBUF_UNLOCK(&so->so_snd);
971		if (nam && ssk->state < TCPS_SYN_SENT) {
972			/*
973			 * Do implied connect if not yet connected.
974			 */
975			error = sdp_start_connect(ssk, nam, td);
976			if (error)
977				goto out;
978		}
979		if (flags & PRUS_EOF) {
980			/*
981			 * Close the send side of the connection after
982			 * the data is sent.
983			 */
984			socantsendmore(so);
985			sdp_usrclosed(ssk);
986			if (!(ssk->flags & SDP_DROPPED))
987				sdp_output_disconnect(ssk);
988		} else if (!(ssk->flags & SDP_DROPPED) &&
989		    !(flags & PRUS_MORETOCOME))
990			sdp_post_sends(ssk, M_NOWAIT);
991		SDP_WUNLOCK(ssk);
992		return (0);
993	} else {
994		SOCKBUF_LOCK(&so->so_snd);
995		if (sbspace(&so->so_snd) < -512) {
996			SOCKBUF_UNLOCK(&so->so_snd);
997			m_freem(m);
998			error = ENOBUFS;
999			goto out;
1000		}
1001		/*
1002		 * According to RFC961 (Assigned Protocols),
1003		 * the urgent pointer points to the last octet
1004		 * of urgent data.  We continue, however,
1005		 * to consider it to indicate the first octet
1006		 * of data past the urgent section.
1007		 * Otherwise, snd_up should be one lower.
1008		 */
1009		m->m_flags |= M_URG | M_PUSH;
1010		sdp_append(ssk, &so->so_snd, m, cnt);
1011		SOCKBUF_UNLOCK(&so->so_snd);
1012		if (nam && ssk->state < TCPS_SYN_SENT) {
1013			/*
1014			 * Do implied connect if not yet connected.
1015			 */
1016			error = sdp_start_connect(ssk, nam, td);
1017			if (error)
1018				goto out;
1019		}
1020		sdp_post_sends(ssk, M_NOWAIT);
1021		SDP_WUNLOCK(ssk);
1022		return (0);
1023	}
1024out:
1025	SDP_WUNLOCK(ssk);
1026	return (error);
1027}
1028
1029#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1030
1031/*
1032 * Send on a socket.  If send must go all at once and message is larger than
1033 * send buffering, then hard error.  Lock against other senders.  If must go
1034 * all at once and not enough room now, then inform user that this would
1035 * block and do nothing.  Otherwise, if nonblocking, send as much as
1036 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1037 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1038 * in mbuf chain must be small enough to send all at once.
1039 *
1040 * Returns nonzero on error, timeout or signal; callers must check for short
1041 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1042 * on return.
1043 */
1044static int
1045sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1046    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1047{
1048	struct sdp_sock *ssk;
1049	long space, resid;
1050	int atomic;
1051	int error;
1052	int copy;
1053
1054	if (uio != NULL)
1055		resid = uio->uio_resid;
1056	else
1057		resid = top->m_pkthdr.len;
1058	atomic = top != NULL;
1059	if (control != NULL) {
1060		if (control->m_len) {
1061			m_freem(control);
1062			if (top)
1063				m_freem(top);
1064			return (EINVAL);
1065		}
1066		m_freem(control);
1067		control = NULL;
1068	}
1069	/*
1070	 * In theory resid should be unsigned.  However, space must be
1071	 * signed, as it might be less than 0 if we over-committed, and we
1072	 * must use a signed comparison of space and resid.  On the other
1073	 * hand, a negative resid causes us to loop sending 0-length
1074	 * segments to the protocol.
1075	 *
1076	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1077	 * type sockets since that's an error.
1078	 */
1079	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1080		error = EINVAL;
1081		goto out;
1082	}
1083	if (td != NULL)
1084		td->td_ru.ru_msgsnd++;
1085
1086	ssk = sdp_sk(so);
1087	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1088	if (error)
1089		goto out;
1090
1091restart:
1092	do {
1093		SOCKBUF_LOCK(&so->so_snd);
1094		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1095			SOCKBUF_UNLOCK(&so->so_snd);
1096			error = EPIPE;
1097			goto release;
1098		}
1099		if (so->so_error) {
1100			error = so->so_error;
1101			so->so_error = 0;
1102			SOCKBUF_UNLOCK(&so->so_snd);
1103			goto release;
1104		}
1105		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1106			SOCKBUF_UNLOCK(&so->so_snd);
1107			error = ENOTCONN;
1108			goto release;
1109		}
1110		space = sbspace(&so->so_snd);
1111		if (flags & MSG_OOB)
1112			space += 1024;
1113		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1114			SOCKBUF_UNLOCK(&so->so_snd);
1115			error = EMSGSIZE;
1116			goto release;
1117		}
1118		if (space < resid &&
1119		    (atomic || space < so->so_snd.sb_lowat)) {
1120			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1121				SOCKBUF_UNLOCK(&so->so_snd);
1122				error = EWOULDBLOCK;
1123				goto release;
1124			}
1125			error = sbwait(&so->so_snd);
1126			SOCKBUF_UNLOCK(&so->so_snd);
1127			if (error)
1128				goto release;
1129			goto restart;
1130		}
1131		SOCKBUF_UNLOCK(&so->so_snd);
1132		do {
1133			if (uio == NULL) {
1134				resid = 0;
1135				if (flags & MSG_EOR)
1136					top->m_flags |= M_EOR;
1137			} else {
1138				/*
1139				 * Copy the data from userland into a mbuf
1140				 * chain.  If no data is to be copied in,
1141				 * a single empty mbuf is returned.
1142				 */
1143				copy = min(space,
1144				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1145				top = m_uiotombuf(uio, M_WAITOK, copy,
1146				    0, M_PKTHDR |
1147				    ((flags & MSG_EOR) ? M_EOR : 0));
1148				if (top == NULL) {
1149					/* only possible error */
1150					error = EFAULT;
1151					goto release;
1152				}
1153				space -= resid - uio->uio_resid;
1154				resid = uio->uio_resid;
1155			}
1156			/*
1157			 * XXX all the SBS_CANTSENDMORE checks previously
1158			 * done could be out of date after dropping the
1159			 * socket lock.
1160			 */
1161			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1162			/*
1163			 * Set EOF on the last send if the user specified
1164			 * MSG_EOF.
1165			 */
1166			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1167			/* If there is more to send set PRUS_MORETOCOME. */
1168			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1169			    top, addr, NULL, td);
1170			top = NULL;
1171			if (error)
1172				goto release;
1173		} while (resid && space > 0);
1174	} while (resid);
1175
1176release:
1177	sbunlock(&so->so_snd);
1178out:
1179	if (top != NULL)
1180		m_freem(top);
1181	return (error);
1182}
1183
1184/*
1185 * The part of soreceive() that implements reading non-inline out-of-band
1186 * data from a socket.  For more complete comments, see soreceive(), from
1187 * which this code originated.
1188 *
1189 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1190 * unable to return an mbuf chain to the caller.
1191 */
1192static int
1193soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1194{
1195	struct protosw *pr = so->so_proto;
1196	struct mbuf *m;
1197	int error;
1198
1199	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1200
1201	m = m_get(M_WAITOK, MT_DATA);
1202	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1203	if (error)
1204		goto bad;
1205	do {
1206		error = uiomove(mtod(m, void *),
1207		    (int) min(uio->uio_resid, m->m_len), uio);
1208		m = m_free(m);
1209	} while (uio->uio_resid && error == 0 && m);
1210bad:
1211	if (m != NULL)
1212		m_freem(m);
1213	return (error);
1214}
1215
1216/*
1217 * Optimized version of soreceive() for stream (TCP) sockets.
1218 */
1219static int
1220sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1221    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1222{
1223	int len = 0, error = 0, flags, oresid;
1224	struct sockbuf *sb;
1225	struct mbuf *m, *n = NULL;
1226	struct sdp_sock *ssk;
1227
1228	/* We only do stream sockets. */
1229	if (so->so_type != SOCK_STREAM)
1230		return (EINVAL);
1231	if (psa != NULL)
1232		*psa = NULL;
1233	if (controlp != NULL)
1234		return (EINVAL);
1235	if (flagsp != NULL)
1236		flags = *flagsp &~ MSG_EOR;
1237	else
1238		flags = 0;
1239	if (flags & MSG_OOB)
1240		return (soreceive_rcvoob(so, uio, flags));
1241	if (mp0 != NULL)
1242		*mp0 = NULL;
1243
1244	sb = &so->so_rcv;
1245	ssk = sdp_sk(so);
1246
1247	/* Prevent other readers from entering the socket. */
1248	error = sblock(sb, SBLOCKWAIT(flags));
1249	if (error)
1250		goto out;
1251	SOCKBUF_LOCK(sb);
1252
1253	/* Easy one, no space to copyout anything. */
1254	if (uio->uio_resid == 0) {
1255		error = EINVAL;
1256		goto out;
1257	}
1258	oresid = uio->uio_resid;
1259
1260	/* We will never ever get anything unless we are connected. */
1261	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1262		/* When disconnecting there may be still some data left. */
1263		if (sbavail(sb))
1264			goto deliver;
1265		if (!(so->so_state & SS_ISDISCONNECTED))
1266			error = ENOTCONN;
1267		goto out;
1268	}
1269
1270	/* Socket buffer is empty and we shall not block. */
1271	if (sbavail(sb) == 0 &&
1272	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1273		error = EAGAIN;
1274		goto out;
1275	}
1276
1277restart:
1278	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1279
1280	/* Abort if socket has reported problems. */
1281	if (so->so_error) {
1282		if (sbavail(sb))
1283			goto deliver;
1284		if (oresid > uio->uio_resid)
1285			goto out;
1286		error = so->so_error;
1287		if (!(flags & MSG_PEEK))
1288			so->so_error = 0;
1289		goto out;
1290	}
1291
1292	/* Door is closed.  Deliver what is left, if any. */
1293	if (sb->sb_state & SBS_CANTRCVMORE) {
1294		if (sbavail(sb))
1295			goto deliver;
1296		else
1297			goto out;
1298	}
1299
1300	/* Socket buffer got some data that we shall deliver now. */
1301	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1302	    ((so->so_state & SS_NBIO) ||
1303	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1304	     sbavail(sb) >= sb->sb_lowat ||
1305	     sbavail(sb) >= uio->uio_resid ||
1306	     sbavail(sb) >= sb->sb_hiwat) ) {
1307		goto deliver;
1308	}
1309
1310	/* On MSG_WAITALL we must wait until all data or error arrives. */
1311	if ((flags & MSG_WAITALL) &&
1312	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1313		goto deliver;
1314
1315	/*
1316	 * Wait and block until (more) data comes in.
1317	 * NB: Drops the sockbuf lock during wait.
1318	 */
1319	error = sbwait(sb);
1320	if (error)
1321		goto out;
1322	goto restart;
1323
1324deliver:
1325	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1326	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1327	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1328
1329	/* Statistics. */
1330	if (uio->uio_td)
1331		uio->uio_td->td_ru.ru_msgrcv++;
1332
1333	/* Fill uio until full or current end of socket buffer is reached. */
1334	len = min(uio->uio_resid, sbavail(sb));
1335	if (mp0 != NULL) {
1336		/* Dequeue as many mbufs as possible. */
1337		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1338			for (*mp0 = m = sb->sb_mb;
1339			     m != NULL && m->m_len <= len;
1340			     m = m->m_next) {
1341				len -= m->m_len;
1342				uio->uio_resid -= m->m_len;
1343				sbfree(sb, m);
1344				n = m;
1345			}
1346			sb->sb_mb = m;
1347			if (sb->sb_mb == NULL)
1348				SB_EMPTY_FIXUP(sb);
1349			n->m_next = NULL;
1350		}
1351		/* Copy the remainder. */
1352		if (len > 0) {
1353			KASSERT(sb->sb_mb != NULL,
1354			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1355
1356			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1357			if (m == NULL)
1358				len = 0;	/* Don't flush data from sockbuf. */
1359			else
1360				uio->uio_resid -= m->m_len;
1361			if (*mp0 != NULL)
1362				n->m_next = m;
1363			else
1364				*mp0 = m;
1365			if (*mp0 == NULL) {
1366				error = ENOBUFS;
1367				goto out;
1368			}
1369		}
1370	} else {
1371		/* NB: Must unlock socket buffer as uiomove may sleep. */
1372		SOCKBUF_UNLOCK(sb);
1373		error = m_mbuftouio(uio, sb->sb_mb, len);
1374		SOCKBUF_LOCK(sb);
1375		if (error)
1376			goto out;
1377	}
1378	SBLASTRECORDCHK(sb);
1379	SBLASTMBUFCHK(sb);
1380
1381	/*
1382	 * Remove the delivered data from the socket buffer unless we
1383	 * were only peeking.
1384	 */
1385	if (!(flags & MSG_PEEK)) {
1386		if (len > 0)
1387			sbdrop_locked(sb, len);
1388
1389		/* Notify protocol that we drained some data. */
1390		SOCKBUF_UNLOCK(sb);
1391		SDP_WLOCK(ssk);
1392		sdp_do_posts(ssk);
1393		SDP_WUNLOCK(ssk);
1394		SOCKBUF_LOCK(sb);
1395	}
1396
1397	/*
1398	 * For MSG_WAITALL we may have to loop again and wait for
1399	 * more data to come in.
1400	 */
1401	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1402		goto restart;
1403out:
1404	SOCKBUF_LOCK_ASSERT(sb);
1405	SBLASTRECORDCHK(sb);
1406	SBLASTMBUFCHK(sb);
1407	SOCKBUF_UNLOCK(sb);
1408	sbunlock(sb);
1409	return (error);
1410}
1411
1412/*
1413 * Abort is used to teardown a connection typically while sitting in
1414 * the accept queue.
1415 */
1416void
1417sdp_abort(struct socket *so)
1418{
1419	struct sdp_sock *ssk;
1420
1421	ssk = sdp_sk(so);
1422	SDP_WLOCK(ssk);
1423	/*
1424	 * If we have not yet dropped, do it now.
1425	 */
1426	if (!(ssk->flags & SDP_TIMEWAIT) &&
1427	    !(ssk->flags & SDP_DROPPED))
1428		sdp_drop(ssk, ECONNABORTED);
1429	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1430	    ssk, ssk->flags));
1431	SDP_WUNLOCK(ssk);
1432}
1433
1434/*
1435 * Close a SDP socket and initiate a friendly disconnect.
1436 */
1437static void
1438sdp_close(struct socket *so)
1439{
1440	struct sdp_sock *ssk;
1441
1442	ssk = sdp_sk(so);
1443	SDP_WLOCK(ssk);
1444	/*
1445	 * If we have not yet dropped, do it now.
1446	 */
1447	if (!(ssk->flags & SDP_TIMEWAIT) &&
1448	    !(ssk->flags & SDP_DROPPED))
1449		sdp_start_disconnect(ssk);
1450
1451	/*
1452	 * If we've still not dropped let the socket layer know we're
1453	 * holding on to the socket and pcb for a while.
1454	 */
1455	if (!(ssk->flags & SDP_DROPPED)) {
1456		SOCK_LOCK(so);
1457		so->so_state |= SS_PROTOREF;
1458		SOCK_UNLOCK(so);
1459		ssk->flags |= SDP_SOCKREF;
1460	}
1461	SDP_WUNLOCK(ssk);
1462}
1463
1464/*
1465 * User requests out-of-band data.
1466 */
1467static int
1468sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1469{
1470	int error = 0;
1471	struct sdp_sock *ssk;
1472
1473	ssk = sdp_sk(so);
1474	SDP_WLOCK(ssk);
1475	if (!rx_ring_trylock(&ssk->rx_ring)) {
1476		SDP_WUNLOCK(ssk);
1477		return (ECONNRESET);
1478	}
1479	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1480		error = ECONNRESET;
1481		goto out;
1482	}
1483	if ((so->so_oobmark == 0 &&
1484	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1485	    so->so_options & SO_OOBINLINE ||
1486	    ssk->oobflags & SDP_HADOOB) {
1487		error = EINVAL;
1488		goto out;
1489	}
1490	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1491		error = EWOULDBLOCK;
1492		goto out;
1493	}
1494	m->m_len = 1;
1495	*mtod(m, caddr_t) = ssk->iobc;
1496	if ((flags & MSG_PEEK) == 0)
1497		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1498out:
1499	rx_ring_unlock(&ssk->rx_ring);
1500	SDP_WUNLOCK(ssk);
1501	return (error);
1502}
1503
1504void
1505sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1506{
1507	struct mbuf *m;
1508	struct socket *so;
1509
1510	so = ssk->socket;
1511	if (so == NULL)
1512		return;
1513
1514	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1515	sohasoutofband(so);
1516	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1517	if (!(so->so_options & SO_OOBINLINE)) {
1518		for (m = mb; m->m_next != NULL; m = m->m_next);
1519		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1520		ssk->oobflags |= SDP_HAVEOOB;
1521		m->m_len--;
1522		mb->m_pkthdr.len--;
1523	}
1524}
1525
1526/*
1527 * Notify a sdp socket of an asynchronous error.
1528 *
1529 * Do not wake up user since there currently is no mechanism for
1530 * reporting soft errors (yet - a kqueue filter may be added).
1531 */
1532struct sdp_sock *
1533sdp_notify(struct sdp_sock *ssk, int error)
1534{
1535
1536	SDP_WLOCK_ASSERT(ssk);
1537
1538	if ((ssk->flags & SDP_TIMEWAIT) ||
1539	    (ssk->flags & SDP_DROPPED))
1540		return (ssk);
1541
1542	/*
1543	 * Ignore some errors if we are hooked up.
1544	 */
1545	if (ssk->state == TCPS_ESTABLISHED &&
1546	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1547	     error == EHOSTDOWN))
1548		return (ssk);
1549	ssk->softerror = error;
1550	return sdp_drop(ssk, error);
1551}
1552
1553static void
1554sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1555{
1556	struct in_addr faddr;
1557
1558	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1559	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1560		return;
1561
1562	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1563}
1564
1565static int
1566sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1567    struct thread *td)
1568{
1569	return (EOPNOTSUPP);
1570}
1571
1572static void
1573sdp_keepalive_timeout(void *data)
1574{
1575	struct sdp_sock *ssk;
1576
1577	ssk = data;
1578	/* Callout canceled. */
1579        if (!callout_active(&ssk->keep2msl))
1580                return;
1581	/* Callout rescheduled as a different kind of timer. */
1582	if (callout_pending(&ssk->keep2msl))
1583		goto out;
1584        callout_deactivate(&ssk->keep2msl);
1585	if (ssk->flags & SDP_DROPPED ||
1586	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1587		goto out;
1588	sdp_post_keepalive(ssk);
1589	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1590	    sdp_keepalive_timeout, ssk);
1591out:
1592	SDP_WUNLOCK(ssk);
1593}
1594
1595
1596void
1597sdp_start_keepalive_timer(struct socket *so)
1598{
1599	struct sdp_sock *ssk;
1600
1601	ssk = sdp_sk(so);
1602	if (!callout_pending(&ssk->keep2msl))
1603                callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1604                    sdp_keepalive_timeout, ssk);
1605}
1606
1607static void
1608sdp_stop_keepalive_timer(struct socket *so)
1609{
1610	struct sdp_sock *ssk;
1611
1612	ssk = sdp_sk(so);
1613	callout_stop(&ssk->keep2msl);
1614}
1615
1616/*
1617 * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1618 * socket option arguments.  When it re-acquires the lock after the copy, it
1619 * has to revalidate that the connection is still valid for the socket
1620 * option.
1621 */
1622#define SDP_WLOCK_RECHECK(inp) do {					\
1623	SDP_WLOCK(ssk);							\
1624	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1625		SDP_WUNLOCK(ssk);					\
1626		return (ECONNRESET);					\
1627	}								\
1628} while(0)
1629
1630static int
1631sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1632{
1633	int	error, opt, optval;
1634	struct sdp_sock *ssk;
1635
1636	error = 0;
1637	ssk = sdp_sk(so);
1638	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1639		SDP_WLOCK(ssk);
1640		if (so->so_options & SO_KEEPALIVE)
1641			sdp_start_keepalive_timer(so);
1642		else
1643			sdp_stop_keepalive_timer(so);
1644		SDP_WUNLOCK(ssk);
1645	}
1646	if (sopt->sopt_level != IPPROTO_TCP)
1647		return (error);
1648
1649	SDP_WLOCK(ssk);
1650	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1651		SDP_WUNLOCK(ssk);
1652		return (ECONNRESET);
1653	}
1654
1655	switch (sopt->sopt_dir) {
1656	case SOPT_SET:
1657		switch (sopt->sopt_name) {
1658		case TCP_NODELAY:
1659			SDP_WUNLOCK(ssk);
1660			error = sooptcopyin(sopt, &optval, sizeof optval,
1661			    sizeof optval);
1662			if (error)
1663				return (error);
1664
1665			SDP_WLOCK_RECHECK(ssk);
1666			opt = SDP_NODELAY;
1667			if (optval)
1668				ssk->flags |= opt;
1669			else
1670				ssk->flags &= ~opt;
1671			sdp_do_posts(ssk);
1672			SDP_WUNLOCK(ssk);
1673			break;
1674
1675		default:
1676			SDP_WUNLOCK(ssk);
1677			error = ENOPROTOOPT;
1678			break;
1679		}
1680		break;
1681
1682	case SOPT_GET:
1683		switch (sopt->sopt_name) {
1684		case TCP_NODELAY:
1685			optval = ssk->flags & SDP_NODELAY;
1686			SDP_WUNLOCK(ssk);
1687			error = sooptcopyout(sopt, &optval, sizeof optval);
1688			break;
1689		default:
1690			SDP_WUNLOCK(ssk);
1691			error = ENOPROTOOPT;
1692			break;
1693		}
1694		break;
1695	}
1696	return (error);
1697}
1698#undef SDP_WLOCK_RECHECK
1699
1700int sdp_mod_count = 0;
1701int sdp_mod_usec = 0;
1702
1703void
1704sdp_set_default_moderation(struct sdp_sock *ssk)
1705{
1706	struct ib_cq_attr attr;
1707	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1708		return;
1709	memset(&attr, 0, sizeof(attr));
1710	attr.moderation.cq_count = sdp_mod_count;
1711	attr.moderation.cq_period = sdp_mod_usec;
1712
1713	ib_modify_cq(ssk->rx_ring.cq, &attr, IB_CQ_MODERATION);
1714}
1715
1716static void
1717sdp_dev_add(struct ib_device *device)
1718{
1719	struct ib_fmr_pool_param param;
1720	struct sdp_device *sdp_dev;
1721
1722	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1723	sdp_dev->pd = ib_alloc_pd(device);
1724	if (IS_ERR(sdp_dev->pd))
1725		goto out_pd;
1726        sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
1727        if (IS_ERR(sdp_dev->mr))
1728		goto out_mr;
1729	memset(&param, 0, sizeof param);
1730	param.max_pages_per_fmr = SDP_FMR_SIZE;
1731	param.page_shift = PAGE_SHIFT;
1732	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1733	param.pool_size = SDP_FMR_POOL_SIZE;
1734	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1735	param.cache = 1;
1736	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1737	if (IS_ERR(sdp_dev->fmr_pool))
1738		goto out_fmr;
1739	ib_set_client_data(device, &sdp_client, sdp_dev);
1740	return;
1741
1742out_fmr:
1743	ib_dereg_mr(sdp_dev->mr);
1744out_mr:
1745	ib_dealloc_pd(sdp_dev->pd);
1746out_pd:
1747	free(sdp_dev, M_SDP);
1748}
1749
1750static void
1751sdp_dev_rem(struct ib_device *device)
1752{
1753	struct sdp_device *sdp_dev;
1754	struct sdp_sock *ssk;
1755
1756	SDP_LIST_WLOCK();
1757	LIST_FOREACH(ssk, &sdp_list, list) {
1758		if (ssk->ib_device != device)
1759			continue;
1760		SDP_WLOCK(ssk);
1761		if ((ssk->flags & SDP_DESTROY) == 0)
1762			ssk = sdp_notify(ssk, ECONNRESET);
1763		if (ssk)
1764			SDP_WUNLOCK(ssk);
1765	}
1766	SDP_LIST_WUNLOCK();
1767	/*
1768	 * XXX Do I need to wait between these two?
1769	 */
1770	sdp_dev = ib_get_client_data(device, &sdp_client);
1771	if (!sdp_dev)
1772		return;
1773	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1774	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1775	ib_dereg_mr(sdp_dev->mr);
1776	ib_dealloc_pd(sdp_dev->pd);
1777	free(sdp_dev, M_SDP);
1778}
1779
1780struct ib_client sdp_client =
1781    { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1782
1783
1784static int
1785sdp_pcblist(SYSCTL_HANDLER_ARGS)
1786{
1787	int error, n, i;
1788	struct sdp_sock *ssk;
1789	struct xinpgen xig;
1790
1791	/*
1792	 * The process of preparing the TCB list is too time-consuming and
1793	 * resource-intensive to repeat twice on every request.
1794	 */
1795	if (req->oldptr == NULL) {
1796		n = sdp_count;
1797		n += imax(n / 8, 10);
1798		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1799		return (0);
1800	}
1801
1802	if (req->newptr != NULL)
1803		return (EPERM);
1804
1805	/*
1806	 * OK, now we're committed to doing something.
1807	 */
1808	SDP_LIST_RLOCK();
1809	n = sdp_count;
1810	SDP_LIST_RUNLOCK();
1811
1812	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1813		+ n * sizeof(struct xtcpcb));
1814	if (error != 0)
1815		return (error);
1816
1817	xig.xig_len = sizeof xig;
1818	xig.xig_count = n;
1819	xig.xig_gen = 0;
1820	xig.xig_sogen = so_gencnt;
1821	error = SYSCTL_OUT(req, &xig, sizeof xig);
1822	if (error)
1823		return (error);
1824
1825	SDP_LIST_RLOCK();
1826	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1827	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1828		struct xtcpcb xt;
1829
1830		SDP_RLOCK(ssk);
1831		if (ssk->flags & SDP_TIMEWAIT) {
1832			if (ssk->cred != NULL)
1833				error = cr_cansee(req->td->td_ucred,
1834				    ssk->cred);
1835			else
1836				error = EINVAL;	/* Skip this inp. */
1837		} else if (ssk->socket)
1838			error = cr_canseesocket(req->td->td_ucred,
1839			    ssk->socket);
1840		else
1841			error = EINVAL;
1842		if (error) {
1843			error = 0;
1844			goto next;
1845		}
1846
1847		bzero(&xt, sizeof(xt));
1848		xt.xt_len = sizeof xt;
1849		xt.xt_inp.inp_gencnt = 0;
1850		xt.xt_inp.inp_vflag = INP_IPV4;
1851		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1852		xt.xt_inp.inp_lport = ssk->lport;
1853		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1854		xt.xt_inp.inp_fport = ssk->fport;
1855		xt.xt_tp.t_state = ssk->state;
1856		if (ssk->socket != NULL)
1857			sotoxsocket(ssk->socket, &xt.xt_socket);
1858		else
1859			bzero(&xt.xt_socket, sizeof xt.xt_socket);
1860		xt.xt_socket.xso_protocol = IPPROTO_TCP;
1861		SDP_RUNLOCK(ssk);
1862		error = SYSCTL_OUT(req, &xt, sizeof xt);
1863		if (error)
1864			break;
1865		i++;
1866		continue;
1867next:
1868		SDP_RUNLOCK(ssk);
1869	}
1870	if (!error) {
1871		/*
1872		 * Give the user an updated idea of our state.
1873		 * If the generation differs from what we told
1874		 * her before, she knows that something happened
1875		 * while we were processing this request, and it
1876		 * might be necessary to retry.
1877		 */
1878		xig.xig_gen = 0;
1879		xig.xig_sogen = so_gencnt;
1880		xig.xig_count = sdp_count;
1881		error = SYSCTL_OUT(req, &xig, sizeof xig);
1882	}
1883	SDP_LIST_RUNLOCK();
1884	return (error);
1885}
1886
1887static SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
1888
1889SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1890    CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1891    "List of active SDP connections");
1892
1893static void
1894sdp_zone_change(void *tag)
1895{
1896
1897	uma_zone_set_max(sdp_zone, maxsockets);
1898}
1899
1900static void
1901sdp_init(void)
1902{
1903
1904	LIST_INIT(&sdp_list);
1905	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1906	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1907	uma_zone_set_max(sdp_zone, maxsockets);
1908	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1909		EVENTHANDLER_PRI_ANY);
1910	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1911	ib_register_client(&sdp_client);
1912}
1913
1914extern struct domain sdpdomain;
1915
1916struct pr_usrreqs sdp_usrreqs = {
1917	.pru_abort =		sdp_abort,
1918	.pru_accept =		sdp_accept,
1919	.pru_attach =		sdp_attach,
1920	.pru_bind =		sdp_bind,
1921	.pru_connect =		sdp_connect,
1922	.pru_control =		sdp_control,
1923	.pru_detach =		sdp_detach,
1924	.pru_disconnect =	sdp_disconnect,
1925	.pru_listen =		sdp_listen,
1926	.pru_peeraddr =		sdp_getpeeraddr,
1927	.pru_rcvoob =		sdp_rcvoob,
1928	.pru_send =		sdp_send,
1929	.pru_sosend =		sdp_sosend,
1930	.pru_soreceive =	sdp_sorecv,
1931	.pru_shutdown =		sdp_shutdown,
1932	.pru_sockaddr =		sdp_getsockaddr,
1933	.pru_close =		sdp_close,
1934};
1935
1936struct protosw sdpsw[] = {
1937{
1938	.pr_type =		SOCK_STREAM,
1939	.pr_domain =		&sdpdomain,
1940	.pr_protocol =		IPPROTO_IP,
1941	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1942	.pr_ctlinput =		sdp_ctlinput,
1943	.pr_ctloutput =		sdp_ctloutput,
1944	.pr_usrreqs =		&sdp_usrreqs
1945},
1946{
1947	.pr_type =		SOCK_STREAM,
1948	.pr_domain =		&sdpdomain,
1949	.pr_protocol =		IPPROTO_TCP,
1950	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1951	.pr_ctlinput =		sdp_ctlinput,
1952	.pr_ctloutput =		sdp_ctloutput,
1953	.pr_usrreqs =		&sdp_usrreqs
1954},
1955};
1956
1957struct domain sdpdomain = {
1958	.dom_family =		AF_INET_SDP,
1959	.dom_name =		"SDP",
1960	.dom_init =		sdp_init,
1961	.dom_protosw =		sdpsw,
1962	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1963};
1964
1965DOMAIN_SET(sdp);
1966
1967int sdp_debug_level = 1;
1968int sdp_data_debug_level = 0;
1969