tcp_usrreq.c revision 309108
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.
4 * Copyright (c) 2006-2007 Robert N. M. Watson
5 * Copyright (c) 2010-2011 Juniper Networks, Inc.
6 * All rights reserved.
7 *
8 * Portions of this software were developed by Robert N. M. Watson under
9 * contract to Juniper Networks, Inc.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: stable/10/sys/netinet/tcp_usrreq.c 309108 2016-11-24 14:48:46Z jch $");
40
41#include "opt_ddb.h"
42#include "opt_inet.h"
43#include "opt_inet6.h"
44#include "opt_tcpdebug.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/limits.h>
49#include <sys/malloc.h>
50#include <sys/kernel.h>
51#include <sys/sysctl.h>
52#include <sys/mbuf.h>
53#ifdef INET6
54#include <sys/domain.h>
55#endif /* INET6 */
56#include <sys/socket.h>
57#include <sys/socketvar.h>
58#include <sys/protosw.h>
59#include <sys/proc.h>
60#include <sys/jail.h>
61#include <sys/syslog.h>
62
63#ifdef DDB
64#include <ddb/ddb.h>
65#endif
66
67#include <net/if.h>
68#include <net/route.h>
69#include <net/vnet.h>
70
71#include <netinet/cc.h>
72#include <netinet/in.h>
73#include <netinet/in_pcb.h>
74#include <netinet/in_systm.h>
75#include <netinet/in_var.h>
76#include <netinet/ip_var.h>
77#ifdef INET6
78#include <netinet/ip6.h>
79#include <netinet6/in6_pcb.h>
80#include <netinet6/ip6_var.h>
81#include <netinet6/scope6_var.h>
82#endif
83#ifdef TCP_RFC7413
84#include <netinet/tcp_fastopen.h>
85#endif
86#include <netinet/tcp_fsm.h>
87#include <netinet/tcp_seq.h>
88#include <netinet/tcp_timer.h>
89#include <netinet/tcp_var.h>
90#include <netinet/tcpip.h>
91#ifdef TCPDEBUG
92#include <netinet/tcp_debug.h>
93#endif
94#ifdef TCP_OFFLOAD
95#include <netinet/tcp_offload.h>
96#endif
97
98/*
99 * TCP protocol interface to socket abstraction.
100 */
101static int	tcp_attach(struct socket *);
102#ifdef INET
103static int	tcp_connect(struct tcpcb *, struct sockaddr *,
104		    struct thread *td);
105#endif /* INET */
106#ifdef INET6
107static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
108		    struct thread *td);
109#endif /* INET6 */
110static void	tcp_disconnect(struct tcpcb *);
111static void	tcp_usrclosed(struct tcpcb *);
112static void	tcp_fill_info(struct tcpcb *, struct tcp_info *);
113
114#ifdef TCPDEBUG
115#define	TCPDEBUG0	int ostate = 0
116#define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
117#define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
118				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
119#else
120#define	TCPDEBUG0
121#define	TCPDEBUG1()
122#define	TCPDEBUG2(req)
123#endif
124
125/*
126 * TCP attaches to socket via pru_attach(), reserving space,
127 * and an internet control block.
128 */
129static int
130tcp_usr_attach(struct socket *so, int proto, struct thread *td)
131{
132	struct inpcb *inp;
133	struct tcpcb *tp = NULL;
134	int error;
135	TCPDEBUG0;
136
137	inp = sotoinpcb(so);
138	KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
139	TCPDEBUG1();
140
141	error = tcp_attach(so);
142	if (error)
143		goto out;
144
145	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
146		so->so_linger = TCP_LINGERTIME;
147
148	inp = sotoinpcb(so);
149	tp = intotcpcb(inp);
150out:
151	TCPDEBUG2(PRU_ATTACH);
152	return error;
153}
154
155/*
156 * tcp_detach is called when the socket layer loses its final reference
157 * to the socket, be it a file descriptor reference, a reference from TCP,
158 * etc.  At this point, there is only one case in which we will keep around
159 * inpcb state: time wait.
160 *
161 * This function can probably be re-absorbed back into tcp_usr_detach() now
162 * that there is a single detach path.
163 */
164static void
165tcp_detach(struct socket *so, struct inpcb *inp)
166{
167	struct tcpcb *tp;
168
169	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
170	INP_WLOCK_ASSERT(inp);
171
172	KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
173	KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so"));
174
175	tp = intotcpcb(inp);
176
177	if (inp->inp_flags & INP_TIMEWAIT) {
178		/*
179		 * There are two cases to handle: one in which the time wait
180		 * state is being discarded (INP_DROPPED), and one in which
181		 * this connection will remain in timewait.  In the former,
182		 * it is time to discard all state (except tcptw, which has
183		 * already been discarded by the timewait close code, which
184		 * should be further up the call stack somewhere).  In the
185		 * latter case, we detach from the socket, but leave the pcb
186		 * present until timewait ends.
187		 *
188		 * XXXRW: Would it be cleaner to free the tcptw here?
189		 *
190		 * Astute question indeed, from twtcp perspective there are
191		 * three cases to consider:
192		 *
193		 * #1 tcp_detach is called at tcptw creation time by
194		 *  tcp_twstart, then do not discard the newly created tcptw
195		 *  and leave inpcb present until timewait ends
196		 * #2 tcp_detach is called at timewait end (or reuse) by
197		 *  tcp_twclose, then the tcptw has already been discarded
198		 *  (or reused) and inpcb is freed here
199		 * #3 tcp_detach is called() after timewait ends (or reuse)
200		 *  (e.g. by soclose), then tcptw has already been discarded
201		 *  (or reused) and inpcb is freed here
202		 *
203		 *  In all three cases the tcptw should not be freed here.
204		 */
205		if (inp->inp_flags & INP_DROPPED) {
206			in_pcbdetach(inp);
207			if (__predict_true(tp == NULL)) {
208				in_pcbfree(inp);
209			} else {
210				/*
211				 * This case should not happen as in TIMEWAIT
212				 * state the inp should not be destroyed before
213				 * its tcptw.  If INVARIANTS is defined, panic.
214				 */
215#ifdef INVARIANTS
216				panic("%s: Panic before an inp double-free: "
217				    "INP_TIMEWAIT && INP_DROPPED && tp != NULL"
218				    , __func__);
219#else
220				log(LOG_ERR, "%s: Avoid an inp double-free: "
221				    "INP_TIMEWAIT && INP_DROPPED && tp != NULL"
222				    , __func__);
223#endif
224				INP_WUNLOCK(inp);
225			}
226		} else {
227			in_pcbdetach(inp);
228			INP_WUNLOCK(inp);
229		}
230	} else {
231		/*
232		 * If the connection is not in timewait, we consider two
233		 * two conditions: one in which no further processing is
234		 * necessary (dropped || embryonic), and one in which TCP is
235		 * not yet done, but no longer requires the socket, so the
236		 * pcb will persist for the time being.
237		 *
238		 * XXXRW: Does the second case still occur?
239		 */
240		if (inp->inp_flags & INP_DROPPED ||
241		    tp->t_state < TCPS_SYN_SENT) {
242			tcp_discardcb(tp);
243			in_pcbdetach(inp);
244			in_pcbfree(inp);
245		} else {
246			in_pcbdetach(inp);
247			INP_WUNLOCK(inp);
248		}
249	}
250}
251
252/*
253 * pru_detach() detaches the TCP protocol from the socket.
254 * If the protocol state is non-embryonic, then can't
255 * do this directly: have to initiate a pru_disconnect(),
256 * which may finish later; embryonic TCB's can just
257 * be discarded here.
258 */
259static void
260tcp_usr_detach(struct socket *so)
261{
262	struct inpcb *inp;
263	int rlock = 0;
264
265	inp = sotoinpcb(so);
266	KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
267	if (!INP_INFO_WLOCKED(&V_tcbinfo)) {
268		INP_INFO_RLOCK(&V_tcbinfo);
269		rlock = 1;
270	}
271	INP_WLOCK(inp);
272	KASSERT(inp->inp_socket != NULL,
273	    ("tcp_usr_detach: inp_socket == NULL"));
274	tcp_detach(so, inp);
275	if (rlock)
276		INP_INFO_RUNLOCK(&V_tcbinfo);
277}
278
279#ifdef INET
280/*
281 * Give the socket an address.
282 */
283static int
284tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
285{
286	int error = 0;
287	struct inpcb *inp;
288	struct tcpcb *tp = NULL;
289	struct sockaddr_in *sinp;
290
291	sinp = (struct sockaddr_in *)nam;
292	if (nam->sa_len != sizeof (*sinp))
293		return (EINVAL);
294	/*
295	 * Must check for multicast addresses and disallow binding
296	 * to them.
297	 */
298	if (sinp->sin_family == AF_INET &&
299	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
300		return (EAFNOSUPPORT);
301
302	TCPDEBUG0;
303	inp = sotoinpcb(so);
304	KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
305	INP_WLOCK(inp);
306	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
307		error = EINVAL;
308		goto out;
309	}
310	tp = intotcpcb(inp);
311	TCPDEBUG1();
312	INP_HASH_WLOCK(&V_tcbinfo);
313	error = in_pcbbind(inp, nam, td->td_ucred);
314	INP_HASH_WUNLOCK(&V_tcbinfo);
315out:
316	TCPDEBUG2(PRU_BIND);
317	INP_WUNLOCK(inp);
318
319	return (error);
320}
321#endif /* INET */
322
323#ifdef INET6
324static int
325tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
326{
327	int error = 0;
328	struct inpcb *inp;
329	struct tcpcb *tp = NULL;
330	struct sockaddr_in6 *sin6p;
331
332	sin6p = (struct sockaddr_in6 *)nam;
333	if (nam->sa_len != sizeof (*sin6p))
334		return (EINVAL);
335	/*
336	 * Must check for multicast addresses and disallow binding
337	 * to them.
338	 */
339	if (sin6p->sin6_family == AF_INET6 &&
340	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
341		return (EAFNOSUPPORT);
342
343	TCPDEBUG0;
344	inp = sotoinpcb(so);
345	KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
346	INP_WLOCK(inp);
347	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
348		error = EINVAL;
349		goto out;
350	}
351	tp = intotcpcb(inp);
352	TCPDEBUG1();
353	INP_HASH_WLOCK(&V_tcbinfo);
354	inp->inp_vflag &= ~INP_IPV4;
355	inp->inp_vflag |= INP_IPV6;
356#ifdef INET
357	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
358		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
359			inp->inp_vflag |= INP_IPV4;
360		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
361			struct sockaddr_in sin;
362
363			in6_sin6_2_sin(&sin, sin6p);
364			inp->inp_vflag |= INP_IPV4;
365			inp->inp_vflag &= ~INP_IPV6;
366			error = in_pcbbind(inp, (struct sockaddr *)&sin,
367			    td->td_ucred);
368			INP_HASH_WUNLOCK(&V_tcbinfo);
369			goto out;
370		}
371	}
372#endif
373	error = in6_pcbbind(inp, nam, td->td_ucred);
374	INP_HASH_WUNLOCK(&V_tcbinfo);
375out:
376	TCPDEBUG2(PRU_BIND);
377	INP_WUNLOCK(inp);
378	return (error);
379}
380#endif /* INET6 */
381
382#ifdef INET
383/*
384 * Prepare to accept connections.
385 */
386static int
387tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
388{
389	int error = 0;
390	struct inpcb *inp;
391	struct tcpcb *tp = NULL;
392
393	TCPDEBUG0;
394	inp = sotoinpcb(so);
395	KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
396	INP_WLOCK(inp);
397	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
398		error = EINVAL;
399		goto out;
400	}
401	tp = intotcpcb(inp);
402	TCPDEBUG1();
403	SOCK_LOCK(so);
404	error = solisten_proto_check(so);
405	INP_HASH_WLOCK(&V_tcbinfo);
406	if (error == 0 && inp->inp_lport == 0)
407		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
408	INP_HASH_WUNLOCK(&V_tcbinfo);
409	if (error == 0) {
410		tcp_state_change(tp, TCPS_LISTEN);
411		solisten_proto(so, backlog);
412#ifdef TCP_OFFLOAD
413		if ((so->so_options & SO_NO_OFFLOAD) == 0)
414			tcp_offload_listen_start(tp);
415#endif
416	}
417	SOCK_UNLOCK(so);
418
419#ifdef TCP_RFC7413
420	if (tp->t_flags & TF_FASTOPEN)
421		tp->t_tfo_pending = tcp_fastopen_alloc_counter();
422#endif
423out:
424	TCPDEBUG2(PRU_LISTEN);
425	INP_WUNLOCK(inp);
426	return (error);
427}
428#endif /* INET */
429
430#ifdef INET6
431static int
432tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
433{
434	int error = 0;
435	struct inpcb *inp;
436	struct tcpcb *tp = NULL;
437
438	TCPDEBUG0;
439	inp = sotoinpcb(so);
440	KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
441	INP_WLOCK(inp);
442	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
443		error = EINVAL;
444		goto out;
445	}
446	tp = intotcpcb(inp);
447	TCPDEBUG1();
448	SOCK_LOCK(so);
449	error = solisten_proto_check(so);
450	INP_HASH_WLOCK(&V_tcbinfo);
451	if (error == 0 && inp->inp_lport == 0) {
452		inp->inp_vflag &= ~INP_IPV4;
453		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
454			inp->inp_vflag |= INP_IPV4;
455		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
456	}
457	INP_HASH_WUNLOCK(&V_tcbinfo);
458	if (error == 0) {
459		tcp_state_change(tp, TCPS_LISTEN);
460		solisten_proto(so, backlog);
461#ifdef TCP_OFFLOAD
462		if ((so->so_options & SO_NO_OFFLOAD) == 0)
463			tcp_offload_listen_start(tp);
464#endif
465	}
466	SOCK_UNLOCK(so);
467
468#ifdef TCP_RFC7413
469	if (tp->t_flags & TF_FASTOPEN)
470		tp->t_tfo_pending = tcp_fastopen_alloc_counter();
471#endif
472out:
473	TCPDEBUG2(PRU_LISTEN);
474	INP_WUNLOCK(inp);
475	return (error);
476}
477#endif /* INET6 */
478
479#ifdef INET
480/*
481 * Initiate connection to peer.
482 * Create a template for use in transmissions on this connection.
483 * Enter SYN_SENT state, and mark socket as connecting.
484 * Start keep-alive timer, and seed output sequence space.
485 * Send initial segment on connection.
486 */
487static int
488tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
489{
490	int error = 0;
491	struct inpcb *inp;
492	struct tcpcb *tp = NULL;
493	struct sockaddr_in *sinp;
494
495	sinp = (struct sockaddr_in *)nam;
496	if (nam->sa_len != sizeof (*sinp))
497		return (EINVAL);
498	/*
499	 * Must disallow TCP ``connections'' to multicast addresses.
500	 */
501	if (sinp->sin_family == AF_INET
502	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
503		return (EAFNOSUPPORT);
504	if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0)
505		return (error);
506
507	TCPDEBUG0;
508	inp = sotoinpcb(so);
509	KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
510	INP_WLOCK(inp);
511	if (inp->inp_flags & INP_TIMEWAIT) {
512		error = EADDRINUSE;
513		goto out;
514	}
515	if (inp->inp_flags & INP_DROPPED) {
516		error = ECONNREFUSED;
517		goto out;
518	}
519	tp = intotcpcb(inp);
520	TCPDEBUG1();
521	if ((error = tcp_connect(tp, nam, td)) != 0)
522		goto out;
523#ifdef TCP_OFFLOAD
524	if (registered_toedevs > 0 &&
525	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
526	    (error = tcp_offload_connect(so, nam)) == 0)
527		goto out;
528#endif
529	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
530	error = tcp_output(tp);
531out:
532	TCPDEBUG2(PRU_CONNECT);
533	INP_WUNLOCK(inp);
534	return (error);
535}
536#endif /* INET */
537
538#ifdef INET6
539static int
540tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
541{
542	int error = 0;
543	struct inpcb *inp;
544	struct tcpcb *tp = NULL;
545	struct sockaddr_in6 *sin6p;
546
547	TCPDEBUG0;
548
549	sin6p = (struct sockaddr_in6 *)nam;
550	if (nam->sa_len != sizeof (*sin6p))
551		return (EINVAL);
552	/*
553	 * Must disallow TCP ``connections'' to multicast addresses.
554	 */
555	if (sin6p->sin6_family == AF_INET6
556	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
557		return (EAFNOSUPPORT);
558
559	inp = sotoinpcb(so);
560	KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
561	INP_WLOCK(inp);
562	if (inp->inp_flags & INP_TIMEWAIT) {
563		error = EADDRINUSE;
564		goto out;
565	}
566	if (inp->inp_flags & INP_DROPPED) {
567		error = ECONNREFUSED;
568		goto out;
569	}
570	tp = intotcpcb(inp);
571	TCPDEBUG1();
572#ifdef INET
573	/*
574	 * XXXRW: Some confusion: V4/V6 flags relate to binding, and
575	 * therefore probably require the hash lock, which isn't held here.
576	 * Is this a significant problem?
577	 */
578	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
579		struct sockaddr_in sin;
580
581		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
582			error = EINVAL;
583			goto out;
584		}
585
586		in6_sin6_2_sin(&sin, sin6p);
587		inp->inp_vflag |= INP_IPV4;
588		inp->inp_vflag &= ~INP_IPV6;
589		if ((error = prison_remote_ip4(td->td_ucred,
590		    &sin.sin_addr)) != 0)
591			goto out;
592		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
593			goto out;
594#ifdef TCP_OFFLOAD
595		if (registered_toedevs > 0 &&
596		    (so->so_options & SO_NO_OFFLOAD) == 0 &&
597		    (error = tcp_offload_connect(so, nam)) == 0)
598			goto out;
599#endif
600		error = tcp_output(tp);
601		goto out;
602	}
603#endif
604	inp->inp_vflag &= ~INP_IPV4;
605	inp->inp_vflag |= INP_IPV6;
606	inp->inp_inc.inc_flags |= INC_ISIPV6;
607	if ((error = prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr)) != 0)
608		goto out;
609	if ((error = tcp6_connect(tp, nam, td)) != 0)
610		goto out;
611#ifdef TCP_OFFLOAD
612	if (registered_toedevs > 0 &&
613	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
614	    (error = tcp_offload_connect(so, nam)) == 0)
615		goto out;
616#endif
617	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
618	error = tcp_output(tp);
619
620out:
621	TCPDEBUG2(PRU_CONNECT);
622	INP_WUNLOCK(inp);
623	return (error);
624}
625#endif /* INET6 */
626
627/*
628 * Initiate disconnect from peer.
629 * If connection never passed embryonic stage, just drop;
630 * else if don't need to let data drain, then can just drop anyways,
631 * else have to begin TCP shutdown process: mark socket disconnecting,
632 * drain unread data, state switch to reflect user close, and
633 * send segment (e.g. FIN) to peer.  Socket will be really disconnected
634 * when peer sends FIN and acks ours.
635 *
636 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
637 */
638static int
639tcp_usr_disconnect(struct socket *so)
640{
641	struct inpcb *inp;
642	struct tcpcb *tp = NULL;
643	int error = 0;
644
645	TCPDEBUG0;
646	INP_INFO_RLOCK(&V_tcbinfo);
647	inp = sotoinpcb(so);
648	KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
649	INP_WLOCK(inp);
650	if (inp->inp_flags & INP_TIMEWAIT)
651		goto out;
652	if (inp->inp_flags & INP_DROPPED) {
653		error = ECONNRESET;
654		goto out;
655	}
656	tp = intotcpcb(inp);
657	TCPDEBUG1();
658	tcp_disconnect(tp);
659out:
660	TCPDEBUG2(PRU_DISCONNECT);
661	INP_WUNLOCK(inp);
662	INP_INFO_RUNLOCK(&V_tcbinfo);
663	return (error);
664}
665
666#ifdef INET
667/*
668 * Accept a connection.  Essentially all the work is done at higher levels;
669 * just return the address of the peer, storing through addr.
670 */
671static int
672tcp_usr_accept(struct socket *so, struct sockaddr **nam)
673{
674	int error = 0;
675	struct inpcb *inp = NULL;
676	struct tcpcb *tp = NULL;
677	struct in_addr addr;
678	in_port_t port = 0;
679	TCPDEBUG0;
680
681	if (so->so_state & SS_ISDISCONNECTED)
682		return (ECONNABORTED);
683
684	inp = sotoinpcb(so);
685	KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
686	INP_WLOCK(inp);
687	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
688		error = ECONNABORTED;
689		goto out;
690	}
691	tp = intotcpcb(inp);
692	TCPDEBUG1();
693
694	/*
695	 * We inline in_getpeeraddr and COMMON_END here, so that we can
696	 * copy the data of interest and defer the malloc until after we
697	 * release the lock.
698	 */
699	port = inp->inp_fport;
700	addr = inp->inp_faddr;
701
702out:
703	TCPDEBUG2(PRU_ACCEPT);
704	INP_WUNLOCK(inp);
705	if (error == 0)
706		*nam = in_sockaddr(port, &addr);
707	return error;
708}
709#endif /* INET */
710
711#ifdef INET6
712static int
713tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
714{
715	struct inpcb *inp = NULL;
716	int error = 0;
717	struct tcpcb *tp = NULL;
718	struct in_addr addr;
719	struct in6_addr addr6;
720	in_port_t port = 0;
721	int v4 = 0;
722	TCPDEBUG0;
723
724	if (so->so_state & SS_ISDISCONNECTED)
725		return (ECONNABORTED);
726
727	inp = sotoinpcb(so);
728	KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
729	INP_INFO_RLOCK(&V_tcbinfo);
730	INP_WLOCK(inp);
731	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
732		error = ECONNABORTED;
733		goto out;
734	}
735	tp = intotcpcb(inp);
736	TCPDEBUG1();
737
738	/*
739	 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
740	 * copy the data of interest and defer the malloc until after we
741	 * release the lock.
742	 */
743	if (inp->inp_vflag & INP_IPV4) {
744		v4 = 1;
745		port = inp->inp_fport;
746		addr = inp->inp_faddr;
747	} else {
748		port = inp->inp_fport;
749		addr6 = inp->in6p_faddr;
750	}
751
752out:
753	TCPDEBUG2(PRU_ACCEPT);
754	INP_WUNLOCK(inp);
755	INP_INFO_RUNLOCK(&V_tcbinfo);
756	if (error == 0) {
757		if (v4)
758			*nam = in6_v4mapsin6_sockaddr(port, &addr);
759		else
760			*nam = in6_sockaddr(port, &addr6);
761	}
762	return error;
763}
764#endif /* INET6 */
765
766/*
767 * Mark the connection as being incapable of further output.
768 */
769static int
770tcp_usr_shutdown(struct socket *so)
771{
772	int error = 0;
773	struct inpcb *inp;
774	struct tcpcb *tp = NULL;
775
776	TCPDEBUG0;
777	INP_INFO_RLOCK(&V_tcbinfo);
778	inp = sotoinpcb(so);
779	KASSERT(inp != NULL, ("inp == NULL"));
780	INP_WLOCK(inp);
781	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
782		error = ECONNRESET;
783		goto out;
784	}
785	tp = intotcpcb(inp);
786	TCPDEBUG1();
787	socantsendmore(so);
788	tcp_usrclosed(tp);
789	if (!(inp->inp_flags & INP_DROPPED))
790		error = tcp_output(tp);
791
792out:
793	TCPDEBUG2(PRU_SHUTDOWN);
794	INP_WUNLOCK(inp);
795	INP_INFO_RUNLOCK(&V_tcbinfo);
796
797	return (error);
798}
799
800/*
801 * After a receive, possibly send window update to peer.
802 */
803static int
804tcp_usr_rcvd(struct socket *so, int flags)
805{
806	struct inpcb *inp;
807	struct tcpcb *tp = NULL;
808	int error = 0;
809
810	TCPDEBUG0;
811	inp = sotoinpcb(so);
812	KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
813	INP_WLOCK(inp);
814	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
815		error = ECONNRESET;
816		goto out;
817	}
818	tp = intotcpcb(inp);
819	TCPDEBUG1();
820#ifdef TCP_RFC7413
821	/*
822	 * For passively-created TFO connections, don't attempt a window
823	 * update while still in SYN_RECEIVED as this may trigger an early
824	 * SYN|ACK.  It is preferable to have the SYN|ACK be sent along with
825	 * application response data, or failing that, when the DELACK timer
826	 * expires.
827	 */
828	if ((tp->t_flags & TF_FASTOPEN) &&
829	    (tp->t_state == TCPS_SYN_RECEIVED))
830		goto out;
831#endif
832#ifdef TCP_OFFLOAD
833	if (tp->t_flags & TF_TOE)
834		tcp_offload_rcvd(tp);
835	else
836#endif
837	tcp_output(tp);
838
839out:
840	TCPDEBUG2(PRU_RCVD);
841	INP_WUNLOCK(inp);
842	return (error);
843}
844
845/*
846 * Do a send by putting data in output queue and updating urgent
847 * marker if URG set.  Possibly send more data.  Unlike the other
848 * pru_*() routines, the mbuf chains are our responsibility.  We
849 * must either enqueue them or free them.  The other pru_* routines
850 * generally are caller-frees.
851 */
852static int
853tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
854    struct sockaddr *nam, struct mbuf *control, struct thread *td)
855{
856	int error = 0;
857	struct inpcb *inp;
858	struct tcpcb *tp = NULL;
859#ifdef INET6
860	int isipv6;
861#endif
862	TCPDEBUG0;
863
864	/*
865	 * We require the pcbinfo lock if we will close the socket as part of
866	 * this call.
867	 */
868	if (flags & PRUS_EOF)
869		INP_INFO_RLOCK(&V_tcbinfo);
870	inp = sotoinpcb(so);
871	KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
872	INP_WLOCK(inp);
873	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
874		if (control)
875			m_freem(control);
876		if (m)
877			m_freem(m);
878		error = ECONNRESET;
879		goto out;
880	}
881#ifdef INET6
882	isipv6 = nam && nam->sa_family == AF_INET6;
883#endif /* INET6 */
884	tp = intotcpcb(inp);
885	TCPDEBUG1();
886	if (control) {
887		/* TCP doesn't do control messages (rights, creds, etc) */
888		if (control->m_len) {
889			m_freem(control);
890			if (m)
891				m_freem(m);
892			error = EINVAL;
893			goto out;
894		}
895		m_freem(control);	/* empty control, just free it */
896	}
897	if (!(flags & PRUS_OOB)) {
898		sbappendstream(&so->so_snd, m);
899		if (nam && tp->t_state < TCPS_SYN_SENT) {
900			/*
901			 * Do implied connect if not yet connected,
902			 * initialize window to default value, and
903			 * initialize maxseg/maxopd using peer's cached
904			 * MSS.
905			 */
906#ifdef INET6
907			if (isipv6)
908				error = tcp6_connect(tp, nam, td);
909#endif /* INET6 */
910#if defined(INET6) && defined(INET)
911			else
912#endif
913#ifdef INET
914				error = tcp_connect(tp, nam, td);
915#endif
916			if (error)
917				goto out;
918			tp->snd_wnd = TTCP_CLIENT_SND_WND;
919			tcp_mss(tp, -1);
920		}
921		if (flags & PRUS_EOF) {
922			/*
923			 * Close the send side of the connection after
924			 * the data is sent.
925			 */
926			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
927			socantsendmore(so);
928			tcp_usrclosed(tp);
929		}
930		if (!(inp->inp_flags & INP_DROPPED)) {
931			if (flags & PRUS_MORETOCOME)
932				tp->t_flags |= TF_MORETOCOME;
933			error = tcp_output(tp);
934			if (flags & PRUS_MORETOCOME)
935				tp->t_flags &= ~TF_MORETOCOME;
936		}
937	} else {
938		/*
939		 * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
940		 */
941		SOCKBUF_LOCK(&so->so_snd);
942		if (sbspace(&so->so_snd) < -512) {
943			SOCKBUF_UNLOCK(&so->so_snd);
944			m_freem(m);
945			error = ENOBUFS;
946			goto out;
947		}
948		/*
949		 * According to RFC961 (Assigned Protocols),
950		 * the urgent pointer points to the last octet
951		 * of urgent data.  We continue, however,
952		 * to consider it to indicate the first octet
953		 * of data past the urgent section.
954		 * Otherwise, snd_up should be one lower.
955		 */
956		sbappendstream_locked(&so->so_snd, m);
957		SOCKBUF_UNLOCK(&so->so_snd);
958		if (nam && tp->t_state < TCPS_SYN_SENT) {
959			/*
960			 * Do implied connect if not yet connected,
961			 * initialize window to default value, and
962			 * initialize maxseg/maxopd using peer's cached
963			 * MSS.
964			 */
965#ifdef INET6
966			if (isipv6)
967				error = tcp6_connect(tp, nam, td);
968#endif /* INET6 */
969#if defined(INET6) && defined(INET)
970			else
971#endif
972#ifdef INET
973				error = tcp_connect(tp, nam, td);
974#endif
975			if (error)
976				goto out;
977			tp->snd_wnd = TTCP_CLIENT_SND_WND;
978			tcp_mss(tp, -1);
979		}
980		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
981		tp->t_flags |= TF_FORCEDATA;
982		error = tcp_output(tp);
983		tp->t_flags &= ~TF_FORCEDATA;
984	}
985out:
986	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
987		  ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
988	INP_WUNLOCK(inp);
989	if (flags & PRUS_EOF)
990		INP_INFO_RUNLOCK(&V_tcbinfo);
991	return (error);
992}
993
994/*
995 * Abort the TCP.  Drop the connection abruptly.
996 */
997static void
998tcp_usr_abort(struct socket *so)
999{
1000	struct inpcb *inp;
1001	struct tcpcb *tp = NULL;
1002	TCPDEBUG0;
1003
1004	inp = sotoinpcb(so);
1005	KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
1006
1007	INP_INFO_RLOCK(&V_tcbinfo);
1008	INP_WLOCK(inp);
1009	KASSERT(inp->inp_socket != NULL,
1010	    ("tcp_usr_abort: inp_socket == NULL"));
1011
1012	/*
1013	 * If we still have full TCP state, and we're not dropped, drop.
1014	 */
1015	if (!(inp->inp_flags & INP_TIMEWAIT) &&
1016	    !(inp->inp_flags & INP_DROPPED)) {
1017		tp = intotcpcb(inp);
1018		TCPDEBUG1();
1019		tcp_drop(tp, ECONNABORTED);
1020		TCPDEBUG2(PRU_ABORT);
1021	}
1022	if (!(inp->inp_flags & INP_DROPPED)) {
1023		SOCK_LOCK(so);
1024		so->so_state |= SS_PROTOREF;
1025		SOCK_UNLOCK(so);
1026		inp->inp_flags |= INP_SOCKREF;
1027	}
1028	INP_WUNLOCK(inp);
1029	INP_INFO_RUNLOCK(&V_tcbinfo);
1030}
1031
1032/*
1033 * TCP socket is closed.  Start friendly disconnect.
1034 */
1035static void
1036tcp_usr_close(struct socket *so)
1037{
1038	struct inpcb *inp;
1039	struct tcpcb *tp = NULL;
1040	TCPDEBUG0;
1041
1042	inp = sotoinpcb(so);
1043	KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
1044
1045	INP_INFO_RLOCK(&V_tcbinfo);
1046	INP_WLOCK(inp);
1047	KASSERT(inp->inp_socket != NULL,
1048	    ("tcp_usr_close: inp_socket == NULL"));
1049
1050	/*
1051	 * If we still have full TCP state, and we're not dropped, initiate
1052	 * a disconnect.
1053	 */
1054	if (!(inp->inp_flags & INP_TIMEWAIT) &&
1055	    !(inp->inp_flags & INP_DROPPED)) {
1056		tp = intotcpcb(inp);
1057		TCPDEBUG1();
1058		tcp_disconnect(tp);
1059		TCPDEBUG2(PRU_CLOSE);
1060	}
1061	if (!(inp->inp_flags & INP_DROPPED)) {
1062		SOCK_LOCK(so);
1063		so->so_state |= SS_PROTOREF;
1064		SOCK_UNLOCK(so);
1065		inp->inp_flags |= INP_SOCKREF;
1066	}
1067	INP_WUNLOCK(inp);
1068	INP_INFO_RUNLOCK(&V_tcbinfo);
1069}
1070
1071/*
1072 * Receive out-of-band data.
1073 */
1074static int
1075tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
1076{
1077	int error = 0;
1078	struct inpcb *inp;
1079	struct tcpcb *tp = NULL;
1080
1081	TCPDEBUG0;
1082	inp = sotoinpcb(so);
1083	KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
1084	INP_WLOCK(inp);
1085	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
1086		error = ECONNRESET;
1087		goto out;
1088	}
1089	tp = intotcpcb(inp);
1090	TCPDEBUG1();
1091	if ((so->so_oobmark == 0 &&
1092	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1093	    so->so_options & SO_OOBINLINE ||
1094	    tp->t_oobflags & TCPOOB_HADDATA) {
1095		error = EINVAL;
1096		goto out;
1097	}
1098	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
1099		error = EWOULDBLOCK;
1100		goto out;
1101	}
1102	m->m_len = 1;
1103	*mtod(m, caddr_t) = tp->t_iobc;
1104	if ((flags & MSG_PEEK) == 0)
1105		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1106
1107out:
1108	TCPDEBUG2(PRU_RCVOOB);
1109	INP_WUNLOCK(inp);
1110	return (error);
1111}
1112
1113#ifdef INET
1114struct pr_usrreqs tcp_usrreqs = {
1115	.pru_abort =		tcp_usr_abort,
1116	.pru_accept =		tcp_usr_accept,
1117	.pru_attach =		tcp_usr_attach,
1118	.pru_bind =		tcp_usr_bind,
1119	.pru_connect =		tcp_usr_connect,
1120	.pru_control =		in_control,
1121	.pru_detach =		tcp_usr_detach,
1122	.pru_disconnect =	tcp_usr_disconnect,
1123	.pru_listen =		tcp_usr_listen,
1124	.pru_peeraddr =		in_getpeeraddr,
1125	.pru_rcvd =		tcp_usr_rcvd,
1126	.pru_rcvoob =		tcp_usr_rcvoob,
1127	.pru_send =		tcp_usr_send,
1128	.pru_shutdown =		tcp_usr_shutdown,
1129	.pru_sockaddr =		in_getsockaddr,
1130	.pru_sosetlabel =	in_pcbsosetlabel,
1131	.pru_close =		tcp_usr_close,
1132};
1133#endif /* INET */
1134
1135#ifdef INET6
1136struct pr_usrreqs tcp6_usrreqs = {
1137	.pru_abort =		tcp_usr_abort,
1138	.pru_accept =		tcp6_usr_accept,
1139	.pru_attach =		tcp_usr_attach,
1140	.pru_bind =		tcp6_usr_bind,
1141	.pru_connect =		tcp6_usr_connect,
1142	.pru_control =		in6_control,
1143	.pru_detach =		tcp_usr_detach,
1144	.pru_disconnect =	tcp_usr_disconnect,
1145	.pru_listen =		tcp6_usr_listen,
1146	.pru_peeraddr =		in6_mapped_peeraddr,
1147	.pru_rcvd =		tcp_usr_rcvd,
1148	.pru_rcvoob =		tcp_usr_rcvoob,
1149	.pru_send =		tcp_usr_send,
1150	.pru_shutdown =		tcp_usr_shutdown,
1151	.pru_sockaddr =		in6_mapped_sockaddr,
1152	.pru_sosetlabel =	in_pcbsosetlabel,
1153	.pru_close =		tcp_usr_close,
1154};
1155#endif /* INET6 */
1156
1157#ifdef INET
1158/*
1159 * Common subroutine to open a TCP connection to remote host specified
1160 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
1161 * port number if needed.  Call in_pcbconnect_setup to do the routing and
1162 * to choose a local host address (interface).  If there is an existing
1163 * incarnation of the same connection in TIME-WAIT state and if the remote
1164 * host was sending CC options and if the connection duration was < MSL, then
1165 * truncate the previous TIME-WAIT state and proceed.
1166 * Initialize connection parameters and enter SYN-SENT state.
1167 */
1168static int
1169tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
1170{
1171	struct inpcb *inp = tp->t_inpcb, *oinp;
1172	struct socket *so = inp->inp_socket;
1173	struct in_addr laddr;
1174	u_short lport;
1175	int error;
1176
1177	INP_WLOCK_ASSERT(inp);
1178	INP_HASH_WLOCK(&V_tcbinfo);
1179
1180	if (inp->inp_lport == 0) {
1181		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
1182		if (error)
1183			goto out;
1184	}
1185
1186	/*
1187	 * Cannot simply call in_pcbconnect, because there might be an
1188	 * earlier incarnation of this same connection still in
1189	 * TIME_WAIT state, creating an ADDRINUSE error.
1190	 */
1191	laddr = inp->inp_laddr;
1192	lport = inp->inp_lport;
1193	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
1194	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
1195	if (error && oinp == NULL)
1196		goto out;
1197	if (oinp) {
1198		error = EADDRINUSE;
1199		goto out;
1200	}
1201	inp->inp_laddr = laddr;
1202	in_pcbrehash(inp);
1203	INP_HASH_WUNLOCK(&V_tcbinfo);
1204
1205	/*
1206	 * Compute window scaling to request:
1207	 * Scale to fit into sweet spot.  See tcp_syncache.c.
1208	 * XXX: This should move to tcp_output().
1209	 */
1210	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1211	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1212		tp->request_r_scale++;
1213
1214	soisconnecting(so);
1215	TCPSTAT_INC(tcps_connattempt);
1216	tcp_state_change(tp, TCPS_SYN_SENT);
1217	tp->iss = tcp_new_isn(tp);
1218	tcp_sendseqinit(tp);
1219
1220	return 0;
1221
1222out:
1223	INP_HASH_WUNLOCK(&V_tcbinfo);
1224	return (error);
1225}
1226#endif /* INET */
1227
1228#ifdef INET6
1229static int
1230tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
1231{
1232	struct inpcb *inp = tp->t_inpcb, *oinp;
1233	struct socket *so = inp->inp_socket;
1234	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
1235	struct in6_addr addr6;
1236	int error;
1237
1238	INP_WLOCK_ASSERT(inp);
1239	INP_HASH_WLOCK(&V_tcbinfo);
1240
1241	if (inp->inp_lport == 0) {
1242		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
1243		if (error)
1244			goto out;
1245	}
1246
1247	/*
1248	 * Cannot simply call in_pcbconnect, because there might be an
1249	 * earlier incarnation of this same connection still in
1250	 * TIME_WAIT state, creating an ADDRINUSE error.
1251	 * in6_pcbladdr() also handles scope zone IDs.
1252	 *
1253	 * XXXRW: We wouldn't need to expose in6_pcblookup_hash_locked()
1254	 * outside of in6_pcb.c if there were an in6_pcbconnect_setup().
1255	 */
1256	error = in6_pcbladdr(inp, nam, &addr6);
1257	if (error)
1258		goto out;
1259	oinp = in6_pcblookup_hash_locked(inp->inp_pcbinfo,
1260				  &sin6->sin6_addr, sin6->sin6_port,
1261				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
1262				  ? &addr6
1263				  : &inp->in6p_laddr,
1264				  inp->inp_lport,  0, NULL);
1265	if (oinp) {
1266		error = EADDRINUSE;
1267		goto out;
1268	}
1269	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
1270		inp->in6p_laddr = addr6;
1271	inp->in6p_faddr = sin6->sin6_addr;
1272	inp->inp_fport = sin6->sin6_port;
1273	/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
1274	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
1275	if (inp->inp_flags & IN6P_AUTOFLOWLABEL)
1276		inp->inp_flow |=
1277		    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
1278	in_pcbrehash(inp);
1279	INP_HASH_WUNLOCK(&V_tcbinfo);
1280
1281	/* Compute window scaling to request.  */
1282	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1283	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1284		tp->request_r_scale++;
1285
1286	soisconnecting(so);
1287	TCPSTAT_INC(tcps_connattempt);
1288	tcp_state_change(tp, TCPS_SYN_SENT);
1289	tp->iss = tcp_new_isn(tp);
1290	tcp_sendseqinit(tp);
1291
1292	return 0;
1293
1294out:
1295	INP_HASH_WUNLOCK(&V_tcbinfo);
1296	return error;
1297}
1298#endif /* INET6 */
1299
1300/*
1301 * Export TCP internal state information via a struct tcp_info, based on the
1302 * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
1303 * (TCP state machine, etc).  We export all information using FreeBSD-native
1304 * constants -- for example, the numeric values for tcpi_state will differ
1305 * from Linux.
1306 */
1307static void
1308tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
1309{
1310
1311	INP_WLOCK_ASSERT(tp->t_inpcb);
1312	bzero(ti, sizeof(*ti));
1313
1314	ti->tcpi_state = tp->t_state;
1315	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
1316		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1317	if (tp->t_flags & TF_SACK_PERMIT)
1318		ti->tcpi_options |= TCPI_OPT_SACK;
1319	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
1320		ti->tcpi_options |= TCPI_OPT_WSCALE;
1321		ti->tcpi_snd_wscale = tp->snd_scale;
1322		ti->tcpi_rcv_wscale = tp->rcv_scale;
1323	}
1324
1325	ti->tcpi_rto = tp->t_rxtcur * tick;
1326	ti->tcpi_last_data_recv = (long)(ticks - (int)tp->t_rcvtime) * tick;
1327	ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
1328	ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
1329
1330	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
1331	ti->tcpi_snd_cwnd = tp->snd_cwnd;
1332
1333	/*
1334	 * FreeBSD-specific extension fields for tcp_info.
1335	 */
1336	ti->tcpi_rcv_space = tp->rcv_wnd;
1337	ti->tcpi_rcv_nxt = tp->rcv_nxt;
1338	ti->tcpi_snd_wnd = tp->snd_wnd;
1339	ti->tcpi_snd_bwnd = 0;		/* Unused, kept for compat. */
1340	ti->tcpi_snd_nxt = tp->snd_nxt;
1341	ti->tcpi_snd_mss = tp->t_maxseg;
1342	ti->tcpi_rcv_mss = tp->t_maxseg;
1343	if (tp->t_flags & TF_TOE)
1344		ti->tcpi_options |= TCPI_OPT_TOE;
1345	ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
1346	ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
1347	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
1348}
1349
1350/*
1351 * tcp_ctloutput() must drop the inpcb lock before performing copyin on
1352 * socket option arguments.  When it re-acquires the lock after the copy, it
1353 * has to revalidate that the connection is still valid for the socket
1354 * option.
1355 */
1356#define INP_WLOCK_RECHECK(inp) do {					\
1357	INP_WLOCK(inp);							\
1358	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {		\
1359		INP_WUNLOCK(inp);					\
1360		return (ECONNRESET);					\
1361	}								\
1362	tp = intotcpcb(inp);						\
1363} while(0)
1364
1365int
1366tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1367{
1368	int	error, opt, optval;
1369	u_int	ui;
1370	struct	inpcb *inp;
1371	struct	tcpcb *tp;
1372	struct	tcp_info ti;
1373	char buf[TCP_CA_NAME_MAX];
1374	struct cc_algo *algo;
1375
1376	error = 0;
1377	inp = sotoinpcb(so);
1378	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
1379	INP_WLOCK(inp);
1380	if (sopt->sopt_level != IPPROTO_TCP) {
1381#ifdef INET6
1382		if (inp->inp_vflag & INP_IPV6PROTO) {
1383			INP_WUNLOCK(inp);
1384			error = ip6_ctloutput(so, sopt);
1385		}
1386#endif /* INET6 */
1387#if defined(INET6) && defined(INET)
1388		else
1389#endif
1390#ifdef INET
1391		{
1392			INP_WUNLOCK(inp);
1393			error = ip_ctloutput(so, sopt);
1394		}
1395#endif
1396		return (error);
1397	}
1398	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
1399		INP_WUNLOCK(inp);
1400		return (ECONNRESET);
1401	}
1402
1403	switch (sopt->sopt_dir) {
1404	case SOPT_SET:
1405		switch (sopt->sopt_name) {
1406#ifdef TCP_SIGNATURE
1407		case TCP_MD5SIG:
1408			INP_WUNLOCK(inp);
1409			error = sooptcopyin(sopt, &optval, sizeof optval,
1410			    sizeof optval);
1411			if (error)
1412				return (error);
1413
1414			INP_WLOCK_RECHECK(inp);
1415			if (optval > 0)
1416				tp->t_flags |= TF_SIGNATURE;
1417			else
1418				tp->t_flags &= ~TF_SIGNATURE;
1419			goto unlock_and_done;
1420#endif /* TCP_SIGNATURE */
1421
1422		case TCP_NODELAY:
1423		case TCP_NOOPT:
1424			INP_WUNLOCK(inp);
1425			error = sooptcopyin(sopt, &optval, sizeof optval,
1426			    sizeof optval);
1427			if (error)
1428				return (error);
1429
1430			INP_WLOCK_RECHECK(inp);
1431			switch (sopt->sopt_name) {
1432			case TCP_NODELAY:
1433				opt = TF_NODELAY;
1434				break;
1435			case TCP_NOOPT:
1436				opt = TF_NOOPT;
1437				break;
1438			default:
1439				opt = 0; /* dead code to fool gcc */
1440				break;
1441			}
1442
1443			if (optval)
1444				tp->t_flags |= opt;
1445			else
1446				tp->t_flags &= ~opt;
1447unlock_and_done:
1448#ifdef TCP_OFFLOAD
1449			if (tp->t_flags & TF_TOE) {
1450				tcp_offload_ctloutput(tp, sopt->sopt_dir,
1451				    sopt->sopt_name);
1452			}
1453#endif
1454			INP_WUNLOCK(inp);
1455			break;
1456
1457		case TCP_NOPUSH:
1458			INP_WUNLOCK(inp);
1459			error = sooptcopyin(sopt, &optval, sizeof optval,
1460			    sizeof optval);
1461			if (error)
1462				return (error);
1463
1464			INP_WLOCK_RECHECK(inp);
1465			if (optval)
1466				tp->t_flags |= TF_NOPUSH;
1467			else if (tp->t_flags & TF_NOPUSH) {
1468				tp->t_flags &= ~TF_NOPUSH;
1469				if (TCPS_HAVEESTABLISHED(tp->t_state))
1470					error = tcp_output(tp);
1471			}
1472			goto unlock_and_done;
1473
1474		case TCP_MAXSEG:
1475			INP_WUNLOCK(inp);
1476			error = sooptcopyin(sopt, &optval, sizeof optval,
1477			    sizeof optval);
1478			if (error)
1479				return (error);
1480
1481			INP_WLOCK_RECHECK(inp);
1482			if (optval > 0 && optval <= tp->t_maxseg &&
1483			    optval + 40 >= V_tcp_minmss)
1484				tp->t_maxseg = optval;
1485			else
1486				error = EINVAL;
1487			goto unlock_and_done;
1488
1489		case TCP_INFO:
1490			INP_WUNLOCK(inp);
1491			error = EINVAL;
1492			break;
1493
1494		case TCP_CONGESTION:
1495			INP_WUNLOCK(inp);
1496			bzero(buf, sizeof(buf));
1497			error = sooptcopyin(sopt, &buf, sizeof(buf), 1);
1498			if (error)
1499				break;
1500			INP_WLOCK_RECHECK(inp);
1501			/*
1502			 * Return EINVAL if we can't find the requested cc algo.
1503			 */
1504			error = EINVAL;
1505			CC_LIST_RLOCK();
1506			STAILQ_FOREACH(algo, &cc_list, entries) {
1507				if (strncmp(buf, algo->name, TCP_CA_NAME_MAX)
1508				    == 0) {
1509					/* We've found the requested algo. */
1510					error = 0;
1511					/*
1512					 * We hold a write lock over the tcb
1513					 * so it's safe to do these things
1514					 * without ordering concerns.
1515					 */
1516					if (CC_ALGO(tp)->cb_destroy != NULL)
1517						CC_ALGO(tp)->cb_destroy(tp->ccv);
1518					CC_ALGO(tp) = algo;
1519					/*
1520					 * If something goes pear shaped
1521					 * initialising the new algo,
1522					 * fall back to newreno (which
1523					 * does not require initialisation).
1524					 */
1525					if (algo->cb_init != NULL)
1526						if (algo->cb_init(tp->ccv) > 0) {
1527							CC_ALGO(tp) = &newreno_cc_algo;
1528							/*
1529							 * The only reason init
1530							 * should fail is
1531							 * because of malloc.
1532							 */
1533							error = ENOMEM;
1534						}
1535					break; /* Break the STAILQ_FOREACH. */
1536				}
1537			}
1538			CC_LIST_RUNLOCK();
1539			goto unlock_and_done;
1540
1541		case TCP_KEEPIDLE:
1542		case TCP_KEEPINTVL:
1543		case TCP_KEEPINIT:
1544			INP_WUNLOCK(inp);
1545			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
1546			if (error)
1547				return (error);
1548
1549			if (ui > (UINT_MAX / hz)) {
1550				error = EINVAL;
1551				break;
1552			}
1553			ui *= hz;
1554
1555			INP_WLOCK_RECHECK(inp);
1556			switch (sopt->sopt_name) {
1557			case TCP_KEEPIDLE:
1558				tp->t_keepidle = ui;
1559				/*
1560				 * XXX: better check current remaining
1561				 * timeout and "merge" it with new value.
1562				 */
1563				if ((tp->t_state > TCPS_LISTEN) &&
1564				    (tp->t_state <= TCPS_CLOSING))
1565					tcp_timer_activate(tp, TT_KEEP,
1566					    TP_KEEPIDLE(tp));
1567				break;
1568			case TCP_KEEPINTVL:
1569				tp->t_keepintvl = ui;
1570				if ((tp->t_state == TCPS_FIN_WAIT_2) &&
1571				    (TP_MAXIDLE(tp) > 0))
1572					tcp_timer_activate(tp, TT_2MSL,
1573					    TP_MAXIDLE(tp));
1574				break;
1575			case TCP_KEEPINIT:
1576				tp->t_keepinit = ui;
1577				if (tp->t_state == TCPS_SYN_RECEIVED ||
1578				    tp->t_state == TCPS_SYN_SENT)
1579					tcp_timer_activate(tp, TT_KEEP,
1580					    TP_KEEPINIT(tp));
1581				break;
1582			}
1583			goto unlock_and_done;
1584
1585		case TCP_KEEPCNT:
1586			INP_WUNLOCK(inp);
1587			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
1588			if (error)
1589				return (error);
1590
1591			INP_WLOCK_RECHECK(inp);
1592			tp->t_keepcnt = ui;
1593			if ((tp->t_state == TCPS_FIN_WAIT_2) &&
1594			    (TP_MAXIDLE(tp) > 0))
1595				tcp_timer_activate(tp, TT_2MSL,
1596				    TP_MAXIDLE(tp));
1597			goto unlock_and_done;
1598
1599#ifdef TCP_RFC7413
1600		case TCP_FASTOPEN:
1601			INP_WUNLOCK(inp);
1602			if (!V_tcp_fastopen_enabled)
1603				return (EPERM);
1604
1605			error = sooptcopyin(sopt, &optval, sizeof optval,
1606			    sizeof optval);
1607			if (error)
1608				return (error);
1609
1610			INP_WLOCK_RECHECK(inp);
1611			if (optval) {
1612				tp->t_flags |= TF_FASTOPEN;
1613				if ((tp->t_state == TCPS_LISTEN) &&
1614				    (tp->t_tfo_pending == NULL))
1615					tp->t_tfo_pending =
1616					    tcp_fastopen_alloc_counter();
1617			} else
1618				tp->t_flags &= ~TF_FASTOPEN;
1619			goto unlock_and_done;
1620#endif
1621
1622		default:
1623			INP_WUNLOCK(inp);
1624			error = ENOPROTOOPT;
1625			break;
1626		}
1627		break;
1628
1629	case SOPT_GET:
1630		tp = intotcpcb(inp);
1631		switch (sopt->sopt_name) {
1632#ifdef TCP_SIGNATURE
1633		case TCP_MD5SIG:
1634			optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
1635			INP_WUNLOCK(inp);
1636			error = sooptcopyout(sopt, &optval, sizeof optval);
1637			break;
1638#endif
1639
1640		case TCP_NODELAY:
1641			optval = tp->t_flags & TF_NODELAY;
1642			INP_WUNLOCK(inp);
1643			error = sooptcopyout(sopt, &optval, sizeof optval);
1644			break;
1645		case TCP_MAXSEG:
1646			optval = tp->t_maxseg;
1647			INP_WUNLOCK(inp);
1648			error = sooptcopyout(sopt, &optval, sizeof optval);
1649			break;
1650		case TCP_NOOPT:
1651			optval = tp->t_flags & TF_NOOPT;
1652			INP_WUNLOCK(inp);
1653			error = sooptcopyout(sopt, &optval, sizeof optval);
1654			break;
1655		case TCP_NOPUSH:
1656			optval = tp->t_flags & TF_NOPUSH;
1657			INP_WUNLOCK(inp);
1658			error = sooptcopyout(sopt, &optval, sizeof optval);
1659			break;
1660		case TCP_INFO:
1661			tcp_fill_info(tp, &ti);
1662			INP_WUNLOCK(inp);
1663			error = sooptcopyout(sopt, &ti, sizeof ti);
1664			break;
1665		case TCP_CONGESTION:
1666			bzero(buf, sizeof(buf));
1667			strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
1668			INP_WUNLOCK(inp);
1669			error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX);
1670			break;
1671		case TCP_KEEPIDLE:
1672		case TCP_KEEPINTVL:
1673		case TCP_KEEPINIT:
1674		case TCP_KEEPCNT:
1675			switch (sopt->sopt_name) {
1676			case TCP_KEEPIDLE:
1677				ui = tp->t_keepidle / hz;
1678				break;
1679			case TCP_KEEPINTVL:
1680				ui = tp->t_keepintvl / hz;
1681				break;
1682			case TCP_KEEPINIT:
1683				ui = tp->t_keepinit / hz;
1684				break;
1685			case TCP_KEEPCNT:
1686				ui = tp->t_keepcnt;
1687				break;
1688			}
1689			INP_WUNLOCK(inp);
1690			error = sooptcopyout(sopt, &ui, sizeof(ui));
1691			break;
1692#ifdef TCP_RFC7413
1693		case TCP_FASTOPEN:
1694			optval = tp->t_flags & TF_FASTOPEN;
1695			INP_WUNLOCK(inp);
1696			error = sooptcopyout(sopt, &optval, sizeof optval);
1697			break;
1698#endif
1699		default:
1700			INP_WUNLOCK(inp);
1701			error = ENOPROTOOPT;
1702			break;
1703		}
1704		break;
1705	}
1706	return (error);
1707}
1708#undef INP_WLOCK_RECHECK
1709
1710/*
1711 * Attach TCP protocol to socket, allocating
1712 * internet protocol control block, tcp control block,
1713 * bufer space, and entering LISTEN state if to accept connections.
1714 */
1715static int
1716tcp_attach(struct socket *so)
1717{
1718	struct tcpcb *tp;
1719	struct inpcb *inp;
1720	int error;
1721
1722	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
1723		error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
1724		if (error)
1725			return (error);
1726	}
1727	so->so_rcv.sb_flags |= SB_AUTOSIZE;
1728	so->so_snd.sb_flags |= SB_AUTOSIZE;
1729	INP_INFO_RLOCK(&V_tcbinfo);
1730	error = in_pcballoc(so, &V_tcbinfo);
1731	if (error) {
1732		INP_INFO_RUNLOCK(&V_tcbinfo);
1733		return (error);
1734	}
1735	inp = sotoinpcb(so);
1736#ifdef INET6
1737	if (inp->inp_vflag & INP_IPV6PROTO) {
1738		inp->inp_vflag |= INP_IPV6;
1739		inp->in6p_hops = -1;	/* use kernel default */
1740	}
1741	else
1742#endif
1743	inp->inp_vflag |= INP_IPV4;
1744	tp = tcp_newtcpcb(inp);
1745	if (tp == NULL) {
1746		in_pcbdetach(inp);
1747		in_pcbfree(inp);
1748		INP_INFO_RUNLOCK(&V_tcbinfo);
1749		return (ENOBUFS);
1750	}
1751	tp->t_state = TCPS_CLOSED;
1752	INP_WUNLOCK(inp);
1753	INP_INFO_RUNLOCK(&V_tcbinfo);
1754	return (0);
1755}
1756
1757/*
1758 * Initiate (or continue) disconnect.
1759 * If embryonic state, just send reset (once).
1760 * If in ``let data drain'' option and linger null, just drop.
1761 * Otherwise (hard), mark socket disconnecting and drop
1762 * current input data; switch states based on user close, and
1763 * send segment to peer (with FIN).
1764 */
1765static void
1766tcp_disconnect(struct tcpcb *tp)
1767{
1768	struct inpcb *inp = tp->t_inpcb;
1769	struct socket *so = inp->inp_socket;
1770
1771	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1772	INP_WLOCK_ASSERT(inp);
1773
1774	/*
1775	 * Neither tcp_close() nor tcp_drop() should return NULL, as the
1776	 * socket is still open.
1777	 */
1778	if (tp->t_state < TCPS_ESTABLISHED) {
1779		tp = tcp_close(tp);
1780		KASSERT(tp != NULL,
1781		    ("tcp_disconnect: tcp_close() returned NULL"));
1782	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
1783		tp = tcp_drop(tp, 0);
1784		KASSERT(tp != NULL,
1785		    ("tcp_disconnect: tcp_drop() returned NULL"));
1786	} else {
1787		soisdisconnecting(so);
1788		sbflush(&so->so_rcv);
1789		tcp_usrclosed(tp);
1790		if (!(inp->inp_flags & INP_DROPPED))
1791			tcp_output(tp);
1792	}
1793}
1794
1795/*
1796 * User issued close, and wish to trail through shutdown states:
1797 * if never received SYN, just forget it.  If got a SYN from peer,
1798 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1799 * If already got a FIN from peer, then almost done; go to LAST_ACK
1800 * state.  In all other cases, have already sent FIN to peer (e.g.
1801 * after PRU_SHUTDOWN), and just have to play tedious game waiting
1802 * for peer to send FIN or not respond to keep-alives, etc.
1803 * We can let the user exit from the close as soon as the FIN is acked.
1804 */
1805static void
1806tcp_usrclosed(struct tcpcb *tp)
1807{
1808
1809	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1810	INP_WLOCK_ASSERT(tp->t_inpcb);
1811
1812	switch (tp->t_state) {
1813	case TCPS_LISTEN:
1814#ifdef TCP_OFFLOAD
1815		tcp_offload_listen_stop(tp);
1816#endif
1817		tcp_state_change(tp, TCPS_CLOSED);
1818		/* FALLTHROUGH */
1819	case TCPS_CLOSED:
1820		tp = tcp_close(tp);
1821		/*
1822		 * tcp_close() should never return NULL here as the socket is
1823		 * still open.
1824		 */
1825		KASSERT(tp != NULL,
1826		    ("tcp_usrclosed: tcp_close() returned NULL"));
1827		break;
1828
1829	case TCPS_SYN_SENT:
1830	case TCPS_SYN_RECEIVED:
1831		tp->t_flags |= TF_NEEDFIN;
1832		break;
1833
1834	case TCPS_ESTABLISHED:
1835		tcp_state_change(tp, TCPS_FIN_WAIT_1);
1836		break;
1837
1838	case TCPS_CLOSE_WAIT:
1839		tcp_state_change(tp, TCPS_LAST_ACK);
1840		break;
1841	}
1842	if (tp->t_state >= TCPS_FIN_WAIT_2) {
1843		soisdisconnected(tp->t_inpcb->inp_socket);
1844		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
1845		if (tp->t_state == TCPS_FIN_WAIT_2) {
1846			int timeout;
1847
1848			timeout = (tcp_fast_finwait2_recycle) ?
1849			    tcp_finwait2_timeout : TP_MAXIDLE(tp);
1850			tcp_timer_activate(tp, TT_2MSL, timeout);
1851		}
1852	}
1853}
1854
1855#ifdef DDB
1856static void
1857db_print_indent(int indent)
1858{
1859	int i;
1860
1861	for (i = 0; i < indent; i++)
1862		db_printf(" ");
1863}
1864
1865static void
1866db_print_tstate(int t_state)
1867{
1868
1869	switch (t_state) {
1870	case TCPS_CLOSED:
1871		db_printf("TCPS_CLOSED");
1872		return;
1873
1874	case TCPS_LISTEN:
1875		db_printf("TCPS_LISTEN");
1876		return;
1877
1878	case TCPS_SYN_SENT:
1879		db_printf("TCPS_SYN_SENT");
1880		return;
1881
1882	case TCPS_SYN_RECEIVED:
1883		db_printf("TCPS_SYN_RECEIVED");
1884		return;
1885
1886	case TCPS_ESTABLISHED:
1887		db_printf("TCPS_ESTABLISHED");
1888		return;
1889
1890	case TCPS_CLOSE_WAIT:
1891		db_printf("TCPS_CLOSE_WAIT");
1892		return;
1893
1894	case TCPS_FIN_WAIT_1:
1895		db_printf("TCPS_FIN_WAIT_1");
1896		return;
1897
1898	case TCPS_CLOSING:
1899		db_printf("TCPS_CLOSING");
1900		return;
1901
1902	case TCPS_LAST_ACK:
1903		db_printf("TCPS_LAST_ACK");
1904		return;
1905
1906	case TCPS_FIN_WAIT_2:
1907		db_printf("TCPS_FIN_WAIT_2");
1908		return;
1909
1910	case TCPS_TIME_WAIT:
1911		db_printf("TCPS_TIME_WAIT");
1912		return;
1913
1914	default:
1915		db_printf("unknown");
1916		return;
1917	}
1918}
1919
1920static void
1921db_print_tflags(u_int t_flags)
1922{
1923	int comma;
1924
1925	comma = 0;
1926	if (t_flags & TF_ACKNOW) {
1927		db_printf("%sTF_ACKNOW", comma ? ", " : "");
1928		comma = 1;
1929	}
1930	if (t_flags & TF_DELACK) {
1931		db_printf("%sTF_DELACK", comma ? ", " : "");
1932		comma = 1;
1933	}
1934	if (t_flags & TF_NODELAY) {
1935		db_printf("%sTF_NODELAY", comma ? ", " : "");
1936		comma = 1;
1937	}
1938	if (t_flags & TF_NOOPT) {
1939		db_printf("%sTF_NOOPT", comma ? ", " : "");
1940		comma = 1;
1941	}
1942	if (t_flags & TF_SENTFIN) {
1943		db_printf("%sTF_SENTFIN", comma ? ", " : "");
1944		comma = 1;
1945	}
1946	if (t_flags & TF_REQ_SCALE) {
1947		db_printf("%sTF_REQ_SCALE", comma ? ", " : "");
1948		comma = 1;
1949	}
1950	if (t_flags & TF_RCVD_SCALE) {
1951		db_printf("%sTF_RECVD_SCALE", comma ? ", " : "");
1952		comma = 1;
1953	}
1954	if (t_flags & TF_REQ_TSTMP) {
1955		db_printf("%sTF_REQ_TSTMP", comma ? ", " : "");
1956		comma = 1;
1957	}
1958	if (t_flags & TF_RCVD_TSTMP) {
1959		db_printf("%sTF_RCVD_TSTMP", comma ? ", " : "");
1960		comma = 1;
1961	}
1962	if (t_flags & TF_SACK_PERMIT) {
1963		db_printf("%sTF_SACK_PERMIT", comma ? ", " : "");
1964		comma = 1;
1965	}
1966	if (t_flags & TF_NEEDSYN) {
1967		db_printf("%sTF_NEEDSYN", comma ? ", " : "");
1968		comma = 1;
1969	}
1970	if (t_flags & TF_NEEDFIN) {
1971		db_printf("%sTF_NEEDFIN", comma ? ", " : "");
1972		comma = 1;
1973	}
1974	if (t_flags & TF_NOPUSH) {
1975		db_printf("%sTF_NOPUSH", comma ? ", " : "");
1976		comma = 1;
1977	}
1978	if (t_flags & TF_MORETOCOME) {
1979		db_printf("%sTF_MORETOCOME", comma ? ", " : "");
1980		comma = 1;
1981	}
1982	if (t_flags & TF_LQ_OVERFLOW) {
1983		db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : "");
1984		comma = 1;
1985	}
1986	if (t_flags & TF_LASTIDLE) {
1987		db_printf("%sTF_LASTIDLE", comma ? ", " : "");
1988		comma = 1;
1989	}
1990	if (t_flags & TF_RXWIN0SENT) {
1991		db_printf("%sTF_RXWIN0SENT", comma ? ", " : "");
1992		comma = 1;
1993	}
1994	if (t_flags & TF_FASTRECOVERY) {
1995		db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
1996		comma = 1;
1997	}
1998	if (t_flags & TF_CONGRECOVERY) {
1999		db_printf("%sTF_CONGRECOVERY", comma ? ", " : "");
2000		comma = 1;
2001	}
2002	if (t_flags & TF_WASFRECOVERY) {
2003		db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
2004		comma = 1;
2005	}
2006	if (t_flags & TF_SIGNATURE) {
2007		db_printf("%sTF_SIGNATURE", comma ? ", " : "");
2008		comma = 1;
2009	}
2010	if (t_flags & TF_FORCEDATA) {
2011		db_printf("%sTF_FORCEDATA", comma ? ", " : "");
2012		comma = 1;
2013	}
2014	if (t_flags & TF_TSO) {
2015		db_printf("%sTF_TSO", comma ? ", " : "");
2016		comma = 1;
2017	}
2018	if (t_flags & TF_ECN_PERMIT) {
2019		db_printf("%sTF_ECN_PERMIT", comma ? ", " : "");
2020		comma = 1;
2021	}
2022	if (t_flags & TF_FASTOPEN) {
2023		db_printf("%sTF_FASTOPEN", comma ? ", " : "");
2024		comma = 1;
2025	}
2026}
2027
2028static void
2029db_print_toobflags(char t_oobflags)
2030{
2031	int comma;
2032
2033	comma = 0;
2034	if (t_oobflags & TCPOOB_HAVEDATA) {
2035		db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : "");
2036		comma = 1;
2037	}
2038	if (t_oobflags & TCPOOB_HADDATA) {
2039		db_printf("%sTCPOOB_HADDATA", comma ? ", " : "");
2040		comma = 1;
2041	}
2042}
2043
2044static void
2045db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
2046{
2047
2048	db_print_indent(indent);
2049	db_printf("%s at %p\n", name, tp);
2050
2051	indent += 2;
2052
2053	db_print_indent(indent);
2054	db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
2055	   LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
2056
2057	db_print_indent(indent);
2058	db_printf("tt_rexmt: %p   tt_persist: %p   tt_keep: %p\n",
2059	    &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep);
2060
2061	db_print_indent(indent);
2062	db_printf("tt_2msl: %p   tt_delack: %p   t_inpcb: %p\n", &tp->t_timers->tt_2msl,
2063	    &tp->t_timers->tt_delack, tp->t_inpcb);
2064
2065	db_print_indent(indent);
2066	db_printf("t_state: %d (", tp->t_state);
2067	db_print_tstate(tp->t_state);
2068	db_printf(")\n");
2069
2070	db_print_indent(indent);
2071	db_printf("t_flags: 0x%x (", tp->t_flags);
2072	db_print_tflags(tp->t_flags);
2073	db_printf(")\n");
2074
2075	db_print_indent(indent);
2076	db_printf("snd_una: 0x%08x   snd_max: 0x%08x   snd_nxt: x0%08x\n",
2077	    tp->snd_una, tp->snd_max, tp->snd_nxt);
2078
2079	db_print_indent(indent);
2080	db_printf("snd_up: 0x%08x   snd_wl1: 0x%08x   snd_wl2: 0x%08x\n",
2081	   tp->snd_up, tp->snd_wl1, tp->snd_wl2);
2082
2083	db_print_indent(indent);
2084	db_printf("iss: 0x%08x   irs: 0x%08x   rcv_nxt: 0x%08x\n",
2085	    tp->iss, tp->irs, tp->rcv_nxt);
2086
2087	db_print_indent(indent);
2088	db_printf("rcv_adv: 0x%08x   rcv_wnd: %lu   rcv_up: 0x%08x\n",
2089	    tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
2090
2091	db_print_indent(indent);
2092	db_printf("snd_wnd: %lu   snd_cwnd: %lu\n",
2093	   tp->snd_wnd, tp->snd_cwnd);
2094
2095	db_print_indent(indent);
2096	db_printf("snd_ssthresh: %lu   snd_recover: "
2097	    "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
2098
2099	db_print_indent(indent);
2100	db_printf("t_maxopd: %u   t_rcvtime: %u   t_startime: %u\n",
2101	    tp->t_maxopd, tp->t_rcvtime, tp->t_starttime);
2102
2103	db_print_indent(indent);
2104	db_printf("t_rttime: %u   t_rtsq: 0x%08x\n",
2105	    tp->t_rtttime, tp->t_rtseq);
2106
2107	db_print_indent(indent);
2108	db_printf("t_rxtcur: %d   t_maxseg: %u   t_srtt: %d\n",
2109	    tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
2110
2111	db_print_indent(indent);
2112	db_printf("t_rttvar: %d   t_rxtshift: %d   t_rttmin: %u   "
2113	    "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
2114	    tp->t_rttbest);
2115
2116	db_print_indent(indent);
2117	db_printf("t_rttupdated: %lu   max_sndwnd: %lu   t_softerror: %d\n",
2118	    tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);
2119
2120	db_print_indent(indent);
2121	db_printf("t_oobflags: 0x%x (", tp->t_oobflags);
2122	db_print_toobflags(tp->t_oobflags);
2123	db_printf(")   t_iobc: 0x%02x\n", tp->t_iobc);
2124
2125	db_print_indent(indent);
2126	db_printf("snd_scale: %u   rcv_scale: %u   request_r_scale: %u\n",
2127	    tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
2128
2129	db_print_indent(indent);
2130	db_printf("ts_recent: %u   ts_recent_age: %u\n",
2131	    tp->ts_recent, tp->ts_recent_age);
2132
2133	db_print_indent(indent);
2134	db_printf("ts_offset: %u   last_ack_sent: 0x%08x   snd_cwnd_prev: "
2135	    "%lu\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
2136
2137	db_print_indent(indent);
2138	db_printf("snd_ssthresh_prev: %lu   snd_recover_prev: 0x%08x   "
2139	    "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
2140	    tp->snd_recover_prev, tp->t_badrxtwin);
2141
2142	db_print_indent(indent);
2143	db_printf("snd_numholes: %d  snd_holes first: %p\n",
2144	    tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
2145
2146	db_print_indent(indent);
2147	db_printf("snd_fack: 0x%08x   rcv_numsacks: %d   sack_newdata: "
2148	    "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata);
2149
2150	/* Skip sackblks, sackhint. */
2151
2152	db_print_indent(indent);
2153	db_printf("t_rttlow: %d   rfbuf_ts: %u   rfbuf_cnt: %d\n",
2154	    tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
2155}
2156
2157DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
2158{
2159	struct tcpcb *tp;
2160
2161	if (!have_addr) {
2162		db_printf("usage: show tcpcb <addr>\n");
2163		return;
2164	}
2165	tp = (struct tcpcb *)addr;
2166
2167	db_print_tcpcb(tp, "tcpcb", 0);
2168}
2169#endif
2170