fastpath.c revision 315514
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 * Copyright (c) 2007-2008,2010
5 *	Swinburne University of Technology, Melbourne, Australia.
6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010 The FreeBSD Foundation
8 * Copyright (c) 2010-2011 Juniper Networks, Inc.
9 * Copyright (c) 2015 Netflix Inc.
10 * All rights reserved.
11 *
12 * Portions of this software were developed at the Centre for Advanced Internet
13 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
14 * James Healy and David Hayes, made possible in part by a grant from the Cisco
15 * University Research Program Fund at Community Foundation Silicon Valley.
16 *
17 * Portions of this software were developed at the Centre for Advanced
18 * Internet Architectures, Swinburne University of Technology, Melbourne,
19 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
20 *
21 * Portions of this software were developed by Robert N. M. Watson under
22 * contract to Juniper Networks, Inc.
23 *
24 * Portions of this software were developed by Randall R. Stewart while
25 * working for Netflix Inc.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 *    notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 *    notice, this list of conditions and the following disclaimer in the
34 *    documentation and/or other materials provided with the distribution.
35 * 4. Neither the name of the University nor the names of its contributors
36 *    may be used to endorse or promote products derived from this software
37 *    without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
40 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
42 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
43 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
44 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
45 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
47 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
48 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49 * SUCH DAMAGE.
50 *
51 *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
52 */
53
54#include <sys/cdefs.h>
55__FBSDID("$FreeBSD: stable/11/sys/netinet/tcp_stacks/fastpath.c 315514 2017-03-18 22:04:20Z ae $");
56
57#include "opt_inet.h"
58#include "opt_inet6.h"
59#include "opt_tcpdebug.h"
60
61#include <sys/param.h>
62#include <sys/module.h>
63#include <sys/kernel.h>
64#include <sys/hhook.h>
65#include <sys/malloc.h>
66#include <sys/mbuf.h>
67#include <sys/proc.h>		/* for proc0 declaration */
68#include <sys/protosw.h>
69#include <sys/sdt.h>
70#include <sys/signalvar.h>
71#include <sys/socket.h>
72#include <sys/socketvar.h>
73#include <sys/sysctl.h>
74#include <sys/syslog.h>
75#include <sys/systm.h>
76
77#include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
78
79#include <vm/uma.h>
80
81#include <net/route.h>
82#include <net/vnet.h>
83
84#define TCPSTATES		/* for logging */
85
86#include <netinet/in.h>
87#include <netinet/in_kdtrace.h>
88#include <netinet/in_pcb.h>
89#include <netinet/in_systm.h>
90#include <netinet/ip.h>
91#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
92#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
93#include <netinet/ip_var.h>
94#include <netinet/ip_options.h>
95#include <netinet/ip6.h>
96#include <netinet/icmp6.h>
97#include <netinet6/in6_pcb.h>
98#include <netinet6/ip6_var.h>
99#include <netinet/tcp.h>
100#include <netinet/tcp_fsm.h>
101#include <netinet/tcp_seq.h>
102#include <netinet/tcp_timer.h>
103#include <netinet/tcp_var.h>
104#include <netinet6/tcp6_var.h>
105#include <netinet/tcpip.h>
106#include <netinet/tcp_syncache.h>
107#include <netinet/cc/cc.h>
108#ifdef TCPDEBUG
109#include <netinet/tcp_debug.h>
110#endif /* TCPDEBUG */
111#ifdef TCP_OFFLOAD
112#include <netinet/tcp_offload.h>
113#endif
114
115#include <machine/in_cksum.h>
116
117#include <security/mac/mac_framework.h>
118
119VNET_DECLARE(int, tcp_autorcvbuf_inc);
120#define	V_tcp_autorcvbuf_inc	VNET(tcp_autorcvbuf_inc)
121VNET_DECLARE(int, tcp_autorcvbuf_max);
122#define	V_tcp_autorcvbuf_max	VNET(tcp_autorcvbuf_max)
123VNET_DECLARE(int, tcp_do_rfc3042);
124#define	V_tcp_do_rfc3042	VNET(tcp_do_rfc3042)
125VNET_DECLARE(int, tcp_do_autorcvbuf);
126#define	V_tcp_do_autorcvbuf	VNET(tcp_do_autorcvbuf)
127VNET_DECLARE(int, tcp_insecure_rst);
128#define	V_tcp_insecure_rst	VNET(tcp_insecure_rst)
129VNET_DECLARE(int, tcp_insecure_syn);
130#define	V_tcp_insecure_syn	VNET(tcp_insecure_syn)
131
132static void	 tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *,
133			struct socket *, struct tcpcb *, int, int, uint8_t,
134			int);
135
136static void	 tcp_do_segment_fastack(struct mbuf *, struct tcphdr *,
137			struct socket *, struct tcpcb *, int, int, uint8_t,
138			int);
139
140/*
141 * Indicate whether this ack should be delayed.  We can delay the ack if
142 * following conditions are met:
143 *	- There is no delayed ack timer in progress.
144 *	- Our last ack wasn't a 0-sized window. We never want to delay
145 *	  the ack that opens up a 0-sized window.
146 *	- LRO wasn't used for this segment. We make sure by checking that the
147 *	  segment size is not larger than the MSS.
148 */
149#define DELAY_ACK(tp, tlen)						\
150	((!tcp_timer_active(tp, TT_DELACK) &&				\
151	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
152	    (tlen <= tp->t_maxseg) &&					\
153	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
154
155/*
156 * So how is this faster than the normal fast ack?
157 * It basically allows us to also stay in the fastpath
158 * when a window-update ack also arrives. In testing
159 * we saw only 25-30% of connections doing fastpath
160 * due to the fact that along with moving forward
161 * in sequence the window was also updated.
162 */
163static void
164tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
165	       struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
166	       int ti_locked, u_long tiwin)
167{
168	int acked;
169	int winup_only=0;
170#ifdef TCPDEBUG
171	/*
172	 * The size of tcp_saveipgen must be the size of the max ip header,
173	 * now IPv6.
174	 */
175	u_char tcp_saveipgen[IP6_HDR_LEN];
176	struct tcphdr tcp_savetcp;
177	short ostate = 0;
178#endif
179        /*
180	 * The following if statement will be true if
181	 * we are doing the win_up_in_fp <and>
182	 * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or>
183	 * - No more new data, but we have an ack for new data
184	 *   (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack))
185	 * - No more new data, the same ack point but the window grew
186	 *   (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd)
187	 */
188	if ((SEQ_LT(tp->snd_wl1, th->th_seq) ||
189	     (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
190					    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
191		/* keep track of pure window updates */
192		if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
193			winup_only = 1;
194			TCPSTAT_INC(tcps_rcvwinupd);
195		}
196		tp->snd_wnd = tiwin;
197		tp->snd_wl1 = th->th_seq;
198		tp->snd_wl2 = th->th_ack;
199		if (tp->snd_wnd > tp->max_sndwnd)
200			tp->max_sndwnd = tp->snd_wnd;
201	}
202	/*
203	 * If last ACK falls within this segment's sequence numbers,
204	 * record the timestamp.
205	 * NOTE that the test is modified according to the latest
206	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
207	 */
208	if ((to->to_flags & TOF_TS) != 0 &&
209	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
210		tp->ts_recent_age = tcp_ts_getticks();
211		tp->ts_recent = to->to_tsval;
212	}
213	/*
214	 * This is a pure ack for outstanding data.
215	 */
216	if (ti_locked == TI_RLOCKED) {
217		INP_INFO_RUNLOCK(&V_tcbinfo);
218	}
219	ti_locked = TI_UNLOCKED;
220
221	TCPSTAT_INC(tcps_predack);
222
223	/*
224	 * "bad retransmit" recovery.
225	 */
226	if (tp->t_rxtshift == 1 &&
227	    tp->t_flags & TF_PREVVALID &&
228	    (int)(ticks - tp->t_badrxtwin) < 0) {
229		cc_cong_signal(tp, th, CC_RTO_ERR);
230	}
231
232	/*
233	 * Recalculate the transmit timer / rtt.
234	 *
235	 * Some boxes send broken timestamp replies
236	 * during the SYN+ACK phase, ignore
237	 * timestamps of 0 or we could calculate a
238	 * huge RTT and blow up the retransmit timer.
239	 */
240	if ((to->to_flags & TOF_TS) != 0 &&
241	    to->to_tsecr) {
242		u_int t;
243
244		t = tcp_ts_getticks() - to->to_tsecr;
245		if (!tp->t_rttlow || tp->t_rttlow > t)
246			tp->t_rttlow = t;
247		tcp_xmit_timer(tp,
248			       TCP_TS_TO_TICKS(t) + 1);
249	} else if (tp->t_rtttime &&
250		   SEQ_GT(th->th_ack, tp->t_rtseq)) {
251		if (!tp->t_rttlow ||
252		    tp->t_rttlow > ticks - tp->t_rtttime)
253			tp->t_rttlow = ticks - tp->t_rtttime;
254		tcp_xmit_timer(tp,
255			       ticks - tp->t_rtttime);
256	}
257	if (winup_only == 0) {
258		acked = BYTES_THIS_ACK(tp, th);
259
260		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
261		hhook_run_tcp_est_in(tp, th, to);
262
263		TCPSTAT_ADD(tcps_rcvackbyte, acked);
264		sbdrop(&so->so_snd, acked);
265		if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
266		    SEQ_LEQ(th->th_ack, tp->snd_recover))
267			tp->snd_recover = th->th_ack - 1;
268
269		/*
270		 * Let the congestion control algorithm update
271		 * congestion control related information. This
272		 * typically means increasing the congestion
273		 * window.
274		 */
275		cc_ack_received(tp, th, CC_ACK);
276
277		tp->snd_una = th->th_ack;
278		/*
279		 * Pull snd_wl2 up to prevent seq wrap relative
280		 * to th_ack.
281		 */
282		tp->snd_wl2 = th->th_ack;
283		tp->t_dupacks = 0;
284
285		/*
286		 * If all outstanding data are acked, stop
287		 * retransmit timer, otherwise restart timer
288		 * using current (possibly backed-off) value.
289		 * If process is waiting for space,
290		 * wakeup/selwakeup/signal.  If data
291		 * are ready to send, let tcp_output
292		 * decide between more output or persist.
293		 */
294#ifdef TCPDEBUG
295		if (so->so_options & SO_DEBUG)
296			tcp_trace(TA_INPUT, ostate, tp,
297				  (void *)tcp_saveipgen,
298				  &tcp_savetcp, 0);
299#endif
300		TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
301		m_freem(m);
302		if (tp->snd_una == tp->snd_max)
303			tcp_timer_activate(tp, TT_REXMT, 0);
304		else if (!tcp_timer_active(tp, TT_PERSIST))
305			tcp_timer_activate(tp, TT_REXMT,
306					   tp->t_rxtcur);
307	} else {
308		/*
309		 * Window update only, just free the mbufs and
310		 * send out whatever we can.
311		 */
312		m_freem(m);
313	}
314	sowwakeup(so);
315	if (sbavail(&so->so_snd))
316		(void) tcp_output(tp);
317	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
318					    __func__, ti_locked));
319	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
320	INP_WLOCK_ASSERT(tp->t_inpcb);
321
322	if (tp->t_flags & TF_DELACK) {
323		tp->t_flags &= ~TF_DELACK;
324		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
325	}
326	INP_WUNLOCK(tp->t_inpcb);
327}
328
329/*
330 * Here nothing is really faster, its just that we
331 * have broken out the fast-data path also just like
332 * the fast-ack.
333 */
334static void
335tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
336		   struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
337		   int ti_locked, u_long tiwin)
338{
339	int newsize = 0;	/* automatic sockbuf scaling */
340#ifdef TCPDEBUG
341	/*
342	 * The size of tcp_saveipgen must be the size of the max ip header,
343	 * now IPv6.
344	 */
345	u_char tcp_saveipgen[IP6_HDR_LEN];
346	struct tcphdr tcp_savetcp;
347	short ostate = 0;
348#endif
349	/*
350	 * If last ACK falls within this segment's sequence numbers,
351	 * record the timestamp.
352	 * NOTE that the test is modified according to the latest
353	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
354	 */
355	if ((to->to_flags & TOF_TS) != 0 &&
356	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
357		tp->ts_recent_age = tcp_ts_getticks();
358		tp->ts_recent = to->to_tsval;
359	}
360
361	/*
362	 * This is a pure, in-sequence data packet with
363	 * nothing on the reassembly queue and we have enough
364	 * buffer space to take it.
365	 */
366	if (ti_locked == TI_RLOCKED) {
367		INP_INFO_RUNLOCK(&V_tcbinfo);
368	}
369	ti_locked = TI_UNLOCKED;
370
371	/* Clean receiver SACK report if present */
372	if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
373		tcp_clean_sackreport(tp);
374	TCPSTAT_INC(tcps_preddat);
375	tp->rcv_nxt += tlen;
376	/*
377	 * Pull snd_wl1 up to prevent seq wrap relative to
378	 * th_seq.
379	 */
380	tp->snd_wl1 = th->th_seq;
381	/*
382	 * Pull rcv_up up to prevent seq wrap relative to
383	 * rcv_nxt.
384	 */
385	tp->rcv_up = tp->rcv_nxt;
386	TCPSTAT_ADD(tcps_rcvbyte, tlen);
387#ifdef TCPDEBUG
388	if (so->so_options & SO_DEBUG)
389		tcp_trace(TA_INPUT, ostate, tp,
390			  (void *)tcp_saveipgen, &tcp_savetcp, 0);
391#endif
392	TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
393	/*
394	 * Automatic sizing of receive socket buffer.  Often the send
395	 * buffer size is not optimally adjusted to the actual network
396	 * conditions at hand (delay bandwidth product).  Setting the
397	 * buffer size too small limits throughput on links with high
398	 * bandwidth and high delay (eg. trans-continental/oceanic links).
399	 *
400	 * On the receive side the socket buffer memory is only rarely
401	 * used to any significant extent.  This allows us to be much
402	 * more aggressive in scaling the receive socket buffer.  For
403	 * the case that the buffer space is actually used to a large
404	 * extent and we run out of kernel memory we can simply drop
405	 * the new segments; TCP on the sender will just retransmit it
406	 * later.  Setting the buffer size too big may only consume too
407	 * much kernel memory if the application doesn't read() from
408	 * the socket or packet loss or reordering makes use of the
409	 * reassembly queue.
410	 *
411	 * The criteria to step up the receive buffer one notch are:
412	 *  1. Application has not set receive buffer size with
413	 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
414	 *  2. the number of bytes received during the time it takes
415	 *     one timestamp to be reflected back to us (the RTT);
416	 *  3. received bytes per RTT is within seven eighth of the
417	 *     current socket buffer size;
418	 *  4. receive buffer size has not hit maximal automatic size;
419	 *
420	 * This algorithm does one step per RTT at most and only if
421	 * we receive a bulk stream w/o packet losses or reorderings.
422	 * Shrinking the buffer during idle times is not necessary as
423	 * it doesn't consume any memory when idle.
424	 *
425	 * TODO: Only step up if the application is actually serving
426	 * the buffer to better manage the socket buffer resources.
427	 */
428	if (V_tcp_do_autorcvbuf &&
429	    (to->to_flags & TOF_TS) &&
430	    to->to_tsecr &&
431	    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
432		if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
433		    to->to_tsecr - tp->rfbuf_ts < hz) {
434			if (tp->rfbuf_cnt >
435			    (so->so_rcv.sb_hiwat / 8 * 7) &&
436			    so->so_rcv.sb_hiwat <
437			    V_tcp_autorcvbuf_max) {
438				newsize =
439					min(so->so_rcv.sb_hiwat +
440					    V_tcp_autorcvbuf_inc,
441					    V_tcp_autorcvbuf_max);
442			}
443			/* Start over with next RTT. */
444			tp->rfbuf_ts = 0;
445			tp->rfbuf_cnt = 0;
446		} else
447			tp->rfbuf_cnt += tlen;	/* add up */
448	}
449
450	/* Add data to socket buffer. */
451	SOCKBUF_LOCK(&so->so_rcv);
452	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
453		m_freem(m);
454	} else {
455		/*
456		 * Set new socket buffer size.
457		 * Give up when limit is reached.
458		 */
459		if (newsize)
460			if (!sbreserve_locked(&so->so_rcv,
461					      newsize, so, NULL))
462				so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
463		m_adj(m, drop_hdrlen);	/* delayed header drop */
464		sbappendstream_locked(&so->so_rcv, m, 0);
465	}
466	/* NB: sorwakeup_locked() does an implicit unlock. */
467	sorwakeup_locked(so);
468	if (DELAY_ACK(tp, tlen)) {
469		tp->t_flags |= TF_DELACK;
470	} else {
471		tp->t_flags |= TF_ACKNOW;
472		tcp_output(tp);
473	}
474	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
475					    __func__, ti_locked));
476	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
477	INP_WLOCK_ASSERT(tp->t_inpcb);
478
479	if (tp->t_flags & TF_DELACK) {
480		tp->t_flags &= ~TF_DELACK;
481		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
482	}
483	INP_WUNLOCK(tp->t_inpcb);
484}
485
486/*
487 * The slow-path is the clone of the long long part
488 * of tcp_do_segment past all the fast-path stuff. We
489 * use it here by two different callers, the fast/slow and
490 * the fastack only.
491 */
492static void
493tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
494		struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
495		int ti_locked, u_long tiwin, int thflags)
496{
497	int  acked, ourfinisacked, needoutput = 0;
498	int rstreason, todrop, win;
499	char *s;
500	struct in_conninfo *inc;
501	struct mbuf *mfree = NULL;
502#ifdef TCPDEBUG
503	/*
504	 * The size of tcp_saveipgen must be the size of the max ip header,
505	 * now IPv6.
506	 */
507	u_char tcp_saveipgen[IP6_HDR_LEN];
508	struct tcphdr tcp_savetcp;
509	short ostate = 0;
510#endif
511	/*
512	 * Calculate amount of space in receive window,
513	 * and then do TCP input processing.
514	 * Receive window is amount of space in rcv queue,
515	 * but not less than advertised window.
516	 */
517	inc = &tp->t_inpcb->inp_inc;
518	win = sbspace(&so->so_rcv);
519	if (win < 0)
520		win = 0;
521	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
522
523	/* Reset receive buffer auto scaling when not in bulk receive mode. */
524	tp->rfbuf_ts = 0;
525	tp->rfbuf_cnt = 0;
526
527	switch (tp->t_state) {
528
529	/*
530	 * If the state is SYN_RECEIVED:
531	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
532	 */
533	case TCPS_SYN_RECEIVED:
534		if ((thflags & TH_ACK) &&
535		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
536		     SEQ_GT(th->th_ack, tp->snd_max))) {
537				rstreason = BANDLIM_RST_OPENPORT;
538				goto dropwithreset;
539		}
540		break;
541
542	/*
543	 * If the state is SYN_SENT:
544	 *	if seg contains an ACK, but not for our SYN, drop the input.
545	 *	if seg contains a RST, then drop the connection.
546	 *	if seg does not contain SYN, then drop it.
547	 * Otherwise this is an acceptable SYN segment
548	 *	initialize tp->rcv_nxt and tp->irs
549	 *	if seg contains ack then advance tp->snd_una
550	 *	if seg contains an ECE and ECN support is enabled, the stream
551	 *	    is ECN capable.
552	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
553	 *	arrange for segment to be acked (eventually)
554	 *	continue processing rest of data/controls, beginning with URG
555	 */
556	case TCPS_SYN_SENT:
557		if ((thflags & TH_ACK) &&
558		    (SEQ_LEQ(th->th_ack, tp->iss) ||
559		     SEQ_GT(th->th_ack, tp->snd_max))) {
560			rstreason = BANDLIM_UNLIMITED;
561			goto dropwithreset;
562		}
563		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
564			TCP_PROBE5(connect__refused, NULL, tp,
565			    mtod(m, const char *), tp, th);
566			tp = tcp_drop(tp, ECONNREFUSED);
567		}
568		if (thflags & TH_RST)
569			goto drop;
570		if (!(thflags & TH_SYN))
571			goto drop;
572
573		tp->irs = th->th_seq;
574		tcp_rcvseqinit(tp);
575		if (thflags & TH_ACK) {
576			TCPSTAT_INC(tcps_connects);
577			soisconnected(so);
578#ifdef MAC
579			mac_socketpeer_set_from_mbuf(m, so);
580#endif
581			/* Do window scaling on this connection? */
582			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
583				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
584				tp->rcv_scale = tp->request_r_scale;
585			}
586			tp->rcv_adv += imin(tp->rcv_wnd,
587			    TCP_MAXWIN << tp->rcv_scale);
588			tp->snd_una++;		/* SYN is acked */
589			/*
590			 * If there's data, delay ACK; if there's also a FIN
591			 * ACKNOW will be turned on later.
592			 */
593			if (DELAY_ACK(tp, tlen) && tlen != 0)
594				tcp_timer_activate(tp, TT_DELACK,
595				    tcp_delacktime);
596			else
597				tp->t_flags |= TF_ACKNOW;
598
599			if ((thflags & TH_ECE) && V_tcp_do_ecn) {
600				tp->t_flags |= TF_ECN_PERMIT;
601				TCPSTAT_INC(tcps_ecn_shs);
602			}
603
604			/*
605			 * Received <SYN,ACK> in SYN_SENT[*] state.
606			 * Transitions:
607			 *	SYN_SENT  --> ESTABLISHED
608			 *	SYN_SENT* --> FIN_WAIT_1
609			 */
610			tp->t_starttime = ticks;
611			if (tp->t_flags & TF_NEEDFIN) {
612				tcp_state_change(tp, TCPS_FIN_WAIT_1);
613				tp->t_flags &= ~TF_NEEDFIN;
614				thflags &= ~TH_SYN;
615			} else {
616				tcp_state_change(tp, TCPS_ESTABLISHED);
617				TCP_PROBE5(connect__established, NULL, tp,
618				    mtod(m, const char *), tp, th);
619				cc_conn_init(tp);
620				tcp_timer_activate(tp, TT_KEEP,
621				    TP_KEEPIDLE(tp));
622			}
623		} else {
624			/*
625			 * Received initial SYN in SYN-SENT[*] state =>
626			 * simultaneous open.
627			 * If it succeeds, connection is * half-synchronized.
628			 * Otherwise, do 3-way handshake:
629			 *        SYN-SENT -> SYN-RECEIVED
630			 *        SYN-SENT* -> SYN-RECEIVED*
631			 */
632			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
633			tcp_timer_activate(tp, TT_REXMT, 0);
634			tcp_state_change(tp, TCPS_SYN_RECEIVED);
635		}
636
637		KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
638		    "ti_locked %d", __func__, ti_locked));
639		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
640		INP_WLOCK_ASSERT(tp->t_inpcb);
641
642		/*
643		 * Advance th->th_seq to correspond to first data byte.
644		 * If data, trim to stay within window,
645		 * dropping FIN if necessary.
646		 */
647		th->th_seq++;
648		if (tlen > tp->rcv_wnd) {
649			todrop = tlen - tp->rcv_wnd;
650			m_adj(m, -todrop);
651			tlen = tp->rcv_wnd;
652			thflags &= ~TH_FIN;
653			TCPSTAT_INC(tcps_rcvpackafterwin);
654			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
655		}
656		tp->snd_wl1 = th->th_seq - 1;
657		tp->rcv_up = th->th_seq;
658		/*
659		 * Client side of transaction: already sent SYN and data.
660		 * If the remote host used T/TCP to validate the SYN,
661		 * our data will be ACK'd; if so, enter normal data segment
662		 * processing in the middle of step 5, ack processing.
663		 * Otherwise, goto step 6.
664		 */
665		if (thflags & TH_ACK)
666			goto process_ACK;
667
668		goto step6;
669
670	/*
671	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
672	 *      do normal processing.
673	 *
674	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
675	 */
676	case TCPS_LAST_ACK:
677	case TCPS_CLOSING:
678		break;  /* continue normal processing */
679	}
680
681	/*
682	 * States other than LISTEN or SYN_SENT.
683	 * First check the RST flag and sequence number since reset segments
684	 * are exempt from the timestamp and connection count tests.  This
685	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
686	 * below which allowed reset segments in half the sequence space
687	 * to fall though and be processed (which gives forged reset
688	 * segments with a random sequence number a 50 percent chance of
689	 * killing a connection).
690	 * Then check timestamp, if present.
691	 * Then check the connection count, if present.
692	 * Then check that at least some bytes of segment are within
693	 * receive window.  If segment begins before rcv_nxt,
694	 * drop leading data (and SYN); if nothing left, just ack.
695	 */
696	if (thflags & TH_RST) {
697		/*
698		 * RFC5961 Section 3.2
699		 *
700		 * - RST drops connection only if SEG.SEQ == RCV.NXT.
701		 * - If RST is in window, we send challenge ACK.
702		 *
703		 * Note: to take into account delayed ACKs, we should
704		 *   test against last_ack_sent instead of rcv_nxt.
705		 * Note 2: we handle special case of closed window, not
706		 *   covered by the RFC.
707		 */
708		if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
709		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
710		    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
711			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
712			KASSERT(ti_locked == TI_RLOCKED,
713			    ("%s: TH_RST ti_locked %d, th %p tp %p",
714			    __func__, ti_locked, th, tp));
715			KASSERT(tp->t_state != TCPS_SYN_SENT,
716			    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
717			    __func__, th, tp));
718
719			if (V_tcp_insecure_rst ||
720			    tp->last_ack_sent == th->th_seq) {
721				TCPSTAT_INC(tcps_drops);
722				/* Drop the connection. */
723				switch (tp->t_state) {
724				case TCPS_SYN_RECEIVED:
725					so->so_error = ECONNREFUSED;
726					goto close;
727				case TCPS_ESTABLISHED:
728				case TCPS_FIN_WAIT_1:
729				case TCPS_FIN_WAIT_2:
730				case TCPS_CLOSE_WAIT:
731				case TCPS_CLOSING:
732				case TCPS_LAST_ACK:
733					so->so_error = ECONNRESET;
734				close:
735					/* FALLTHROUGH */
736				default:
737					tp = tcp_close(tp);
738				}
739			} else {
740				TCPSTAT_INC(tcps_badrst);
741				/* Send challenge ACK. */
742				tcp_respond(tp, mtod(m, void *), th, m,
743				    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
744				tp->last_ack_sent = tp->rcv_nxt;
745				m = NULL;
746			}
747		}
748		goto drop;
749	}
750
751	/*
752	 * RFC5961 Section 4.2
753	 * Send challenge ACK for any SYN in synchronized state.
754	 */
755	if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) {
756		KASSERT(ti_locked == TI_RLOCKED,
757		    ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
758		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
759
760		TCPSTAT_INC(tcps_badsyn);
761		if (V_tcp_insecure_syn &&
762		    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
763		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
764			tp = tcp_drop(tp, ECONNRESET);
765			rstreason = BANDLIM_UNLIMITED;
766		} else {
767			/* Send challenge ACK. */
768			tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
769			    tp->snd_nxt, TH_ACK);
770			tp->last_ack_sent = tp->rcv_nxt;
771			m = NULL;
772		}
773		goto drop;
774	}
775
776	/*
777	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
778	 * and it's less than ts_recent, drop it.
779	 */
780	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
781	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
782
783		/* Check to see if ts_recent is over 24 days old.  */
784		if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
785			/*
786			 * Invalidate ts_recent.  If this segment updates
787			 * ts_recent, the age will be reset later and ts_recent
788			 * will get a valid value.  If it does not, setting
789			 * ts_recent to zero will at least satisfy the
790			 * requirement that zero be placed in the timestamp
791			 * echo reply when ts_recent isn't valid.  The
792			 * age isn't reset until we get a valid ts_recent
793			 * because we don't want out-of-order segments to be
794			 * dropped when ts_recent is old.
795			 */
796			tp->ts_recent = 0;
797		} else {
798			TCPSTAT_INC(tcps_rcvduppack);
799			TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
800			TCPSTAT_INC(tcps_pawsdrop);
801			if (tlen)
802				goto dropafterack;
803			goto drop;
804		}
805	}
806
807	/*
808	 * In the SYN-RECEIVED state, validate that the packet belongs to
809	 * this connection before trimming the data to fit the receive
810	 * window.  Check the sequence number versus IRS since we know
811	 * the sequence numbers haven't wrapped.  This is a partial fix
812	 * for the "LAND" DoS attack.
813	 */
814	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
815		rstreason = BANDLIM_RST_OPENPORT;
816		goto dropwithreset;
817	}
818
819	todrop = tp->rcv_nxt - th->th_seq;
820	if (todrop > 0) {
821		if (thflags & TH_SYN) {
822			thflags &= ~TH_SYN;
823			th->th_seq++;
824			if (th->th_urp > 1)
825				th->th_urp--;
826			else
827				thflags &= ~TH_URG;
828			todrop--;
829		}
830		/*
831		 * Following if statement from Stevens, vol. 2, p. 960.
832		 */
833		if (todrop > tlen
834		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
835			/*
836			 * Any valid FIN must be to the left of the window.
837			 * At this point the FIN must be a duplicate or out
838			 * of sequence; drop it.
839			 */
840			thflags &= ~TH_FIN;
841
842			/*
843			 * Send an ACK to resynchronize and drop any data.
844			 * But keep on processing for RST or ACK.
845			 */
846			tp->t_flags |= TF_ACKNOW;
847			todrop = tlen;
848			TCPSTAT_INC(tcps_rcvduppack);
849			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
850		} else {
851			TCPSTAT_INC(tcps_rcvpartduppack);
852			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
853		}
854		drop_hdrlen += todrop;	/* drop from the top afterwards */
855		th->th_seq += todrop;
856		tlen -= todrop;
857		if (th->th_urp > todrop)
858			th->th_urp -= todrop;
859		else {
860			thflags &= ~TH_URG;
861			th->th_urp = 0;
862		}
863	}
864
865	/*
866	 * If new data are received on a connection after the
867	 * user processes are gone, then RST the other end.
868	 */
869	if ((so->so_state & SS_NOFDREF) &&
870	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
871		KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
872		    "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
873		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
874
875		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
876			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
877			    "after socket was closed, "
878			    "sending RST and removing tcpcb\n",
879			    s, __func__, tcpstates[tp->t_state], tlen);
880			free(s, M_TCPLOG);
881		}
882		tp = tcp_close(tp);
883		TCPSTAT_INC(tcps_rcvafterclose);
884		rstreason = BANDLIM_UNLIMITED;
885		goto dropwithreset;
886	}
887
888	/*
889	 * If segment ends after window, drop trailing data
890	 * (and PUSH and FIN); if nothing left, just ACK.
891	 */
892	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
893	if (todrop > 0) {
894		TCPSTAT_INC(tcps_rcvpackafterwin);
895		if (todrop >= tlen) {
896			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
897			/*
898			 * If window is closed can only take segments at
899			 * window edge, and have to drop data and PUSH from
900			 * incoming segments.  Continue processing, but
901			 * remember to ack.  Otherwise, drop segment
902			 * and ack.
903			 */
904			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
905				tp->t_flags |= TF_ACKNOW;
906				TCPSTAT_INC(tcps_rcvwinprobe);
907			} else
908				goto dropafterack;
909		} else
910			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
911		m_adj(m, -todrop);
912		tlen -= todrop;
913		thflags &= ~(TH_PUSH|TH_FIN);
914	}
915
916	/*
917	 * If last ACK falls within this segment's sequence numbers,
918	 * record its timestamp.
919	 * NOTE:
920	 * 1) That the test incorporates suggestions from the latest
921	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
922	 * 2) That updating only on newer timestamps interferes with
923	 *    our earlier PAWS tests, so this check should be solely
924	 *    predicated on the sequence space of this segment.
925	 * 3) That we modify the segment boundary check to be
926	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
927	 *    instead of RFC1323's
928	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
929	 *    This modified check allows us to overcome RFC1323's
930	 *    limitations as described in Stevens TCP/IP Illustrated
931	 *    Vol. 2 p.869. In such cases, we can still calculate the
932	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
933	 */
934	if ((to->to_flags & TOF_TS) != 0 &&
935	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
936	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
937		((thflags & (TH_SYN|TH_FIN)) != 0))) {
938		tp->ts_recent_age = tcp_ts_getticks();
939		tp->ts_recent = to->to_tsval;
940	}
941
942	/*
943	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
944	 * flag is on (half-synchronized state), then queue data for
945	 * later processing; else drop segment and return.
946	 */
947	if ((thflags & TH_ACK) == 0) {
948		if (tp->t_state == TCPS_SYN_RECEIVED ||
949		    (tp->t_flags & TF_NEEDSYN))
950			goto step6;
951		else if (tp->t_flags & TF_ACKNOW)
952			goto dropafterack;
953		else
954			goto drop;
955	}
956
957	/*
958	 * Ack processing.
959	 */
960	switch (tp->t_state) {
961
962	/*
963	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
964	 * ESTABLISHED state and continue processing.
965	 * The ACK was checked above.
966	 */
967	case TCPS_SYN_RECEIVED:
968
969		TCPSTAT_INC(tcps_connects);
970		soisconnected(so);
971		/* Do window scaling? */
972		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
973			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
974			tp->rcv_scale = tp->request_r_scale;
975			tp->snd_wnd = tiwin;
976		}
977		/*
978		 * Make transitions:
979		 *      SYN-RECEIVED  -> ESTABLISHED
980		 *      SYN-RECEIVED* -> FIN-WAIT-1
981		 */
982		tp->t_starttime = ticks;
983		if (tp->t_flags & TF_NEEDFIN) {
984			tcp_state_change(tp, TCPS_FIN_WAIT_1);
985			tp->t_flags &= ~TF_NEEDFIN;
986		} else {
987			tcp_state_change(tp, TCPS_ESTABLISHED);
988			TCP_PROBE5(accept__established, NULL, tp,
989			    mtod(m, const char *), tp, th);
990			cc_conn_init(tp);
991			tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
992		}
993		/*
994		 * If segment contains data or ACK, will call tcp_reass()
995		 * later; if not, do so now to pass queued data to user.
996		 */
997		if (tlen == 0 && (thflags & TH_FIN) == 0)
998			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
999			    (struct mbuf *)0);
1000		tp->snd_wl1 = th->th_seq - 1;
1001		/* FALLTHROUGH */
1002
1003	/*
1004	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1005	 * ACKs.  If the ack is in the range
1006	 *	tp->snd_una < th->th_ack <= tp->snd_max
1007	 * then advance tp->snd_una to th->th_ack and drop
1008	 * data from the retransmission queue.  If this ACK reflects
1009	 * more up to date window information we update our window information.
1010	 */
1011	case TCPS_ESTABLISHED:
1012	case TCPS_FIN_WAIT_1:
1013	case TCPS_FIN_WAIT_2:
1014	case TCPS_CLOSE_WAIT:
1015	case TCPS_CLOSING:
1016	case TCPS_LAST_ACK:
1017		if (SEQ_GT(th->th_ack, tp->snd_max)) {
1018			TCPSTAT_INC(tcps_rcvacktoomuch);
1019			goto dropafterack;
1020		}
1021		if ((tp->t_flags & TF_SACK_PERMIT) &&
1022		    ((to->to_flags & TOF_SACK) ||
1023		     !TAILQ_EMPTY(&tp->snd_holes)))
1024			tcp_sack_doack(tp, to, th->th_ack);
1025		else
1026			/*
1027			 * Reset the value so that previous (valid) value
1028			 * from the last ack with SACK doesn't get used.
1029			 */
1030			tp->sackhint.sacked_bytes = 0;
1031
1032		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
1033		hhook_run_tcp_est_in(tp, th, to);
1034
1035		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1036			if (tlen == 0 && tiwin == tp->snd_wnd) {
1037				/*
1038				 * If this is the first time we've seen a
1039				 * FIN from the remote, this is not a
1040				 * duplicate and it needs to be processed
1041				 * normally.  This happens during a
1042				 * simultaneous close.
1043				 */
1044				if ((thflags & TH_FIN) &&
1045				    (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
1046					tp->t_dupacks = 0;
1047					break;
1048				}
1049				TCPSTAT_INC(tcps_rcvdupack);
1050				/*
1051				 * If we have outstanding data (other than
1052				 * a window probe), this is a completely
1053				 * duplicate ack (ie, window info didn't
1054				 * change and FIN isn't set),
1055				 * the ack is the biggest we've
1056				 * seen and we've seen exactly our rexmt
1057				 * threshold of them, assume a packet
1058				 * has been dropped and retransmit it.
1059				 * Kludge snd_nxt & the congestion
1060				 * window so we send only this one
1061				 * packet.
1062				 *
1063				 * We know we're losing at the current
1064				 * window size so do congestion avoidance
1065				 * (set ssthresh to half the current window
1066				 * and pull our congestion window back to
1067				 * the new ssthresh).
1068				 *
1069				 * Dup acks mean that packets have left the
1070				 * network (they're now cached at the receiver)
1071				 * so bump cwnd by the amount in the receiver
1072				 * to keep a constant cwnd packets in the
1073				 * network.
1074				 *
1075				 * When using TCP ECN, notify the peer that
1076				 * we reduced the cwnd.
1077				 */
1078				if (!tcp_timer_active(tp, TT_REXMT) ||
1079				    th->th_ack != tp->snd_una)
1080					tp->t_dupacks = 0;
1081				else if (++tp->t_dupacks > tcprexmtthresh ||
1082				     IN_FASTRECOVERY(tp->t_flags)) {
1083					cc_ack_received(tp, th, CC_DUPACK);
1084					if ((tp->t_flags & TF_SACK_PERMIT) &&
1085					    IN_FASTRECOVERY(tp->t_flags)) {
1086						int awnd;
1087
1088						/*
1089						 * Compute the amount of data in flight first.
1090						 * We can inject new data into the pipe iff
1091						 * we have less than 1/2 the original window's
1092						 * worth of data in flight.
1093						 */
1094						if (V_tcp_do_rfc6675_pipe)
1095							awnd = tcp_compute_pipe(tp);
1096						else
1097							awnd = (tp->snd_nxt - tp->snd_fack) +
1098								tp->sackhint.sack_bytes_rexmit;
1099
1100						if (awnd < tp->snd_ssthresh) {
1101							tp->snd_cwnd += tp->t_maxseg;
1102							if (tp->snd_cwnd > tp->snd_ssthresh)
1103								tp->snd_cwnd = tp->snd_ssthresh;
1104						}
1105					} else
1106						tp->snd_cwnd += tp->t_maxseg;
1107					(void) tp->t_fb->tfb_tcp_output(tp);
1108					goto drop;
1109				} else if (tp->t_dupacks == tcprexmtthresh) {
1110					tcp_seq onxt = tp->snd_nxt;
1111
1112					/*
1113					 * If we're doing sack, check to
1114					 * see if we're already in sack
1115					 * recovery. If we're not doing sack,
1116					 * check to see if we're in newreno
1117					 * recovery.
1118					 */
1119					if (tp->t_flags & TF_SACK_PERMIT) {
1120						if (IN_FASTRECOVERY(tp->t_flags)) {
1121							tp->t_dupacks = 0;
1122							break;
1123						}
1124					} else {
1125						if (SEQ_LEQ(th->th_ack,
1126						    tp->snd_recover)) {
1127							tp->t_dupacks = 0;
1128							break;
1129						}
1130					}
1131					/* Congestion signal before ack. */
1132					cc_cong_signal(tp, th, CC_NDUPACK);
1133					cc_ack_received(tp, th, CC_DUPACK);
1134					tcp_timer_activate(tp, TT_REXMT, 0);
1135					tp->t_rtttime = 0;
1136					if (tp->t_flags & TF_SACK_PERMIT) {
1137						TCPSTAT_INC(
1138						    tcps_sack_recovery_episode);
1139						tp->sack_newdata = tp->snd_nxt;
1140						tp->snd_cwnd = tp->t_maxseg;
1141						(void) tp->t_fb->tfb_tcp_output(tp);
1142						goto drop;
1143					}
1144					tp->snd_nxt = th->th_ack;
1145					tp->snd_cwnd = tp->t_maxseg;
1146					(void) tp->t_fb->tfb_tcp_output(tp);
1147					KASSERT(tp->snd_limited <= 2,
1148					    ("%s: tp->snd_limited too big",
1149					    __func__));
1150					tp->snd_cwnd = tp->snd_ssthresh +
1151					     tp->t_maxseg *
1152					     (tp->t_dupacks - tp->snd_limited);
1153					if (SEQ_GT(onxt, tp->snd_nxt))
1154						tp->snd_nxt = onxt;
1155					goto drop;
1156				} else if (V_tcp_do_rfc3042) {
1157					/*
1158					 * Process first and second duplicate
1159					 * ACKs. Each indicates a segment
1160					 * leaving the network, creating room
1161					 * for more. Make sure we can send a
1162					 * packet on reception of each duplicate
1163					 * ACK by increasing snd_cwnd by one
1164					 * segment. Restore the original
1165					 * snd_cwnd after packet transmission.
1166					 */
1167					cc_ack_received(tp, th, CC_DUPACK);
1168					u_long oldcwnd = tp->snd_cwnd;
1169					tcp_seq oldsndmax = tp->snd_max;
1170					u_int sent;
1171					int avail;
1172
1173					KASSERT(tp->t_dupacks == 1 ||
1174					    tp->t_dupacks == 2,
1175					    ("%s: dupacks not 1 or 2",
1176					    __func__));
1177					if (tp->t_dupacks == 1)
1178						tp->snd_limited = 0;
1179					tp->snd_cwnd =
1180					    (tp->snd_nxt - tp->snd_una) +
1181					    (tp->t_dupacks - tp->snd_limited) *
1182					    tp->t_maxseg;
1183					/*
1184					 * Only call tcp_output when there
1185					 * is new data available to be sent.
1186					 * Otherwise we would send pure ACKs.
1187					 */
1188					SOCKBUF_LOCK(&so->so_snd);
1189					avail = sbavail(&so->so_snd) -
1190					    (tp->snd_nxt - tp->snd_una);
1191					SOCKBUF_UNLOCK(&so->so_snd);
1192					if (avail > 0)
1193						(void) tp->t_fb->tfb_tcp_output(tp);
1194					sent = tp->snd_max - oldsndmax;
1195					if (sent > tp->t_maxseg) {
1196						KASSERT((tp->t_dupacks == 2 &&
1197						    tp->snd_limited == 0) ||
1198						   (sent == tp->t_maxseg + 1 &&
1199						    tp->t_flags & TF_SENTFIN),
1200						    ("%s: sent too much",
1201						    __func__));
1202						tp->snd_limited = 2;
1203					} else if (sent > 0)
1204						++tp->snd_limited;
1205					tp->snd_cwnd = oldcwnd;
1206					goto drop;
1207				}
1208			} else
1209				tp->t_dupacks = 0;
1210			break;
1211		}
1212
1213		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
1214		    ("%s: th_ack <= snd_una", __func__));
1215
1216		/*
1217		 * If the congestion window was inflated to account
1218		 * for the other side's cached packets, retract it.
1219		 */
1220		if (IN_FASTRECOVERY(tp->t_flags)) {
1221			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
1222				if (tp->t_flags & TF_SACK_PERMIT)
1223					tcp_sack_partialack(tp, th);
1224				else
1225					tcp_newreno_partial_ack(tp, th);
1226			} else
1227				cc_post_recovery(tp, th);
1228		}
1229		tp->t_dupacks = 0;
1230		/*
1231		 * If we reach this point, ACK is not a duplicate,
1232		 *     i.e., it ACKs something we sent.
1233		 */
1234		if (tp->t_flags & TF_NEEDSYN) {
1235			/*
1236			 * T/TCP: Connection was half-synchronized, and our
1237			 * SYN has been ACK'd (so connection is now fully
1238			 * synchronized).  Go to non-starred state,
1239			 * increment snd_una for ACK of SYN, and check if
1240			 * we can do window scaling.
1241			 */
1242			tp->t_flags &= ~TF_NEEDSYN;
1243			tp->snd_una++;
1244			/* Do window scaling? */
1245			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1246				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
1247				tp->rcv_scale = tp->request_r_scale;
1248				/* Send window already scaled. */
1249			}
1250		}
1251
1252process_ACK:
1253		INP_WLOCK_ASSERT(tp->t_inpcb);
1254
1255		acked = BYTES_THIS_ACK(tp, th);
1256		TCPSTAT_INC(tcps_rcvackpack);
1257		TCPSTAT_ADD(tcps_rcvackbyte, acked);
1258
1259		/*
1260		 * If we just performed our first retransmit, and the ACK
1261		 * arrives within our recovery window, then it was a mistake
1262		 * to do the retransmit in the first place.  Recover our
1263		 * original cwnd and ssthresh, and proceed to transmit where
1264		 * we left off.
1265		 */
1266		if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
1267		    (int)(ticks - tp->t_badrxtwin) < 0)
1268			cc_cong_signal(tp, th, CC_RTO_ERR);
1269
1270		/*
1271		 * If we have a timestamp reply, update smoothed
1272		 * round trip time.  If no timestamp is present but
1273		 * transmit timer is running and timed sequence
1274		 * number was acked, update smoothed round trip time.
1275		 * Since we now have an rtt measurement, cancel the
1276		 * timer backoff (cf., Phil Karn's retransmit alg.).
1277		 * Recompute the initial retransmit timer.
1278		 *
1279		 * Some boxes send broken timestamp replies
1280		 * during the SYN+ACK phase, ignore
1281		 * timestamps of 0 or we could calculate a
1282		 * huge RTT and blow up the retransmit timer.
1283		 */
1284		if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
1285			u_int t;
1286
1287			t = tcp_ts_getticks() - to->to_tsecr;
1288			if (!tp->t_rttlow || tp->t_rttlow > t)
1289				tp->t_rttlow = t;
1290			tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
1291		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
1292			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
1293				tp->t_rttlow = ticks - tp->t_rtttime;
1294			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
1295		}
1296
1297		/*
1298		 * If all outstanding data is acked, stop retransmit
1299		 * timer and remember to restart (more output or persist).
1300		 * If there is more data to be acked, restart retransmit
1301		 * timer, using current (possibly backed-off) value.
1302		 */
1303		if (th->th_ack == tp->snd_max) {
1304			tcp_timer_activate(tp, TT_REXMT, 0);
1305			needoutput = 1;
1306		} else if (!tcp_timer_active(tp, TT_PERSIST))
1307			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1308
1309		/*
1310		 * If no data (only SYN) was ACK'd,
1311		 *    skip rest of ACK processing.
1312		 */
1313		if (acked == 0)
1314			goto step6;
1315
1316		/*
1317		 * Let the congestion control algorithm update congestion
1318		 * control related information. This typically means increasing
1319		 * the congestion window.
1320		 */
1321		cc_ack_received(tp, th, CC_ACK);
1322
1323		SOCKBUF_LOCK(&so->so_snd);
1324		if (acked > sbavail(&so->so_snd)) {
1325			tp->snd_wnd -= sbavail(&so->so_snd);
1326			mfree = sbcut_locked(&so->so_snd,
1327			    (int)sbavail(&so->so_snd));
1328			ourfinisacked = 1;
1329		} else {
1330			mfree = sbcut_locked(&so->so_snd, acked);
1331			tp->snd_wnd -= acked;
1332			ourfinisacked = 0;
1333		}
1334		/* NB: sowwakeup_locked() does an implicit unlock. */
1335		sowwakeup_locked(so);
1336		m_freem(mfree);
1337		/* Detect una wraparound. */
1338		if (!IN_RECOVERY(tp->t_flags) &&
1339		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
1340		    SEQ_LEQ(th->th_ack, tp->snd_recover))
1341			tp->snd_recover = th->th_ack - 1;
1342		/* XXXLAS: Can this be moved up into cc_post_recovery? */
1343		if (IN_RECOVERY(tp->t_flags) &&
1344		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
1345			EXIT_RECOVERY(tp->t_flags);
1346		}
1347		tp->snd_una = th->th_ack;
1348		if (tp->t_flags & TF_SACK_PERMIT) {
1349			if (SEQ_GT(tp->snd_una, tp->snd_recover))
1350				tp->snd_recover = tp->snd_una;
1351		}
1352		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1353			tp->snd_nxt = tp->snd_una;
1354
1355		switch (tp->t_state) {
1356
1357		/*
1358		 * In FIN_WAIT_1 STATE in addition to the processing
1359		 * for the ESTABLISHED state if our FIN is now acknowledged
1360		 * then enter FIN_WAIT_2.
1361		 */
1362		case TCPS_FIN_WAIT_1:
1363			if (ourfinisacked) {
1364				/*
1365				 * If we can't receive any more
1366				 * data, then closing user can proceed.
1367				 * Starting the timer is contrary to the
1368				 * specification, but if we don't get a FIN
1369				 * we'll hang forever.
1370				 *
1371				 * XXXjl:
1372				 * we should release the tp also, and use a
1373				 * compressed state.
1374				 */
1375				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1376					soisdisconnected(so);
1377					tcp_timer_activate(tp, TT_2MSL,
1378					    (tcp_fast_finwait2_recycle ?
1379					    tcp_finwait2_timeout :
1380					    TP_MAXIDLE(tp)));
1381				}
1382				tcp_state_change(tp, TCPS_FIN_WAIT_2);
1383			}
1384			break;
1385
1386		/*
1387		 * In CLOSING STATE in addition to the processing for
1388		 * the ESTABLISHED state if the ACK acknowledges our FIN
1389		 * then enter the TIME-WAIT state, otherwise ignore
1390		 * the segment.
1391		 */
1392		case TCPS_CLOSING:
1393			if (ourfinisacked) {
1394				INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1395				tcp_twstart(tp);
1396				INP_INFO_RUNLOCK(&V_tcbinfo);
1397				m_freem(m);
1398				return;
1399			}
1400			break;
1401
1402		/*
1403		 * In LAST_ACK, we may still be waiting for data to drain
1404		 * and/or to be acked, as well as for the ack of our FIN.
1405		 * If our FIN is now acknowledged, delete the TCB,
1406		 * enter the closed state and return.
1407		 */
1408		case TCPS_LAST_ACK:
1409			if (ourfinisacked) {
1410				INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1411				tp = tcp_close(tp);
1412				goto drop;
1413			}
1414			break;
1415		}
1416	}
1417
1418step6:
1419	INP_WLOCK_ASSERT(tp->t_inpcb);
1420
1421	/*
1422	 * Update window information.
1423	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1424	 */
1425	if ((thflags & TH_ACK) &&
1426	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
1427	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
1428	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
1429		/* keep track of pure window updates */
1430		if (tlen == 0 &&
1431		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
1432			TCPSTAT_INC(tcps_rcvwinupd);
1433		tp->snd_wnd = tiwin;
1434		tp->snd_wl1 = th->th_seq;
1435		tp->snd_wl2 = th->th_ack;
1436		if (tp->snd_wnd > tp->max_sndwnd)
1437			tp->max_sndwnd = tp->snd_wnd;
1438		needoutput = 1;
1439	}
1440
1441	/*
1442	 * Process segments with URG.
1443	 */
1444	if ((thflags & TH_URG) && th->th_urp &&
1445	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1446		/*
1447		 * This is a kludge, but if we receive and accept
1448		 * random urgent pointers, we'll crash in
1449		 * soreceive.  It's hard to imagine someone
1450		 * actually wanting to send this much urgent data.
1451		 */
1452		SOCKBUF_LOCK(&so->so_rcv);
1453		if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
1454			th->th_urp = 0;			/* XXX */
1455			thflags &= ~TH_URG;		/* XXX */
1456			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
1457			goto dodata;			/* XXX */
1458		}
1459		/*
1460		 * If this segment advances the known urgent pointer,
1461		 * then mark the data stream.  This should not happen
1462		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1463		 * a FIN has been received from the remote side.
1464		 * In these states we ignore the URG.
1465		 *
1466		 * According to RFC961 (Assigned Protocols),
1467		 * the urgent pointer points to the last octet
1468		 * of urgent data.  We continue, however,
1469		 * to consider it to indicate the first octet
1470		 * of data past the urgent section as the original
1471		 * spec states (in one of two places).
1472		 */
1473		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
1474			tp->rcv_up = th->th_seq + th->th_urp;
1475			so->so_oobmark = sbavail(&so->so_rcv) +
1476			    (tp->rcv_up - tp->rcv_nxt) - 1;
1477			if (so->so_oobmark == 0)
1478				so->so_rcv.sb_state |= SBS_RCVATMARK;
1479			sohasoutofband(so);
1480			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1481		}
1482		SOCKBUF_UNLOCK(&so->so_rcv);
1483		/*
1484		 * Remove out of band data so doesn't get presented to user.
1485		 * This can happen independent of advancing the URG pointer,
1486		 * but if two URG's are pending at once, some out-of-band
1487		 * data may creep in... ick.
1488		 */
1489		if (th->th_urp <= (u_long)tlen &&
1490		    !(so->so_options & SO_OOBINLINE)) {
1491			/* hdr drop is delayed */
1492			tcp_pulloutofband(so, th, m, drop_hdrlen);
1493		}
1494	} else {
1495		/*
1496		 * If no out of band data is expected,
1497		 * pull receive urgent pointer along
1498		 * with the receive window.
1499		 */
1500		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1501			tp->rcv_up = tp->rcv_nxt;
1502	}
1503dodata:							/* XXX */
1504	INP_WLOCK_ASSERT(tp->t_inpcb);
1505
1506	/*
1507	 * Process the segment text, merging it into the TCP sequencing queue,
1508	 * and arranging for acknowledgment of receipt if necessary.
1509	 * This process logically involves adjusting tp->rcv_wnd as data
1510	 * is presented to the user (this happens in tcp_usrreq.c,
1511	 * case PRU_RCVD).  If a FIN has already been received on this
1512	 * connection then we just ignore the text.
1513	 */
1514	if ((tlen || (thflags & TH_FIN)) &&
1515	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1516		tcp_seq save_start = th->th_seq;
1517		m_adj(m, drop_hdrlen);	/* delayed header drop */
1518		/*
1519		 * Insert segment which includes th into TCP reassembly queue
1520		 * with control block tp.  Set thflags to whether reassembly now
1521		 * includes a segment with FIN.  This handles the common case
1522		 * inline (segment is the next to be received on an established
1523		 * connection, and the queue is empty), avoiding linkage into
1524		 * and removal from the queue and repetition of various
1525		 * conversions.
1526		 * Set DELACK for segments received in order, but ack
1527		 * immediately when segments are out of order (so
1528		 * fast retransmit can work).
1529		 */
1530		if (th->th_seq == tp->rcv_nxt &&
1531		    LIST_EMPTY(&tp->t_segq) &&
1532		    TCPS_HAVEESTABLISHED(tp->t_state)) {
1533			if (DELAY_ACK(tp, tlen))
1534				tp->t_flags |= TF_DELACK;
1535			else
1536				tp->t_flags |= TF_ACKNOW;
1537			tp->rcv_nxt += tlen;
1538			thflags = th->th_flags & TH_FIN;
1539			TCPSTAT_INC(tcps_rcvpack);
1540			TCPSTAT_ADD(tcps_rcvbyte, tlen);
1541			SOCKBUF_LOCK(&so->so_rcv);
1542			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1543				m_freem(m);
1544			else
1545				sbappendstream_locked(&so->so_rcv, m, 0);
1546			/* NB: sorwakeup_locked() does an implicit unlock. */
1547			sorwakeup_locked(so);
1548		} else {
1549			/*
1550			 * XXX: Due to the header drop above "th" is
1551			 * theoretically invalid by now.  Fortunately
1552			 * m_adj() doesn't actually frees any mbufs
1553			 * when trimming from the head.
1554			 */
1555			thflags = tcp_reass(tp, th, &tlen, m);
1556			tp->t_flags |= TF_ACKNOW;
1557		}
1558		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
1559			tcp_update_sack_list(tp, save_start, save_start + tlen);
1560#if 0
1561		/*
1562		 * Note the amount of data that peer has sent into
1563		 * our window, in order to estimate the sender's
1564		 * buffer size.
1565		 * XXX: Unused.
1566		 */
1567		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
1568			len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1569		else
1570			len = so->so_rcv.sb_hiwat;
1571#endif
1572	} else {
1573		m_freem(m);
1574		thflags &= ~TH_FIN;
1575	}
1576
1577	/*
1578	 * If FIN is received ACK the FIN and let the user know
1579	 * that the connection is closing.
1580	 */
1581	if (thflags & TH_FIN) {
1582		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1583			socantrcvmore(so);
1584			/*
1585			 * If connection is half-synchronized
1586			 * (ie NEEDSYN flag on) then delay ACK,
1587			 * so it may be piggybacked when SYN is sent.
1588			 * Otherwise, since we received a FIN then no
1589			 * more input can be expected, send ACK now.
1590			 */
1591			if (tp->t_flags & TF_NEEDSYN)
1592				tp->t_flags |= TF_DELACK;
1593			else
1594				tp->t_flags |= TF_ACKNOW;
1595			tp->rcv_nxt++;
1596		}
1597		switch (tp->t_state) {
1598
1599		/*
1600		 * In SYN_RECEIVED and ESTABLISHED STATES
1601		 * enter the CLOSE_WAIT state.
1602		 */
1603		case TCPS_SYN_RECEIVED:
1604			tp->t_starttime = ticks;
1605			/* FALLTHROUGH */
1606		case TCPS_ESTABLISHED:
1607			tcp_state_change(tp, TCPS_CLOSE_WAIT);
1608			break;
1609
1610		/*
1611		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1612		 * enter the CLOSING state.
1613		 */
1614		case TCPS_FIN_WAIT_1:
1615			tcp_state_change(tp, TCPS_CLOSING);
1616			break;
1617
1618		/*
1619		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1620		 * starting the time-wait timer, turning off the other
1621		 * standard timers.
1622		 */
1623		case TCPS_FIN_WAIT_2:
1624			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1625			KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
1626			    "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
1627			    ti_locked));
1628
1629			tcp_twstart(tp);
1630			INP_INFO_RUNLOCK(&V_tcbinfo);
1631			return;
1632		}
1633	}
1634	if (ti_locked == TI_RLOCKED) {
1635		INP_INFO_RUNLOCK(&V_tcbinfo);
1636	}
1637	ti_locked = TI_UNLOCKED;
1638
1639#ifdef TCPDEBUG
1640	if (so->so_options & SO_DEBUG)
1641		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
1642			  &tcp_savetcp, 0);
1643#endif
1644	TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
1645
1646	/*
1647	 * Return any desired output.
1648	 */
1649	if (needoutput || (tp->t_flags & TF_ACKNOW))
1650		(void) tp->t_fb->tfb_tcp_output(tp);
1651
1652	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
1653	    __func__, ti_locked));
1654	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1655	INP_WLOCK_ASSERT(tp->t_inpcb);
1656
1657	if (tp->t_flags & TF_DELACK) {
1658		tp->t_flags &= ~TF_DELACK;
1659		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
1660	}
1661	INP_WUNLOCK(tp->t_inpcb);
1662	return;
1663
1664dropafterack:
1665	/*
1666	 * Generate an ACK dropping incoming segment if it occupies
1667	 * sequence space, where the ACK reflects our state.
1668	 *
1669	 * We can now skip the test for the RST flag since all
1670	 * paths to this code happen after packets containing
1671	 * RST have been dropped.
1672	 *
1673	 * In the SYN-RECEIVED state, don't send an ACK unless the
1674	 * segment we received passes the SYN-RECEIVED ACK test.
1675	 * If it fails send a RST.  This breaks the loop in the
1676	 * "LAND" DoS attack, and also prevents an ACK storm
1677	 * between two listening ports that have been sent forged
1678	 * SYN segments, each with the source address of the other.
1679	 */
1680	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
1681	    (SEQ_GT(tp->snd_una, th->th_ack) ||
1682	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
1683		rstreason = BANDLIM_RST_OPENPORT;
1684		goto dropwithreset;
1685	}
1686#ifdef TCPDEBUG
1687	if (so->so_options & SO_DEBUG)
1688		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1689			  &tcp_savetcp, 0);
1690#endif
1691	TCP_PROBE3(debug__drop, tp, th, mtod(m, const char *));
1692	if (ti_locked == TI_RLOCKED) {
1693		INP_INFO_RUNLOCK(&V_tcbinfo);
1694	}
1695	ti_locked = TI_UNLOCKED;
1696
1697	tp->t_flags |= TF_ACKNOW;
1698	(void) tp->t_fb->tfb_tcp_output(tp);
1699	INP_WUNLOCK(tp->t_inpcb);
1700	m_freem(m);
1701	return;
1702
1703dropwithreset:
1704	if (ti_locked == TI_RLOCKED) {
1705		INP_INFO_RUNLOCK(&V_tcbinfo);
1706	}
1707	ti_locked = TI_UNLOCKED;
1708
1709	if (tp != NULL) {
1710		tcp_dropwithreset(m, th, tp, tlen, rstreason);
1711		INP_WUNLOCK(tp->t_inpcb);
1712	} else
1713		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1714	return;
1715
1716drop:
1717	if (ti_locked == TI_RLOCKED) {
1718		INP_INFO_RUNLOCK(&V_tcbinfo);
1719		ti_locked = TI_UNLOCKED;
1720	}
1721#ifdef INVARIANTS
1722	else
1723		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1724#endif
1725
1726	/*
1727	 * Drop space held by incoming segment and return.
1728	 */
1729#ifdef TCPDEBUG
1730	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1731		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1732			  &tcp_savetcp, 0);
1733#endif
1734	TCP_PROBE3(debug__drop, tp, th, mtod(m, const char *));
1735	if (tp != NULL)
1736		INP_WUNLOCK(tp->t_inpcb);
1737	m_freem(m);
1738}
1739
1740
1741/*
1742 * Do fast slow is a combination of the original
1743 * tcp_dosegment and a split fastpath, one function
1744 * for the fast-ack which also includes allowing fastpath
1745 * for window advanced in sequence acks. And also a
1746 * sub-function that handles the insequence data.
1747 */
1748void
1749tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so,
1750			struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
1751			int ti_locked)
1752{
1753	int thflags;
1754	u_long tiwin;
1755	char *s;
1756	int can_enter;
1757	struct in_conninfo *inc;
1758	struct tcpopt to;
1759
1760	thflags = th->th_flags;
1761	tp->sackhint.last_sack_ack = 0;
1762	inc = &tp->t_inpcb->inp_inc;
1763	/*
1764	 * If this is either a state-changing packet or current state isn't
1765	 * established, we require a write lock on tcbinfo.  Otherwise, we
1766	 * allow the tcbinfo to be in either alocked or unlocked, as the
1767	 * caller may have unnecessarily acquired a write lock due to a race.
1768	 */
1769	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
1770	    tp->t_state != TCPS_ESTABLISHED) {
1771		KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
1772						  "SYN/FIN/RST/!EST", __func__, ti_locked));
1773		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1774	} else {
1775#ifdef INVARIANTS
1776		if (ti_locked == TI_RLOCKED) {
1777			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1778		} else {
1779			KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
1780							   "ti_locked: %d", __func__, ti_locked));
1781			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1782		}
1783#endif
1784	}
1785	INP_WLOCK_ASSERT(tp->t_inpcb);
1786	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
1787					    __func__));
1788	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
1789						__func__));
1790
1791	/*
1792	 * Segment received on connection.
1793	 * Reset idle time and keep-alive timer.
1794	 * XXX: This should be done after segment
1795	 * validation to ignore broken/spoofed segs.
1796	 */
1797	tp->t_rcvtime = ticks;
1798	if (TCPS_HAVEESTABLISHED(tp->t_state))
1799		tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
1800
1801	/*
1802	 * Unscale the window into a 32-bit value.
1803	 * For the SYN_SENT state the scale is zero.
1804	 */
1805	tiwin = th->th_win << tp->snd_scale;
1806
1807	/*
1808	 * TCP ECN processing.
1809	 */
1810	if (tp->t_flags & TF_ECN_PERMIT) {
1811		if (thflags & TH_CWR)
1812			tp->t_flags &= ~TF_ECN_SND_ECE;
1813		switch (iptos & IPTOS_ECN_MASK) {
1814		case IPTOS_ECN_CE:
1815			tp->t_flags |= TF_ECN_SND_ECE;
1816			TCPSTAT_INC(tcps_ecn_ce);
1817			break;
1818		case IPTOS_ECN_ECT0:
1819			TCPSTAT_INC(tcps_ecn_ect0);
1820			break;
1821		case IPTOS_ECN_ECT1:
1822			TCPSTAT_INC(tcps_ecn_ect1);
1823			break;
1824		}
1825		/* Congestion experienced. */
1826		if (thflags & TH_ECE) {
1827			cc_cong_signal(tp, th, CC_ECN);
1828		}
1829	}
1830
1831	/*
1832	 * Parse options on any incoming segment.
1833	 */
1834	tcp_dooptions(&to, (u_char *)(th + 1),
1835		      (th->th_off << 2) - sizeof(struct tcphdr),
1836		      (thflags & TH_SYN) ? TO_SYN : 0);
1837
1838	/*
1839	 * If echoed timestamp is later than the current time,
1840	 * fall back to non RFC1323 RTT calculation.  Normalize
1841	 * timestamp if syncookies were used when this connection
1842	 * was established.
1843	 */
1844	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1845		to.to_tsecr -= tp->ts_offset;
1846		if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
1847			to.to_tsecr = 0;
1848	}
1849	/*
1850	 * If timestamps were negotiated during SYN/ACK they should
1851	 * appear on every segment during this session and vice versa.
1852	 */
1853	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
1854		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1855			log(LOG_DEBUG, "%s; %s: Timestamp missing, "
1856			    "no action\n", s, __func__);
1857			free(s, M_TCPLOG);
1858		}
1859	}
1860	if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
1861		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1862			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
1863			    "no action\n", s, __func__);
1864			free(s, M_TCPLOG);
1865		}
1866	}
1867
1868	/*
1869	 * Process options only when we get SYN/ACK back. The SYN case
1870	 * for incoming connections is handled in tcp_syncache.
1871	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1872	 * or <SYN,ACK>) segment itself is never scaled.
1873	 * XXX this is traditional behavior, may need to be cleaned up.
1874	 */
1875	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1876		if ((to.to_flags & TOF_SCALE) &&
1877		    (tp->t_flags & TF_REQ_SCALE)) {
1878			tp->t_flags |= TF_RCVD_SCALE;
1879			tp->snd_scale = to.to_wscale;
1880		}
1881		/*
1882		 * Initial send window.  It will be updated with
1883		 * the next incoming segment to the scaled value.
1884		 */
1885		tp->snd_wnd = th->th_win;
1886		if (to.to_flags & TOF_TS) {
1887			tp->t_flags |= TF_RCVD_TSTMP;
1888			tp->ts_recent = to.to_tsval;
1889			tp->ts_recent_age = tcp_ts_getticks();
1890		}
1891		if (to.to_flags & TOF_MSS)
1892			tcp_mss(tp, to.to_mss);
1893		if ((tp->t_flags & TF_SACK_PERMIT) &&
1894		    (to.to_flags & TOF_SACKPERM) == 0)
1895			tp->t_flags &= ~TF_SACK_PERMIT;
1896	}
1897	can_enter = 0;
1898	if (__predict_true((tlen == 0))) {
1899		/*
1900		 * The ack moved forward and we have a window (non-zero)
1901		 * <or>
1902		 * The ack did not move forward, but the window increased.
1903		 */
1904		if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) ||
1905				   ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) {
1906			can_enter = 1;
1907		}
1908	} else {
1909		/*
1910		 * Data incoming, use the old entry criteria
1911		 * for fast-path with data.
1912		 */
1913		if ((tiwin && tiwin == tp->snd_wnd)) {
1914			can_enter = 1;
1915		}
1916	}
1917	/*
1918	 * Header prediction: check for the two common cases
1919	 * of a uni-directional data xfer.  If the packet has
1920	 * no control flags, is in-sequence, the window didn't
1921	 * change and we're not retransmitting, it's a
1922	 * candidate.  If the length is zero and the ack moved
1923	 * forward, we're the sender side of the xfer.  Just
1924	 * free the data acked & wake any higher level process
1925	 * that was blocked waiting for space.  If the length
1926	 * is non-zero and the ack didn't move, we're the
1927	 * receiver side.  If we're getting packets in-order
1928	 * (the reassembly queue is empty), add the data to
1929	 * the socket buffer and note that we need a delayed ack.
1930	 * Make sure that the hidden state-flags are also off.
1931	 * Since we check for TCPS_ESTABLISHED first, it can only
1932	 * be TH_NEEDSYN.
1933	 */
1934	if (__predict_true(tp->t_state == TCPS_ESTABLISHED &&
1935	    th->th_seq == tp->rcv_nxt &&
1936	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1937	    tp->snd_nxt == tp->snd_max &&
1938	    can_enter &&
1939	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1940	    LIST_EMPTY(&tp->t_segq) &&
1941	    ((to.to_flags & TOF_TS) == 0 ||
1942	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) {
1943		if (__predict_true((tlen == 0) &&
1944		    (SEQ_LEQ(th->th_ack, tp->snd_max) &&
1945		     !IN_RECOVERY(tp->t_flags) &&
1946		     (to.to_flags & TOF_SACK) == 0 &&
1947		     TAILQ_EMPTY(&tp->snd_holes)))) {
1948			/* We are done */
1949			tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
1950				       ti_locked, tiwin);
1951			return;
1952		} else if ((tlen) &&
1953			   (th->th_ack == tp->snd_una &&
1954			    tlen <= sbspace(&so->so_rcv))) {
1955			tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen,
1956					   ti_locked, tiwin);
1957			/* We are done */
1958			return;
1959		}
1960	}
1961	tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
1962			ti_locked, tiwin, thflags);
1963}
1964
1965
1966/*
1967 * This subfunction is used to try to highly optimize the
1968 * fast path. We again allow window updates that are
1969 * in sequence to remain in the fast-path. We also add
1970 * in the __predict's to attempt to help the compiler.
1971 * Note that if we return a 0, then we can *not* process
1972 * it and the caller should push the packet into the
1973 * slow-path.
1974 */
1975static int
1976tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
1977	       struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
1978	       int ti_locked, u_long tiwin)
1979{
1980	int acked;
1981	int winup_only=0;
1982#ifdef TCPDEBUG
1983	/*
1984	 * The size of tcp_saveipgen must be the size of the max ip header,
1985	 * now IPv6.
1986	 */
1987	u_char tcp_saveipgen[IP6_HDR_LEN];
1988	struct tcphdr tcp_savetcp;
1989	short ostate = 0;
1990#endif
1991
1992
1993	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
1994		/* Old ack, behind (or duplicate to) the last one rcv'd */
1995		return (0);
1996	}
1997	if (__predict_false(th->th_ack == tp->snd_una) &&
1998	    __predict_false(tiwin <= tp->snd_wnd)) {
1999		/* duplicate ack <or> a shrinking dup ack with shrinking window */
2000		return (0);
2001	}
2002	if (__predict_false(tiwin == 0)) {
2003		/* zero window */
2004		return (0);
2005	}
2006	if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
2007		/* Above what we have sent? */
2008		return (0);
2009	}
2010	if (__predict_false(tp->snd_nxt != tp->snd_max)) {
2011		/* We are retransmitting */
2012		return (0);
2013	}
2014	if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) {
2015		/* We need a SYN or a FIN, unlikely.. */
2016		return (0);
2017	}
2018	if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
2019		/* Timestamp is behind .. old ack with seq wrap? */
2020		return (0);
2021	}
2022	if (__predict_false(IN_RECOVERY(tp->t_flags))) {
2023		/* Still recovering */
2024		return (0);
2025	}
2026	if (__predict_false(to->to_flags & TOF_SACK)) {
2027		/* Sack included in the ack..  */
2028		return (0);
2029	}
2030	if (!TAILQ_EMPTY(&tp->snd_holes)) {
2031		/* We have sack holes on our scoreboard */
2032		return (0);
2033	}
2034	/* Ok if we reach here, we can process a fast-ack */
2035
2036	/* Did the window get updated? */
2037	if (tiwin != tp->snd_wnd) {
2038		/* keep track of pure window updates */
2039		if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
2040			winup_only = 1;
2041			TCPSTAT_INC(tcps_rcvwinupd);
2042		}
2043		tp->snd_wnd = tiwin;
2044		tp->snd_wl1 = th->th_seq;
2045		if (tp->snd_wnd > tp->max_sndwnd)
2046			tp->max_sndwnd = tp->snd_wnd;
2047	}
2048	/*
2049	 * Pull snd_wl2 up to prevent seq wrap relative
2050	 * to th_ack.
2051	 */
2052	tp->snd_wl2 = th->th_ack;
2053	/*
2054	 * If last ACK falls within this segment's sequence numbers,
2055	 * record the timestamp.
2056	 * NOTE that the test is modified according to the latest
2057	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2058	 */
2059	if ((to->to_flags & TOF_TS) != 0 &&
2060	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2061		tp->ts_recent_age = tcp_ts_getticks();
2062		tp->ts_recent = to->to_tsval;
2063	}
2064	/*
2065	 * This is a pure ack for outstanding data.
2066	 */
2067	if (ti_locked == TI_RLOCKED) {
2068		INP_INFO_RUNLOCK(&V_tcbinfo);
2069	}
2070	ti_locked = TI_UNLOCKED;
2071
2072	TCPSTAT_INC(tcps_predack);
2073
2074	/*
2075	 * "bad retransmit" recovery.
2076	 */
2077	if (tp->t_rxtshift == 1 &&
2078	    tp->t_flags & TF_PREVVALID &&
2079	    (int)(ticks - tp->t_badrxtwin) < 0) {
2080		cc_cong_signal(tp, th, CC_RTO_ERR);
2081	}
2082
2083	/*
2084	 * Recalculate the transmit timer / rtt.
2085	 *
2086	 * Some boxes send broken timestamp replies
2087	 * during the SYN+ACK phase, ignore
2088	 * timestamps of 0 or we could calculate a
2089	 * huge RTT and blow up the retransmit timer.
2090	 */
2091	if ((to->to_flags & TOF_TS) != 0 &&
2092	    to->to_tsecr) {
2093		u_int t;
2094
2095		t = tcp_ts_getticks() - to->to_tsecr;
2096		if (!tp->t_rttlow || tp->t_rttlow > t)
2097			tp->t_rttlow = t;
2098		tcp_xmit_timer(tp,
2099			       TCP_TS_TO_TICKS(t) + 1);
2100	} else if (tp->t_rtttime &&
2101		   SEQ_GT(th->th_ack, tp->t_rtseq)) {
2102		if (!tp->t_rttlow ||
2103		    tp->t_rttlow > ticks - tp->t_rtttime)
2104			tp->t_rttlow = ticks - tp->t_rtttime;
2105		tcp_xmit_timer(tp,
2106			       ticks - tp->t_rtttime);
2107	}
2108	if (winup_only == 0) {
2109		acked = BYTES_THIS_ACK(tp, th);
2110
2111		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
2112		hhook_run_tcp_est_in(tp, th, to);
2113
2114		TCPSTAT_ADD(tcps_rcvackbyte, acked);
2115		sbdrop(&so->so_snd, acked);
2116		if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2117		    SEQ_LEQ(th->th_ack, tp->snd_recover))
2118			tp->snd_recover = th->th_ack - 1;
2119
2120		/*
2121		 * Let the congestion control algorithm update
2122		 * congestion control related information. This
2123		 * typically means increasing the congestion
2124		 * window.
2125		 */
2126		cc_ack_received(tp, th, CC_ACK);
2127
2128		tp->snd_una = th->th_ack;
2129		tp->t_dupacks = 0;
2130
2131		/*
2132		 * If all outstanding data are acked, stop
2133		 * retransmit timer, otherwise restart timer
2134		 * using current (possibly backed-off) value.
2135		 * If process is waiting for space,
2136		 * wakeup/selwakeup/signal.  If data
2137		 * are ready to send, let tcp_output
2138		 * decide between more output or persist.
2139		 */
2140#ifdef TCPDEBUG
2141		if (so->so_options & SO_DEBUG)
2142			tcp_trace(TA_INPUT, ostate, tp,
2143				  (void *)tcp_saveipgen,
2144				  &tcp_savetcp, 0);
2145#endif
2146		TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
2147		m_freem(m);
2148		if (tp->snd_una == tp->snd_max)
2149			tcp_timer_activate(tp, TT_REXMT, 0);
2150		else if (!tcp_timer_active(tp, TT_PERSIST))
2151			tcp_timer_activate(tp, TT_REXMT,
2152					   tp->t_rxtcur);
2153		/* Wake up the socket if we have room to write more */
2154		sowwakeup(so);
2155	} else {
2156		/*
2157		 * Window update only, just free the mbufs and
2158		 * send out whatever we can.
2159		 */
2160		m_freem(m);
2161	}
2162	if (sbavail(&so->so_snd))
2163		(void) tcp_output(tp);
2164	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
2165					    __func__, ti_locked));
2166	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2167	INP_WLOCK_ASSERT(tp->t_inpcb);
2168
2169	if (tp->t_flags & TF_DELACK) {
2170		tp->t_flags &= ~TF_DELACK;
2171		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
2172	}
2173	INP_WUNLOCK(tp->t_inpcb);
2174	return (1);
2175}
2176
2177/*
2178 * This tcp-do-segment concentrates on making the fastest
2179 * ack processing path. It does not have a fast-path for
2180 * data (it possibly could which would then eliminate the
2181 * need for fast-slow above). For a content distributor having
2182 * large outgoing elephants and very very little coming in
2183 * having no fastpath for data does not really help (since you
2184 * don't get much data in). The most important thing is
2185 * processing ack's quickly and getting the rest of the data
2186 * output to the peer as quickly as possible. This routine
2187 * seems to be about an overall 3% faster then the old
2188 * tcp_do_segment and keeps us in the fast-path for packets
2189 * much more (by allowing window updates to also stay in the fastpath).
2190 */
2191void
2192tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
2193		       struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
2194		       int ti_locked)
2195{
2196	int thflags;
2197	u_long tiwin;
2198	char *s;
2199	struct in_conninfo *inc;
2200	struct tcpopt to;
2201
2202	thflags = th->th_flags;
2203	tp->sackhint.last_sack_ack = 0;
2204	inc = &tp->t_inpcb->inp_inc;
2205	/*
2206	 * If this is either a state-changing packet or current state isn't
2207	 * established, we require a write lock on tcbinfo.  Otherwise, we
2208	 * allow the tcbinfo to be in either alocked or unlocked, as the
2209	 * caller may have unnecessarily acquired a write lock due to a race.
2210	 */
2211	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
2212	    tp->t_state != TCPS_ESTABLISHED) {
2213		KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
2214						  "SYN/FIN/RST/!EST", __func__, ti_locked));
2215		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
2216	} else {
2217#ifdef INVARIANTS
2218		if (ti_locked == TI_RLOCKED) {
2219			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
2220		} else {
2221			KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
2222							   "ti_locked: %d", __func__, ti_locked));
2223			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2224		}
2225#endif
2226	}
2227	INP_WLOCK_ASSERT(tp->t_inpcb);
2228	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
2229					    __func__));
2230	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
2231						__func__));
2232
2233	/*
2234	 * Segment received on connection.
2235	 * Reset idle time and keep-alive timer.
2236	 * XXX: This should be done after segment
2237	 * validation to ignore broken/spoofed segs.
2238	 */
2239	tp->t_rcvtime = ticks;
2240	if (TCPS_HAVEESTABLISHED(tp->t_state))
2241		tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
2242
2243	/*
2244	 * Unscale the window into a 32-bit value.
2245	 * For the SYN_SENT state the scale is zero.
2246	 */
2247	tiwin = th->th_win << tp->snd_scale;
2248
2249	/*
2250	 * TCP ECN processing.
2251	 */
2252	if (tp->t_flags & TF_ECN_PERMIT) {
2253		if (thflags & TH_CWR)
2254			tp->t_flags &= ~TF_ECN_SND_ECE;
2255		switch (iptos & IPTOS_ECN_MASK) {
2256		case IPTOS_ECN_CE:
2257			tp->t_flags |= TF_ECN_SND_ECE;
2258			TCPSTAT_INC(tcps_ecn_ce);
2259			break;
2260		case IPTOS_ECN_ECT0:
2261			TCPSTAT_INC(tcps_ecn_ect0);
2262			break;
2263		case IPTOS_ECN_ECT1:
2264			TCPSTAT_INC(tcps_ecn_ect1);
2265			break;
2266		}
2267		/* Congestion experienced. */
2268		if (thflags & TH_ECE) {
2269			cc_cong_signal(tp, th, CC_ECN);
2270		}
2271	}
2272
2273	/*
2274	 * Parse options on any incoming segment.
2275	 */
2276	tcp_dooptions(&to, (u_char *)(th + 1),
2277		      (th->th_off << 2) - sizeof(struct tcphdr),
2278		      (thflags & TH_SYN) ? TO_SYN : 0);
2279
2280	/*
2281	 * If echoed timestamp is later than the current time,
2282	 * fall back to non RFC1323 RTT calculation.  Normalize
2283	 * timestamp if syncookies were used when this connection
2284	 * was established.
2285	 */
2286	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
2287		to.to_tsecr -= tp->ts_offset;
2288		if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
2289			to.to_tsecr = 0;
2290	}
2291	/*
2292	 * If timestamps were negotiated during SYN/ACK they should
2293	 * appear on every segment during this session and vice versa.
2294	 */
2295	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
2296		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2297			log(LOG_DEBUG, "%s; %s: Timestamp missing, "
2298			    "no action\n", s, __func__);
2299			free(s, M_TCPLOG);
2300		}
2301	}
2302	if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
2303		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2304			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
2305			    "no action\n", s, __func__);
2306			free(s, M_TCPLOG);
2307		}
2308	}
2309
2310	/*
2311	 * Process options only when we get SYN/ACK back. The SYN case
2312	 * for incoming connections is handled in tcp_syncache.
2313	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
2314	 * or <SYN,ACK>) segment itself is never scaled.
2315	 * XXX this is traditional behavior, may need to be cleaned up.
2316	 */
2317	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2318		if ((to.to_flags & TOF_SCALE) &&
2319		    (tp->t_flags & TF_REQ_SCALE)) {
2320			tp->t_flags |= TF_RCVD_SCALE;
2321			tp->snd_scale = to.to_wscale;
2322		}
2323		/*
2324		 * Initial send window.  It will be updated with
2325		 * the next incoming segment to the scaled value.
2326		 */
2327		tp->snd_wnd = th->th_win;
2328		if (to.to_flags & TOF_TS) {
2329			tp->t_flags |= TF_RCVD_TSTMP;
2330			tp->ts_recent = to.to_tsval;
2331			tp->ts_recent_age = tcp_ts_getticks();
2332		}
2333		if (to.to_flags & TOF_MSS)
2334			tcp_mss(tp, to.to_mss);
2335		if ((tp->t_flags & TF_SACK_PERMIT) &&
2336		    (to.to_flags & TOF_SACKPERM) == 0)
2337			tp->t_flags &= ~TF_SACK_PERMIT;
2338	}
2339	/*
2340	 * Header prediction: check for the two common cases
2341	 * of a uni-directional data xfer.  If the packet has
2342	 * no control flags, is in-sequence, the window didn't
2343	 * change and we're not retransmitting, it's a
2344	 * candidate.  If the length is zero and the ack moved
2345	 * forward, we're the sender side of the xfer.  Just
2346	 * free the data acked & wake any higher level process
2347	 * that was blocked waiting for space.  If the length
2348	 * is non-zero and the ack didn't move, we're the
2349	 * receiver side.  If we're getting packets in-order
2350	 * (the reassembly queue is empty), add the data to
2351	 * the socket buffer and note that we need a delayed ack.
2352	 * Make sure that the hidden state-flags are also off.
2353	 * Since we check for TCPS_ESTABLISHED first, it can only
2354	 * be TH_NEEDSYN.
2355	 */
2356	if (__predict_true(tp->t_state == TCPS_ESTABLISHED) &&
2357	    __predict_true(((to.to_flags & TOF_SACK) == 0)) &&
2358	    __predict_true(tlen == 0) &&
2359	    __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) &&
2360	    __predict_true(LIST_EMPTY(&tp->t_segq)) &&
2361	    __predict_true(th->th_seq == tp->rcv_nxt)) {
2362		    if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
2363				    ti_locked, tiwin)) {
2364			    return;
2365		    }
2366	}
2367	tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
2368			ti_locked, tiwin, thflags);
2369}
2370
2371struct tcp_function_block __tcp_fastslow = {
2372	.tfb_tcp_block_name = "fastslow",
2373	.tfb_tcp_output = tcp_output,
2374	.tfb_tcp_do_segment = tcp_do_segment_fastslow,
2375	.tfb_tcp_ctloutput = tcp_default_ctloutput,
2376};
2377
2378struct tcp_function_block __tcp_fastack = {
2379	.tfb_tcp_block_name = "fastack",
2380	.tfb_tcp_output = tcp_output,
2381	.tfb_tcp_do_segment = tcp_do_segment_fastack,
2382	.tfb_tcp_ctloutput = tcp_default_ctloutput
2383};
2384
2385static int
2386tcp_addfastpaths(module_t mod, int type, void *data)
2387{
2388	int err=0;
2389
2390	switch (type) {
2391	case MOD_LOAD:
2392		err = register_tcp_functions(&__tcp_fastack, M_WAITOK);
2393		if (err) {
2394			printf("Failed to register fastack module -- err:%d\n", err);
2395			return(err);
2396		}
2397		err = register_tcp_functions(&__tcp_fastslow, M_WAITOK);
2398		if (err) {
2399			printf("Failed to register fastslow module -- err:%d\n", err);
2400			deregister_tcp_functions(&__tcp_fastack);
2401			return(err);
2402		}
2403		break;
2404	case MOD_QUIESCE:
2405		if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) {
2406			return(EBUSY);
2407		}
2408		break;
2409	case MOD_UNLOAD:
2410		err = deregister_tcp_functions(&__tcp_fastack);
2411		if (err == EBUSY)
2412			break;
2413		err = deregister_tcp_functions(&__tcp_fastslow);
2414		if (err == EBUSY)
2415			break;
2416		err = 0;
2417		break;
2418	default:
2419		return (EOPNOTSUPP);
2420	}
2421	return (err);
2422}
2423
2424static moduledata_t new_tcp_fastpaths = {
2425	.name = "tcp_fastpaths",
2426	.evhand = tcp_addfastpaths,
2427	.priv = 0
2428};
2429
2430MODULE_VERSION(kern_tcpfastpaths, 1);
2431DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
2432