1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5 *	The Regents of the University of California.  All rights reserved.
6 * Copyright (c) 2007-2008,2010
7 *	Swinburne University of Technology, Melbourne, Australia.
8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9 * Copyright (c) 2010 The FreeBSD Foundation
10 * Copyright (c) 2010-2011 Juniper Networks, Inc.
11 * All rights reserved.
12 *
13 * Portions of this software were developed at the Centre for Advanced Internet
14 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
15 * James Healy and David Hayes, made possible in part by a grant from the Cisco
16 * University Research Program Fund at Community Foundation Silicon Valley.
17 *
18 * Portions of this software were developed at the Centre for Advanced
19 * Internet Architectures, Swinburne University of Technology, Melbourne,
20 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
21 *
22 * Portions of this software were developed by Robert N. M. Watson under
23 * contract to Juniper Networks, Inc.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 * 1. Redistributions of source code must retain the above copyright
29 *    notice, this list of conditions and the following disclaimer.
30 * 2. Redistributions in binary form must reproduce the above copyright
31 *    notice, this list of conditions and the following disclaimer in the
32 *    documentation and/or other materials provided with the distribution.
33 * 3. Neither the name of the University nor the names of its contributors
34 *    may be used to endorse or promote products derived from this software
35 *    without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
38 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
41 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
42 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
43 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
45 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
46 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 * SUCH DAMAGE.
48 */
49
50#include <sys/cdefs.h>
51#include "opt_inet.h"
52#include "opt_inet6.h"
53#include "opt_ipsec.h"
54#include "opt_rss.h"
55
56#include <sys/param.h>
57#include <sys/arb.h>
58#include <sys/kernel.h>
59#ifdef TCP_HHOOK
60#include <sys/hhook.h>
61#endif
62#include <sys/malloc.h>
63#include <sys/mbuf.h>
64#include <sys/proc.h>		/* for proc0 declaration */
65#include <sys/protosw.h>
66#include <sys/qmath.h>
67#include <sys/sdt.h>
68#include <sys/signalvar.h>
69#include <sys/socket.h>
70#include <sys/socketvar.h>
71#include <sys/sysctl.h>
72#include <sys/syslog.h>
73#include <sys/systm.h>
74#include <sys/stats.h>
75
76#include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
77
78#include <vm/uma.h>
79
80#include <net/if.h>
81#include <net/if_var.h>
82#include <net/route.h>
83#include <net/rss_config.h>
84#include <net/vnet.h>
85
86#define TCPSTATES		/* for logging */
87
88#include <netinet/in.h>
89#include <netinet/in_kdtrace.h>
90#include <netinet/in_pcb.h>
91#include <netinet/in_rss.h>
92#include <netinet/in_systm.h>
93#include <netinet/ip.h>
94#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
95#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
96#include <netinet/ip_var.h>
97#include <netinet/ip_options.h>
98#include <netinet/ip6.h>
99#include <netinet/icmp6.h>
100#include <netinet6/in6_pcb.h>
101#include <netinet6/in6_rss.h>
102#include <netinet6/in6_var.h>
103#include <netinet6/ip6_var.h>
104#include <netinet6/nd6.h>
105#include <netinet/tcp.h>
106#include <netinet/tcp_fsm.h>
107#include <netinet/tcp_seq.h>
108#include <netinet/tcp_timer.h>
109#include <netinet/tcp_var.h>
110#include <netinet/tcp_log_buf.h>
111#include <netinet6/tcp6_var.h>
112#include <netinet/tcpip.h>
113#include <netinet/cc/cc.h>
114#include <netinet/tcp_fastopen.h>
115#ifdef TCPPCAP
116#include <netinet/tcp_pcap.h>
117#endif
118#include <netinet/tcp_syncache.h>
119#ifdef TCP_OFFLOAD
120#include <netinet/tcp_offload.h>
121#endif
122#include <netinet/tcp_ecn.h>
123#include <netinet/udp.h>
124
125#include <netipsec/ipsec_support.h>
126
127#include <machine/in_cksum.h>
128
129#include <security/mac/mac_framework.h>
130
131const int tcprexmtthresh = 3;
132
133VNET_DEFINE(int, tcp_log_in_vain) = 0;
134SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
135    &VNET_NAME(tcp_log_in_vain), 0,
136    "Log all incoming TCP segments to closed ports");
137
138VNET_DEFINE(int, blackhole) = 0;
139#define	V_blackhole		VNET(blackhole)
140SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
141    &VNET_NAME(blackhole), 0,
142    "Do not send RST on segments to closed ports");
143
144VNET_DEFINE(bool, blackhole_local) = false;
145#define	V_blackhole_local	VNET(blackhole_local)
146SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, blackhole_local, CTLFLAG_VNET |
147    CTLFLAG_RW, &VNET_NAME(blackhole_local), false,
148    "Enforce net.inet.tcp.blackhole for locally originated packets");
149
150VNET_DEFINE(int, tcp_delack_enabled) = 1;
151SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW,
152    &VNET_NAME(tcp_delack_enabled), 0,
153    "Delay ACK to try and piggyback it onto a data packet");
154
155VNET_DEFINE(int, drop_synfin) = 0;
156SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW,
157    &VNET_NAME(drop_synfin), 0,
158    "Drop TCP packets with SYN+FIN set");
159
160VNET_DEFINE(int, tcp_do_prr) = 1;
161SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW,
162    &VNET_NAME(tcp_do_prr), 1,
163    "Enable Proportional Rate Reduction per RFC 6937");
164
165VNET_DEFINE(int, tcp_do_newcwv) = 0;
166SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW,
167    &VNET_NAME(tcp_do_newcwv), 0,
168    "Enable New Congestion Window Validation per RFC7661");
169
170VNET_DEFINE(int, tcp_do_rfc3042) = 1;
171SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW,
172    &VNET_NAME(tcp_do_rfc3042), 0,
173    "Enable RFC 3042 (Limited Transmit)");
174
175VNET_DEFINE(int, tcp_do_rfc3390) = 1;
176SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW,
177    &VNET_NAME(tcp_do_rfc3390), 0,
178    "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
179
180VNET_DEFINE(int, tcp_initcwnd_segments) = 10;
181SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments,
182    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0,
183    "Slow-start flight size (initial congestion window) in number of segments");
184
185VNET_DEFINE(int, tcp_do_rfc3465) = 1;
186SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW,
187    &VNET_NAME(tcp_do_rfc3465), 0,
188    "Enable RFC 3465 (Appropriate Byte Counting)");
189
190VNET_DEFINE(int, tcp_abc_l_var) = 2;
191SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW,
192    &VNET_NAME(tcp_abc_l_var), 2,
193    "Cap the max cwnd increment during slow-start to this number of segments");
194
195VNET_DEFINE(int, tcp_insecure_syn) = 0;
196SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW,
197    &VNET_NAME(tcp_insecure_syn), 0,
198    "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets");
199
200VNET_DEFINE(int, tcp_insecure_rst) = 0;
201SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW,
202    &VNET_NAME(tcp_insecure_rst), 0,
203    "Follow RFC793 instead of RFC5961 criteria for accepting RST packets");
204
205VNET_DEFINE(int, tcp_recvspace) = 1024*64;
206#define	V_tcp_recvspace	VNET(tcp_recvspace)
207SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW,
208    &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size");
209
210VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
211SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
212    &VNET_NAME(tcp_do_autorcvbuf), 0,
213    "Enable automatic receive buffer sizing");
214
215VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024;
216SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
217    &VNET_NAME(tcp_autorcvbuf_max), 0,
218    "Max size of automatic receive buffer");
219
220VNET_DEFINE(struct inpcbinfo, tcbinfo);
221
222/*
223 * TCP statistics are stored in an array of counter(9)s, which size matches
224 * size of struct tcpstat.  TCP running connection count is a regular array.
225 */
226VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat);
227SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat,
228    tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
229VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]);
230SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD |
231    CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES,
232    "TCP connection counts by TCP state");
233
234/*
235 * Kernel module interface for updating tcpstat.  The first argument is an index
236 * into tcpstat treated as an array.
237 */
238void
239kmod_tcpstat_add(int statnum, int val)
240{
241
242	counter_u64_add(VNET(tcpstat)[statnum], val);
243}
244
245/*
246 * Make sure that we only start a SACK loss recovery when
247 * receiving a duplicate ACK with a SACK block, and also
248 * complete SACK loss recovery in case the other end
249 * reneges.
250 */
251static bool inline
252tcp_is_sack_recovery(struct tcpcb *tp, struct tcpopt *to)
253{
254	return ((tp->t_flags & TF_SACK_PERMIT) &&
255		((to->to_flags & TOF_SACK) ||
256		(!TAILQ_EMPTY(&tp->snd_holes))));
257}
258
259#ifdef TCP_HHOOK
260/*
261 * Wrapper for the TCP established input helper hook.
262 */
263void
264hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
265{
266	struct tcp_hhook_data hhook_data;
267
268	if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) {
269		hhook_data.tp = tp;
270		hhook_data.th = th;
271		hhook_data.to = to;
272
273		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data,
274		    &tp->t_osd);
275	}
276}
277#endif
278
279/*
280 * CC wrapper hook functions
281 */
282void
283cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs,
284    uint16_t type)
285{
286#ifdef STATS
287	int32_t gput;
288#endif
289
290	INP_WLOCK_ASSERT(tptoinpcb(tp));
291
292	tp->t_ccv.nsegs = nsegs;
293	tp->t_ccv.bytes_this_ack = BYTES_THIS_ACK(tp, th);
294	if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) ||
295	    (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) &&
296	     (tp->snd_cwnd < (tcp_compute_pipe(tp) * 2))))
297		tp->t_ccv.flags |= CCF_CWND_LIMITED;
298	else
299		tp->t_ccv.flags &= ~CCF_CWND_LIMITED;
300
301	if (type == CC_ACK) {
302#ifdef STATS
303		stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
304		    ((int32_t)tp->snd_cwnd) - tp->snd_wnd);
305		if (!IN_RECOVERY(tp->t_flags))
306			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN,
307			   tp->t_ccv.bytes_this_ack / (tcp_maxseg(tp) * nsegs));
308		if ((tp->t_flags & TF_GPUTINPROG) &&
309		    SEQ_GEQ(th->th_ack, tp->gput_ack)) {
310			/*
311			 * Compute goodput in bits per millisecond.
312			 */
313			gput = (((int64_t)SEQ_SUB(th->th_ack, tp->gput_seq)) << 3) /
314			    max(1, tcp_ts_getticks() - tp->gput_ts);
315			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
316			    gput);
317			/*
318			 * XXXLAS: This is a temporary hack, and should be
319			 * chained off VOI_TCP_GPUT when stats(9) grows an API
320			 * to deal with chained VOIs.
321			 */
322			if (tp->t_stats_gput_prev > 0)
323				stats_voi_update_abs_s32(tp->t_stats,
324				    VOI_TCP_GPUT_ND,
325				    ((gput - tp->t_stats_gput_prev) * 100) /
326				    tp->t_stats_gput_prev);
327			tp->t_flags &= ~TF_GPUTINPROG;
328			tp->t_stats_gput_prev = gput;
329		}
330#endif /* STATS */
331		if (tp->snd_cwnd > tp->snd_ssthresh) {
332			tp->t_bytes_acked += tp->t_ccv.bytes_this_ack;
333			if (tp->t_bytes_acked >= tp->snd_cwnd) {
334				tp->t_bytes_acked -= tp->snd_cwnd;
335				tp->t_ccv.flags |= CCF_ABC_SENTAWND;
336			}
337		} else {
338				tp->t_ccv.flags &= ~CCF_ABC_SENTAWND;
339				tp->t_bytes_acked = 0;
340		}
341	}
342
343	if (CC_ALGO(tp)->ack_received != NULL) {
344		/* XXXLAS: Find a way to live without this */
345		tp->t_ccv.curack = th->th_ack;
346		CC_ALGO(tp)->ack_received(&tp->t_ccv, type);
347	}
348#ifdef STATS
349	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
350#endif
351}
352
353void
354cc_conn_init(struct tcpcb *tp)
355{
356	struct hc_metrics_lite metrics;
357	struct inpcb *inp = tptoinpcb(tp);
358	u_int maxseg;
359	int rtt;
360
361	INP_WLOCK_ASSERT(inp);
362
363	tcp_hc_get(&inp->inp_inc, &metrics);
364	maxseg = tcp_maxseg(tp);
365
366	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
367		tp->t_srtt = rtt;
368		TCPSTAT_INC(tcps_usedrtt);
369		if (metrics.rmx_rttvar) {
370			tp->t_rttvar = metrics.rmx_rttvar;
371			TCPSTAT_INC(tcps_usedrttvar);
372		} else {
373			/* default variation is +- 1 rtt */
374			tp->t_rttvar =
375			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
376		}
377		TCPT_RANGESET(tp->t_rxtcur,
378		    ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
379		    tp->t_rttmin, TCPTV_REXMTMAX);
380	}
381	if (metrics.rmx_ssthresh) {
382		/*
383		 * There's some sort of gateway or interface
384		 * buffer limit on the path.  Use this to set
385		 * the slow start threshold, but set the
386		 * threshold to no less than 2*mss.
387		 */
388		tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh);
389		TCPSTAT_INC(tcps_usedssthresh);
390	}
391
392	/*
393	 * Set the initial slow-start flight size.
394	 *
395	 * If a SYN or SYN/ACK was lost and retransmitted, we have to
396	 * reduce the initial CWND to one segment as congestion is likely
397	 * requiring us to be cautious.
398	 */
399	if (tp->snd_cwnd == 1)
400		tp->snd_cwnd = maxseg;		/* SYN(-ACK) lost */
401	else
402		tp->snd_cwnd = tcp_compute_initwnd(maxseg);
403
404	if (CC_ALGO(tp)->conn_init != NULL)
405		CC_ALGO(tp)->conn_init(&tp->t_ccv);
406}
407
408void inline
409cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
410{
411	INP_WLOCK_ASSERT(tptoinpcb(tp));
412
413#ifdef STATS
414	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
415#endif
416
417	switch(type) {
418	case CC_NDUPACK:
419		if (!IN_FASTRECOVERY(tp->t_flags)) {
420			tp->snd_recover = tp->snd_max;
421			if (tp->t_flags2 & TF2_ECN_PERMIT)
422				tp->t_flags2 |= TF2_ECN_SND_CWR;
423		}
424		break;
425	case CC_ECN:
426		if (!IN_CONGRECOVERY(tp->t_flags) ||
427		    /*
428		     * Allow ECN reaction on ACK to CWR, if
429		     * that data segment was also CE marked.
430		     */
431		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
432			EXIT_CONGRECOVERY(tp->t_flags);
433			TCPSTAT_INC(tcps_ecn_rcwnd);
434			tp->snd_recover = tp->snd_max + 1;
435			if (tp->t_flags2 & TF2_ECN_PERMIT)
436				tp->t_flags2 |= TF2_ECN_SND_CWR;
437		}
438		break;
439	case CC_RTO:
440		tp->t_dupacks = 0;
441		tp->t_bytes_acked = 0;
442		if ((tp->t_rxtshift > 1) ||
443		    !((tp->t_flags & TF_SACK_PERMIT) &&
444		      (!TAILQ_EMPTY(&tp->snd_holes))))
445			EXIT_RECOVERY(tp->t_flags);
446		if (tp->t_flags2 & TF2_ECN_PERMIT)
447			tp->t_flags2 |= TF2_ECN_SND_CWR;
448		break;
449	case CC_RTO_ERR:
450		TCPSTAT_INC(tcps_sndrexmitbad);
451		/* RTO was unnecessary, so reset everything. */
452		tp->snd_cwnd = tp->snd_cwnd_prev;
453		tp->snd_ssthresh = tp->snd_ssthresh_prev;
454		tp->snd_recover = tp->snd_recover_prev;
455		if (tp->t_flags & TF_WASFRECOVERY)
456			ENTER_FASTRECOVERY(tp->t_flags);
457		if (tp->t_flags & TF_WASCRECOVERY)
458			ENTER_CONGRECOVERY(tp->t_flags);
459		tp->snd_nxt = tp->snd_max;
460		tp->t_flags &= ~TF_PREVVALID;
461		tp->t_badrxtwin = 0;
462		break;
463	}
464	if (SEQ_LT(tp->snd_fack, tp->snd_una) ||
465	    SEQ_GT(tp->snd_fack, tp->snd_max)) {
466		tp->snd_fack = tp->snd_una;
467	}
468
469	if (CC_ALGO(tp)->cong_signal != NULL) {
470		if (th != NULL)
471			tp->t_ccv.curack = th->th_ack;
472		CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
473	}
474}
475
476void inline
477cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
478{
479	INP_WLOCK_ASSERT(tptoinpcb(tp));
480
481	if (CC_ALGO(tp)->post_recovery != NULL) {
482		if (SEQ_LT(tp->snd_fack, th->th_ack) ||
483		    SEQ_GT(tp->snd_fack, tp->snd_max)) {
484			tp->snd_fack = th->th_ack;
485		}
486		tp->t_ccv.curack = th->th_ack;
487		CC_ALGO(tp)->post_recovery(&tp->t_ccv);
488	}
489	EXIT_RECOVERY(tp->t_flags);
490
491	tp->t_bytes_acked = 0;
492	tp->sackhint.delivered_data = 0;
493	tp->sackhint.prr_delivered = 0;
494	tp->sackhint.prr_out = 0;
495}
496
497/*
498 * Indicate whether this ack should be delayed.  We can delay the ack if
499 * following conditions are met:
500 *	- There is no delayed ack timer in progress.
501 *	- Our last ack wasn't a 0-sized window. We never want to delay
502 *	  the ack that opens up a 0-sized window.
503 *	- LRO wasn't used for this segment. We make sure by checking that the
504 *	  segment size is not larger than the MSS.
505 */
506#define DELAY_ACK(tp, tlen)						\
507	((!tcp_timer_active(tp, TT_DELACK) &&				\
508	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
509	    (tlen <= tp->t_maxseg) &&					\
510	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
511
512void inline
513cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos)
514{
515	INP_WLOCK_ASSERT(tptoinpcb(tp));
516
517	if (CC_ALGO(tp)->ecnpkt_handler != NULL) {
518		switch (iptos & IPTOS_ECN_MASK) {
519		case IPTOS_ECN_CE:
520			tp->t_ccv.flags |= CCF_IPHDR_CE;
521			break;
522		case IPTOS_ECN_ECT0:
523			/* FALLTHROUGH */
524		case IPTOS_ECN_ECT1:
525			/* FALLTHROUGH */
526		case IPTOS_ECN_NOTECT:
527			tp->t_ccv.flags &= ~CCF_IPHDR_CE;
528			break;
529		}
530
531		if (flags & TH_CWR)
532			tp->t_ccv.flags |= CCF_TCPHDR_CWR;
533		else
534			tp->t_ccv.flags &= ~CCF_TCPHDR_CWR;
535
536		CC_ALGO(tp)->ecnpkt_handler(&tp->t_ccv);
537
538		if (tp->t_ccv.flags & CCF_ACKNOW) {
539			tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
540			tp->t_flags |= TF_ACKNOW;
541		}
542	}
543}
544
545void inline
546cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
547{
548	cc_ecnpkt_handler_flags(tp, tcp_get_flags(th), iptos);
549}
550
551/*
552 * TCP input handling is split into multiple parts:
553 *   tcp6_input is a thin wrapper around tcp_input for the extended
554 *	ip6_protox[] call format in ip6_input
555 *   tcp_input handles primary segment validation, inpcb lookup and
556 *	SYN processing on listen sockets
557 *   tcp_do_segment processes the ACK and text of the segment for
558 *	establishing, established and closing connections
559 */
560#ifdef INET6
561int
562tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
563{
564	struct mbuf *m;
565	struct in6_ifaddr *ia6;
566	struct ip6_hdr *ip6;
567
568	m = *mp;
569	if (m->m_len < *offp + sizeof(struct tcphdr)) {
570		m = m_pullup(m, *offp + sizeof(struct tcphdr));
571		if (m == NULL) {
572			*mp = m;
573			TCPSTAT_INC(tcps_rcvshort);
574			return (IPPROTO_DONE);
575		}
576	}
577
578	/*
579	 * draft-itojun-ipv6-tcp-to-anycast
580	 * better place to put this in?
581	 */
582	ip6 = mtod(m, struct ip6_hdr *);
583	ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
584	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
585		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
586			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
587		*mp = NULL;
588		return (IPPROTO_DONE);
589	}
590
591	*mp = m;
592	return (tcp_input_with_port(mp, offp, proto, port));
593}
594
595int
596tcp6_input(struct mbuf **mp, int *offp, int proto)
597{
598
599	return(tcp6_input_with_port(mp, offp, proto, 0));
600}
601#endif /* INET6 */
602
603int
604tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
605{
606	struct mbuf *m = *mp;
607	struct tcphdr *th = NULL;
608	struct ip *ip = NULL;
609	struct inpcb *inp = NULL;
610	struct tcpcb *tp = NULL;
611	struct socket *so = NULL;
612	u_char *optp = NULL;
613	int off0;
614	int optlen = 0;
615#ifdef INET
616	int len;
617	uint8_t ipttl;
618#endif
619	int tlen = 0, off;
620	int drop_hdrlen;
621	int thflags;
622	int rstreason = 0;	/* For badport_bandlim accounting purposes */
623	int lookupflag;
624	uint8_t iptos;
625	struct m_tag *fwd_tag = NULL;
626#ifdef INET6
627	struct ip6_hdr *ip6 = NULL;
628	int isipv6;
629#else
630	const void *ip6 = NULL;
631#endif /* INET6 */
632	struct tcpopt to;		/* options in this segment */
633	char *s = NULL;			/* address and port logging */
634
635	NET_EPOCH_ASSERT();
636
637#ifdef INET6
638	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
639#endif
640
641	off0 = *offp;
642	m = *mp;
643	*mp = NULL;
644	to.to_flags = 0;
645	TCPSTAT_INC(tcps_rcvtotal);
646
647	m->m_pkthdr.tcp_tun_port = port;
648#ifdef INET6
649	if (isipv6) {
650		ip6 = mtod(m, struct ip6_hdr *);
651		th = (struct tcphdr *)((caddr_t)ip6 + off0);
652		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
653		if (port)
654			goto skip6_csum;
655		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
656			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
657				th->th_sum = m->m_pkthdr.csum_data;
658			else
659				th->th_sum = in6_cksum_pseudo(ip6, tlen,
660				    IPPROTO_TCP, m->m_pkthdr.csum_data);
661			th->th_sum ^= 0xffff;
662		} else
663			th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen);
664		if (th->th_sum) {
665			TCPSTAT_INC(tcps_rcvbadsum);
666			goto drop;
667		}
668	skip6_csum:
669		/*
670		 * Be proactive about unspecified IPv6 address in source.
671		 * As we use all-zero to indicate unbounded/unconnected pcb,
672		 * unspecified IPv6 address can be used to confuse us.
673		 *
674		 * Note that packets with unspecified IPv6 destination is
675		 * already dropped in ip6_input.
676		 */
677		KASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst),
678		    ("%s: unspecified destination v6 address", __func__));
679		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
680			IP6STAT_INC(ip6s_badscope); /* XXX */
681			goto drop;
682		}
683		iptos = IPV6_TRAFFIC_CLASS(ip6);
684	}
685#endif
686#if defined(INET) && defined(INET6)
687	else
688#endif
689#ifdef INET
690	{
691		/*
692		 * Get IP and TCP header together in first mbuf.
693		 * Note: IP leaves IP header in first mbuf.
694		 */
695		if (off0 > sizeof (struct ip)) {
696			ip_stripoptions(m);
697			off0 = sizeof(struct ip);
698		}
699		if (m->m_len < sizeof (struct tcpiphdr)) {
700			if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
701			    == NULL) {
702				TCPSTAT_INC(tcps_rcvshort);
703				return (IPPROTO_DONE);
704			}
705		}
706		ip = mtod(m, struct ip *);
707		th = (struct tcphdr *)((caddr_t)ip + off0);
708		tlen = ntohs(ip->ip_len) - off0;
709
710		iptos = ip->ip_tos;
711		if (port)
712			goto skip_csum;
713		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
714			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
715				th->th_sum = m->m_pkthdr.csum_data;
716			else
717				th->th_sum = in_pseudo(ip->ip_src.s_addr,
718				    ip->ip_dst.s_addr,
719				    htonl(m->m_pkthdr.csum_data + tlen +
720				    IPPROTO_TCP));
721			th->th_sum ^= 0xffff;
722		} else {
723			struct ipovly *ipov = (struct ipovly *)ip;
724
725			/*
726			 * Checksum extended TCP header and data.
727			 */
728			len = off0 + tlen;
729			ipttl = ip->ip_ttl;
730			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
731			ipov->ih_len = htons(tlen);
732			th->th_sum = in_cksum(m, len);
733			/* Reset length for SDT probes. */
734			ip->ip_len = htons(len);
735			/* Reset TOS bits */
736			ip->ip_tos = iptos;
737			/* Re-initialization for later version check */
738			ip->ip_ttl = ipttl;
739			ip->ip_v = IPVERSION;
740			ip->ip_hl = off0 >> 2;
741		}
742	skip_csum:
743		if (th->th_sum && (port == 0)) {
744			TCPSTAT_INC(tcps_rcvbadsum);
745			goto drop;
746		}
747		KASSERT(ip->ip_dst.s_addr != INADDR_ANY,
748		    ("%s: unspecified destination v4 address", __func__));
749		if (__predict_false(ip->ip_src.s_addr == INADDR_ANY)) {
750			IPSTAT_INC(ips_badaddr);
751			goto drop;
752		}
753	}
754#endif /* INET */
755
756	/*
757	 * Check that TCP offset makes sense,
758	 * pull out TCP options and adjust length.		XXX
759	 */
760	off = th->th_off << 2;
761	if (off < sizeof (struct tcphdr) || off > tlen) {
762		TCPSTAT_INC(tcps_rcvbadoff);
763		goto drop;
764	}
765	tlen -= off;	/* tlen is used instead of ti->ti_len */
766	if (off > sizeof (struct tcphdr)) {
767#ifdef INET6
768		if (isipv6) {
769			if (m->m_len < off0 + off) {
770				m = m_pullup(m, off0 + off);
771				if (m == NULL) {
772					TCPSTAT_INC(tcps_rcvshort);
773					return (IPPROTO_DONE);
774				}
775			}
776			ip6 = mtod(m, struct ip6_hdr *);
777			th = (struct tcphdr *)((caddr_t)ip6 + off0);
778		}
779#endif
780#if defined(INET) && defined(INET6)
781		else
782#endif
783#ifdef INET
784		{
785			if (m->m_len < sizeof(struct ip) + off) {
786				if ((m = m_pullup(m, sizeof (struct ip) + off))
787				    == NULL) {
788					TCPSTAT_INC(tcps_rcvshort);
789					return (IPPROTO_DONE);
790				}
791				ip = mtod(m, struct ip *);
792				th = (struct tcphdr *)((caddr_t)ip + off0);
793			}
794		}
795#endif
796		optlen = off - sizeof (struct tcphdr);
797		optp = (u_char *)(th + 1);
798	}
799	thflags = tcp_get_flags(th);
800
801	/*
802	 * Convert TCP protocol specific fields to host format.
803	 */
804	tcp_fields_to_host(th);
805
806	/*
807	 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
808	 */
809	drop_hdrlen = off0 + off;
810
811	/*
812	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
813	 */
814        if (
815#ifdef INET6
816	    (isipv6 && (m->m_flags & M_IP6_NEXTHOP))
817#ifdef INET
818	    || (!isipv6 && (m->m_flags & M_IP_NEXTHOP))
819#endif
820#endif
821#if defined(INET) && !defined(INET6)
822	    (m->m_flags & M_IP_NEXTHOP)
823#endif
824	    )
825		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
826
827	/*
828	 * For initial SYN packets we don't need write lock on matching
829	 * PCB, be it a listening one or a synchronized one.  The packet
830	 * shall not modify its state.
831	 */
832	lookupflag = INPLOOKUP_WILDCARD |
833	    ((thflags & (TH_ACK|TH_SYN)) == TH_SYN ?
834	    INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB);
835findpcb:
836	tp = NULL;
837#ifdef INET6
838	if (isipv6 && fwd_tag != NULL) {
839		struct sockaddr_in6 *next_hop6;
840
841		next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
842		/*
843		 * Transparently forwarded. Pretend to be the destination.
844		 * Already got one like this?
845		 */
846		inp = in6_pcblookup_mbuf(&V_tcbinfo,
847		    &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport,
848		    lookupflag & ~INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif, m);
849		if (!inp) {
850			/*
851			 * It's new.  Try to find the ambushing socket.
852			 * Because we've rewritten the destination address,
853			 * any hardware-generated hash is ignored.
854			 */
855			inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src,
856			    th->th_sport, &next_hop6->sin6_addr,
857			    next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) :
858			    th->th_dport, lookupflag, m->m_pkthdr.rcvif);
859		}
860	} else if (isipv6) {
861		inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
862		    th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag,
863		    m->m_pkthdr.rcvif, m);
864	}
865#endif /* INET6 */
866#if defined(INET6) && defined(INET)
867	else
868#endif
869#ifdef INET
870	if (fwd_tag != NULL) {
871		struct sockaddr_in *next_hop;
872
873		next_hop = (struct sockaddr_in *)(fwd_tag+1);
874		/*
875		 * Transparently forwarded. Pretend to be the destination.
876		 * already got one like this?
877		 */
878		inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport,
879		    ip->ip_dst, th->th_dport, lookupflag & ~INPLOOKUP_WILDCARD,
880		    m->m_pkthdr.rcvif, m);
881		if (!inp) {
882			/*
883			 * It's new.  Try to find the ambushing socket.
884			 * Because we've rewritten the destination address,
885			 * any hardware-generated hash is ignored.
886			 */
887			inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
888			    th->th_sport, next_hop->sin_addr,
889			    next_hop->sin_port ? ntohs(next_hop->sin_port) :
890			    th->th_dport, lookupflag, m->m_pkthdr.rcvif);
891		}
892	} else
893		inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
894		    th->th_sport, ip->ip_dst, th->th_dport, lookupflag,
895		    m->m_pkthdr.rcvif, m);
896#endif /* INET */
897
898	/*
899	 * If the INPCB does not exist then all data in the incoming
900	 * segment is discarded and an appropriate RST is sent back.
901	 * XXX MRT Send RST using which routing table?
902	 */
903	if (inp == NULL) {
904		if (rstreason != 0) {
905			/* We came here after second (safety) lookup. */
906			MPASS((lookupflag & INPLOOKUP_WILDCARD) == 0);
907			goto dropwithreset;
908		}
909		/*
910		 * Log communication attempts to ports that are not
911		 * in use.
912		 */
913		if ((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
914		    V_tcp_log_in_vain == 2) {
915			if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6)))
916				log(LOG_INFO, "%s; %s: Connection attempt "
917				    "to closed port\n", s, __func__);
918		}
919		rstreason = BANDLIM_RST_CLOSEDPORT;
920		goto dropwithreset;
921	}
922	INP_LOCK_ASSERT(inp);
923
924	if ((inp->inp_flowtype == M_HASHTYPE_NONE) &&
925	    !SOLISTENING(inp->inp_socket)) {
926		if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
927			inp->inp_flowid = m->m_pkthdr.flowid;
928			inp->inp_flowtype = M_HASHTYPE_GET(m);
929#ifdef	RSS
930		} else {
931			  /* assign flowid by software RSS hash */
932#ifdef INET6
933			  if (isipv6) {
934				rss_proto_software_hash_v6(&inp->in6p_faddr,
935							   &inp->in6p_laddr,
936							   inp->inp_fport,
937							   inp->inp_lport,
938							   IPPROTO_TCP,
939							   &inp->inp_flowid,
940							   &inp->inp_flowtype);
941			  } else
942#endif	/* INET6 */
943			  {
944				rss_proto_software_hash_v4(inp->inp_faddr,
945							   inp->inp_laddr,
946							   inp->inp_fport,
947							   inp->inp_lport,
948							   IPPROTO_TCP,
949							   &inp->inp_flowid,
950							   &inp->inp_flowtype);
951			  }
952#endif	/* RSS */
953		}
954	}
955#if defined(IPSEC) || defined(IPSEC_SUPPORT)
956#ifdef INET6
957	if (isipv6 && IPSEC_ENABLED(ipv6) &&
958	    IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) {
959		goto dropunlock;
960	}
961#ifdef INET
962	else
963#endif
964#endif /* INET6 */
965#ifdef INET
966	if (IPSEC_ENABLED(ipv4) &&
967	    IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) {
968		goto dropunlock;
969	}
970#endif /* INET */
971#endif /* IPSEC */
972
973	/*
974	 * Check the minimum TTL for socket.
975	 */
976	if (inp->inp_ip_minttl != 0) {
977#ifdef INET6
978		if (isipv6) {
979			if (inp->inp_ip_minttl > ip6->ip6_hlim)
980				goto dropunlock;
981		} else
982#endif
983		if (inp->inp_ip_minttl > ip->ip_ttl)
984			goto dropunlock;
985	}
986
987	tp = intotcpcb(inp);
988	switch (tp->t_state) {
989	case TCPS_TIME_WAIT:
990		/*
991		 * A previous connection in TIMEWAIT state is supposed to catch
992		 * stray or duplicate segments arriving late.  If this segment
993		 * was a legitimate new connection attempt, the old INPCB gets
994		 * removed and we can try again to find a listening socket.
995		 */
996		tcp_dooptions(&to, optp, optlen,
997		    (thflags & TH_SYN) ? TO_SYN : 0);
998		/*
999		 * tcp_twcheck unlocks the inp always, and frees the m if fails.
1000		 */
1001		if (tcp_twcheck(inp, &to, th, m, tlen))
1002			goto findpcb;
1003		return (IPPROTO_DONE);
1004	case TCPS_CLOSED:
1005		/*
1006		 * The TCPCB may no longer exist if the connection is winding
1007		 * down or it is in the CLOSED state.  Either way we drop the
1008		 * segment and send an appropriate response.
1009		 */
1010		rstreason = BANDLIM_RST_CLOSEDPORT;
1011		goto dropwithreset;
1012	}
1013
1014	if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
1015		rstreason = BANDLIM_RST_CLOSEDPORT;
1016		goto dropwithreset;
1017	}
1018
1019#ifdef TCP_OFFLOAD
1020	if (tp->t_flags & TF_TOE) {
1021		tcp_offload_input(tp, m);
1022		m = NULL;	/* consumed by the TOE driver */
1023		goto dropunlock;
1024	}
1025#endif
1026
1027#ifdef MAC
1028	if (mac_inpcb_check_deliver(inp, m))
1029		goto dropunlock;
1030#endif
1031	so = inp->inp_socket;
1032	KASSERT(so != NULL, ("%s: so == NULL", __func__));
1033	/*
1034	 * When the socket is accepting connections (the INPCB is in LISTEN
1035	 * state) we look into the SYN cache if this is a new connection
1036	 * attempt or the completion of a previous one.
1037	 */
1038	KASSERT(tp->t_state == TCPS_LISTEN || !SOLISTENING(so),
1039	    ("%s: so accepting but tp %p not listening", __func__, tp));
1040	if (tp->t_state == TCPS_LISTEN && SOLISTENING(so)) {
1041		struct in_conninfo inc;
1042
1043		bzero(&inc, sizeof(inc));
1044#ifdef INET6
1045		if (isipv6) {
1046			inc.inc_flags |= INC_ISIPV6;
1047			if (inp->inp_inc.inc_flags & INC_IPV6MINMTU)
1048				inc.inc_flags |= INC_IPV6MINMTU;
1049			inc.inc6_faddr = ip6->ip6_src;
1050			inc.inc6_laddr = ip6->ip6_dst;
1051		} else
1052#endif
1053		{
1054			inc.inc_faddr = ip->ip_src;
1055			inc.inc_laddr = ip->ip_dst;
1056		}
1057		inc.inc_fport = th->th_sport;
1058		inc.inc_lport = th->th_dport;
1059		inc.inc_fibnum = so->so_fibnum;
1060
1061		/*
1062		 * Check for an existing connection attempt in syncache if
1063		 * the flag is only ACK.  A successful lookup creates a new
1064		 * socket appended to the listen queue in SYN_RECEIVED state.
1065		 */
1066		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
1067			/*
1068			 * Parse the TCP options here because
1069			 * syncookies need access to the reflected
1070			 * timestamp.
1071			 */
1072			tcp_dooptions(&to, optp, optlen, 0);
1073			/*
1074			 * NB: syncache_expand() doesn't unlock inp.
1075			 */
1076			rstreason = syncache_expand(&inc, &to, th, &so, m, port);
1077			if (rstreason < 0) {
1078				/*
1079				 * A failing TCP MD5 signature comparison
1080				 * must result in the segment being dropped
1081				 * and must not produce any response back
1082				 * to the sender.
1083				 */
1084				goto dropunlock;
1085			} else if (rstreason == 0) {
1086				/*
1087				 * No syncache entry, or ACK was not for our
1088				 * SYN/ACK.  Do our protection against double
1089				 * ACK.  If peer sent us 2 ACKs, then for the
1090				 * first one syncache_expand() successfully
1091				 * converted syncache entry into a socket,
1092				 * while we were waiting on the inpcb lock.  We
1093				 * don't want to sent RST for the second ACK,
1094				 * so we perform second lookup without wildcard
1095				 * match, hoping to find the new socket.  If
1096				 * the ACK is stray indeed, rstreason would
1097				 * hint the above code that the lookup was a
1098				 * second attempt.
1099				 *
1100				 * NB: syncache did its own logging
1101				 * of the failure cause.
1102				 */
1103				INP_WUNLOCK(inp);
1104				rstreason = BANDLIM_RST_OPENPORT;
1105				lookupflag &= ~INPLOOKUP_WILDCARD;
1106				goto findpcb;
1107			}
1108tfo_socket_result:
1109			if (so == NULL) {
1110				/*
1111				 * We completed the 3-way handshake
1112				 * but could not allocate a socket
1113				 * either due to memory shortage,
1114				 * listen queue length limits or
1115				 * global socket limits.  Send RST
1116				 * or wait and have the remote end
1117				 * retransmit the ACK for another
1118				 * try.
1119				 */
1120				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1121					log(LOG_DEBUG, "%s; %s: Listen socket: "
1122					    "Socket allocation failed due to "
1123					    "limits or memory shortage, %s\n",
1124					    s, __func__,
1125					    V_tcp_sc_rst_sock_fail ?
1126					    "sending RST" : "try again");
1127				if (V_tcp_sc_rst_sock_fail) {
1128					rstreason = BANDLIM_UNLIMITED;
1129					goto dropwithreset;
1130				} else
1131					goto dropunlock;
1132			}
1133			/*
1134			 * Socket is created in state SYN_RECEIVED.
1135			 * Unlock the listen socket, lock the newly
1136			 * created socket and update the tp variable.
1137			 * If we came here via jump to tfo_socket_result,
1138			 * then listening socket is read-locked.
1139			 */
1140			INP_UNLOCK(inp);	/* listen socket */
1141			inp = sotoinpcb(so);
1142			/*
1143			 * New connection inpcb is already locked by
1144			 * syncache_expand().
1145			 */
1146			INP_WLOCK_ASSERT(inp);
1147			tp = intotcpcb(inp);
1148			KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
1149			    ("%s: ", __func__));
1150			/*
1151			 * Process the segment and the data it
1152			 * contains.  tcp_do_segment() consumes
1153			 * the mbuf chain and unlocks the inpcb.
1154			 */
1155			TCP_PROBE5(receive, NULL, tp, m, tp, th);
1156			tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen,
1157			    tlen, iptos);
1158			return (IPPROTO_DONE);
1159		}
1160		/*
1161		 * Segment flag validation for new connection attempts:
1162		 *
1163		 * Our (SYN|ACK) response was rejected.
1164		 * Check with syncache and remove entry to prevent
1165		 * retransmits.
1166		 *
1167		 * NB: syncache_chkrst does its own logging of failure
1168		 * causes.
1169		 */
1170		if (thflags & TH_RST) {
1171			syncache_chkrst(&inc, th, m, port);
1172			goto dropunlock;
1173		}
1174		/*
1175		 * We can't do anything without SYN.
1176		 */
1177		if ((thflags & TH_SYN) == 0) {
1178			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1179				log(LOG_DEBUG, "%s; %s: Listen socket: "
1180				    "SYN is missing, segment ignored\n",
1181				    s, __func__);
1182			TCPSTAT_INC(tcps_badsyn);
1183			goto dropunlock;
1184		}
1185		/*
1186		 * (SYN|ACK) is bogus on a listen socket.
1187		 */
1188		if (thflags & TH_ACK) {
1189			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1190				log(LOG_DEBUG, "%s; %s: Listen socket: "
1191				    "SYN|ACK invalid, segment rejected\n",
1192				    s, __func__);
1193			syncache_badack(&inc, port);	/* XXX: Not needed! */
1194			TCPSTAT_INC(tcps_badsyn);
1195			rstreason = BANDLIM_RST_OPENPORT;
1196			goto dropwithreset;
1197		}
1198		/*
1199		 * If the drop_synfin option is enabled, drop all
1200		 * segments with both the SYN and FIN bits set.
1201		 * This prevents e.g. nmap from identifying the
1202		 * TCP/IP stack.
1203		 * XXX: Poor reasoning.  nmap has other methods
1204		 * and is constantly refining its stack detection
1205		 * strategies.
1206		 * XXX: This is a violation of the TCP specification
1207		 * and was used by RFC1644.
1208		 */
1209		if ((thflags & TH_FIN) && V_drop_synfin) {
1210			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1211				log(LOG_DEBUG, "%s; %s: Listen socket: "
1212				    "SYN|FIN segment ignored (based on "
1213				    "sysctl setting)\n", s, __func__);
1214			TCPSTAT_INC(tcps_badsyn);
1215			goto dropunlock;
1216		}
1217		/*
1218		 * Segment's flags are (SYN) or (SYN|FIN).
1219		 *
1220		 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
1221		 * as they do not affect the state of the TCP FSM.
1222		 * The data pointed to by TH_URG and th_urp is ignored.
1223		 */
1224		KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
1225		    ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
1226		KASSERT(thflags & (TH_SYN),
1227		    ("%s: Listen socket: TH_SYN not set", __func__));
1228		INP_RLOCK_ASSERT(inp);
1229#ifdef INET6
1230		/*
1231		 * If deprecated address is forbidden,
1232		 * we do not accept SYN to deprecated interface
1233		 * address to prevent any new inbound connection from
1234		 * getting established.
1235		 * When we do not accept SYN, we send a TCP RST,
1236		 * with deprecated source address (instead of dropping
1237		 * it).  We compromise it as it is much better for peer
1238		 * to send a RST, and RST will be the final packet
1239		 * for the exchange.
1240		 *
1241		 * If we do not forbid deprecated addresses, we accept
1242		 * the SYN packet.  RFC2462 does not suggest dropping
1243		 * SYN in this case.
1244		 * If we decipher RFC2462 5.5.4, it says like this:
1245		 * 1. use of deprecated addr with existing
1246		 *    communication is okay - "SHOULD continue to be
1247		 *    used"
1248		 * 2. use of it with new communication:
1249		 *   (2a) "SHOULD NOT be used if alternate address
1250		 *        with sufficient scope is available"
1251		 *   (2b) nothing mentioned otherwise.
1252		 * Here we fall into (2b) case as we have no choice in
1253		 * our source address selection - we must obey the peer.
1254		 *
1255		 * The wording in RFC2462 is confusing, and there are
1256		 * multiple description text for deprecated address
1257		 * handling - worse, they are not exactly the same.
1258		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
1259		 */
1260		if (isipv6 && !V_ip6_use_deprecated) {
1261			struct in6_ifaddr *ia6;
1262
1263			ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
1264			if (ia6 != NULL &&
1265			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
1266				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1267				    log(LOG_DEBUG, "%s; %s: Listen socket: "
1268					"Connection attempt to deprecated "
1269					"IPv6 address rejected\n",
1270					s, __func__);
1271				rstreason = BANDLIM_RST_OPENPORT;
1272				goto dropwithreset;
1273			}
1274		}
1275#endif /* INET6 */
1276		/*
1277		 * Basic sanity checks on incoming SYN requests:
1278		 *   Don't respond if the destination is a link layer
1279		 *	broadcast according to RFC1122 4.2.3.10, p. 104.
1280		 *   If it is from this socket it must be forged.
1281		 *   Don't respond if the source or destination is a
1282		 *	global or subnet broad- or multicast address.
1283		 *   Note that it is quite possible to receive unicast
1284		 *	link-layer packets with a broadcast IP address. Use
1285		 *	in_broadcast() to find them.
1286		 */
1287		if (m->m_flags & (M_BCAST|M_MCAST)) {
1288			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1289			    log(LOG_DEBUG, "%s; %s: Listen socket: "
1290				"Connection attempt from broad- or multicast "
1291				"link layer address ignored\n", s, __func__);
1292			goto dropunlock;
1293		}
1294#ifdef INET6
1295		if (isipv6) {
1296			if (th->th_dport == th->th_sport &&
1297			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
1298				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1299				    log(LOG_DEBUG, "%s; %s: Listen socket: "
1300					"Connection attempt to/from self "
1301					"ignored\n", s, __func__);
1302				goto dropunlock;
1303			}
1304			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1305			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
1306				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1307				    log(LOG_DEBUG, "%s; %s: Listen socket: "
1308					"Connection attempt from/to multicast "
1309					"address ignored\n", s, __func__);
1310				goto dropunlock;
1311			}
1312		}
1313#endif
1314#if defined(INET) && defined(INET6)
1315		else
1316#endif
1317#ifdef INET
1318		{
1319			if (th->th_dport == th->th_sport &&
1320			    ip->ip_dst.s_addr == ip->ip_src.s_addr) {
1321				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1322				    log(LOG_DEBUG, "%s; %s: Listen socket: "
1323					"Connection attempt from/to self "
1324					"ignored\n", s, __func__);
1325				goto dropunlock;
1326			}
1327			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1328			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1329			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
1330			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
1331				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1332				    log(LOG_DEBUG, "%s; %s: Listen socket: "
1333					"Connection attempt from/to broad- "
1334					"or multicast address ignored\n",
1335					s, __func__);
1336				goto dropunlock;
1337			}
1338		}
1339#endif
1340		/*
1341		 * SYN appears to be valid.  Create compressed TCP state
1342		 * for syncache.
1343		 */
1344		TCP_PROBE3(debug__input, tp, th, m);
1345		tcp_dooptions(&to, optp, optlen, TO_SYN);
1346		if ((so = syncache_add(&inc, &to, th, inp, so, m, NULL, NULL,
1347		    iptos, port)) != NULL)
1348			goto tfo_socket_result;
1349
1350		/*
1351		 * Entry added to syncache and mbuf consumed.
1352		 * Only the listen socket is unlocked by syncache_add().
1353		 */
1354		return (IPPROTO_DONE);
1355	} else if (tp->t_state == TCPS_LISTEN) {
1356		/*
1357		 * When a listen socket is torn down the SO_ACCEPTCONN
1358		 * flag is removed first while connections are drained
1359		 * from the accept queue in a unlock/lock cycle of the
1360		 * ACCEPT_LOCK, opening a race condition allowing a SYN
1361		 * attempt go through unhandled.
1362		 */
1363		goto dropunlock;
1364	}
1365#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1366	if (tp->t_flags & TF_SIGNATURE) {
1367		tcp_dooptions(&to, optp, optlen, thflags);
1368		if ((to.to_flags & TOF_SIGNATURE) == 0) {
1369			TCPSTAT_INC(tcps_sig_err_nosigopt);
1370			goto dropunlock;
1371		}
1372		if (!TCPMD5_ENABLED() ||
1373		    TCPMD5_INPUT(m, th, to.to_signature) != 0)
1374			goto dropunlock;
1375	}
1376#endif
1377	TCP_PROBE5(receive, NULL, tp, m, tp, th);
1378
1379	/*
1380	 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
1381	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
1382	 * the inpcb, and unlocks pcbinfo.
1383	 *
1384	 * XXXGL: in case of a pure SYN arriving on existing connection
1385	 * TCP stacks won't need to modify the PCB, they would either drop
1386	 * the segment silently, or send a challenge ACK.  However, we try
1387	 * to upgrade the lock, because calling convention for stacks is
1388	 * write-lock on PCB.  If upgrade fails, drop the SYN.
1389	 */
1390	if ((lookupflag & INPLOOKUP_RLOCKPCB) && INP_TRY_UPGRADE(inp) == 0)
1391		goto dropunlock;
1392
1393	tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen, tlen, iptos);
1394	return (IPPROTO_DONE);
1395
1396dropwithreset:
1397	/*
1398	 * When blackholing do not respond with a RST but
1399	 * completely ignore the segment and drop it.
1400	 */
1401	if (((rstreason == BANDLIM_RST_OPENPORT && V_blackhole == 3) ||
1402	    (rstreason == BANDLIM_RST_CLOSEDPORT &&
1403	    ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) &&
1404	    (V_blackhole_local || (
1405#ifdef INET6
1406	    isipv6 ? !in6_localaddr(&ip6->ip6_src) :
1407#endif
1408#ifdef INET
1409	    !in_localip(ip->ip_src)
1410#else
1411	    true
1412#endif
1413	    )))
1414		goto dropunlock;
1415	TCP_PROBE5(receive, NULL, tp, m, tp, th);
1416	tcp_dropwithreset(m, th, tp, tlen, rstreason);
1417	m = NULL;	/* mbuf chain got consumed. */
1418
1419dropunlock:
1420	if (m != NULL)
1421		TCP_PROBE5(receive, NULL, tp, m, tp, th);
1422
1423	if (inp != NULL)
1424		INP_UNLOCK(inp);
1425
1426drop:
1427	if (s != NULL)
1428		free(s, M_TCPLOG);
1429	if (m != NULL)
1430		m_freem(m);
1431	return (IPPROTO_DONE);
1432}
1433
1434/*
1435 * Automatic sizing of receive socket buffer.  Often the send
1436 * buffer size is not optimally adjusted to the actual network
1437 * conditions at hand (delay bandwidth product).  Setting the
1438 * buffer size too small limits throughput on links with high
1439 * bandwidth and high delay (eg. trans-continental/oceanic links).
1440 *
1441 * On the receive side the socket buffer memory is only rarely
1442 * used to any significant extent.  This allows us to be much
1443 * more aggressive in scaling the receive socket buffer.  For
1444 * the case that the buffer space is actually used to a large
1445 * extent and we run out of kernel memory we can simply drop
1446 * the new segments; TCP on the sender will just retransmit it
1447 * later.  Setting the buffer size too big may only consume too
1448 * much kernel memory if the application doesn't read() from
1449 * the socket or packet loss or reordering makes use of the
1450 * reassembly queue.
1451 *
1452 * The criteria to step up the receive buffer one notch are:
1453 *  1. Application has not set receive buffer size with
1454 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
1455 *  2. the number of bytes received during 1/2 of an sRTT
1456 *     is at least 3/8 of the current socket buffer size.
1457 *  3. receive buffer size has not hit maximal automatic size;
1458 *
1459 * If all of the criteria are met we increaset the socket buffer
1460 * by a 1/2 (bounded by the max). This allows us to keep ahead
1461 * of slow-start but also makes it so our peer never gets limited
1462 * by our rwnd which we then open up causing a burst.
1463 *
1464 * This algorithm does two steps per RTT at most and only if
1465 * we receive a bulk stream w/o packet losses or reorderings.
1466 * Shrinking the buffer during idle times is not necessary as
1467 * it doesn't consume any memory when idle.
1468 *
1469 * TODO: Only step up if the application is actually serving
1470 * the buffer to better manage the socket buffer resources.
1471 */
1472int
1473tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
1474    struct tcpcb *tp, int tlen)
1475{
1476	int newsize = 0;
1477
1478	if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
1479	    tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
1480	    TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
1481	    ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) {
1482		if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) &&
1483		    so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
1484			newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max);
1485		}
1486		TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
1487
1488		/* Start over with next RTT. */
1489		tp->rfbuf_ts = 0;
1490		tp->rfbuf_cnt = 0;
1491	} else {
1492		tp->rfbuf_cnt += tlen;	/* add up */
1493	}
1494	return (newsize);
1495}
1496
1497int
1498tcp_input(struct mbuf **mp, int *offp, int proto)
1499{
1500	return(tcp_input_with_port(mp, offp, proto, 0));
1501}
1502
1503static void
1504tcp_handle_wakeup(struct tcpcb *tp)
1505{
1506
1507	INP_WLOCK_ASSERT(tptoinpcb(tp));
1508
1509	if (tp->t_flags & TF_WAKESOR) {
1510		struct socket *so = tptosocket(tp);
1511
1512		tp->t_flags &= ~TF_WAKESOR;
1513		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1514		sorwakeup_locked(so);
1515	}
1516}
1517
1518void
1519tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1520    int drop_hdrlen, int tlen, uint8_t iptos)
1521{
1522	uint16_t thflags;
1523	int acked, ourfinisacked, needoutput = 0;
1524	sackstatus_t sack_changed;
1525	int rstreason, todrop, win, incforsyn = 0;
1526	uint32_t tiwin;
1527	uint16_t nsegs;
1528	char *s;
1529	struct inpcb *inp = tptoinpcb(tp);
1530	struct socket *so = tptosocket(tp);
1531	struct in_conninfo *inc = &inp->inp_inc;
1532	struct mbuf *mfree;
1533	struct tcpopt to;
1534	int tfo_syn;
1535	u_int maxseg = 0;
1536
1537	thflags = tcp_get_flags(th);
1538	tp->sackhint.last_sack_ack = 0;
1539	sack_changed = SACK_NOCHANGE;
1540	nsegs = max(1, m->m_pkthdr.lro_nsegs);
1541
1542	NET_EPOCH_ASSERT();
1543	INP_WLOCK_ASSERT(inp);
1544	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
1545	    __func__));
1546	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
1547	    __func__));
1548
1549#ifdef TCPPCAP
1550	/* Save segment, if requested. */
1551	tcp_pcap_add(th, m, &(tp->t_inpkts));
1552#endif
1553	TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
1554	    tlen, NULL, true);
1555
1556	if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
1557		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1558			log(LOG_DEBUG, "%s; %s: "
1559			    "SYN|FIN segment ignored (based on "
1560			    "sysctl setting)\n", s, __func__);
1561			free(s, M_TCPLOG);
1562		}
1563		goto drop;
1564	}
1565
1566	/*
1567	 * If a segment with the ACK-bit set arrives in the SYN-SENT state
1568	 * check SEQ.ACK first.
1569	 */
1570	if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
1571	    (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
1572		rstreason = BANDLIM_UNLIMITED;
1573		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
1574		goto dropwithreset;
1575	}
1576
1577	/*
1578	 * Segment received on connection.
1579	 * Reset idle time and keep-alive timer.
1580	 * XXX: This should be done after segment
1581	 * validation to ignore broken/spoofed segs.
1582	 */
1583	if  (tp->t_idle_reduce &&
1584	     (tp->snd_max == tp->snd_una) &&
1585	     ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
1586		cc_after_idle(tp);
1587	tp->t_rcvtime = ticks;
1588
1589	if (thflags & TH_FIN)
1590		tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
1591	/*
1592	 * Scale up the window into a 32-bit value.
1593	 * For the SYN_SENT state the scale is zero.
1594	 */
1595	tiwin = th->th_win << tp->snd_scale;
1596#ifdef STATS
1597	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
1598#endif
1599
1600	/*
1601	 * TCP ECN processing.
1602	 */
1603	if (tcp_ecn_input_segment(tp, thflags, tlen,
1604	    tcp_packets_this_ack(tp, th->th_ack),
1605	    iptos))
1606		cc_cong_signal(tp, th, CC_ECN);
1607
1608	/*
1609	 * Parse options on any incoming segment.
1610	 */
1611	tcp_dooptions(&to, (u_char *)(th + 1),
1612	    (th->th_off << 2) - sizeof(struct tcphdr),
1613	    (thflags & TH_SYN) ? TO_SYN : 0);
1614	if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) {
1615		/*
1616		 * We don't look at sack's from the
1617		 * peer because the MSS is too small which
1618		 * can subject us to an attack.
1619		 */
1620		to.to_flags &= ~TOF_SACK;
1621	}
1622#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1623	if ((tp->t_flags & TF_SIGNATURE) != 0 &&
1624	    (to.to_flags & TOF_SIGNATURE) == 0) {
1625		TCPSTAT_INC(tcps_sig_err_sigopt);
1626		/* XXX: should drop? */
1627	}
1628#endif
1629	/*
1630	 * If echoed timestamp is later than the current time,
1631	 * fall back to non RFC1323 RTT calculation.  Normalize
1632	 * timestamp if syncookies were used when this connection
1633	 * was established.
1634	 */
1635	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1636		to.to_tsecr -= tp->ts_offset;
1637		if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) {
1638			to.to_tsecr = 0;
1639		} else if (tp->t_rxtshift == 1 &&
1640			 tp->t_flags & TF_PREVVALID &&
1641			 tp->t_badrxtwin != 0 &&
1642			 TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) {
1643			cc_cong_signal(tp, th, CC_RTO_ERR);
1644		}
1645	}
1646	/*
1647	 * Process options only when we get SYN/ACK back. The SYN case
1648	 * for incoming connections is handled in tcp_syncache.
1649	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1650	 * or <SYN,ACK>) segment itself is never scaled.
1651	 * XXX this is traditional behavior, may need to be cleaned up.
1652	 */
1653	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1654		/* Handle parallel SYN for ECN */
1655		tcp_ecn_input_parallel_syn(tp, thflags, iptos);
1656		if ((to.to_flags & TOF_SCALE) &&
1657		    (tp->t_flags & TF_REQ_SCALE) &&
1658		    !(tp->t_flags & TF_NOOPT)) {
1659			tp->t_flags |= TF_RCVD_SCALE;
1660			tp->snd_scale = to.to_wscale;
1661		} else {
1662			tp->t_flags &= ~TF_REQ_SCALE;
1663		}
1664		/*
1665		 * Initial send window.  It will be updated with
1666		 * the next incoming segment to the scaled value.
1667		 */
1668		tp->snd_wnd = th->th_win;
1669		if ((to.to_flags & TOF_TS) &&
1670		    (tp->t_flags & TF_REQ_TSTMP) &&
1671		    !(tp->t_flags & TF_NOOPT)) {
1672			tp->t_flags |= TF_RCVD_TSTMP;
1673			tp->ts_recent = to.to_tsval;
1674			tp->ts_recent_age = tcp_ts_getticks();
1675		} else {
1676			tp->t_flags &= ~TF_REQ_TSTMP;
1677		}
1678		if (to.to_flags & TOF_MSS) {
1679			tcp_mss(tp, to.to_mss);
1680		}
1681		if ((tp->t_flags & TF_SACK_PERMIT) &&
1682		    (!(to.to_flags & TOF_SACKPERM) ||
1683		    (tp->t_flags & TF_NOOPT))) {
1684			tp->t_flags &= ~TF_SACK_PERMIT;
1685		}
1686		if (tp->t_flags & TF_FASTOPEN) {
1687			if ((to.to_flags & TOF_FASTOPEN) &&
1688			    !(tp->t_flags & TF_NOOPT)) {
1689				uint16_t mss;
1690
1691				if (to.to_flags & TOF_MSS) {
1692					mss = to.to_mss;
1693				} else {
1694					if ((inp->inp_vflag & INP_IPV6) != 0) {
1695						mss = TCP6_MSS;
1696					} else {
1697						mss = TCP_MSS;
1698					}
1699				}
1700				tcp_fastopen_update_cache(tp, mss,
1701				    to.to_tfo_len, to.to_tfo_cookie);
1702			} else {
1703				tcp_fastopen_disable_path(tp);
1704			}
1705		}
1706	}
1707
1708	/*
1709	 * If timestamps were negotiated during SYN/ACK and a
1710	 * segment without a timestamp is received, silently drop
1711	 * the segment, unless it is a RST segment or missing timestamps are
1712	 * tolerated.
1713	 * See section 3.2 of RFC 7323.
1714	 */
1715	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
1716		if (((thflags & TH_RST) != 0) || V_tcp_tolerate_missing_ts) {
1717			if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1718				log(LOG_DEBUG, "%s; %s: Timestamp missing, "
1719				    "segment processed normally\n",
1720				    s, __func__);
1721				free(s, M_TCPLOG);
1722			}
1723		} else {
1724			if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1725				log(LOG_DEBUG, "%s; %s: Timestamp missing, "
1726				    "segment silently dropped\n", s, __func__);
1727				free(s, M_TCPLOG);
1728			}
1729			goto drop;
1730		}
1731	}
1732	/*
1733	 * If timestamps were not negotiated during SYN/ACK and a
1734	 * segment with a timestamp is received, ignore the
1735	 * timestamp and process the packet normally.
1736	 * See section 3.2 of RFC 7323.
1737	 */
1738	if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
1739		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1740			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
1741			    "segment processed normally\n", s, __func__);
1742			free(s, M_TCPLOG);
1743		}
1744	}
1745
1746	/*
1747	 * Header prediction: check for the two common cases
1748	 * of a uni-directional data xfer.  If the packet has
1749	 * no control flags, is in-sequence, the window didn't
1750	 * change and we're not retransmitting, it's a
1751	 * candidate.  If the length is zero and the ack moved
1752	 * forward, we're the sender side of the xfer.  Just
1753	 * free the data acked & wake any higher level process
1754	 * that was blocked waiting for space.  If the length
1755	 * is non-zero and the ack didn't move, we're the
1756	 * receiver side.  If we're getting packets in-order
1757	 * (the reassembly queue is empty), add the data to
1758	 * the socket buffer and note that we need a delayed ack.
1759	 * Make sure that the hidden state-flags are also off.
1760	 * Since we check for TCPS_ESTABLISHED first, it can only
1761	 * be TH_NEEDSYN.
1762	 */
1763	if (tp->t_state == TCPS_ESTABLISHED &&
1764	    th->th_seq == tp->rcv_nxt &&
1765	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1766	    tp->snd_nxt == tp->snd_max &&
1767	    tiwin && tiwin == tp->snd_wnd &&
1768	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1769	    SEGQ_EMPTY(tp) &&
1770	    ((to.to_flags & TOF_TS) == 0 ||
1771	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
1772		/*
1773		 * If last ACK falls within this segment's sequence numbers,
1774		 * record the timestamp.
1775		 * NOTE that the test is modified according to the latest
1776		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1777		 */
1778		if ((to.to_flags & TOF_TS) != 0 &&
1779		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1780			tp->ts_recent_age = tcp_ts_getticks();
1781			tp->ts_recent = to.to_tsval;
1782		}
1783
1784		if (tlen == 0) {
1785			if (SEQ_GT(th->th_ack, tp->snd_una) &&
1786			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
1787			    !IN_RECOVERY(tp->t_flags) &&
1788			    (to.to_flags & TOF_SACK) == 0 &&
1789			    TAILQ_EMPTY(&tp->snd_holes)) {
1790				/*
1791				 * This is a pure ack for outstanding data.
1792				 */
1793				TCPSTAT_INC(tcps_predack);
1794
1795				/*
1796				 * "bad retransmit" recovery without timestamps.
1797				 */
1798				if ((to.to_flags & TOF_TS) == 0 &&
1799				    tp->t_rxtshift == 1 &&
1800				    tp->t_flags & TF_PREVVALID &&
1801				    tp->t_badrxtwin != 0 &&
1802				    TSTMP_LT(ticks, tp->t_badrxtwin)) {
1803					cc_cong_signal(tp, th, CC_RTO_ERR);
1804				}
1805
1806				/*
1807				 * Recalculate the transmit timer / rtt.
1808				 *
1809				 * Some boxes send broken timestamp replies
1810				 * during the SYN+ACK phase, ignore
1811				 * timestamps of 0 or we could calculate a
1812				 * huge RTT and blow up the retransmit timer.
1813				 */
1814				if ((to.to_flags & TOF_TS) != 0 &&
1815				    to.to_tsecr) {
1816					uint32_t t;
1817
1818					t = tcp_ts_getticks() - to.to_tsecr;
1819					if (!tp->t_rttlow || tp->t_rttlow > t)
1820						tp->t_rttlow = t;
1821					tcp_xmit_timer(tp,
1822					    TCP_TS_TO_TICKS(t) + 1);
1823				} else if (tp->t_rtttime &&
1824				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
1825					if (!tp->t_rttlow ||
1826					    tp->t_rttlow > ticks - tp->t_rtttime)
1827						tp->t_rttlow = ticks - tp->t_rtttime;
1828					tcp_xmit_timer(tp,
1829							ticks - tp->t_rtttime);
1830				}
1831				acked = BYTES_THIS_ACK(tp, th);
1832
1833#ifdef TCP_HHOOK
1834				/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
1835				hhook_run_tcp_est_in(tp, th, &to);
1836#endif
1837
1838				TCPSTAT_ADD(tcps_rcvackpack, nsegs);
1839				TCPSTAT_ADD(tcps_rcvackbyte, acked);
1840				sbdrop(&so->so_snd, acked);
1841				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
1842				    SEQ_LEQ(th->th_ack, tp->snd_recover))
1843					tp->snd_recover = th->th_ack - 1;
1844
1845				/*
1846				 * Let the congestion control algorithm update
1847				 * congestion control related information. This
1848				 * typically means increasing the congestion
1849				 * window.
1850				 */
1851				cc_ack_received(tp, th, nsegs, CC_ACK);
1852
1853				tp->snd_una = th->th_ack;
1854				/*
1855				 * Pull snd_wl2 up to prevent seq wrap relative
1856				 * to th_ack.
1857				 */
1858				tp->snd_wl2 = th->th_ack;
1859				tp->t_dupacks = 0;
1860				m_freem(m);
1861
1862				/*
1863				 * If all outstanding data are acked, stop
1864				 * retransmit timer, otherwise restart timer
1865				 * using current (possibly backed-off) value.
1866				 * If process is waiting for space,
1867				 * wakeup/selwakeup/signal.  If data
1868				 * are ready to send, let tcp_output
1869				 * decide between more output or persist.
1870				 */
1871				TCP_PROBE3(debug__input, tp, th, m);
1872				/*
1873				 * Clear t_acktime if remote side has ACKd
1874				 * all data in the socket buffer.
1875				 * Otherwise, update t_acktime if we received
1876				 * a sufficiently large ACK.
1877				 */
1878				if (sbavail(&so->so_snd) == 0)
1879					tp->t_acktime = 0;
1880				else if (acked > 1)
1881					tp->t_acktime = ticks;
1882				if (tp->snd_una == tp->snd_max)
1883					tcp_timer_activate(tp, TT_REXMT, 0);
1884				else if (!tcp_timer_active(tp, TT_PERSIST))
1885					tcp_timer_activate(tp, TT_REXMT,
1886					    TP_RXTCUR(tp));
1887				sowwakeup(so);
1888				/*
1889				 * Only call tcp_output when there
1890				 * is new data available to be sent
1891				 * or we need to send an ACK.
1892				 */
1893				if ((tp->t_flags & TF_ACKNOW) ||
1894				    (sbavail(&so->so_snd) >=
1895				     SEQ_SUB(tp->snd_max, tp->snd_una))) {
1896					(void) tcp_output(tp);
1897				}
1898				goto check_delack;
1899			}
1900		} else if (th->th_ack == tp->snd_una &&
1901		    tlen <= sbspace(&so->so_rcv)) {
1902			int newsize = 0;	/* automatic sockbuf scaling */
1903
1904			/*
1905			 * This is a pure, in-sequence data packet with
1906			 * nothing on the reassembly queue and we have enough
1907			 * buffer space to take it.
1908			 */
1909			/* Clean receiver SACK report if present */
1910			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
1911				tcp_clean_sackreport(tp);
1912			TCPSTAT_INC(tcps_preddat);
1913			tp->rcv_nxt += tlen;
1914			if (tlen &&
1915			    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
1916			    (tp->t_fbyte_in == 0)) {
1917				tp->t_fbyte_in = ticks;
1918				if (tp->t_fbyte_in == 0)
1919					tp->t_fbyte_in = 1;
1920				if (tp->t_fbyte_out && tp->t_fbyte_in)
1921					tp->t_flags2 |= TF2_FBYTES_COMPLETE;
1922			}
1923			/*
1924			 * Pull snd_wl1 up to prevent seq wrap relative to
1925			 * th_seq.
1926			 */
1927			tp->snd_wl1 = th->th_seq;
1928			/*
1929			 * Pull rcv_up up to prevent seq wrap relative to
1930			 * rcv_nxt.
1931			 */
1932			tp->rcv_up = tp->rcv_nxt;
1933			TCPSTAT_ADD(tcps_rcvpack, nsegs);
1934			TCPSTAT_ADD(tcps_rcvbyte, tlen);
1935			TCP_PROBE3(debug__input, tp, th, m);
1936
1937			newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
1938
1939			/* Add data to socket buffer. */
1940			SOCKBUF_LOCK(&so->so_rcv);
1941			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1942				m_freem(m);
1943			} else {
1944				/*
1945				 * Set new socket buffer size.
1946				 * Give up when limit is reached.
1947				 */
1948				if (newsize)
1949					if (!sbreserve_locked(so, SO_RCV,
1950					    newsize, NULL))
1951						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
1952				m_adj(m, drop_hdrlen);	/* delayed header drop */
1953				sbappendstream_locked(&so->so_rcv, m, 0);
1954			}
1955			/* NB: sorwakeup_locked() does an implicit unlock. */
1956			sorwakeup_locked(so);
1957			if (DELAY_ACK(tp, tlen)) {
1958				tp->t_flags |= TF_DELACK;
1959			} else {
1960				tp->t_flags |= TF_ACKNOW;
1961				(void) tcp_output(tp);
1962			}
1963			goto check_delack;
1964		}
1965	}
1966
1967	/*
1968	 * Calculate amount of space in receive window,
1969	 * and then do TCP input processing.
1970	 * Receive window is amount of space in rcv queue,
1971	 * but not less than advertised window.
1972	 */
1973	win = sbspace(&so->so_rcv);
1974	if (win < 0)
1975		win = 0;
1976	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1977
1978	switch (tp->t_state) {
1979	/*
1980	 * If the state is SYN_RECEIVED:
1981	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
1982	 */
1983	case TCPS_SYN_RECEIVED:
1984		if (thflags & TH_RST) {
1985			/* Handle RST segments later. */
1986			break;
1987		}
1988		if ((thflags & TH_ACK) &&
1989		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1990		     SEQ_GT(th->th_ack, tp->snd_max))) {
1991				rstreason = BANDLIM_RST_OPENPORT;
1992				tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
1993				goto dropwithreset;
1994		}
1995		if (tp->t_flags & TF_FASTOPEN) {
1996			/*
1997			 * When a TFO connection is in SYN_RECEIVED, the
1998			 * only valid packets are the initial SYN, a
1999			 * retransmit/copy of the initial SYN (possibly with
2000			 * a subset of the original data), a valid ACK, a
2001			 * FIN, or a RST.
2002			 */
2003			if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
2004				rstreason = BANDLIM_RST_OPENPORT;
2005				tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
2006				goto dropwithreset;
2007			} else if (thflags & TH_SYN) {
2008				/* non-initial SYN is ignored */
2009				if ((tcp_timer_active(tp, TT_DELACK) ||
2010				     tcp_timer_active(tp, TT_REXMT)))
2011					goto drop;
2012			} else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) {
2013				goto drop;
2014			}
2015		}
2016		break;
2017
2018	/*
2019	 * If the state is SYN_SENT:
2020	 *	if seg contains a RST with valid ACK (SEQ.ACK has already
2021	 *	    been verified), then drop the connection.
2022	 *	if seg contains a RST without an ACK, drop the seg.
2023	 *	if seg does not contain SYN, then drop the seg.
2024	 * Otherwise this is an acceptable SYN segment
2025	 *	initialize tp->rcv_nxt and tp->irs
2026	 *	if seg contains ack then advance tp->snd_una
2027	 *	if seg contains an ECE and ECN support is enabled, the stream
2028	 *	    is ECN capable.
2029	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
2030	 *	arrange for segment to be acked (eventually)
2031	 *	continue processing rest of data/controls, beginning with URG
2032	 */
2033	case TCPS_SYN_SENT:
2034		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
2035			TCP_PROBE5(connect__refused, NULL, tp,
2036			    m, tp, th);
2037			tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
2038			tp = tcp_drop(tp, ECONNREFUSED);
2039		}
2040		if (thflags & TH_RST)
2041			goto drop;
2042		if (!(thflags & TH_SYN))
2043			goto drop;
2044
2045		tp->irs = th->th_seq;
2046		tcp_rcvseqinit(tp);
2047		if (thflags & TH_ACK) {
2048			int tfo_partial_ack = 0;
2049
2050			TCPSTAT_INC(tcps_connects);
2051			soisconnected(so);
2052#ifdef MAC
2053			mac_socketpeer_set_from_mbuf(m, so);
2054#endif
2055			/* Do window scaling on this connection? */
2056			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2057				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
2058				tp->rcv_scale = tp->request_r_scale;
2059			}
2060			tp->rcv_adv += min(tp->rcv_wnd,
2061			    TCP_MAXWIN << tp->rcv_scale);
2062			tp->snd_una++;		/* SYN is acked */
2063			if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2064				tp->snd_nxt = tp->snd_una;
2065			/*
2066			 * If not all the data that was sent in the TFO SYN
2067			 * has been acked, resend the remainder right away.
2068			 */
2069			if ((tp->t_flags & TF_FASTOPEN) &&
2070			    (tp->snd_una != tp->snd_max)) {
2071				tp->snd_nxt = th->th_ack;
2072				tfo_partial_ack = 1;
2073			}
2074			/*
2075			 * If there's data, delay ACK; if there's also a FIN
2076			 * ACKNOW will be turned on later.
2077			 */
2078			if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack)
2079				tcp_timer_activate(tp, TT_DELACK,
2080				    tcp_delacktime);
2081			else
2082				tp->t_flags |= TF_ACKNOW;
2083
2084			tcp_ecn_input_syn_sent(tp, thflags, iptos);
2085
2086			/*
2087			 * Received <SYN,ACK> in SYN_SENT[*] state.
2088			 * Transitions:
2089			 *	SYN_SENT  --> ESTABLISHED
2090			 *	SYN_SENT* --> FIN_WAIT_1
2091			 */
2092			tp->t_starttime = ticks;
2093			if (tp->t_flags & TF_NEEDFIN) {
2094				tp->t_acktime = ticks;
2095				tcp_state_change(tp, TCPS_FIN_WAIT_1);
2096				tp->t_flags &= ~TF_NEEDFIN;
2097				thflags &= ~TH_SYN;
2098			} else {
2099				tcp_state_change(tp, TCPS_ESTABLISHED);
2100				TCP_PROBE5(connect__established, NULL, tp,
2101				    m, tp, th);
2102				cc_conn_init(tp);
2103				tcp_timer_activate(tp, TT_KEEP,
2104				    TP_KEEPIDLE(tp));
2105			}
2106		} else {
2107			/*
2108			 * Received initial SYN in SYN-SENT[*] state =>
2109			 * simultaneous open.
2110			 * If it succeeds, connection is * half-synchronized.
2111			 * Otherwise, do 3-way handshake:
2112			 *        SYN-SENT -> SYN-RECEIVED
2113			 *        SYN-SENT* -> SYN-RECEIVED*
2114			 */
2115			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN);
2116			tcp_timer_activate(tp, TT_REXMT, 0);
2117			tcp_state_change(tp, TCPS_SYN_RECEIVED);
2118		}
2119
2120		/*
2121		 * Advance th->th_seq to correspond to first data byte.
2122		 * If data, trim to stay within window,
2123		 * dropping FIN if necessary.
2124		 */
2125		th->th_seq++;
2126		if (tlen > tp->rcv_wnd) {
2127			todrop = tlen - tp->rcv_wnd;
2128			m_adj(m, -todrop);
2129			tlen = tp->rcv_wnd;
2130			thflags &= ~TH_FIN;
2131			TCPSTAT_INC(tcps_rcvpackafterwin);
2132			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
2133		}
2134		tp->snd_wl1 = th->th_seq - 1;
2135		tp->rcv_up = th->th_seq;
2136		/*
2137		 * Client side of transaction: already sent SYN and data.
2138		 * If the remote host used T/TCP to validate the SYN,
2139		 * our data will be ACK'd; if so, enter normal data segment
2140		 * processing in the middle of step 5, ack processing.
2141		 * Otherwise, goto step 6.
2142		 */
2143		if (thflags & TH_ACK)
2144			goto process_ACK;
2145
2146		goto step6;
2147	}
2148
2149	/*
2150	 * States other than LISTEN or SYN_SENT.
2151	 * First check the RST flag and sequence number since reset segments
2152	 * are exempt from the timestamp and connection count tests.  This
2153	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
2154	 * below which allowed reset segments in half the sequence space
2155	 * to fall though and be processed (which gives forged reset
2156	 * segments with a random sequence number a 50 percent chance of
2157	 * killing a connection).
2158	 * Then check timestamp, if present.
2159	 * Then check the connection count, if present.
2160	 * Then check that at least some bytes of segment are within
2161	 * receive window.  If segment begins before rcv_nxt,
2162	 * drop leading data (and SYN); if nothing left, just ack.
2163	 */
2164	if (thflags & TH_RST) {
2165		/*
2166		 * RFC5961 Section 3.2
2167		 *
2168		 * - RST drops connection only if SEG.SEQ == RCV.NXT.
2169		 * - If RST is in window, we send challenge ACK.
2170		 *
2171		 * Note: to take into account delayed ACKs, we should
2172		 *   test against last_ack_sent instead of rcv_nxt.
2173		 * Note 2: we handle special case of closed window, not
2174		 *   covered by the RFC.
2175		 */
2176		if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2177		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
2178		    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
2179			KASSERT(tp->t_state != TCPS_SYN_SENT,
2180			    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
2181			    __func__, th, tp));
2182
2183			if (V_tcp_insecure_rst ||
2184			    tp->last_ack_sent == th->th_seq) {
2185				TCPSTAT_INC(tcps_drops);
2186				/* Drop the connection. */
2187				switch (tp->t_state) {
2188				case TCPS_SYN_RECEIVED:
2189					so->so_error = ECONNREFUSED;
2190					goto close;
2191				case TCPS_ESTABLISHED:
2192				case TCPS_FIN_WAIT_1:
2193				case TCPS_FIN_WAIT_2:
2194				case TCPS_CLOSE_WAIT:
2195				case TCPS_CLOSING:
2196				case TCPS_LAST_ACK:
2197					so->so_error = ECONNRESET;
2198				close:
2199					/* FALLTHROUGH */
2200				default:
2201					tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST);
2202					tp = tcp_close(tp);
2203				}
2204			} else {
2205				TCPSTAT_INC(tcps_badrst);
2206				/* Send challenge ACK. */
2207				tcp_respond(tp, mtod(m, void *), th, m,
2208				    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
2209				tp->last_ack_sent = tp->rcv_nxt;
2210				m = NULL;
2211			}
2212		}
2213		goto drop;
2214	}
2215
2216	/*
2217	 * RFC5961 Section 4.2
2218	 * Send challenge ACK for any SYN in synchronized state.
2219	 */
2220	if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT &&
2221	    tp->t_state != TCPS_SYN_RECEIVED) {
2222		TCPSTAT_INC(tcps_badsyn);
2223		if (V_tcp_insecure_syn &&
2224		    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2225		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2226			tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
2227			tp = tcp_drop(tp, ECONNRESET);
2228			rstreason = BANDLIM_UNLIMITED;
2229		} else {
2230			tcp_ecn_input_syn_sent(tp, thflags, iptos);
2231			/* Send challenge ACK. */
2232			tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
2233			    tp->snd_nxt, TH_ACK);
2234			tp->last_ack_sent = tp->rcv_nxt;
2235			m = NULL;
2236		}
2237		goto drop;
2238	}
2239
2240	/*
2241	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
2242	 * and it's less than ts_recent, drop it.
2243	 */
2244	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
2245	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
2246		/* Check to see if ts_recent is over 24 days old.  */
2247		if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
2248			/*
2249			 * Invalidate ts_recent.  If this segment updates
2250			 * ts_recent, the age will be reset later and ts_recent
2251			 * will get a valid value.  If it does not, setting
2252			 * ts_recent to zero will at least satisfy the
2253			 * requirement that zero be placed in the timestamp
2254			 * echo reply when ts_recent isn't valid.  The
2255			 * age isn't reset until we get a valid ts_recent
2256			 * because we don't want out-of-order segments to be
2257			 * dropped when ts_recent is old.
2258			 */
2259			tp->ts_recent = 0;
2260		} else {
2261			TCPSTAT_INC(tcps_rcvduppack);
2262			TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
2263			TCPSTAT_INC(tcps_pawsdrop);
2264			if (tlen)
2265				goto dropafterack;
2266			goto drop;
2267		}
2268	}
2269
2270	/*
2271	 * In the SYN-RECEIVED state, validate that the packet belongs to
2272	 * this connection before trimming the data to fit the receive
2273	 * window.  Check the sequence number versus IRS since we know
2274	 * the sequence numbers haven't wrapped.  This is a partial fix
2275	 * for the "LAND" DoS attack.
2276	 */
2277	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
2278		rstreason = BANDLIM_RST_OPENPORT;
2279		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
2280		goto dropwithreset;
2281	}
2282
2283	todrop = tp->rcv_nxt - th->th_seq;
2284	if (todrop > 0) {
2285		if (thflags & TH_SYN) {
2286			thflags &= ~TH_SYN;
2287			th->th_seq++;
2288			if (th->th_urp > 1)
2289				th->th_urp--;
2290			else
2291				thflags &= ~TH_URG;
2292			todrop--;
2293		}
2294		/*
2295		 * Following if statement from Stevens, vol. 2, p. 960.
2296		 */
2297		if (todrop > tlen
2298		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
2299			/*
2300			 * Any valid FIN must be to the left of the window.
2301			 * At this point the FIN must be a duplicate or out
2302			 * of sequence; drop it.
2303			 */
2304			thflags &= ~TH_FIN;
2305
2306			/*
2307			 * Send an ACK to resynchronize and drop any data.
2308			 * But keep on processing for RST or ACK.
2309			 */
2310			tp->t_flags |= TF_ACKNOW;
2311			todrop = tlen;
2312			TCPSTAT_INC(tcps_rcvduppack);
2313			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
2314		} else {
2315			TCPSTAT_INC(tcps_rcvpartduppack);
2316			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
2317		}
2318		/*
2319		 * DSACK - add SACK block for dropped range
2320		 */
2321		if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
2322			tcp_update_sack_list(tp, th->th_seq,
2323			    th->th_seq + todrop);
2324			/*
2325			 * ACK now, as the next in-sequence segment
2326			 * will clear the DSACK block again
2327			 */
2328			tp->t_flags |= TF_ACKNOW;
2329		}
2330		drop_hdrlen += todrop;	/* drop from the top afterwards */
2331		th->th_seq += todrop;
2332		tlen -= todrop;
2333		if (th->th_urp > todrop)
2334			th->th_urp -= todrop;
2335		else {
2336			thflags &= ~TH_URG;
2337			th->th_urp = 0;
2338		}
2339	}
2340
2341	/*
2342	 * If new data are received on a connection after the
2343	 * user processes are gone, then RST the other end if
2344	 * no FIN has been processed.
2345	 */
2346	if ((tp->t_flags & TF_CLOSED) && tlen > 0 &&
2347	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2348		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2349			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
2350			    "after socket was closed, "
2351			    "sending RST and removing tcpcb\n",
2352			    s, __func__, tcpstates[tp->t_state], tlen);
2353			free(s, M_TCPLOG);
2354		}
2355		tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
2356		/* tcp_close will kill the inp pre-log the Reset */
2357		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
2358		tp = tcp_close(tp);
2359		TCPSTAT_INC(tcps_rcvafterclose);
2360		rstreason = BANDLIM_UNLIMITED;
2361		goto dropwithreset;
2362	}
2363
2364	/*
2365	 * If segment ends after window, drop trailing data
2366	 * (and PUSH and FIN); if nothing left, just ACK.
2367	 */
2368	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
2369	if (todrop > 0) {
2370		TCPSTAT_INC(tcps_rcvpackafterwin);
2371		if (todrop >= tlen) {
2372			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
2373			/*
2374			 * If window is closed can only take segments at
2375			 * window edge, and have to drop data and PUSH from
2376			 * incoming segments.  Continue processing, but
2377			 * remember to ack.  Otherwise, drop segment
2378			 * and ack.
2379			 */
2380			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
2381				tp->t_flags |= TF_ACKNOW;
2382				TCPSTAT_INC(tcps_rcvwinprobe);
2383			} else
2384				goto dropafterack;
2385		} else
2386			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
2387		m_adj(m, -todrop);
2388		tlen -= todrop;
2389		thflags &= ~(TH_PUSH|TH_FIN);
2390	}
2391
2392	/*
2393	 * If last ACK falls within this segment's sequence numbers,
2394	 * record its timestamp.
2395	 * NOTE:
2396	 * 1) That the test incorporates suggestions from the latest
2397	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
2398	 * 2) That updating only on newer timestamps interferes with
2399	 *    our earlier PAWS tests, so this check should be solely
2400	 *    predicated on the sequence space of this segment.
2401	 * 3) That we modify the segment boundary check to be
2402	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
2403	 *    instead of RFC1323's
2404	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
2405	 *    This modified check allows us to overcome RFC1323's
2406	 *    limitations as described in Stevens TCP/IP Illustrated
2407	 *    Vol. 2 p.869. In such cases, we can still calculate the
2408	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
2409	 */
2410	if ((to.to_flags & TOF_TS) != 0 &&
2411	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
2412	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
2413		((thflags & (TH_SYN|TH_FIN)) != 0))) {
2414		tp->ts_recent_age = tcp_ts_getticks();
2415		tp->ts_recent = to.to_tsval;
2416	}
2417
2418	/*
2419	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
2420	 * flag is on (half-synchronized state), then queue data for
2421	 * later processing; else drop segment and return.
2422	 */
2423	if ((thflags & TH_ACK) == 0) {
2424		if (tp->t_state == TCPS_SYN_RECEIVED ||
2425		    (tp->t_flags & TF_NEEDSYN)) {
2426			if (tp->t_state == TCPS_SYN_RECEIVED &&
2427			    (tp->t_flags & TF_FASTOPEN)) {
2428				tp->snd_wnd = tiwin;
2429				cc_conn_init(tp);
2430			}
2431			goto step6;
2432		} else if (tp->t_flags & TF_ACKNOW)
2433			goto dropafterack;
2434		else
2435			goto drop;
2436	}
2437
2438	/*
2439	 * Ack processing.
2440	 */
2441	switch (tp->t_state) {
2442	/*
2443	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
2444	 * ESTABLISHED state and continue processing.
2445	 * The ACK was checked above.
2446	 */
2447	case TCPS_SYN_RECEIVED:
2448
2449		TCPSTAT_INC(tcps_connects);
2450		if (tp->t_flags & TF_SONOTCONN) {
2451			/*
2452			 * Usually SYN_RECEIVED had been created from a LISTEN,
2453			 * and solisten_enqueue() has already marked the socket
2454			 * layer as connected.  If it didn't, which can happen
2455			 * only with an accept_filter(9), then the tp is marked
2456			 * with TF_SONOTCONN.  The other reason for this mark
2457			 * to be set is a simultaneous open, a SYN_RECEIVED
2458			 * that had been created from SYN_SENT.
2459			 */
2460			tp->t_flags &= ~TF_SONOTCONN;
2461			soisconnected(so);
2462		}
2463		/* Do window scaling? */
2464		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2465			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
2466			tp->rcv_scale = tp->request_r_scale;
2467		}
2468		tp->snd_wnd = tiwin;
2469		/*
2470		 * Make transitions:
2471		 *      SYN-RECEIVED  -> ESTABLISHED
2472		 *      SYN-RECEIVED* -> FIN-WAIT-1
2473		 */
2474		tp->t_starttime = ticks;
2475		if ((tp->t_flags & TF_FASTOPEN) && tp->t_tfo_pending) {
2476			tcp_fastopen_decrement_counter(tp->t_tfo_pending);
2477			tp->t_tfo_pending = NULL;
2478		}
2479		if (tp->t_flags & TF_NEEDFIN) {
2480			tp->t_acktime = ticks;
2481			tcp_state_change(tp, TCPS_FIN_WAIT_1);
2482			tp->t_flags &= ~TF_NEEDFIN;
2483		} else {
2484			tcp_state_change(tp, TCPS_ESTABLISHED);
2485			TCP_PROBE5(accept__established, NULL, tp,
2486			    m, tp, th);
2487			/*
2488			 * TFO connections call cc_conn_init() during SYN
2489			 * processing.  Calling it again here for such
2490			 * connections is not harmless as it would undo the
2491			 * snd_cwnd reduction that occurs when a TFO SYN|ACK
2492			 * is retransmitted.
2493			 */
2494			if (!(tp->t_flags & TF_FASTOPEN))
2495				cc_conn_init(tp);
2496			tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
2497		}
2498		/*
2499		 * Account for the ACK of our SYN prior to
2500		 * regular ACK processing below, except for
2501		 * simultaneous SYN, which is handled later.
2502		 */
2503		if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
2504			incforsyn = 1;
2505		/*
2506		 * If segment contains data or ACK, will call tcp_reass()
2507		 * later; if not, do so now to pass queued data to user.
2508		 */
2509		if (tlen == 0 && (thflags & TH_FIN) == 0) {
2510			(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
2511			    (struct mbuf *)0);
2512			tcp_handle_wakeup(tp);
2513		}
2514		tp->snd_wl1 = th->th_seq - 1;
2515		/* FALLTHROUGH */
2516
2517	/*
2518	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2519	 * ACKs.  If the ack is in the range
2520	 *	tp->snd_una < th->th_ack <= tp->snd_max
2521	 * then advance tp->snd_una to th->th_ack and drop
2522	 * data from the retransmission queue.  If this ACK reflects
2523	 * more up to date window information we update our window information.
2524	 */
2525	case TCPS_ESTABLISHED:
2526	case TCPS_FIN_WAIT_1:
2527	case TCPS_FIN_WAIT_2:
2528	case TCPS_CLOSE_WAIT:
2529	case TCPS_CLOSING:
2530	case TCPS_LAST_ACK:
2531		if (SEQ_GT(th->th_ack, tp->snd_max)) {
2532			TCPSTAT_INC(tcps_rcvacktoomuch);
2533			goto dropafterack;
2534		}
2535		if (tcp_is_sack_recovery(tp, &to)) {
2536			sack_changed = tcp_sack_doack(tp, &to, th->th_ack);
2537			if ((sack_changed != SACK_NOCHANGE) &&
2538			    (tp->t_flags & TF_LRD)) {
2539				tcp_sack_lost_retransmission(tp, th);
2540			}
2541		} else
2542			/*
2543			 * Reset the value so that previous (valid) value
2544			 * from the last ack with SACK doesn't get used.
2545			 */
2546			tp->sackhint.sacked_bytes = 0;
2547
2548#ifdef TCP_HHOOK
2549		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
2550		hhook_run_tcp_est_in(tp, th, &to);
2551#endif
2552
2553		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2554			maxseg = tcp_maxseg(tp);
2555			if (tlen == 0 &&
2556			    (tiwin == tp->snd_wnd ||
2557			    (tp->t_flags & TF_SACK_PERMIT))) {
2558				/*
2559				 * If this is the first time we've seen a
2560				 * FIN from the remote, this is not a
2561				 * duplicate and it needs to be processed
2562				 * normally.  This happens during a
2563				 * simultaneous close.
2564				 */
2565				if ((thflags & TH_FIN) &&
2566				    (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
2567					tp->t_dupacks = 0;
2568					break;
2569				}
2570				TCPSTAT_INC(tcps_rcvdupack);
2571				/*
2572				 * If we have outstanding data (other than
2573				 * a window probe), this is a completely
2574				 * duplicate ack (ie, window info didn't
2575				 * change and FIN isn't set),
2576				 * the ack is the biggest we've
2577				 * seen and we've seen exactly our rexmt
2578				 * threshold of them, assume a packet
2579				 * has been dropped and retransmit it.
2580				 * Kludge snd_nxt & the congestion
2581				 * window so we send only this one
2582				 * packet.
2583				 *
2584				 * We know we're losing at the current
2585				 * window size so do congestion avoidance
2586				 * (set ssthresh to half the current window
2587				 * and pull our congestion window back to
2588				 * the new ssthresh).
2589				 *
2590				 * Dup acks mean that packets have left the
2591				 * network (they're now cached at the receiver)
2592				 * so bump cwnd by the amount in the receiver
2593				 * to keep a constant cwnd packets in the
2594				 * network.
2595				 *
2596				 * When using TCP ECN, notify the peer that
2597				 * we reduced the cwnd.
2598				 */
2599				/*
2600				 * Following 2 kinds of acks should not affect
2601				 * dupack counting:
2602				 * 1) Old acks
2603				 * 2) Acks with SACK but without any new SACK
2604				 * information in them. These could result from
2605				 * any anomaly in the network like a switch
2606				 * duplicating packets or a possible DoS attack.
2607				 */
2608				if (th->th_ack != tp->snd_una ||
2609				    (tcp_is_sack_recovery(tp, &to) &&
2610				    (sack_changed == SACK_NOCHANGE))) {
2611					break;
2612				} else if (!tcp_timer_active(tp, TT_REXMT)) {
2613					tp->t_dupacks = 0;
2614				} else if (++tp->t_dupacks > tcprexmtthresh ||
2615					    IN_FASTRECOVERY(tp->t_flags)) {
2616					cc_ack_received(tp, th, nsegs,
2617					    CC_DUPACK);
2618					if (V_tcp_do_prr &&
2619					    IN_FASTRECOVERY(tp->t_flags) &&
2620					    (tp->t_flags & TF_SACK_PERMIT)) {
2621						tcp_do_prr_ack(tp, th, &to,
2622						    sack_changed, &maxseg);
2623					} else if (tcp_is_sack_recovery(tp, &to) &&
2624						    IN_FASTRECOVERY(tp->t_flags)) {
2625						int awnd;
2626
2627						/*
2628						 * Compute the amount of data in flight first.
2629						 * We can inject new data into the pipe iff
2630						 * we have less than 1/2 the original window's
2631						 * worth of data in flight.
2632						 */
2633						if (V_tcp_do_newsack) {
2634							awnd = tcp_compute_pipe(tp);
2635						} else {
2636							awnd = (tp->snd_nxt - tp->snd_fack) +
2637								tp->sackhint.sack_bytes_rexmit;
2638						}
2639						if (awnd < tp->snd_ssthresh) {
2640							tp->snd_cwnd += maxseg;
2641							if (tp->snd_cwnd > tp->snd_ssthresh)
2642								tp->snd_cwnd = tp->snd_ssthresh;
2643						}
2644					} else {
2645						tp->snd_cwnd += maxseg;
2646					}
2647					(void) tcp_output(tp);
2648					goto drop;
2649				} else if (tp->t_dupacks == tcprexmtthresh ||
2650					    (tp->t_flags & TF_SACK_PERMIT &&
2651					     V_tcp_do_newsack &&
2652					     tp->sackhint.sacked_bytes >
2653					     (tcprexmtthresh - 1) * maxseg)) {
2654enter_recovery:
2655					/*
2656					 * Above is the RFC6675 trigger condition of
2657					 * more than (dupthresh-1)*maxseg sacked data.
2658					 * If the count of holes in the
2659					 * scoreboard is >= dupthresh, we could
2660					 * also enter loss recovery, but don't
2661					 * have that value readily available.
2662					 */
2663					tp->t_dupacks = tcprexmtthresh;
2664					tcp_seq onxt = tp->snd_nxt;
2665
2666					/*
2667					 * If we're doing sack, or prr, check
2668					 * to see if we're already in sack
2669					 * recovery. If we're not doing sack,
2670					 * check to see if we're in newreno
2671					 * recovery.
2672					 */
2673					if (V_tcp_do_prr ||
2674					    (tp->t_flags & TF_SACK_PERMIT)) {
2675						if (IN_FASTRECOVERY(tp->t_flags)) {
2676							tp->t_dupacks = 0;
2677							break;
2678						}
2679					} else {
2680						if (SEQ_LEQ(th->th_ack,
2681						    tp->snd_recover)) {
2682							tp->t_dupacks = 0;
2683							break;
2684						}
2685					}
2686					/* Congestion signal before ack. */
2687					cc_cong_signal(tp, th, CC_NDUPACK);
2688					cc_ack_received(tp, th, nsegs,
2689					    CC_DUPACK);
2690					tcp_timer_activate(tp, TT_REXMT, 0);
2691					tp->t_rtttime = 0;
2692					if (V_tcp_do_prr) {
2693						/*
2694						 * snd_ssthresh is already updated by
2695						 * cc_cong_signal.
2696						 */
2697						if (tcp_is_sack_recovery(tp, &to)) {
2698							/*
2699							 * Exclude Limited Transmit
2700							 * segments here
2701							 */
2702							tp->sackhint.prr_delivered =
2703							    maxseg;
2704						} else {
2705							tp->sackhint.prr_delivered =
2706							    imin(tp->snd_max - tp->snd_una,
2707							    imin(INT_MAX / 65536,
2708								tp->t_dupacks) * maxseg);
2709						}
2710						tp->sackhint.recover_fs = max(1,
2711						    tp->snd_nxt - tp->snd_una);
2712					}
2713					if (tcp_is_sack_recovery(tp, &to)) {
2714						TCPSTAT_INC(tcps_sack_recovery_episode);
2715						tp->snd_recover = tp->snd_nxt;
2716						tp->snd_cwnd = maxseg;
2717						(void) tcp_output(tp);
2718						if (SEQ_GT(th->th_ack, tp->snd_una)) {
2719							goto resume_partialack;
2720						}
2721						goto drop;
2722					}
2723					tp->snd_nxt = th->th_ack;
2724					tp->snd_cwnd = maxseg;
2725					(void) tcp_output(tp);
2726					KASSERT(tp->snd_limited <= 2,
2727					    ("%s: tp->snd_limited too big",
2728					    __func__));
2729					tp->snd_cwnd = tp->snd_ssthresh +
2730					     maxseg *
2731					     (tp->t_dupacks - tp->snd_limited);
2732					if (SEQ_GT(onxt, tp->snd_nxt))
2733						tp->snd_nxt = onxt;
2734					goto drop;
2735				} else if (V_tcp_do_rfc3042) {
2736					/*
2737					 * Process first and second duplicate
2738					 * ACKs. Each indicates a segment
2739					 * leaving the network, creating room
2740					 * for more. Make sure we can send a
2741					 * packet on reception of each duplicate
2742					 * ACK by increasing snd_cwnd by one
2743					 * segment. Restore the original
2744					 * snd_cwnd after packet transmission.
2745					 */
2746					cc_ack_received(tp, th, nsegs, CC_DUPACK);
2747					uint32_t oldcwnd = tp->snd_cwnd;
2748					tcp_seq oldsndmax = tp->snd_max;
2749					u_int sent;
2750					int avail;
2751
2752					KASSERT(tp->t_dupacks == 1 ||
2753					    tp->t_dupacks == 2,
2754					    ("%s: dupacks not 1 or 2",
2755					    __func__));
2756					if (tp->t_dupacks == 1)
2757						tp->snd_limited = 0;
2758					tp->snd_cwnd =
2759					    (tp->snd_nxt - tp->snd_una) +
2760					    (tp->t_dupacks - tp->snd_limited) *
2761					    maxseg;
2762					/*
2763					 * Only call tcp_output when there
2764					 * is new data available to be sent
2765					 * or we need to send an ACK.
2766					 */
2767					SOCKBUF_LOCK(&so->so_snd);
2768					avail = sbavail(&so->so_snd);
2769					SOCKBUF_UNLOCK(&so->so_snd);
2770					if (tp->t_flags & TF_ACKNOW ||
2771					    (avail >=
2772					     SEQ_SUB(tp->snd_nxt, tp->snd_una))) {
2773						(void) tcp_output(tp);
2774					}
2775					sent = SEQ_SUB(tp->snd_max, oldsndmax);
2776					if (sent > maxseg) {
2777						KASSERT((tp->t_dupacks == 2 &&
2778						    tp->snd_limited == 0) ||
2779						   (sent == maxseg + 1 &&
2780						    tp->t_flags & TF_SENTFIN),
2781						    ("%s: sent too much",
2782						    __func__));
2783						tp->snd_limited = 2;
2784					} else if (sent > 0) {
2785						++tp->snd_limited;
2786					}
2787					tp->snd_cwnd = oldcwnd;
2788					goto drop;
2789				}
2790			}
2791			break;
2792		} else {
2793			/*
2794			 * This ack is advancing the left edge, reset the
2795			 * counter.
2796			 */
2797			tp->t_dupacks = 0;
2798			/*
2799			 * If this ack also has new SACK info, increment the
2800			 * counter as per rfc6675. The variable
2801			 * sack_changed tracks all changes to the SACK
2802			 * scoreboard, including when partial ACKs without
2803			 * SACK options are received, and clear the scoreboard
2804			 * from the left side. Such partial ACKs should not be
2805			 * counted as dupacks here.
2806			 */
2807			if (tcp_is_sack_recovery(tp, &to) &&
2808			    (sack_changed != SACK_NOCHANGE)) {
2809				tp->t_dupacks++;
2810				/* limit overhead by setting maxseg last */
2811				if (!IN_FASTRECOVERY(tp->t_flags) &&
2812				    (tp->sackhint.sacked_bytes >
2813				    ((tcprexmtthresh - 1) *
2814				    (maxseg = tcp_maxseg(tp))))) {
2815					goto enter_recovery;
2816				}
2817			}
2818		}
2819
2820resume_partialack:
2821		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
2822		    ("%s: th_ack <= snd_una", __func__));
2823
2824		/*
2825		 * If the congestion window was inflated to account
2826		 * for the other side's cached packets, retract it.
2827		 */
2828		if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2829			if (IN_FASTRECOVERY(tp->t_flags)) {
2830				if (tp->t_flags & TF_SACK_PERMIT) {
2831					if (V_tcp_do_prr &&
2832					    (to.to_flags & TOF_SACK)) {
2833						tcp_timer_activate(tp,
2834						    TT_REXMT, 0);
2835						tp->t_rtttime = 0;
2836						tcp_do_prr_ack(tp, th, &to,
2837						    sack_changed, &maxseg);
2838						tp->t_flags |= TF_ACKNOW;
2839						(void) tcp_output(tp);
2840					} else {
2841						tcp_sack_partialack(tp, th,
2842						    &maxseg);
2843					}
2844				} else {
2845					tcp_newreno_partial_ack(tp, th);
2846				}
2847			} else if (IN_CONGRECOVERY(tp->t_flags) &&
2848				    (V_tcp_do_prr)) {
2849				tp->sackhint.delivered_data =
2850				    BYTES_THIS_ACK(tp, th);
2851				tp->snd_fack = th->th_ack;
2852				/*
2853				 * During ECN cwnd reduction
2854				 * always use PRR-SSRB
2855				 */
2856				tcp_do_prr_ack(tp, th, &to, SACK_CHANGE,
2857				    &maxseg);
2858				(void) tcp_output(tp);
2859			}
2860		}
2861		/*
2862		 * If we reach this point, ACK is not a duplicate,
2863		 *     i.e., it ACKs something we sent.
2864		 */
2865		if (tp->t_flags & TF_NEEDSYN) {
2866			/*
2867			 * T/TCP: Connection was half-synchronized, and our
2868			 * SYN has been ACK'd (so connection is now fully
2869			 * synchronized).  Go to non-starred state,
2870			 * increment snd_una for ACK of SYN, and check if
2871			 * we can do window scaling.
2872			 */
2873			tp->t_flags &= ~TF_NEEDSYN;
2874			tp->snd_una++;
2875			/* Do window scaling? */
2876			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2877				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
2878				tp->rcv_scale = tp->request_r_scale;
2879				/* Send window already scaled. */
2880			}
2881		}
2882
2883process_ACK:
2884		INP_WLOCK_ASSERT(inp);
2885
2886		/*
2887		 * Adjust for the SYN bit in sequence space,
2888		 * but don't account for it in cwnd calculations.
2889		 * This is for the SYN_RECEIVED, non-simultaneous
2890		 * SYN case. SYN_SENT and simultaneous SYN are
2891		 * treated elsewhere.
2892		 */
2893		if (incforsyn)
2894			tp->snd_una++;
2895		acked = BYTES_THIS_ACK(tp, th);
2896		KASSERT(acked >= 0, ("%s: acked unexepectedly negative "
2897		    "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__,
2898		    tp->snd_una, th->th_ack, tp, m));
2899		TCPSTAT_ADD(tcps_rcvackpack, nsegs);
2900		TCPSTAT_ADD(tcps_rcvackbyte, acked);
2901
2902		/*
2903		 * If we just performed our first retransmit, and the ACK
2904		 * arrives within our recovery window, then it was a mistake
2905		 * to do the retransmit in the first place.  Recover our
2906		 * original cwnd and ssthresh, and proceed to transmit where
2907		 * we left off.
2908		 */
2909		if (tp->t_rxtshift == 1 &&
2910		    tp->t_flags & TF_PREVVALID &&
2911		    tp->t_badrxtwin != 0 &&
2912		    to.to_flags & TOF_TS &&
2913		    to.to_tsecr != 0 &&
2914		    TSTMP_LT(to.to_tsecr, tp->t_badrxtwin))
2915			cc_cong_signal(tp, th, CC_RTO_ERR);
2916
2917		/*
2918		 * If we have a timestamp reply, update smoothed
2919		 * round trip time.  If no timestamp is present but
2920		 * transmit timer is running and timed sequence
2921		 * number was acked, update smoothed round trip time.
2922		 * Since we now have an rtt measurement, cancel the
2923		 * timer backoff (cf., Phil Karn's retransmit alg.).
2924		 * Recompute the initial retransmit timer.
2925		 *
2926		 * Some boxes send broken timestamp replies
2927		 * during the SYN+ACK phase, ignore
2928		 * timestamps of 0 or we could calculate a
2929		 * huge RTT and blow up the retransmit timer.
2930		 */
2931		if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
2932			uint32_t t;
2933
2934			t = tcp_ts_getticks() - to.to_tsecr;
2935			if (!tp->t_rttlow || tp->t_rttlow > t)
2936				tp->t_rttlow = t;
2937			tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
2938		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
2939			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
2940				tp->t_rttlow = ticks - tp->t_rtttime;
2941			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
2942		}
2943
2944		SOCKBUF_LOCK(&so->so_snd);
2945		/*
2946		 * Clear t_acktime if remote side has ACKd all data in the
2947		 * socket buffer and FIN (if applicable).
2948		 * Otherwise, update t_acktime if we received a sufficiently
2949		 * large ACK.
2950		 */
2951		if ((tp->t_state <= TCPS_CLOSE_WAIT &&
2952		    acked == sbavail(&so->so_snd)) ||
2953		    acked > sbavail(&so->so_snd))
2954			tp->t_acktime = 0;
2955		else if (acked > 1)
2956			tp->t_acktime = ticks;
2957
2958		/*
2959		 * If all outstanding data is acked, stop retransmit
2960		 * timer and remember to restart (more output or persist).
2961		 * If there is more data to be acked, restart retransmit
2962		 * timer, using current (possibly backed-off) value.
2963		 */
2964		if (th->th_ack == tp->snd_max) {
2965			tcp_timer_activate(tp, TT_REXMT, 0);
2966			needoutput = 1;
2967		} else if (!tcp_timer_active(tp, TT_PERSIST))
2968			tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp));
2969
2970		/*
2971		 * If no data (only SYN) was ACK'd,
2972		 *    skip rest of ACK processing.
2973		 */
2974		if (acked == 0) {
2975			SOCKBUF_UNLOCK(&so->so_snd);
2976			goto step6;
2977		}
2978
2979		/*
2980		 * Let the congestion control algorithm update congestion
2981		 * control related information. This typically means increasing
2982		 * the congestion window.
2983		 */
2984		cc_ack_received(tp, th, nsegs, CC_ACK);
2985
2986		if (acked > sbavail(&so->so_snd)) {
2987			if (tp->snd_wnd >= sbavail(&so->so_snd))
2988				tp->snd_wnd -= sbavail(&so->so_snd);
2989			else
2990				tp->snd_wnd = 0;
2991			mfree = sbcut_locked(&so->so_snd,
2992			    (int)sbavail(&so->so_snd));
2993			ourfinisacked = 1;
2994		} else {
2995			mfree = sbcut_locked(&so->so_snd, acked);
2996			if (tp->snd_wnd >= (uint32_t) acked)
2997				tp->snd_wnd -= acked;
2998			else
2999				tp->snd_wnd = 0;
3000			ourfinisacked = 0;
3001		}
3002		/* NB: sowwakeup_locked() does an implicit unlock. */
3003		sowwakeup_locked(so);
3004		m_freem(mfree);
3005		/* Detect una wraparound. */
3006		if (!IN_RECOVERY(tp->t_flags) &&
3007		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
3008		    SEQ_LEQ(th->th_ack, tp->snd_recover))
3009			tp->snd_recover = th->th_ack - 1;
3010		tp->snd_una = th->th_ack;
3011		if (IN_RECOVERY(tp->t_flags) &&
3012		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
3013			cc_post_recovery(tp, th);
3014		}
3015		if (tp->t_flags & TF_SACK_PERMIT) {
3016			if (SEQ_GT(tp->snd_una, tp->snd_recover))
3017				tp->snd_recover = tp->snd_una;
3018		}
3019		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3020			tp->snd_nxt = tp->snd_una;
3021
3022		switch (tp->t_state) {
3023		/*
3024		 * In FIN_WAIT_1 STATE in addition to the processing
3025		 * for the ESTABLISHED state if our FIN is now acknowledged
3026		 * then enter FIN_WAIT_2.
3027		 */
3028		case TCPS_FIN_WAIT_1:
3029			if (ourfinisacked) {
3030				/*
3031				 * If we can't receive any more
3032				 * data, then closing user can proceed.
3033				 * Starting the timer is contrary to the
3034				 * specification, but if we don't get a FIN
3035				 * we'll hang forever.
3036				 */
3037				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3038					tcp_free_sackholes(tp);
3039					soisdisconnected(so);
3040					tcp_timer_activate(tp, TT_2MSL,
3041					    (tcp_fast_finwait2_recycle ?
3042					    tcp_finwait2_timeout :
3043					    TP_MAXIDLE(tp)));
3044				}
3045				tcp_state_change(tp, TCPS_FIN_WAIT_2);
3046			}
3047			break;
3048
3049		/*
3050		 * In CLOSING STATE in addition to the processing for
3051		 * the ESTABLISHED state if the ACK acknowledges our FIN
3052		 * then enter the TIME-WAIT state, otherwise ignore
3053		 * the segment.
3054		 */
3055		case TCPS_CLOSING:
3056			if (ourfinisacked) {
3057				tcp_twstart(tp);
3058				m_freem(m);
3059				return;
3060			}
3061			break;
3062
3063		/*
3064		 * In LAST_ACK, we may still be waiting for data to drain
3065		 * and/or to be acked, as well as for the ack of our FIN.
3066		 * If our FIN is now acknowledged, delete the TCB,
3067		 * enter the closed state and return.
3068		 */
3069		case TCPS_LAST_ACK:
3070			if (ourfinisacked) {
3071				tp = tcp_close(tp);
3072				goto drop;
3073			}
3074			break;
3075		}
3076	}
3077
3078step6:
3079	INP_WLOCK_ASSERT(inp);
3080
3081	/*
3082	 * Update window information.
3083	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
3084	 */
3085	if ((thflags & TH_ACK) &&
3086	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
3087	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
3088	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
3089		/* keep track of pure window updates */
3090		if (tlen == 0 &&
3091		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
3092			TCPSTAT_INC(tcps_rcvwinupd);
3093		tp->snd_wnd = tiwin;
3094		tp->snd_wl1 = th->th_seq;
3095		tp->snd_wl2 = th->th_ack;
3096		if (tp->snd_wnd > tp->max_sndwnd)
3097			tp->max_sndwnd = tp->snd_wnd;
3098		needoutput = 1;
3099	}
3100
3101	/*
3102	 * Process segments with URG.
3103	 */
3104	if ((thflags & TH_URG) && th->th_urp &&
3105	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3106		/*
3107		 * This is a kludge, but if we receive and accept
3108		 * random urgent pointers, we'll crash in
3109		 * soreceive.  It's hard to imagine someone
3110		 * actually wanting to send this much urgent data.
3111		 */
3112		SOCKBUF_LOCK(&so->so_rcv);
3113		if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
3114			th->th_urp = 0;			/* XXX */
3115			thflags &= ~TH_URG;		/* XXX */
3116			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
3117			goto dodata;			/* XXX */
3118		}
3119		/*
3120		 * If this segment advances the known urgent pointer,
3121		 * then mark the data stream.  This should not happen
3122		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
3123		 * a FIN has been received from the remote side.
3124		 * In these states we ignore the URG.
3125		 *
3126		 * According to RFC961 (Assigned Protocols),
3127		 * the urgent pointer points to the last octet
3128		 * of urgent data.  We continue, however,
3129		 * to consider it to indicate the first octet
3130		 * of data past the urgent section as the original
3131		 * spec states (in one of two places).
3132		 */
3133		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
3134			tp->rcv_up = th->th_seq + th->th_urp;
3135			so->so_oobmark = sbavail(&so->so_rcv) +
3136			    (tp->rcv_up - tp->rcv_nxt) - 1;
3137			if (so->so_oobmark == 0)
3138				so->so_rcv.sb_state |= SBS_RCVATMARK;
3139			sohasoutofband(so);
3140			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
3141		}
3142		SOCKBUF_UNLOCK(&so->so_rcv);
3143		/*
3144		 * Remove out of band data so doesn't get presented to user.
3145		 * This can happen independent of advancing the URG pointer,
3146		 * but if two URG's are pending at once, some out-of-band
3147		 * data may creep in... ick.
3148		 */
3149		if (th->th_urp <= (uint32_t)tlen &&
3150		    !(so->so_options & SO_OOBINLINE)) {
3151			/* hdr drop is delayed */
3152			tcp_pulloutofband(so, th, m, drop_hdrlen);
3153		}
3154	} else {
3155		/*
3156		 * If no out of band data is expected,
3157		 * pull receive urgent pointer along
3158		 * with the receive window.
3159		 */
3160		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
3161			tp->rcv_up = tp->rcv_nxt;
3162	}
3163dodata:							/* XXX */
3164	INP_WLOCK_ASSERT(inp);
3165
3166	/*
3167	 * Process the segment text, merging it into the TCP sequencing queue,
3168	 * and arranging for acknowledgment of receipt if necessary.
3169	 * This process logically involves adjusting tp->rcv_wnd as data
3170	 * is presented to the user (this happens in tcp_usrreq.c,
3171	 * case PRU_RCVD).  If a FIN has already been received on this
3172	 * connection then we just ignore the text.
3173	 */
3174	tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
3175	    (tp->t_flags & TF_FASTOPEN));
3176	if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
3177	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3178		tcp_seq save_start = th->th_seq;
3179		tcp_seq save_rnxt  = tp->rcv_nxt;
3180		int     save_tlen  = tlen;
3181		m_adj(m, drop_hdrlen);	/* delayed header drop */
3182		/*
3183		 * Insert segment which includes th into TCP reassembly queue
3184		 * with control block tp.  Set thflags to whether reassembly now
3185		 * includes a segment with FIN.  This handles the common case
3186		 * inline (segment is the next to be received on an established
3187		 * connection, and the queue is empty), avoiding linkage into
3188		 * and removal from the queue and repetition of various
3189		 * conversions.
3190		 * Set DELACK for segments received in order, but ack
3191		 * immediately when segments are out of order (so
3192		 * fast retransmit can work).
3193		 */
3194		if (th->th_seq == tp->rcv_nxt &&
3195		    SEGQ_EMPTY(tp) &&
3196		    (TCPS_HAVEESTABLISHED(tp->t_state) ||
3197		     tfo_syn)) {
3198			if (DELAY_ACK(tp, tlen) || tfo_syn)
3199				tp->t_flags |= TF_DELACK;
3200			else
3201				tp->t_flags |= TF_ACKNOW;
3202			tp->rcv_nxt += tlen;
3203			if (tlen &&
3204			    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
3205			    (tp->t_fbyte_in == 0)) {
3206				tp->t_fbyte_in = ticks;
3207				if (tp->t_fbyte_in == 0)
3208					tp->t_fbyte_in = 1;
3209				if (tp->t_fbyte_out && tp->t_fbyte_in)
3210					tp->t_flags2 |= TF2_FBYTES_COMPLETE;
3211			}
3212			thflags = tcp_get_flags(th) & TH_FIN;
3213			TCPSTAT_INC(tcps_rcvpack);
3214			TCPSTAT_ADD(tcps_rcvbyte, tlen);
3215			SOCKBUF_LOCK(&so->so_rcv);
3216			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
3217				m_freem(m);
3218			else
3219				sbappendstream_locked(&so->so_rcv, m, 0);
3220			tp->t_flags |= TF_WAKESOR;
3221		} else {
3222			/*
3223			 * XXX: Due to the header drop above "th" is
3224			 * theoretically invalid by now.  Fortunately
3225			 * m_adj() doesn't actually frees any mbufs
3226			 * when trimming from the head.
3227			 */
3228			tcp_seq temp = save_start;
3229
3230			thflags = tcp_reass(tp, th, &temp, &tlen, m);
3231			tp->t_flags |= TF_ACKNOW;
3232		}
3233		if ((tp->t_flags & TF_SACK_PERMIT) &&
3234		    (save_tlen > 0) &&
3235		    TCPS_HAVEESTABLISHED(tp->t_state)) {
3236			if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
3237				/*
3238				 * DSACK actually handled in the fastpath
3239				 * above.
3240				 */
3241				tcp_update_sack_list(tp, save_start,
3242				    save_start + save_tlen);
3243			} else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
3244				if ((tp->rcv_numsacks >= 1) &&
3245				    (tp->sackblks[0].end == save_start)) {
3246					/*
3247					 * Partial overlap, recorded at todrop
3248					 * above.
3249					 */
3250					tcp_update_sack_list(tp,
3251					    tp->sackblks[0].start,
3252					    tp->sackblks[0].end);
3253				} else {
3254					tcp_update_dsack_list(tp, save_start,
3255					    save_start + save_tlen);
3256				}
3257			} else if (tlen >= save_tlen) {
3258				/* Update of sackblks. */
3259				tcp_update_dsack_list(tp, save_start,
3260				    save_start + save_tlen);
3261			} else if (tlen > 0) {
3262				tcp_update_dsack_list(tp, save_start,
3263				    save_start + tlen);
3264			}
3265		}
3266		tcp_handle_wakeup(tp);
3267#if 0
3268		/*
3269		 * Note the amount of data that peer has sent into
3270		 * our window, in order to estimate the sender's
3271		 * buffer size.
3272		 * XXX: Unused.
3273		 */
3274		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
3275			len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
3276		else
3277			len = so->so_rcv.sb_hiwat;
3278#endif
3279	} else {
3280		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
3281			if (tlen > 0) {
3282				if ((thflags & TH_FIN) != 0) {
3283					log(LOG_DEBUG, "%s; %s: %s: "
3284					    "Received %d bytes of data and FIN "
3285					    "after having received a FIN, "
3286					    "just dropping both\n",
3287					    s, __func__,
3288					    tcpstates[tp->t_state], tlen);
3289				} else {
3290					log(LOG_DEBUG, "%s; %s: %s: "
3291					    "Received %d bytes of data "
3292					    "after having received a FIN, "
3293					    "just dropping it\n",
3294					    s, __func__,
3295					    tcpstates[tp->t_state], tlen);
3296				}
3297			} else {
3298				if ((thflags & TH_FIN) != 0) {
3299					log(LOG_DEBUG, "%s; %s: %s: "
3300					    "Received FIN "
3301					    "after having received a FIN, "
3302					    "just dropping it\n",
3303					    s, __func__,
3304					    tcpstates[tp->t_state]);
3305				}
3306			}
3307			free(s, M_TCPLOG);
3308		}
3309		m_freem(m);
3310		thflags &= ~TH_FIN;
3311	}
3312
3313	/*
3314	 * If FIN is received ACK the FIN and let the user know
3315	 * that the connection is closing.
3316	 */
3317	if (thflags & TH_FIN) {
3318		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3319			/* The socket upcall is handled by socantrcvmore. */
3320			socantrcvmore(so);
3321			/*
3322			 * If connection is half-synchronized
3323			 * (ie NEEDSYN flag on) then delay ACK,
3324			 * so it may be piggybacked when SYN is sent.
3325			 * Otherwise, since we received a FIN then no
3326			 * more input can be expected, send ACK now.
3327			 */
3328			if (tp->t_flags & TF_NEEDSYN)
3329				tp->t_flags |= TF_DELACK;
3330			else
3331				tp->t_flags |= TF_ACKNOW;
3332			tp->rcv_nxt++;
3333		}
3334		switch (tp->t_state) {
3335		/*
3336		 * In SYN_RECEIVED and ESTABLISHED STATES
3337		 * enter the CLOSE_WAIT state.
3338		 */
3339		case TCPS_SYN_RECEIVED:
3340			tp->t_starttime = ticks;
3341			/* FALLTHROUGH */
3342		case TCPS_ESTABLISHED:
3343			tcp_state_change(tp, TCPS_CLOSE_WAIT);
3344			break;
3345
3346		/*
3347		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
3348		 * enter the CLOSING state.
3349		 */
3350		case TCPS_FIN_WAIT_1:
3351			tcp_state_change(tp, TCPS_CLOSING);
3352			break;
3353
3354		/*
3355		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
3356		 * starting the time-wait timer, turning off the other
3357		 * standard timers.
3358		 */
3359		case TCPS_FIN_WAIT_2:
3360			tcp_twstart(tp);
3361			return;
3362		}
3363	}
3364	TCP_PROBE3(debug__input, tp, th, m);
3365
3366	/*
3367	 * Return any desired output.
3368	 */
3369	if (needoutput || (tp->t_flags & TF_ACKNOW)) {
3370		(void) tcp_output(tp);
3371	}
3372check_delack:
3373	INP_WLOCK_ASSERT(inp);
3374
3375	if (tp->t_flags & TF_DELACK) {
3376		tp->t_flags &= ~TF_DELACK;
3377		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
3378	}
3379	INP_WUNLOCK(inp);
3380	return;
3381
3382dropafterack:
3383	/*
3384	 * Generate an ACK dropping incoming segment if it occupies
3385	 * sequence space, where the ACK reflects our state.
3386	 *
3387	 * We can now skip the test for the RST flag since all
3388	 * paths to this code happen after packets containing
3389	 * RST have been dropped.
3390	 *
3391	 * In the SYN-RECEIVED state, don't send an ACK unless the
3392	 * segment we received passes the SYN-RECEIVED ACK test.
3393	 * If it fails send a RST.  This breaks the loop in the
3394	 * "LAND" DoS attack, and also prevents an ACK storm
3395	 * between two listening ports that have been sent forged
3396	 * SYN segments, each with the source address of the other.
3397	 */
3398	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
3399	    (SEQ_GT(tp->snd_una, th->th_ack) ||
3400	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
3401		rstreason = BANDLIM_RST_OPENPORT;
3402		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
3403		goto dropwithreset;
3404	}
3405	TCP_PROBE3(debug__input, tp, th, m);
3406	tp->t_flags |= TF_ACKNOW;
3407	(void) tcp_output(tp);
3408	INP_WUNLOCK(inp);
3409	m_freem(m);
3410	return;
3411
3412dropwithreset:
3413	if (tp != NULL) {
3414		tcp_dropwithreset(m, th, tp, tlen, rstreason);
3415		INP_WUNLOCK(inp);
3416	} else
3417		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
3418	return;
3419
3420drop:
3421	/*
3422	 * Drop space held by incoming segment and return.
3423	 */
3424	TCP_PROBE3(debug__input, tp, th, m);
3425	if (tp != NULL) {
3426		INP_WUNLOCK(inp);
3427	}
3428	m_freem(m);
3429}
3430
3431/*
3432 * Issue RST and make ACK acceptable to originator of segment.
3433 * The mbuf must still include the original packet header.
3434 * tp may be NULL.
3435 */
3436void
3437tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
3438    int tlen, int rstreason)
3439{
3440#ifdef INET
3441	struct ip *ip;
3442#endif
3443#ifdef INET6
3444	struct ip6_hdr *ip6;
3445#endif
3446
3447	if (tp != NULL) {
3448		INP_LOCK_ASSERT(tptoinpcb(tp));
3449	}
3450
3451	/* Don't bother if destination was broadcast/multicast. */
3452	if ((tcp_get_flags(th) & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
3453		goto drop;
3454#ifdef INET6
3455	if (mtod(m, struct ip *)->ip_v == 6) {
3456		ip6 = mtod(m, struct ip6_hdr *);
3457		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
3458		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
3459			goto drop;
3460		/* IPv6 anycast check is done at tcp6_input() */
3461	}
3462#endif
3463#if defined(INET) && defined(INET6)
3464	else
3465#endif
3466#ifdef INET
3467	{
3468		ip = mtod(m, struct ip *);
3469		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
3470		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
3471		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
3472		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
3473			goto drop;
3474	}
3475#endif
3476
3477	/* Perform bandwidth limiting. */
3478	if (badport_bandlim(rstreason) < 0)
3479		goto drop;
3480
3481	/* tcp_respond consumes the mbuf chain. */
3482	if (tcp_get_flags(th) & TH_ACK) {
3483		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
3484		    th->th_ack, TH_RST);
3485	} else {
3486		if (tcp_get_flags(th) & TH_SYN)
3487			tlen++;
3488		if (tcp_get_flags(th) & TH_FIN)
3489			tlen++;
3490		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
3491		    (tcp_seq)0, TH_RST|TH_ACK);
3492	}
3493	return;
3494drop:
3495	m_freem(m);
3496}
3497
3498/*
3499 * Parse TCP options and place in tcpopt.
3500 */
3501void
3502tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
3503{
3504	int opt, optlen;
3505
3506	to->to_flags = 0;
3507	for (; cnt > 0; cnt -= optlen, cp += optlen) {
3508		opt = cp[0];
3509		if (opt == TCPOPT_EOL)
3510			break;
3511		if (opt == TCPOPT_NOP)
3512			optlen = 1;
3513		else {
3514			if (cnt < 2)
3515				break;
3516			optlen = cp[1];
3517			if (optlen < 2 || optlen > cnt)
3518				break;
3519		}
3520		switch (opt) {
3521		case TCPOPT_MAXSEG:
3522			if (optlen != TCPOLEN_MAXSEG)
3523				continue;
3524			if (!(flags & TO_SYN))
3525				continue;
3526			to->to_flags |= TOF_MSS;
3527			bcopy((char *)cp + 2,
3528			    (char *)&to->to_mss, sizeof(to->to_mss));
3529			to->to_mss = ntohs(to->to_mss);
3530			break;
3531		case TCPOPT_WINDOW:
3532			if (optlen != TCPOLEN_WINDOW)
3533				continue;
3534			if (!(flags & TO_SYN))
3535				continue;
3536			to->to_flags |= TOF_SCALE;
3537			to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
3538			break;
3539		case TCPOPT_TIMESTAMP:
3540			if (optlen != TCPOLEN_TIMESTAMP)
3541				continue;
3542			to->to_flags |= TOF_TS;
3543			bcopy((char *)cp + 2,
3544			    (char *)&to->to_tsval, sizeof(to->to_tsval));
3545			to->to_tsval = ntohl(to->to_tsval);
3546			bcopy((char *)cp + 6,
3547			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
3548			to->to_tsecr = ntohl(to->to_tsecr);
3549			break;
3550		case TCPOPT_SIGNATURE:
3551			/*
3552			 * In order to reply to a host which has set the
3553			 * TCP_SIGNATURE option in its initial SYN, we have
3554			 * to record the fact that the option was observed
3555			 * here for the syncache code to perform the correct
3556			 * response.
3557			 */
3558			if (optlen != TCPOLEN_SIGNATURE)
3559				continue;
3560			to->to_flags |= TOF_SIGNATURE;
3561			to->to_signature = cp + 2;
3562			break;
3563		case TCPOPT_SACK_PERMITTED:
3564			if (optlen != TCPOLEN_SACK_PERMITTED)
3565				continue;
3566			if (!(flags & TO_SYN))
3567				continue;
3568			if (!V_tcp_do_sack)
3569				continue;
3570			to->to_flags |= TOF_SACKPERM;
3571			break;
3572		case TCPOPT_SACK:
3573			if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
3574				continue;
3575			if (flags & TO_SYN)
3576				continue;
3577			to->to_flags |= TOF_SACK;
3578			to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
3579			to->to_sacks = cp + 2;
3580			TCPSTAT_INC(tcps_sack_rcv_blocks);
3581			break;
3582		case TCPOPT_FAST_OPEN:
3583			/*
3584			 * Cookie length validation is performed by the
3585			 * server side cookie checking code or the client
3586			 * side cookie cache update code.
3587			 */
3588			if (!(flags & TO_SYN))
3589				continue;
3590			if (!V_tcp_fastopen_client_enable &&
3591			    !V_tcp_fastopen_server_enable)
3592				continue;
3593			to->to_flags |= TOF_FASTOPEN;
3594			to->to_tfo_len = optlen - 2;
3595			to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL;
3596			break;
3597		default:
3598			continue;
3599		}
3600	}
3601}
3602
3603/*
3604 * Pull out of band byte out of a segment so
3605 * it doesn't appear in the user's data queue.
3606 * It is still reflected in the segment length for
3607 * sequencing purposes.
3608 */
3609void
3610tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
3611    int off)
3612{
3613	int cnt = off + th->th_urp - 1;
3614
3615	while (cnt >= 0) {
3616		if (m->m_len > cnt) {
3617			char *cp = mtod(m, caddr_t) + cnt;
3618			struct tcpcb *tp = sototcpcb(so);
3619
3620			INP_WLOCK_ASSERT(tptoinpcb(tp));
3621
3622			tp->t_iobc = *cp;
3623			tp->t_oobflags |= TCPOOB_HAVEDATA;
3624			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
3625			m->m_len--;
3626			if (m->m_flags & M_PKTHDR)
3627				m->m_pkthdr.len--;
3628			return;
3629		}
3630		cnt -= m->m_len;
3631		m = m->m_next;
3632		if (m == NULL)
3633			break;
3634	}
3635	panic("tcp_pulloutofband");
3636}
3637
3638/*
3639 * Collect new round-trip time estimate
3640 * and update averages and current timeout.
3641 */
3642void
3643tcp_xmit_timer(struct tcpcb *tp, int rtt)
3644{
3645	int delta;
3646
3647	INP_WLOCK_ASSERT(tptoinpcb(tp));
3648
3649	TCPSTAT_INC(tcps_rttupdated);
3650	if (tp->t_rttupdated < UCHAR_MAX)
3651		tp->t_rttupdated++;
3652#ifdef STATS
3653	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT,
3654	    imax(0, rtt * 1000 / hz));
3655#endif
3656	if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) {
3657		/*
3658		 * srtt is stored as fixed point with 5 bits after the
3659		 * binary point (i.e., scaled by 8).  The following magic
3660		 * is equivalent to the smoothing algorithm in rfc793 with
3661		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
3662		 * point).  Adjust rtt to origin 0.
3663		 */
3664		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3665			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3666
3667		if ((tp->t_srtt += delta) <= 0)
3668			tp->t_srtt = 1;
3669
3670		/*
3671		 * We accumulate a smoothed rtt variance (actually, a
3672		 * smoothed mean difference), then set the retransmit
3673		 * timer to smoothed rtt + 4 times the smoothed variance.
3674		 * rttvar is stored as fixed point with 4 bits after the
3675		 * binary point (scaled by 16).  The following is
3676		 * equivalent to rfc793 smoothing with an alpha of .75
3677		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
3678		 * rfc793's wired-in beta.
3679		 */
3680		if (delta < 0)
3681			delta = -delta;
3682		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3683		if ((tp->t_rttvar += delta) <= 0)
3684			tp->t_rttvar = 1;
3685	} else {
3686		/*
3687		 * No rtt measurement yet - use the unsmoothed rtt.
3688		 * Set the variance to half the rtt (so our first
3689		 * retransmit happens at 3*rtt).
3690		 */
3691		tp->t_srtt = rtt << TCP_RTT_SHIFT;
3692		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3693	}
3694	tp->t_rtttime = 0;
3695	tp->t_rxtshift = 0;
3696
3697	/*
3698	 * the retransmit should happen at rtt + 4 * rttvar.
3699	 * Because of the way we do the smoothing, srtt and rttvar
3700	 * will each average +1/2 tick of bias.  When we compute
3701	 * the retransmit timer, we want 1/2 tick of rounding and
3702	 * 1 extra tick because of +-1/2 tick uncertainty in the
3703	 * firing of the timer.  The bias will give us exactly the
3704	 * 1.5 tick we need.  But, because the bias is
3705	 * statistical, we have to test that we don't drop below
3706	 * the minimum feasible timer (which is 2 ticks).
3707	 */
3708	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3709		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
3710
3711	/*
3712	 * We received an ack for a packet that wasn't retransmitted;
3713	 * it is probably safe to discard any error indications we've
3714	 * received recently.  This isn't quite right, but close enough
3715	 * for now (a route might have failed after we sent a segment,
3716	 * and the return path might not be symmetrical).
3717	 */
3718	tp->t_softerror = 0;
3719}
3720
3721/*
3722 * Determine a reasonable value for maxseg size.
3723 * If the route is known, check route for mtu.
3724 * If none, use an mss that can be handled on the outgoing interface
3725 * without forcing IP to fragment.  If no route is found, route has no mtu,
3726 * or the destination isn't local, use a default, hopefully conservative
3727 * size (usually 512 or the default IP max size, but no more than the mtu
3728 * of the interface), as we can't discover anything about intervening
3729 * gateways or networks.  We also initialize the congestion/slow start
3730 * window to be a single segment if the destination isn't local.
3731 * While looking at the routing entry, we also initialize other path-dependent
3732 * parameters from pre-set or cached values in the routing entry.
3733 *
3734 * NOTE that resulting t_maxseg doesn't include space for TCP options or
3735 * IP options, e.g. IPSEC data, since length of this data may vary, and
3736 * thus it is calculated for every segment separately in tcp_output().
3737 *
3738 * NOTE that this routine is only called when we process an incoming
3739 * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
3740 * settings are handled in tcp_mssopt().
3741 */
3742void
3743tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
3744    struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap)
3745{
3746	int mss = 0;
3747	uint32_t maxmtu = 0;
3748	struct inpcb *inp = tptoinpcb(tp);
3749	struct hc_metrics_lite metrics;
3750#ifdef INET6
3751	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3752	size_t min_protoh = isipv6 ?
3753			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
3754			    sizeof (struct tcpiphdr);
3755#else
3756	 size_t min_protoh = sizeof(struct tcpiphdr);
3757#endif
3758
3759	INP_WLOCK_ASSERT(inp);
3760
3761	if (tp->t_port)
3762		min_protoh += V_tcp_udp_tunneling_overhead;
3763	if (mtuoffer != -1) {
3764		KASSERT(offer == -1, ("%s: conflict", __func__));
3765		offer = mtuoffer - min_protoh;
3766	}
3767
3768	/* Initialize. */
3769#ifdef INET6
3770	if (isipv6) {
3771		maxmtu = tcp_maxmtu6(&inp->inp_inc, cap);
3772		tp->t_maxseg = V_tcp_v6mssdflt;
3773	}
3774#endif
3775#if defined(INET) && defined(INET6)
3776	else
3777#endif
3778#ifdef INET
3779	{
3780		maxmtu = tcp_maxmtu(&inp->inp_inc, cap);
3781		tp->t_maxseg = V_tcp_mssdflt;
3782	}
3783#endif
3784
3785	/*
3786	 * No route to sender, stay with default mss and return.
3787	 */
3788	if (maxmtu == 0) {
3789		/*
3790		 * In case we return early we need to initialize metrics
3791		 * to a defined state as tcp_hc_get() would do for us
3792		 * if there was no cache hit.
3793		 */
3794		if (metricptr != NULL)
3795			bzero(metricptr, sizeof(struct hc_metrics_lite));
3796		return;
3797	}
3798
3799	/* What have we got? */
3800	switch (offer) {
3801		case 0:
3802			/*
3803			 * Offer == 0 means that there was no MSS on the SYN
3804			 * segment, in this case we use tcp_mssdflt as
3805			 * already assigned to t_maxseg above.
3806			 */
3807			offer = tp->t_maxseg;
3808			break;
3809
3810		case -1:
3811			/*
3812			 * Offer == -1 means that we didn't receive SYN yet.
3813			 */
3814			/* FALLTHROUGH */
3815
3816		default:
3817			/*
3818			 * Prevent DoS attack with too small MSS. Round up
3819			 * to at least minmss.
3820			 */
3821			offer = max(offer, V_tcp_minmss);
3822	}
3823
3824	/*
3825	 * rmx information is now retrieved from tcp_hostcache.
3826	 */
3827	tcp_hc_get(&inp->inp_inc, &metrics);
3828	if (metricptr != NULL)
3829		bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
3830
3831	/*
3832	 * If there's a discovered mtu in tcp hostcache, use it.
3833	 * Else, use the link mtu.
3834	 */
3835	if (metrics.rmx_mtu)
3836		mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
3837	else {
3838#ifdef INET6
3839		if (isipv6) {
3840			mss = maxmtu - min_protoh;
3841			if (!V_path_mtu_discovery &&
3842			    !in6_localaddr(&inp->in6p_faddr))
3843				mss = min(mss, V_tcp_v6mssdflt);
3844		}
3845#endif
3846#if defined(INET) && defined(INET6)
3847		else
3848#endif
3849#ifdef INET
3850		{
3851			mss = maxmtu - min_protoh;
3852			if (!V_path_mtu_discovery &&
3853			    !in_localaddr(inp->inp_faddr))
3854				mss = min(mss, V_tcp_mssdflt);
3855		}
3856#endif
3857		/*
3858		 * XXX - The above conditional (mss = maxmtu - min_protoh)
3859		 * probably violates the TCP spec.
3860		 * The problem is that, since we don't know the
3861		 * other end's MSS, we are supposed to use a conservative
3862		 * default.  But, if we do that, then MTU discovery will
3863		 * never actually take place, because the conservative
3864		 * default is much less than the MTUs typically seen
3865		 * on the Internet today.  For the moment, we'll sweep
3866		 * this under the carpet.
3867		 *
3868		 * The conservative default might not actually be a problem
3869		 * if the only case this occurs is when sending an initial
3870		 * SYN with options and data to a host we've never talked
3871		 * to before.  Then, they will reply with an MSS value which
3872		 * will get recorded and the new parameters should get
3873		 * recomputed.  For Further Study.
3874		 */
3875	}
3876	mss = min(mss, offer);
3877
3878	/*
3879	 * Sanity check: make sure that maxseg will be large
3880	 * enough to allow some data on segments even if the
3881	 * all the option space is used (40bytes).  Otherwise
3882	 * funny things may happen in tcp_output.
3883	 *
3884	 * XXXGL: shouldn't we reserve space for IP/IPv6 options?
3885	 */
3886	mss = max(mss, 64);
3887
3888	tp->t_maxseg = mss;
3889	if (tp->t_maxseg < V_tcp_mssdflt) {
3890		/*
3891		 * The MSS is so small we should not process incoming
3892		 * SACK's since we are subject to attack in such a
3893		 * case.
3894		 */
3895		tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
3896	} else {
3897		tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT;
3898	}
3899
3900}
3901
3902void
3903tcp_mss(struct tcpcb *tp, int offer)
3904{
3905	int mss;
3906	uint32_t bufsize;
3907	struct inpcb *inp = tptoinpcb(tp);
3908	struct socket *so;
3909	struct hc_metrics_lite metrics;
3910	struct tcp_ifcap cap;
3911
3912	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
3913
3914	bzero(&cap, sizeof(cap));
3915	tcp_mss_update(tp, offer, -1, &metrics, &cap);
3916
3917	mss = tp->t_maxseg;
3918
3919	/*
3920	 * If there's a pipesize, change the socket buffer to that size,
3921	 * don't change if sb_hiwat is different than default (then it
3922	 * has been changed on purpose with setsockopt).
3923	 * Make the socket buffers an integral number of mss units;
3924	 * if the mss is larger than the socket buffer, decrease the mss.
3925	 */
3926	so = inp->inp_socket;
3927	SOCKBUF_LOCK(&so->so_snd);
3928	if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe)
3929		bufsize = metrics.rmx_sendpipe;
3930	else
3931		bufsize = so->so_snd.sb_hiwat;
3932	if (bufsize < mss)
3933		mss = bufsize;
3934	else {
3935		bufsize = roundup(bufsize, mss);
3936		if (bufsize > sb_max)
3937			bufsize = sb_max;
3938		if (bufsize > so->so_snd.sb_hiwat)
3939			(void)sbreserve_locked(so, SO_SND, bufsize, NULL);
3940	}
3941	SOCKBUF_UNLOCK(&so->so_snd);
3942	/*
3943	 * Sanity check: make sure that maxseg will be large
3944	 * enough to allow some data on segments even if the
3945	 * all the option space is used (40bytes).  Otherwise
3946	 * funny things may happen in tcp_output.
3947	 *
3948	 * XXXGL: shouldn't we reserve space for IP/IPv6 options?
3949	 */
3950	tp->t_maxseg = max(mss, 64);
3951	if (tp->t_maxseg < V_tcp_mssdflt) {
3952		/*
3953		 * The MSS is so small we should not process incoming
3954		 * SACK's since we are subject to attack in such a
3955		 * case.
3956		 */
3957		tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
3958	} else {
3959		tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT;
3960	}
3961
3962	SOCKBUF_LOCK(&so->so_rcv);
3963	if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe)
3964		bufsize = metrics.rmx_recvpipe;
3965	else
3966		bufsize = so->so_rcv.sb_hiwat;
3967	if (bufsize > mss) {
3968		bufsize = roundup(bufsize, mss);
3969		if (bufsize > sb_max)
3970			bufsize = sb_max;
3971		if (bufsize > so->so_rcv.sb_hiwat)
3972			(void)sbreserve_locked(so, SO_RCV, bufsize, NULL);
3973	}
3974	SOCKBUF_UNLOCK(&so->so_rcv);
3975
3976	/* Check the interface for TSO capabilities. */
3977	if (cap.ifcap & CSUM_TSO) {
3978		tp->t_flags |= TF_TSO;
3979		tp->t_tsomax = cap.tsomax;
3980		tp->t_tsomaxsegcount = cap.tsomaxsegcount;
3981		tp->t_tsomaxsegsize = cap.tsomaxsegsize;
3982	}
3983}
3984
3985/*
3986 * Determine the MSS option to send on an outgoing SYN.
3987 */
3988int
3989tcp_mssopt(struct in_conninfo *inc)
3990{
3991	int mss = 0;
3992	uint32_t thcmtu = 0;
3993	uint32_t maxmtu = 0;
3994	size_t min_protoh;
3995
3996	KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
3997
3998#ifdef INET6
3999	if (inc->inc_flags & INC_ISIPV6) {
4000		mss = V_tcp_v6mssdflt;
4001		maxmtu = tcp_maxmtu6(inc, NULL);
4002		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
4003	}
4004#endif
4005#if defined(INET) && defined(INET6)
4006	else
4007#endif
4008#ifdef INET
4009	{
4010		mss = V_tcp_mssdflt;
4011		maxmtu = tcp_maxmtu(inc, NULL);
4012		min_protoh = sizeof(struct tcpiphdr);
4013	}
4014#endif
4015#if defined(INET6) || defined(INET)
4016	thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
4017#endif
4018
4019	if (maxmtu && thcmtu)
4020		mss = min(maxmtu, thcmtu) - min_protoh;
4021	else if (maxmtu || thcmtu)
4022		mss = max(maxmtu, thcmtu) - min_protoh;
4023
4024	return (mss);
4025}
4026
4027void
4028tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
4029    sackstatus_t sack_changed, u_int *maxsegp)
4030{
4031	int snd_cnt = 0, limit = 0, del_data = 0, pipe = 0;
4032	u_int maxseg;
4033
4034	INP_WLOCK_ASSERT(tptoinpcb(tp));
4035
4036	if (*maxsegp == 0) {
4037		*maxsegp = tcp_maxseg(tp);
4038	}
4039	maxseg = *maxsegp;
4040	/*
4041	 * Compute the amount of data that this ACK is indicating
4042	 * (del_data) and an estimate of how many bytes are in the
4043	 * network.
4044	 */
4045	if (tcp_is_sack_recovery(tp, to) ||
4046	    (IN_CONGRECOVERY(tp->t_flags) &&
4047	     !IN_FASTRECOVERY(tp->t_flags))) {
4048		del_data = tp->sackhint.delivered_data;
4049		if (V_tcp_do_newsack)
4050			pipe = tcp_compute_pipe(tp);
4051		else
4052			pipe = (tp->snd_nxt - tp->snd_fack) +
4053				tp->sackhint.sack_bytes_rexmit;
4054	} else {
4055		if (tp->sackhint.prr_delivered < (tcprexmtthresh * maxseg +
4056					     tp->snd_recover - tp->snd_una)) {
4057			del_data = maxseg;
4058		}
4059		pipe = imax(0, tp->snd_max - tp->snd_una -
4060			    imin(INT_MAX / 65536, tp->t_dupacks) * maxseg);
4061	}
4062	tp->sackhint.prr_delivered += del_data;
4063	/*
4064	 * Proportional Rate Reduction
4065	 */
4066	if (pipe >= tp->snd_ssthresh) {
4067		if (tp->sackhint.recover_fs == 0)
4068			tp->sackhint.recover_fs =
4069			    imax(1, tp->snd_nxt - tp->snd_una);
4070		snd_cnt = howmany((long)tp->sackhint.prr_delivered *
4071			    tp->snd_ssthresh, tp->sackhint.recover_fs) -
4072			    tp->sackhint.prr_out + maxseg - 1;
4073	} else {
4074		/*
4075		 * PRR 6937bis heuristic:
4076		 * - A partial ack without SACK block beneath snd_recover
4077		 * indicates further loss.
4078		 * - An SACK scoreboard update adding a new hole indicates
4079		 * further loss, so be conservative and send at most one
4080		 * segment.
4081		 * - Prevent ACK splitting attacks, by being conservative
4082		 * when no new data is acked.
4083		 */
4084		if ((sack_changed == SACK_NEWLOSS) || (del_data == 0)) {
4085			limit = tp->sackhint.prr_delivered -
4086				tp->sackhint.prr_out;
4087		} else {
4088			limit = imax(tp->sackhint.prr_delivered -
4089				    tp->sackhint.prr_out, del_data) +
4090				    maxseg;
4091		}
4092		snd_cnt = imin((tp->snd_ssthresh - pipe), limit);
4093	}
4094	snd_cnt = imax(snd_cnt, 0) / maxseg;
4095	/*
4096	 * Send snd_cnt new data into the network in response to this ack.
4097	 * If there is going to be a SACK retransmission, adjust snd_cwnd
4098	 * accordingly.
4099	 */
4100	if (IN_FASTRECOVERY(tp->t_flags)) {
4101		if (tcp_is_sack_recovery(tp, to)) {
4102			tp->snd_cwnd = tp->snd_nxt - tp->snd_recover +
4103					    tp->sackhint.sack_bytes_rexmit +
4104					    (snd_cnt * maxseg);
4105		} else {
4106			tp->snd_cwnd = (tp->snd_max - tp->snd_una) +
4107					    (snd_cnt * maxseg);
4108		}
4109	} else if (IN_CONGRECOVERY(tp->t_flags)) {
4110		tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg);
4111	}
4112	tp->snd_cwnd = imax(maxseg, tp->snd_cwnd);
4113}
4114
4115/*
4116 * On a partial ack arrives, force the retransmission of the
4117 * next unacknowledged segment.  Do not clear tp->t_dupacks.
4118 * By setting snd_nxt to ti_ack, this forces retransmission timer to
4119 * be started again.
4120 */
4121void
4122tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
4123{
4124	tcp_seq onxt = tp->snd_nxt;
4125	uint32_t ocwnd = tp->snd_cwnd;
4126	u_int maxseg = tcp_maxseg(tp);
4127
4128	INP_WLOCK_ASSERT(tptoinpcb(tp));
4129
4130	tcp_timer_activate(tp, TT_REXMT, 0);
4131	tp->t_rtttime = 0;
4132	tp->snd_nxt = th->th_ack;
4133	/*
4134	 * Set snd_cwnd to one segment beyond acknowledged offset.
4135	 * (tp->snd_una has not yet been updated when this function is called.)
4136	 */
4137	tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
4138	tp->t_flags |= TF_ACKNOW;
4139	(void) tcp_output(tp);
4140	tp->snd_cwnd = ocwnd;
4141	if (SEQ_GT(onxt, tp->snd_nxt))
4142		tp->snd_nxt = onxt;
4143	/*
4144	 * Partial window deflation.  Relies on fact that tp->snd_una
4145	 * not updated yet.
4146	 */
4147	if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
4148		tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
4149	else
4150		tp->snd_cwnd = 0;
4151	tp->snd_cwnd += maxseg;
4152}
4153
4154int
4155tcp_compute_pipe(struct tcpcb *tp)
4156{
4157	if (tp->t_fb->tfb_compute_pipe == NULL) {
4158		return (tp->snd_max - tp->snd_una +
4159			tp->sackhint.sack_bytes_rexmit -
4160			tp->sackhint.sacked_bytes -
4161			tp->sackhint.lost_bytes);
4162	} else {
4163		return((*tp->t_fb->tfb_compute_pipe)(tp));
4164	}
4165}
4166
4167uint32_t
4168tcp_compute_initwnd(uint32_t maxseg)
4169{
4170	/*
4171	 * Calculate the Initial Window, also used as Restart Window
4172	 *
4173	 * RFC5681 Section 3.1 specifies the default conservative values.
4174	 * RFC3390 specifies slightly more aggressive values.
4175	 * RFC6928 increases it to ten segments.
4176	 * Support for user specified value for initial flight size.
4177	 */
4178	if (V_tcp_initcwnd_segments)
4179		return min(V_tcp_initcwnd_segments * maxseg,
4180		    max(2 * maxseg, V_tcp_initcwnd_segments * 1460));
4181	else if (V_tcp_do_rfc3390)
4182		return min(4 * maxseg, max(2 * maxseg, 4380));
4183	else {
4184		/* Per RFC5681 Section 3.1 */
4185		if (maxseg > 2190)
4186			return (2 * maxseg);
4187		else if (maxseg > 1095)
4188			return (3 * maxseg);
4189		else
4190			return (4 * maxseg);
4191	}
4192}
4193