tcp_timer.c revision 331722
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: stable/11/sys/netinet/tcp_timer.c 331722 2018-03-29 02:50:57Z eadler $");
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37#include "opt_tcpdebug.h"
38#include "opt_rss.h"
39
40#include <sys/param.h>
41#include <sys/kernel.h>
42#include <sys/lock.h>
43#include <sys/mbuf.h>
44#include <sys/mutex.h>
45#include <sys/protosw.h>
46#include <sys/smp.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/sysctl.h>
50#include <sys/systm.h>
51
52#include <net/if.h>
53#include <net/route.h>
54#include <net/rss_config.h>
55#include <net/vnet.h>
56#include <net/netisr.h>
57
58#include <netinet/in.h>
59#include <netinet/in_kdtrace.h>
60#include <netinet/in_pcb.h>
61#include <netinet/in_rss.h>
62#include <netinet/in_systm.h>
63#ifdef INET6
64#include <netinet6/in6_pcb.h>
65#endif
66#include <netinet/ip_var.h>
67#include <netinet/tcp.h>
68#include <netinet/tcp_fsm.h>
69#include <netinet/tcp_timer.h>
70#include <netinet/tcp_var.h>
71#include <netinet/cc/cc.h>
72#ifdef INET6
73#include <netinet6/tcp6_var.h>
74#endif
75#include <netinet/tcpip.h>
76#ifdef TCPDEBUG
77#include <netinet/tcp_debug.h>
78#endif
79
80int    tcp_persmin;
81SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
82    &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
83
84int    tcp_persmax;
85SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
86    &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
87
88int	tcp_keepinit;
89SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
90    &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
91
92int	tcp_keepidle;
93SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
94    &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
95
96int	tcp_keepintvl;
97SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
98    &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
99
100int	tcp_delacktime;
101SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
102    &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
103    "Time before a delayed ACK is sent");
104
105int	tcp_msl;
106SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
107    &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
108
109int	tcp_rexmit_min;
110SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
111    &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
112    "Minimum Retransmission Timeout");
113
114int	tcp_rexmit_slop;
115SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
116    &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
117    "Retransmission Timer Slop");
118
119int	tcp_always_keepalive = 1;
120SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
121    &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
122__strong_reference(tcp_always_keepalive, always_keepalive);
123
124int    tcp_fast_finwait2_recycle = 0;
125SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
126    &tcp_fast_finwait2_recycle, 0,
127    "Recycle closed FIN_WAIT_2 connections faster");
128
129int    tcp_finwait2_timeout;
130SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
131    &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
132
133int	tcp_keepcnt = TCPTV_KEEPCNT;
134SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
135    "Number of keepalive probes to send");
136
137	/* max idle probes */
138int	tcp_maxpersistidle;
139
140static int	tcp_rexmit_drop_options = 0;
141SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
142    &tcp_rexmit_drop_options, 0,
143    "Drop TCP options from 3rd and later retransmitted SYN");
144
145static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
146#define	V_tcp_pmtud_blackhole_detect	VNET(tcp_pmtud_blackhole_detect)
147SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
148    CTLFLAG_RW|CTLFLAG_VNET,
149    &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
150    "Path MTU Discovery Black Hole Detection Enabled");
151
152static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
153#define	V_tcp_pmtud_blackhole_activated \
154    VNET(tcp_pmtud_blackhole_activated)
155SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
156    CTLFLAG_RD|CTLFLAG_VNET,
157    &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
158    "Path MTU Discovery Black Hole Detection, Activation Count");
159
160static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
161#define	V_tcp_pmtud_blackhole_activated_min_mss \
162    VNET(tcp_pmtud_blackhole_activated_min_mss)
163SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
164    CTLFLAG_RD|CTLFLAG_VNET,
165    &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
166    "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
167
168static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
169#define	V_tcp_pmtud_blackhole_failed	VNET(tcp_pmtud_blackhole_failed)
170SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
171    CTLFLAG_RD|CTLFLAG_VNET,
172    &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
173    "Path MTU Discovery Black Hole Detection, Failure Count");
174
175#ifdef INET
176static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
177#define	V_tcp_pmtud_blackhole_mss	VNET(tcp_pmtud_blackhole_mss)
178SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
179    CTLFLAG_RW|CTLFLAG_VNET,
180    &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
181    "Path MTU Discovery Black Hole Detection lowered MSS");
182#endif
183
184#ifdef INET6
185static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
186#define	V_tcp_v6pmtud_blackhole_mss	VNET(tcp_v6pmtud_blackhole_mss)
187SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
188    CTLFLAG_RW|CTLFLAG_VNET,
189    &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
190    "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
191#endif
192
193#ifdef	RSS
194static int	per_cpu_timers = 1;
195#else
196static int	per_cpu_timers = 0;
197#endif
198SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
199    &per_cpu_timers , 0, "run tcp timers on all cpus");
200
201#if 0
202#define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
203		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
204#endif
205
206/*
207 * Map the given inp to a CPU id.
208 *
209 * This queries RSS if it's compiled in, else it defaults to the current
210 * CPU ID.
211 */
212static inline int
213inp_to_cpuid(struct inpcb *inp)
214{
215	u_int cpuid;
216
217#ifdef	RSS
218	if (per_cpu_timers) {
219		cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
220		if (cpuid == NETISR_CPUID_NONE)
221			return (curcpu);	/* XXX */
222		else
223			return (cpuid);
224	}
225#else
226	/* Legacy, pre-RSS behaviour */
227	if (per_cpu_timers) {
228		/*
229		 * We don't have a flowid -> cpuid mapping, so cheat and
230		 * just map unknown cpuids to curcpu.  Not the best, but
231		 * apparently better than defaulting to swi 0.
232		 */
233		cpuid = inp->inp_flowid % (mp_maxid + 1);
234		if (! CPU_ABSENT(cpuid))
235			return (cpuid);
236		return (curcpu);
237	}
238#endif
239	/* Default for RSS and non-RSS - cpuid 0 */
240	else {
241		return (0);
242	}
243}
244
245/*
246 * Tcp protocol timeout routine called every 500 ms.
247 * Updates timestamps used for TCP
248 * causes finite state machine actions if timers expire.
249 */
250void
251tcp_slowtimo(void)
252{
253	VNET_ITERATOR_DECL(vnet_iter);
254
255	VNET_LIST_RLOCK_NOSLEEP();
256	VNET_FOREACH(vnet_iter) {
257		CURVNET_SET(vnet_iter);
258		(void) tcp_tw_2msl_scan(0);
259		CURVNET_RESTORE();
260	}
261	VNET_LIST_RUNLOCK_NOSLEEP();
262}
263
264int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
265    { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
266
267int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
268    { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
269
270static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
271
272/*
273 * TCP timer processing.
274 */
275
276void
277tcp_timer_delack(void *xtp)
278{
279	struct tcpcb *tp = xtp;
280	struct inpcb *inp;
281	CURVNET_SET(tp->t_vnet);
282
283	inp = tp->t_inpcb;
284	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
285	INP_WLOCK(inp);
286	if (callout_pending(&tp->t_timers->tt_delack) ||
287	    !callout_active(&tp->t_timers->tt_delack)) {
288		INP_WUNLOCK(inp);
289		CURVNET_RESTORE();
290		return;
291	}
292	callout_deactivate(&tp->t_timers->tt_delack);
293	if ((inp->inp_flags & INP_DROPPED) != 0) {
294		INP_WUNLOCK(inp);
295		CURVNET_RESTORE();
296		return;
297	}
298	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
299		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
300	KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
301		("%s: tp %p delack callout should be running", __func__, tp));
302
303	tp->t_flags |= TF_ACKNOW;
304	TCPSTAT_INC(tcps_delack);
305	(void) tp->t_fb->tfb_tcp_output(tp);
306	INP_WUNLOCK(inp);
307	CURVNET_RESTORE();
308}
309
310void
311tcp_timer_2msl(void *xtp)
312{
313	struct tcpcb *tp = xtp;
314	struct inpcb *inp;
315	CURVNET_SET(tp->t_vnet);
316#ifdef TCPDEBUG
317	int ostate;
318
319	ostate = tp->t_state;
320#endif
321	INP_INFO_RLOCK(&V_tcbinfo);
322	inp = tp->t_inpcb;
323	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
324	INP_WLOCK(inp);
325	tcp_free_sackholes(tp);
326	if (callout_pending(&tp->t_timers->tt_2msl) ||
327	    !callout_active(&tp->t_timers->tt_2msl)) {
328		INP_WUNLOCK(tp->t_inpcb);
329		INP_INFO_RUNLOCK(&V_tcbinfo);
330		CURVNET_RESTORE();
331		return;
332	}
333	callout_deactivate(&tp->t_timers->tt_2msl);
334	if ((inp->inp_flags & INP_DROPPED) != 0) {
335		INP_WUNLOCK(inp);
336		INP_INFO_RUNLOCK(&V_tcbinfo);
337		CURVNET_RESTORE();
338		return;
339	}
340	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
341		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
342	KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
343		("%s: tp %p 2msl callout should be running", __func__, tp));
344	/*
345	 * 2 MSL timeout in shutdown went off.  If we're closed but
346	 * still waiting for peer to close and connection has been idle
347	 * too long delete connection control block.  Otherwise, check
348	 * again in a bit.
349	 *
350	 * If in TIME_WAIT state just ignore as this timeout is handled in
351	 * tcp_tw_2msl_scan().
352	 *
353	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
354	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
355	 * Ignore fact that there were recent incoming segments.
356	 */
357	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
358		INP_WUNLOCK(inp);
359		INP_INFO_RUNLOCK(&V_tcbinfo);
360		CURVNET_RESTORE();
361		return;
362	}
363	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
364	    tp->t_inpcb && tp->t_inpcb->inp_socket &&
365	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
366		TCPSTAT_INC(tcps_finwait2_drops);
367		tp = tcp_close(tp);
368	} else {
369		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
370			if (!callout_reset(&tp->t_timers->tt_2msl,
371			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
372				tp->t_timers->tt_flags &= ~TT_2MSL_RST;
373			}
374		} else
375		       tp = tcp_close(tp);
376       }
377
378#ifdef TCPDEBUG
379	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
380		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
381			  PRU_SLOWTIMO);
382#endif
383	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
384
385	if (tp != NULL)
386		INP_WUNLOCK(inp);
387	INP_INFO_RUNLOCK(&V_tcbinfo);
388	CURVNET_RESTORE();
389}
390
391void
392tcp_timer_keep(void *xtp)
393{
394	struct tcpcb *tp = xtp;
395	struct tcptemp *t_template;
396	struct inpcb *inp;
397	CURVNET_SET(tp->t_vnet);
398#ifdef TCPDEBUG
399	int ostate;
400
401	ostate = tp->t_state;
402#endif
403	INP_INFO_RLOCK(&V_tcbinfo);
404	inp = tp->t_inpcb;
405	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
406	INP_WLOCK(inp);
407	if (callout_pending(&tp->t_timers->tt_keep) ||
408	    !callout_active(&tp->t_timers->tt_keep)) {
409		INP_WUNLOCK(inp);
410		INP_INFO_RUNLOCK(&V_tcbinfo);
411		CURVNET_RESTORE();
412		return;
413	}
414	callout_deactivate(&tp->t_timers->tt_keep);
415	if ((inp->inp_flags & INP_DROPPED) != 0) {
416		INP_WUNLOCK(inp);
417		INP_INFO_RUNLOCK(&V_tcbinfo);
418		CURVNET_RESTORE();
419		return;
420	}
421	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
422		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
423	KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
424		("%s: tp %p keep callout should be running", __func__, tp));
425	/*
426	 * Keep-alive timer went off; send something
427	 * or drop connection if idle for too long.
428	 */
429	TCPSTAT_INC(tcps_keeptimeo);
430	if (tp->t_state < TCPS_ESTABLISHED)
431		goto dropit;
432	if ((tcp_always_keepalive ||
433	    inp->inp_socket->so_options & SO_KEEPALIVE) &&
434	    tp->t_state <= TCPS_CLOSING) {
435		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
436			goto dropit;
437		/*
438		 * Send a packet designed to force a response
439		 * if the peer is up and reachable:
440		 * either an ACK if the connection is still alive,
441		 * or an RST if the peer has closed the connection
442		 * due to timeout or reboot.
443		 * Using sequence number tp->snd_una-1
444		 * causes the transmitted zero-length segment
445		 * to lie outside the receive window;
446		 * by the protocol spec, this requires the
447		 * correspondent TCP to respond.
448		 */
449		TCPSTAT_INC(tcps_keepprobe);
450		t_template = tcpip_maketemplate(inp);
451		if (t_template) {
452			tcp_respond(tp, t_template->tt_ipgen,
453				    &t_template->tt_t, (struct mbuf *)NULL,
454				    tp->rcv_nxt, tp->snd_una - 1, 0);
455			free(t_template, M_TEMP);
456		}
457		if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
458		    tcp_timer_keep, tp)) {
459			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
460		}
461	} else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
462		    tcp_timer_keep, tp)) {
463			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
464		}
465
466#ifdef TCPDEBUG
467	if (inp->inp_socket->so_options & SO_DEBUG)
468		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
469			  PRU_SLOWTIMO);
470#endif
471	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
472	INP_WUNLOCK(inp);
473	INP_INFO_RUNLOCK(&V_tcbinfo);
474	CURVNET_RESTORE();
475	return;
476
477dropit:
478	TCPSTAT_INC(tcps_keepdrops);
479	tp = tcp_drop(tp, ETIMEDOUT);
480
481#ifdef TCPDEBUG
482	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
483		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
484			  PRU_SLOWTIMO);
485#endif
486	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
487	if (tp != NULL)
488		INP_WUNLOCK(tp->t_inpcb);
489	INP_INFO_RUNLOCK(&V_tcbinfo);
490	CURVNET_RESTORE();
491}
492
493void
494tcp_timer_persist(void *xtp)
495{
496	struct tcpcb *tp = xtp;
497	struct inpcb *inp;
498	CURVNET_SET(tp->t_vnet);
499#ifdef TCPDEBUG
500	int ostate;
501
502	ostate = tp->t_state;
503#endif
504	INP_INFO_RLOCK(&V_tcbinfo);
505	inp = tp->t_inpcb;
506	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
507	INP_WLOCK(inp);
508	if (callout_pending(&tp->t_timers->tt_persist) ||
509	    !callout_active(&tp->t_timers->tt_persist)) {
510		INP_WUNLOCK(inp);
511		INP_INFO_RUNLOCK(&V_tcbinfo);
512		CURVNET_RESTORE();
513		return;
514	}
515	callout_deactivate(&tp->t_timers->tt_persist);
516	if ((inp->inp_flags & INP_DROPPED) != 0) {
517		INP_WUNLOCK(inp);
518		INP_INFO_RUNLOCK(&V_tcbinfo);
519		CURVNET_RESTORE();
520		return;
521	}
522	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
523		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
524	KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
525		("%s: tp %p persist callout should be running", __func__, tp));
526	/*
527	 * Persistence timer into zero window.
528	 * Force a byte to be output, if possible.
529	 */
530	TCPSTAT_INC(tcps_persisttimeo);
531	/*
532	 * Hack: if the peer is dead/unreachable, we do not
533	 * time out if the window is closed.  After a full
534	 * backoff, drop the connection if the idle time
535	 * (no responses to probes) reaches the maximum
536	 * backoff that we would use if retransmitting.
537	 */
538	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
539	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
540	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
541		TCPSTAT_INC(tcps_persistdrop);
542		tp = tcp_drop(tp, ETIMEDOUT);
543		goto out;
544	}
545	/*
546	 * If the user has closed the socket then drop a persisting
547	 * connection after a much reduced timeout.
548	 */
549	if (tp->t_state > TCPS_CLOSE_WAIT &&
550	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
551		TCPSTAT_INC(tcps_persistdrop);
552		tp = tcp_drop(tp, ETIMEDOUT);
553		goto out;
554	}
555	tcp_setpersist(tp);
556	tp->t_flags |= TF_FORCEDATA;
557	(void) tp->t_fb->tfb_tcp_output(tp);
558	tp->t_flags &= ~TF_FORCEDATA;
559
560out:
561#ifdef TCPDEBUG
562	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
563		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
564#endif
565	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
566	if (tp != NULL)
567		INP_WUNLOCK(inp);
568	INP_INFO_RUNLOCK(&V_tcbinfo);
569	CURVNET_RESTORE();
570}
571
572void
573tcp_timer_rexmt(void * xtp)
574{
575	struct tcpcb *tp = xtp;
576	CURVNET_SET(tp->t_vnet);
577	int rexmt;
578	int headlocked;
579	struct inpcb *inp;
580#ifdef TCPDEBUG
581	int ostate;
582
583	ostate = tp->t_state;
584#endif
585
586	INP_INFO_RLOCK(&V_tcbinfo);
587	inp = tp->t_inpcb;
588	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
589	INP_WLOCK(inp);
590	if (callout_pending(&tp->t_timers->tt_rexmt) ||
591	    !callout_active(&tp->t_timers->tt_rexmt)) {
592		INP_WUNLOCK(inp);
593		INP_INFO_RUNLOCK(&V_tcbinfo);
594		CURVNET_RESTORE();
595		return;
596	}
597	callout_deactivate(&tp->t_timers->tt_rexmt);
598	if ((inp->inp_flags & INP_DROPPED) != 0) {
599		INP_WUNLOCK(inp);
600		INP_INFO_RUNLOCK(&V_tcbinfo);
601		CURVNET_RESTORE();
602		return;
603	}
604	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
605		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
606	KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
607		("%s: tp %p rexmt callout should be running", __func__, tp));
608	tcp_free_sackholes(tp);
609	if (tp->t_fb->tfb_tcp_rexmit_tmr) {
610		/* The stack has a timer action too. */
611		(*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
612	}
613	/*
614	 * Retransmission timer went off.  Message has not
615	 * been acked within retransmit interval.  Back off
616	 * to a longer retransmit interval and retransmit one segment.
617	 */
618	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
619		tp->t_rxtshift = TCP_MAXRXTSHIFT;
620		TCPSTAT_INC(tcps_timeoutdrop);
621
622		tp = tcp_drop(tp, tp->t_softerror ?
623			      tp->t_softerror : ETIMEDOUT);
624		headlocked = 1;
625		goto out;
626	}
627	INP_INFO_RUNLOCK(&V_tcbinfo);
628	headlocked = 0;
629	if (tp->t_state == TCPS_SYN_SENT) {
630		/*
631		 * If the SYN was retransmitted, indicate CWND to be
632		 * limited to 1 segment in cc_conn_init().
633		 */
634		tp->snd_cwnd = 1;
635	} else if (tp->t_rxtshift == 1) {
636		/*
637		 * first retransmit; record ssthresh and cwnd so they can
638		 * be recovered if this turns out to be a "bad" retransmit.
639		 * A retransmit is considered "bad" if an ACK for this
640		 * segment is received within RTT/2 interval; the assumption
641		 * here is that the ACK was already in flight.  See
642		 * "On Estimating End-to-End Network Path Properties" by
643		 * Allman and Paxson for more details.
644		 */
645		tp->snd_cwnd_prev = tp->snd_cwnd;
646		tp->snd_ssthresh_prev = tp->snd_ssthresh;
647		tp->snd_recover_prev = tp->snd_recover;
648		if (IN_FASTRECOVERY(tp->t_flags))
649			tp->t_flags |= TF_WASFRECOVERY;
650		else
651			tp->t_flags &= ~TF_WASFRECOVERY;
652		if (IN_CONGRECOVERY(tp->t_flags))
653			tp->t_flags |= TF_WASCRECOVERY;
654		else
655			tp->t_flags &= ~TF_WASCRECOVERY;
656		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
657		tp->t_flags |= TF_PREVVALID;
658	} else
659		tp->t_flags &= ~TF_PREVVALID;
660	TCPSTAT_INC(tcps_rexmttimeo);
661	if ((tp->t_state == TCPS_SYN_SENT) ||
662	    (tp->t_state == TCPS_SYN_RECEIVED))
663		rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
664	else
665		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
666	TCPT_RANGESET(tp->t_rxtcur, rexmt,
667		      tp->t_rttmin, TCPTV_REXMTMAX);
668
669	/*
670	 * We enter the path for PLMTUD if connection is established or, if
671	 * connection is FIN_WAIT_1 status, reason for the last is that if
672	 * amount of data we send is very small, we could send it in couple of
673	 * packets and process straight to FIN. In that case we won't catch
674	 * ESTABLISHED state.
675	 */
676	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
677	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
678#ifdef INET6
679		int isipv6;
680#endif
681
682		/*
683		 * Idea here is that at each stage of mtu probe (usually, 1448
684		 * -> 1188 -> 524) should be given 2 chances to recover before
685		 *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
686		 *  take care of that.
687		 */
688		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
689		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
690		    (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
691			/*
692			 * Enter Path MTU Black-hole Detection mechanism:
693			 * - Disable Path MTU Discovery (IP "DF" bit).
694			 * - Reduce MTU to lower value than what we
695			 *   negotiated with peer.
696			 */
697			/* Record that we may have found a black hole. */
698			tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
699
700			/* Keep track of previous MSS. */
701			tp->t_pmtud_saved_maxseg = tp->t_maxseg;
702
703			/*
704			 * Reduce the MSS to blackhole value or to the default
705			 * in an attempt to retransmit.
706			 */
707#ifdef INET6
708			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
709			if (isipv6 &&
710			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
711				/* Use the sysctl tuneable blackhole MSS. */
712				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
713				V_tcp_pmtud_blackhole_activated++;
714			} else if (isipv6) {
715				/* Use the default MSS. */
716				tp->t_maxseg = V_tcp_v6mssdflt;
717				/*
718				 * Disable Path MTU Discovery when we switch to
719				 * minmss.
720				 */
721				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
722				V_tcp_pmtud_blackhole_activated_min_mss++;
723			}
724#endif
725#if defined(INET6) && defined(INET)
726			else
727#endif
728#ifdef INET
729			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
730				/* Use the sysctl tuneable blackhole MSS. */
731				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
732				V_tcp_pmtud_blackhole_activated++;
733			} else {
734				/* Use the default MSS. */
735				tp->t_maxseg = V_tcp_mssdflt;
736				/*
737				 * Disable Path MTU Discovery when we switch to
738				 * minmss.
739				 */
740				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
741				V_tcp_pmtud_blackhole_activated_min_mss++;
742			}
743#endif
744			/*
745			 * Reset the slow-start flight size
746			 * as it may depend on the new MSS.
747			 */
748			if (CC_ALGO(tp)->conn_init != NULL)
749				CC_ALGO(tp)->conn_init(tp->ccv);
750		} else {
751			/*
752			 * If further retransmissions are still unsuccessful
753			 * with a lowered MTU, maybe this isn't a blackhole and
754			 * we restore the previous MSS and blackhole detection
755			 * flags.
756			 * The limit '6' is determined by giving each probe
757			 * stage (1448, 1188, 524) 2 chances to recover.
758			 */
759			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
760			    (tp->t_rxtshift > 6)) {
761				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
762				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
763				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
764				V_tcp_pmtud_blackhole_failed++;
765				/*
766				 * Reset the slow-start flight size as it
767				 * may depend on the new MSS.
768				 */
769				if (CC_ALGO(tp)->conn_init != NULL)
770					CC_ALGO(tp)->conn_init(tp->ccv);
771			}
772		}
773	}
774
775	/*
776	 * Disable RFC1323 and SACK if we haven't got any response to
777	 * our third SYN to work-around some broken terminal servers
778	 * (most of which have hopefully been retired) that have bad VJ
779	 * header compression code which trashes TCP segments containing
780	 * unknown-to-them TCP options.
781	 */
782	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
783	    (tp->t_rxtshift == 3))
784		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
785	/*
786	 * If we backed off this far, our srtt estimate is probably bogus.
787	 * Clobber it so we'll take the next rtt measurement as our srtt;
788	 * move the current srtt into rttvar to keep the current
789	 * retransmit times until then.
790	 */
791	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
792#ifdef INET6
793		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
794			in6_losing(tp->t_inpcb);
795		else
796#endif
797			in_losing(tp->t_inpcb);
798		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
799		tp->t_srtt = 0;
800	}
801	tp->snd_nxt = tp->snd_una;
802	tp->snd_recover = tp->snd_max;
803	/*
804	 * Force a segment to be sent.
805	 */
806	tp->t_flags |= TF_ACKNOW;
807	/*
808	 * If timing a segment in this window, stop the timer.
809	 */
810	tp->t_rtttime = 0;
811
812	cc_cong_signal(tp, NULL, CC_RTO);
813
814	(void) tp->t_fb->tfb_tcp_output(tp);
815
816out:
817#ifdef TCPDEBUG
818	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
819		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
820			  PRU_SLOWTIMO);
821#endif
822	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
823	if (tp != NULL)
824		INP_WUNLOCK(inp);
825	if (headlocked)
826		INP_INFO_RUNLOCK(&V_tcbinfo);
827	CURVNET_RESTORE();
828}
829
830void
831tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
832{
833	struct callout *t_callout;
834	timeout_t *f_callout;
835	struct inpcb *inp = tp->t_inpcb;
836	int cpu = inp_to_cpuid(inp);
837	uint32_t f_reset;
838
839#ifdef TCP_OFFLOAD
840	if (tp->t_flags & TF_TOE)
841		return;
842#endif
843
844	if (tp->t_timers->tt_flags & TT_STOPPED)
845		return;
846
847	switch (timer_type) {
848		case TT_DELACK:
849			t_callout = &tp->t_timers->tt_delack;
850			f_callout = tcp_timer_delack;
851			f_reset = TT_DELACK_RST;
852			break;
853		case TT_REXMT:
854			t_callout = &tp->t_timers->tt_rexmt;
855			f_callout = tcp_timer_rexmt;
856			f_reset = TT_REXMT_RST;
857			break;
858		case TT_PERSIST:
859			t_callout = &tp->t_timers->tt_persist;
860			f_callout = tcp_timer_persist;
861			f_reset = TT_PERSIST_RST;
862			break;
863		case TT_KEEP:
864			t_callout = &tp->t_timers->tt_keep;
865			f_callout = tcp_timer_keep;
866			f_reset = TT_KEEP_RST;
867			break;
868		case TT_2MSL:
869			t_callout = &tp->t_timers->tt_2msl;
870			f_callout = tcp_timer_2msl;
871			f_reset = TT_2MSL_RST;
872			break;
873		default:
874			if (tp->t_fb->tfb_tcp_timer_activate) {
875				tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
876				return;
877			}
878			panic("tp %p bad timer_type %#x", tp, timer_type);
879		}
880	if (delta == 0) {
881		if ((tp->t_timers->tt_flags & timer_type) &&
882		    (callout_stop(t_callout) > 0) &&
883		    (tp->t_timers->tt_flags & f_reset)) {
884			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
885		}
886	} else {
887		if ((tp->t_timers->tt_flags & timer_type) == 0) {
888			tp->t_timers->tt_flags |= (timer_type | f_reset);
889			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
890		} else {
891			/* Reset already running callout on the same CPU. */
892			if (!callout_reset(t_callout, delta, f_callout, tp)) {
893				/*
894				 * Callout not cancelled, consider it as not
895				 * properly restarted. */
896				tp->t_timers->tt_flags &= ~f_reset;
897			}
898		}
899	}
900}
901
902int
903tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
904{
905	struct callout *t_callout;
906
907	switch (timer_type) {
908		case TT_DELACK:
909			t_callout = &tp->t_timers->tt_delack;
910			break;
911		case TT_REXMT:
912			t_callout = &tp->t_timers->tt_rexmt;
913			break;
914		case TT_PERSIST:
915			t_callout = &tp->t_timers->tt_persist;
916			break;
917		case TT_KEEP:
918			t_callout = &tp->t_timers->tt_keep;
919			break;
920		case TT_2MSL:
921			t_callout = &tp->t_timers->tt_2msl;
922			break;
923		default:
924			if (tp->t_fb->tfb_tcp_timer_active) {
925				return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
926			}
927			panic("tp %p bad timer_type %#x", tp, timer_type);
928		}
929	return callout_active(t_callout);
930}
931
932void
933tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
934{
935	struct callout *t_callout;
936	uint32_t f_reset;
937
938	tp->t_timers->tt_flags |= TT_STOPPED;
939
940	switch (timer_type) {
941		case TT_DELACK:
942			t_callout = &tp->t_timers->tt_delack;
943			f_reset = TT_DELACK_RST;
944			break;
945		case TT_REXMT:
946			t_callout = &tp->t_timers->tt_rexmt;
947			f_reset = TT_REXMT_RST;
948			break;
949		case TT_PERSIST:
950			t_callout = &tp->t_timers->tt_persist;
951			f_reset = TT_PERSIST_RST;
952			break;
953		case TT_KEEP:
954			t_callout = &tp->t_timers->tt_keep;
955			f_reset = TT_KEEP_RST;
956			break;
957		case TT_2MSL:
958			t_callout = &tp->t_timers->tt_2msl;
959			f_reset = TT_2MSL_RST;
960			break;
961		default:
962			if (tp->t_fb->tfb_tcp_timer_stop) {
963				/*
964				 * XXXrrs we need to look at this with the
965				 * stop case below (flags).
966				 */
967				tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
968				return;
969			}
970			panic("tp %p bad timer_type %#x", tp, timer_type);
971		}
972
973	if (tp->t_timers->tt_flags & timer_type) {
974		if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
975			/*
976			 * Can't stop the callout, defer tcpcb actual deletion
977			 * to the last one. We do this using the async drain
978			 * function and incrementing the count in
979			 */
980			tp->t_timers->tt_draincnt++;
981		}
982	}
983}
984
985#define	ticks_to_msecs(t)	(1000*(t) / hz)
986
987void
988tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
989    struct xtcp_timer *xtimer)
990{
991	sbintime_t now;
992
993	bzero(xtimer, sizeof(*xtimer));
994	if (timer == NULL)
995		return;
996	now = getsbinuptime();
997	if (callout_active(&timer->tt_delack))
998		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
999	if (callout_active(&timer->tt_rexmt))
1000		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
1001	if (callout_active(&timer->tt_persist))
1002		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
1003	if (callout_active(&timer->tt_keep))
1004		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
1005	if (callout_active(&timer->tt_2msl))
1006		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
1007	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
1008}
1009