1139823Simp/*-
210965Swollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
41541Srgrimes *
51541Srgrimes * Redistribution and use in source and binary forms, with or without
61541Srgrimes * modification, are permitted provided that the following conditions
71541Srgrimes * are met:
81541Srgrimes * 1. Redistributions of source code must retain the above copyright
91541Srgrimes *    notice, this list of conditions and the following disclaimer.
101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer in the
121541Srgrimes *    documentation and/or other materials provided with the distribution.
131541Srgrimes * 4. Neither the name of the University nor the names of its contributors
141541Srgrimes *    may be used to endorse or promote products derived from this software
151541Srgrimes *    without specific prior written permission.
161541Srgrimes *
171541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
181541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
191541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
201541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
211541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
221541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
231541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
241541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
251541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
261541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
271541Srgrimes * SUCH DAMAGE.
281541Srgrimes *
2910965Swollman *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
301541Srgrimes */
311541Srgrimes
32172467Ssilby#include <sys/cdefs.h>
33172467Ssilby__FBSDID("$FreeBSD$");
34172467Ssilby
35125680Sbms#include "opt_inet.h"
3655679Sshin#include "opt_inet6.h"
3756041Sshin#include "opt_ipsec.h"
38254889Smarkj#include "opt_kdtrace.h"
3929514Sjoerg#include "opt_tcpdebug.h"
4029514Sjoerg
411541Srgrimes#include <sys/param.h>
421541Srgrimes#include <sys/systm.h>
4376166Smarkm#include <sys/domain.h>
44216758Slstewart#include <sys/hhook.h>
4547547Sdg#include <sys/kernel.h>
4676166Smarkm#include <sys/lock.h>
471541Srgrimes#include <sys/mbuf.h>
4876166Smarkm#include <sys/mutex.h>
491541Srgrimes#include <sys/protosw.h>
50254889Smarkj#include <sys/sdt.h>
511541Srgrimes#include <sys/socket.h>
521541Srgrimes#include <sys/socketvar.h>
5376166Smarkm#include <sys/sysctl.h>
541541Srgrimes
55185571Sbz#include <net/if.h>
561541Srgrimes#include <net/route.h>
57196019Srwatson#include <net/vnet.h>
581541Srgrimes
59215166Slstewart#include <netinet/cc.h>
601541Srgrimes#include <netinet/in.h>
61254889Smarkj#include <netinet/in_kdtrace.h>
621541Srgrimes#include <netinet/in_systm.h>
631541Srgrimes#include <netinet/ip.h>
641541Srgrimes#include <netinet/in_pcb.h>
6562587Sitojun#include <netinet/ip_var.h>
66152592Sandre#include <netinet/ip_options.h>
6755679Sshin#ifdef INET6
6855679Sshin#include <netinet6/in6_pcb.h>
6962587Sitojun#include <netinet/ip6.h>
7055679Sshin#include <netinet6/ip6_var.h>
7155679Sshin#endif
721541Srgrimes#define	TCPOUTFLAGS
731541Srgrimes#include <netinet/tcp_fsm.h>
741541Srgrimes#include <netinet/tcp_seq.h>
751541Srgrimes#include <netinet/tcp_timer.h>
761541Srgrimes#include <netinet/tcp_var.h>
771541Srgrimes#include <netinet/tcpip.h>
782788Sdg#ifdef TCPDEBUG
791541Srgrimes#include <netinet/tcp_debug.h>
802788Sdg#endif
81237263Snp#ifdef TCP_OFFLOAD
82237263Snp#include <netinet/tcp_offload.h>
83237263Snp#endif
841541Srgrimes
85171167Sgnn#ifdef IPSEC
86105199Ssam#include <netipsec/ipsec.h>
87171167Sgnn#endif /*IPSEC*/
88105199Ssam
8958698Sjlemon#include <machine/in_cksum.h>
9058698Sjlemon
91163606Srwatson#include <security/mac/mac_framework.h>
92163606Srwatson
93207369SbzVNET_DEFINE(int, path_mtu_discovery) = 1;
94195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
95195699Srwatson	&VNET_NAME(path_mtu_discovery), 1,
96195699Srwatson	"Enable Path MTU Discovery");
971541Srgrimes
98207369SbzVNET_DEFINE(int, tcp_do_tso) = 1;
99207369Sbz#define	V_tcp_do_tso		VNET(tcp_do_tso)
100195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
101195699Srwatson	&VNET_NAME(tcp_do_tso), 0,
102195699Srwatson	"Enable TCP Segmentation Offload");
103162110Sandre
104226448SandreVNET_DEFINE(int, tcp_sendspace) = 1024*32;
105226448Sandre#define	V_tcp_sendspace	VNET(tcp_sendspace)
106227034SpluknetSYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
107226448Sandre	&VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
108226448Sandre
109207369SbzVNET_DEFINE(int, tcp_do_autosndbuf) = 1;
110207369Sbz#define	V_tcp_do_autosndbuf	VNET(tcp_do_autosndbuf)
111195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
112195699Srwatson	&VNET_NAME(tcp_do_autosndbuf), 0,
113195699Srwatson	"Enable automatic send buffer sizing");
114166405Sandre
115207369SbzVNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
116207369Sbz#define	V_tcp_autosndbuf_inc	VNET(tcp_autosndbuf_inc)
117195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
118195699Srwatson	&VNET_NAME(tcp_autosndbuf_inc), 0,
119183550Szec	"Incrementor step size of automatic send buffer");
120166405Sandre
121225169SbzVNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
122207369Sbz#define	V_tcp_autosndbuf_max	VNET(tcp_autosndbuf_max)
123195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
124195699Srwatson	&VNET_NAME(tcp_autosndbuf_max), 0,
125183550Szec	"Max size of automatic send buffer");
126166405Sandre
127216758Slstewartstatic void inline	hhook_run_tcp_est_out(struct tcpcb *tp,
128216758Slstewart			    struct tcphdr *th, struct tcpopt *to,
129216758Slstewart			    long len, int tso);
130215166Slstewartstatic void inline	cc_after_idle(struct tcpcb *tp);
131166405Sandre
1321541Srgrimes/*
133242692Skevlo * Wrapper for the TCP established output helper hook.
134216758Slstewart */
135216758Slstewartstatic void inline
136216758Slstewarthhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
137216758Slstewart    struct tcpopt *to, long len, int tso)
138216758Slstewart{
139216758Slstewart	struct tcp_hhook_data hhook_data;
140216758Slstewart
141216758Slstewart	if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) {
142216758Slstewart		hhook_data.tp = tp;
143216758Slstewart		hhook_data.th = th;
144216758Slstewart		hhook_data.to = to;
145216758Slstewart		hhook_data.len = len;
146216758Slstewart		hhook_data.tso = tso;
147216758Slstewart
148216758Slstewart		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data,
149216758Slstewart		    tp->osd);
150216758Slstewart	}
151216758Slstewart}
152216758Slstewart
153216758Slstewart/*
154215166Slstewart * CC wrapper hook functions
155215166Slstewart */
156215166Slstewartstatic void inline
157215166Slstewartcc_after_idle(struct tcpcb *tp)
158215166Slstewart{
159215166Slstewart	INP_WLOCK_ASSERT(tp->t_inpcb);
160215166Slstewart
161215166Slstewart	if (CC_ALGO(tp)->after_idle != NULL)
162215166Slstewart		CC_ALGO(tp)->after_idle(tp->ccv);
163215166Slstewart}
164215166Slstewart
165215166Slstewart/*
1661541Srgrimes * Tcp output routine: figure out what should be sent and send it.
1671541Srgrimes */
1681541Srgrimesint
16998704Sluigitcp_output(struct tcpcb *tp)
1701541Srgrimes{
17198704Sluigi	struct socket *so = tp->t_inpcb->inp_socket;
172124849Sandre	long len, recwin, sendwin;
173221250Sbz	int off, flags, error = 0;	/* Keep compiler happy */
17498704Sluigi	struct mbuf *m;
17555679Sshin	struct ip *ip = NULL;
17698704Sluigi	struct ipovly *ipov = NULL;
17798704Sluigi	struct tcphdr *th;
1786283Swollman	u_char opt[TCP_MAXOLEN];
17936335Sfenner	unsigned ipoptlen, optlen, hdrlen;
180173835Sbz#ifdef IPSEC
181173835Sbz	unsigned ipsec_optlen = 0;
182173835Sbz#endif
1831541Srgrimes	int idle, sendalot;
184167606Sandre	int sack_rxmit, sack_bytes_rxmt;
185130989Sps	struct sackhole *p;
186238516Sglebius	int tso, mtu;
187167606Sandre	struct tcpopt to;
18887193Sdillon#if 0
18960067Sjlemon	int maxburst = TCP_MAXBURST;
19087193Sdillon#endif
19155679Sshin#ifdef INET6
19298704Sluigi	struct ip6_hdr *ip6 = NULL;
19355679Sshin	int isipv6;
1941541Srgrimes
19555679Sshin	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
19655679Sshin#endif
19755679Sshin
198178285Srwatson	INP_WLOCK_ASSERT(tp->t_inpcb);
199101713Sjennifer
200237263Snp#ifdef TCP_OFFLOAD
201237263Snp	if (tp->t_flags & TF_TOE)
202237263Snp		return (tcp_offload_output(tp));
203237263Snp#endif
204237263Snp
2051541Srgrimes	/*
2061541Srgrimes	 * Determine length of data that should be transmitted,
2071541Srgrimes	 * and flags that will be used.
2081541Srgrimes	 * If there is some data or critical controls (SYN, RST)
2091541Srgrimes	 * to send, then transmit; otherwise, investigate further.
2101541Srgrimes	 */
21184564Sjayanth	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
212216105Slstewart	if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
213216105Slstewart		cc_after_idle(tp);
21484564Sjayanth	tp->t_flags &= ~TF_LASTIDLE;
21584564Sjayanth	if (idle) {
21684564Sjayanth		if (tp->t_flags & TF_MORETOCOME) {
21784564Sjayanth			tp->t_flags |= TF_LASTIDLE;
21884564Sjayanth			idle = 0;
21984564Sjayanth		}
22084564Sjayanth	}
2211541Srgrimesagain:
222130989Sps	/*
223130989Sps	 * If we've recently taken a timeout, snd_max will be greater than
224130989Sps	 * snd_nxt.  There may be SACK information that allows us to avoid
225130989Sps	 * resending already delivered data.  Adjust snd_nxt accordingly.
226130989Sps	 */
227169317Sandre	if ((tp->t_flags & TF_SACK_PERMIT) &&
228169317Sandre	    SEQ_LT(tp->snd_nxt, tp->snd_max))
229130989Sps		tcp_sack_adjust(tp);
2301541Srgrimes	sendalot = 0;
231211317Sandre	tso = 0;
232238516Sglebius	mtu = 0;
2331541Srgrimes	off = tp->snd_nxt - tp->snd_una;
234124849Sandre	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
2351541Srgrimes
2361541Srgrimes	flags = tcp_outflags[tp->t_state];
2371541Srgrimes	/*
238130989Sps	 * Send any SACK-generated retransmissions.  If we're explicitly trying
239130989Sps	 * to send out new data (when sendalot is 1), bypass this function.
240130989Sps	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
241130989Sps	 * we're replacing a (future) new transmission with a retransmission
242130989Sps	 * now, and we previously incremented snd_cwnd in tcp_input().
243130989Sps	 */
244133874Srwatson	/*
245130989Sps	 * Still in sack recovery , reset rxmit flag to zero.
246130989Sps	 */
247130989Sps	sack_rxmit = 0;
248136151Sps	sack_bytes_rxmt = 0;
249130989Sps	len = 0;
250130989Sps	p = NULL;
251215166Slstewart	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
252136151Sps	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
253136151Sps		long cwin;
254136151Sps
255136151Sps		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
256136151Sps		if (cwin < 0)
257136151Sps			cwin = 0;
258132417Sjayanth		/* Do not retransmit SACK segments beyond snd_recover */
259132417Sjayanth		if (SEQ_GT(p->end, tp->snd_recover)) {
260132417Sjayanth			/*
261133874Srwatson			 * (At least) part of sack hole extends beyond
262133874Srwatson			 * snd_recover. Check to see if we can rexmit data
263132417Sjayanth			 * for this hole.
264132417Sjayanth			 */
265132417Sjayanth			if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
266133874Srwatson				/*
267132417Sjayanth				 * Can't rexmit any more data for this hole.
268133874Srwatson				 * That data will be rexmitted in the next
269133874Srwatson				 * sack recovery episode, when snd_recover
270132417Sjayanth				 * moves past p->rxmit.
271132417Sjayanth				 */
272132417Sjayanth				p = NULL;
273132417Sjayanth				goto after_sack_rexmit;
274132417Sjayanth			} else
275132417Sjayanth				/* Can rexmit part of the current hole */
276136151Sps				len = ((long)ulmin(cwin,
277136151Sps						   tp->snd_recover - p->rxmit));
278132417Sjayanth		} else
279136151Sps			len = ((long)ulmin(cwin, p->end - p->rxmit));
280130989Sps		off = p->rxmit - tp->snd_una;
281133874Srwatson		KASSERT(off >= 0,("%s: sack block to the left of una : %d",
282132417Sjayanth		    __func__, off));
283130989Sps		if (len > 0) {
284137066Srwatson			sack_rxmit = 1;
285137066Srwatson			sendalot = 1;
286190948Srwatson			TCPSTAT_INC(tcps_sack_rexmits);
287190948Srwatson			TCPSTAT_ADD(tcps_sack_rexmit_bytes,
288190948Srwatson			    min(len, tp->t_maxseg));
289130989Sps		}
290130989Sps	}
291132417Sjayanthafter_sack_rexmit:
292130989Sps	/*
2936283Swollman	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
2946283Swollman	 * state flags.
2956283Swollman	 */
2966283Swollman	if (tp->t_flags & TF_NEEDFIN)
2976283Swollman		flags |= TH_FIN;
2986283Swollman	if (tp->t_flags & TF_NEEDSYN)
2996283Swollman		flags |= TH_SYN;
3006283Swollman
301136327Srwatson	SOCKBUF_LOCK(&so->so_snd);
3026283Swollman	/*
3031541Srgrimes	 * If in persist timeout with window of 0, send 1 byte.
3041541Srgrimes	 * Otherwise, if window is small but nonzero
3051541Srgrimes	 * and timer expired, we will send what we can
3061541Srgrimes	 * and go to transmit state.
3071541Srgrimes	 */
308146463Sps	if (tp->t_flags & TF_FORCEDATA) {
309124849Sandre		if (sendwin == 0) {
3101541Srgrimes			/*
3111541Srgrimes			 * If we still have some data to send, then
3121541Srgrimes			 * clear the FIN bit.  Usually this would
3131541Srgrimes			 * happen below when it realizes that we
3141541Srgrimes			 * aren't sending all the data.  However,
31545439Sjulian			 * if we have exactly 1 byte of unsent data,
3161541Srgrimes			 * then it won't clear the FIN bit below,
3171541Srgrimes			 * and if we are in persist state, we wind
3181541Srgrimes			 * up sending the packet without recording
3191541Srgrimes			 * that we sent the FIN bit.
3201541Srgrimes			 *
3211541Srgrimes			 * We can't just blindly clear the FIN bit,
3221541Srgrimes			 * because if we don't have any more data
3231541Srgrimes			 * to send then the probe will be the FIN
3241541Srgrimes			 * itself.
3251541Srgrimes			 */
3261541Srgrimes			if (off < so->so_snd.sb_cc)
3271541Srgrimes				flags &= ~TH_FIN;
328124849Sandre			sendwin = 1;
3291541Srgrimes		} else {
330168615Sandre			tcp_timer_activate(tp, TT_PERSIST, 0);
3311541Srgrimes			tp->t_rxtshift = 0;
3321541Srgrimes		}
3331541Srgrimes	}
3341541Srgrimes
335104815Sdillon	/*
336133874Srwatson	 * If snd_nxt == snd_max and we have transmitted a FIN, the
337104815Sdillon	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
338124849Sandre	 * a negative length.  This can also occur when TCP opens up
339104815Sdillon	 * its congestion window while receiving additional duplicate
340104815Sdillon	 * acks after fast-retransmit because TCP will reset snd_nxt
341104815Sdillon	 * to snd_max after the fast-retransmit.
342104815Sdillon	 *
343104815Sdillon	 * In the normal retransmit-FIN-only case, however, snd_nxt will
344104815Sdillon	 * be set to snd_una, the offset will be 0, and the length may
345104815Sdillon	 * wind up 0.
346133874Srwatson	 *
347130989Sps	 * If sack_rxmit is true we are retransmitting from the scoreboard
348133874Srwatson	 * in which case len is already set.
349104815Sdillon	 */
350136151Sps	if (sack_rxmit == 0) {
351136151Sps		if (sack_bytes_rxmt == 0)
352136151Sps			len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
353136151Sps		else {
354136151Sps			long cwin;
3551541Srgrimes
356136151Sps                        /*
357136151Sps			 * We are inside of a SACK recovery episode and are
358136151Sps			 * sending new data, having retransmitted all the
359136151Sps			 * data possible in the scoreboard.
360136151Sps			 */
361138199Sps			len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd)
362138199Sps			       - off);
363140138Sps			/*
364140138Sps			 * Don't remove this (len > 0) check !
365140138Sps			 * We explicitly check for len > 0 here (although it
366140138Sps			 * isn't really necessary), to work around a gcc
367140138Sps			 * optimization issue - to force gcc to compute
368140138Sps			 * len above. Without this check, the computation
369140138Sps			 * of len is bungled by the optimizer.
370140138Sps			 */
371140138Sps			if (len > 0) {
372140138Sps				cwin = tp->snd_cwnd -
373140138Sps					(tp->snd_nxt - tp->sack_newdata) -
374140138Sps					sack_bytes_rxmt;
375140138Sps				if (cwin < 0)
376140138Sps					cwin = 0;
377140138Sps				len = lmin(len, cwin);
378140138Sps			}
379136151Sps		}
380136151Sps	}
381136151Sps
3826283Swollman	/*
3836283Swollman	 * Lop off SYN bit if it has already been sent.  However, if this
3846283Swollman	 * is SYN-SENT state and if segment contains data and if we don't
3856283Swollman	 * know that foreign host supports TAO, suppress sending segment.
3866283Swollman	 */
3876283Swollman	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
388155961Sqingli		if (tp->t_state != TCPS_SYN_RECEIVED)
389155961Sqingli			flags &= ~TH_SYN;
3906283Swollman		off--, len++;
3916283Swollman	}
3926283Swollman
39313475Solah	/*
394137139Sandre	 * Be careful not to send data and/or FIN on SYN segments.
39513475Solah	 * This measure is needed to prevent interoperability problems
39613475Solah	 * with not fully conformant TCP implementations.
39713475Solah	 */
398137139Sandre	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
39913475Solah		len = 0;
40013475Solah		flags &= ~TH_FIN;
40113475Solah	}
40213475Solah
403285780Sdelphij	if (len <= 0) {
4041541Srgrimes		/*
4051541Srgrimes		 * If FIN has been sent but not acked,
4061541Srgrimes		 * but we haven't been called to retransmit,
407104815Sdillon		 * len will be < 0.  Otherwise, window shrank
4081541Srgrimes		 * after we sent into it.  If window shrank to 0,
40915262Sdg		 * cancel pending retransmit, pull snd_nxt back
41015262Sdg		 * to (closed) window, and set the persist timer
41115262Sdg		 * if it isn't already going.  If the window didn't
41215262Sdg		 * close completely, just wait for an ACK.
413285780Sdelphij		 *
414285780Sdelphij		 * We also do a general check here to ensure that
415285780Sdelphij		 * we will set the persist timer when we have data
416285780Sdelphij		 * to send, but a 0-byte window. This makes sure
417285780Sdelphij		 * the persist timer is set even if the packet
418285780Sdelphij		 * hits one of the "goto send" lines below.
4191541Srgrimes		 */
4201541Srgrimes		len = 0;
421285780Sdelphij		if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) &&
422285780Sdelphij			(off < (int) so->so_snd.sb_cc)) {
423168615Sandre			tcp_timer_activate(tp, TT_REXMT, 0);
42415262Sdg			tp->t_rxtshift = 0;
4251541Srgrimes			tp->snd_nxt = tp->snd_una;
426168615Sandre			if (!tcp_timer_active(tp, TT_PERSIST))
42715262Sdg				tcp_setpersist(tp);
4281541Srgrimes		}
4291541Srgrimes	}
430104815Sdillon
431166405Sandre	/* len will be >= 0 after this point. */
432182841Sbz	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
433166405Sandre
434104815Sdillon	/*
435166405Sandre	 * Automatic sizing of send socket buffer.  Often the send buffer
436166405Sandre	 * size is not optimally adjusted to the actual network conditions
437166405Sandre	 * at hand (delay bandwidth product).  Setting the buffer size too
438166405Sandre	 * small limits throughput on links with high bandwidth and high
439166405Sandre	 * delay (eg. trans-continental/oceanic links).  Setting the
440166405Sandre	 * buffer size too big consumes too much real kernel memory,
441166405Sandre	 * especially with many connections on busy servers.
442162110Sandre	 *
443166405Sandre	 * The criteria to step up the send buffer one notch are:
444166405Sandre	 *  1. receive window of remote host is larger than send buffer
445166405Sandre	 *     (with a fudge factor of 5/4th);
446166405Sandre	 *  2. send buffer is filled to 7/8th with data (so we actually
447166405Sandre	 *     have data to make use of it);
448166405Sandre	 *  3. send buffer fill has not hit maximal automatic size;
449166405Sandre	 *  4. our send window (slow start and cogestion controlled) is
450166405Sandre	 *     larger than sent but unacknowledged data in send buffer.
451166405Sandre	 *
452166405Sandre	 * The remote host receive window scaling factor may limit the
453166405Sandre	 * growing of the send buffer before it reaches its allowed
454166405Sandre	 * maximum.
455166405Sandre	 *
456166405Sandre	 * It scales directly with slow start or congestion window
457166405Sandre	 * and does at most one step per received ACK.  This fast
458166405Sandre	 * scaling has the drawback of growing the send buffer beyond
459166405Sandre	 * what is strictly necessary to make full use of a given
460166405Sandre	 * delay*bandwith product.  However testing has shown this not
461166405Sandre	 * to be much of an problem.  At worst we are trading wasting
462166405Sandre	 * of available bandwith (the non-use of it) for wasting some
463166405Sandre	 * socket buffer memory.
464166405Sandre	 *
465166405Sandre	 * TODO: Shrink send buffer during idle periods together
466166405Sandre	 * with congestion window.  Requires another timer.  Has to
467166405Sandre	 * wait for upcoming tcp timer rewrite.
468166405Sandre	 */
469181803Sbz	if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
470166405Sandre		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
471166405Sandre		    so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
472181803Sbz		    so->so_snd.sb_cc < V_tcp_autosndbuf_max &&
473166405Sandre		    sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
474166405Sandre			if (!sbreserve_locked(&so->so_snd,
475181803Sbz			    min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
476181803Sbz			     V_tcp_autosndbuf_max), so, curthread))
477166405Sandre				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
478166405Sandre		}
479166405Sandre	}
480166405Sandre
481166405Sandre	/*
482212803Sandre	 * Decide if we can use TCP Segmentation Offloading (if supported by
483212803Sandre	 * hardware).
484166405Sandre	 *
485162110Sandre	 * TSO may only be used if we are in a pure bulk sending state.  The
486162110Sandre	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
487162110Sandre	 * IP options prevent using TSO.  With TSO the TCP header is the same
488162110Sandre	 * (except for the sequence number) for all generated packets.  This
489162110Sandre	 * makes it impossible to transmit any options which vary per generated
490162110Sandre	 * segment or packet.
491104815Sdillon	 */
492173835Sbz#ifdef IPSEC
493173835Sbz	/*
494173835Sbz	 * Pre-calculate here as we save another lookup into the darknesses
495173835Sbz	 * of IPsec that way and can actually decide if TSO is ok.
496173835Sbz	 */
497173835Sbz	ipsec_optlen = ipsec_hdrsiz_tcp(tp);
498173835Sbz#endif
499212803Sandre	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
500212803Sandre	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
501212803Sandre	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
502173835Sbz#ifdef IPSEC
503212803Sandre	    ipsec_optlen == 0 &&
504173835Sbz#endif
505212803Sandre	    tp->t_inpcb->inp_options == NULL &&
506212803Sandre	    tp->t_inpcb->in6p_options == NULL)
507212803Sandre		tso = 1;
508211317Sandre
509132717Sjayanth	if (sack_rxmit) {
510132717Sjayanth		if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
511132717Sjayanth			flags &= ~TH_FIN;
512133874Srwatson	} else {
513132717Sjayanth		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
514132717Sjayanth			flags &= ~TH_FIN;
515132717Sjayanth	}
5161541Srgrimes
517124849Sandre	recwin = sbspace(&so->so_rcv);
5181541Srgrimes
5191541Srgrimes	/*
52087193Sdillon	 * Sender silly window avoidance.   We transmit under the following
52187193Sdillon	 * conditions when len is non-zero:
52287193Sdillon	 *
523162110Sandre	 *	- We have a full segment (or more with TSO)
52487779Sjlemon	 *	- This is the last buffer in a write()/send() and we are
52587779Sjlemon	 *	  either idle or running NODELAY
52687779Sjlemon	 *	- we've timed out (e.g. persist timer)
52787779Sjlemon	 *	- we have more then 1/2 the maximum send window's worth of
52887779Sjlemon	 *	  data (receiver may be limited the window size)
52987779Sjlemon	 *	- we need to retransmit
5301541Srgrimes	 */
5311541Srgrimes	if (len) {
532162110Sandre		if (len >= tp->t_maxseg)
5331541Srgrimes			goto send;
53487193Sdillon		/*
53587193Sdillon		 * NOTE! on localhost connections an 'ack' from the remote
53687193Sdillon		 * end may occur synchronously with the output and cause
53787193Sdillon		 * us to flush a buffer queued with moretocome.  XXX
53887193Sdillon		 *
53987193Sdillon		 * note: the len + off check is almost certainly unnecessary.
54087193Sdillon		 */
54187779Sjlemon		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
54287193Sdillon		    (idle || (tp->t_flags & TF_NODELAY)) &&
54387193Sdillon		    len + off >= so->so_snd.sb_cc &&
54487193Sdillon		    (tp->t_flags & TF_NOPUSH) == 0) {
5451541Srgrimes			goto send;
54687193Sdillon		}
547146463Sps		if (tp->t_flags & TF_FORCEDATA)		/* typ. timeout case */
5481541Srgrimes			goto send;
5496283Swollman		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
5501541Srgrimes			goto send;
55187193Sdillon		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
5521541Srgrimes			goto send;
553130989Sps		if (sack_rxmit)
554130989Sps			goto send;
5551541Srgrimes	}
5561541Srgrimes
5571541Srgrimes	/*
558242252Sandre	 * Sending of standalone window updates.
559242251Sandre	 *
560242311Sandre	 * Window updates are important when we close our window due to a
561242311Sandre	 * full socket buffer and are opening it again after the application
562242252Sandre	 * reads data from it.  Once the window has opened again and the
563242252Sandre	 * remote end starts to send again the ACK clock takes over and
564242252Sandre	 * provides the most current window information.
565242252Sandre	 *
566242311Sandre	 * We must avoid the silly window syndrome whereas every read
567242252Sandre	 * from the receive buffer, no matter how small, causes a window
568242252Sandre	 * update to be sent.  We also should avoid sending a flurry of
569242252Sandre	 * window updates when the socket buffer had queued a lot of data
570242252Sandre	 * and the application is doing small reads.
571242252Sandre	 *
572242252Sandre	 * Prevent a flurry of pointless window updates by only sending
573242252Sandre	 * an update when we can increase the advertized window by more
574242252Sandre	 * than 1/4th of the socket buffer capacity.  When the buffer is
575242252Sandre	 * getting full or is very small be more aggressive and send an
576242252Sandre	 * update whenever we can increase by two mss sized segments.
577242252Sandre	 * In all other situations the ACK's to new incoming data will
578242252Sandre	 * carry further window increases.
579242252Sandre	 *
580242251Sandre	 * Don't send an independent window update if a delayed
581242251Sandre	 * ACK is pending (it will get piggy-backed on it) or the
582242251Sandre	 * remote side already has done a half-close and won't send
583242252Sandre	 * more data.  Skip this if the connection is in T/TCP
584242252Sandre	 * half-open state.
5851541Srgrimes	 */
586170467Sandre	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
587242251Sandre	    !(tp->t_flags & TF_DELACK) &&
588170467Sandre	    !TCPS_HAVERCVDFIN(tp->t_state)) {
5898876Srgrimes		/*
590242252Sandre		 * "adv" is the amount we could increase the window,
5911541Srgrimes		 * taking into account that we are limited by
5921541Srgrimes		 * TCP_MAXWIN << tp->rcv_scale.
5931541Srgrimes		 */
594221346Sjhb		long adv;
595221346Sjhb		int oldwin;
5961541Srgrimes
597221346Sjhb		adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
598221346Sjhb		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
599221346Sjhb			oldwin = (tp->rcv_adv - tp->rcv_nxt);
600221346Sjhb			adv -= oldwin;
601221346Sjhb		} else
602221346Sjhb			oldwin = 0;
603221346Sjhb
604220794Sjhb		/*
605220794Sjhb		 * If the new window size ends up being the same as the old
606220794Sjhb		 * size when it is scaled, then don't force a window update.
607220794Sjhb		 */
608221346Sjhb		if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
609220794Sjhb			goto dontupdate;
610242252Sandre
611242252Sandre		if (adv >= (long)(2 * tp->t_maxseg) &&
612242252Sandre		    (adv >= (long)(so->so_rcv.sb_hiwat / 4) ||
613242252Sandre		     recwin <= (long)(so->so_rcv.sb_hiwat / 8) ||
614242252Sandre		     so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))
6151541Srgrimes			goto send;
6161541Srgrimes	}
617220794Sjhbdontupdate:
6181541Srgrimes
6191541Srgrimes	/*
620104815Sdillon	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
621104815Sdillon	 * is also a catch-all for the retransmit timer timeout case.
6221541Srgrimes	 */
6231541Srgrimes	if (tp->t_flags & TF_ACKNOW)
6241541Srgrimes		goto send;
6256283Swollman	if ((flags & TH_RST) ||
6266283Swollman	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
6271541Srgrimes		goto send;
6281541Srgrimes	if (SEQ_GT(tp->snd_up, tp->snd_una))
6291541Srgrimes		goto send;
6301541Srgrimes	/*
6311541Srgrimes	 * If our state indicates that FIN should be sent
632104815Sdillon	 * and we have not yet done so, then we need to send.
6331541Srgrimes	 */
6341541Srgrimes	if (flags & TH_FIN &&
6351541Srgrimes	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
6361541Srgrimes		goto send;
6371541Srgrimes	/*
638130989Sps	 * In SACK, it is possible for tcp_output to fail to send a segment
639130989Sps	 * after the retransmission timer has been turned off.  Make sure
640130989Sps	 * that the retransmission timer is set.
641130989Sps	 */
642169317Sandre	if ((tp->t_flags & TF_SACK_PERMIT) &&
643169317Sandre	    SEQ_GT(tp->snd_max, tp->snd_una) &&
644168615Sandre	    !tcp_timer_active(tp, TT_REXMT) &&
645168615Sandre	    !tcp_timer_active(tp, TT_PERSIST)) {
646168615Sandre		tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
647136327Srwatson		goto just_return;
648133874Srwatson	}
649130989Sps	/*
6501541Srgrimes	 * TCP window updates are not reliable, rather a polling protocol
6511541Srgrimes	 * using ``persist'' packets is used to insure receipt of window
6521541Srgrimes	 * updates.  The three ``states'' for the output side are:
6531541Srgrimes	 *	idle			not doing retransmits or persists
6541541Srgrimes	 *	persisting		to move a small or zero window
6551541Srgrimes	 *	(re)transmitting	and thereby not persisting
6561541Srgrimes	 *
657168615Sandre	 * tcp_timer_active(tp, TT_PERSIST)
65850673Sjlemon	 *	is true when we are in persist state.
659146463Sps	 * (tp->t_flags & TF_FORCEDATA)
6601541Srgrimes	 *	is set when we are called to send a persist packet.
661168615Sandre	 * tcp_timer_active(tp, TT_REXMT)
6621541Srgrimes	 *	is set when we are retransmitting
6631541Srgrimes	 * The output side is idle when both timers are zero.
6641541Srgrimes	 *
6651541Srgrimes	 * If send window is too small, there is data to transmit, and no
6661541Srgrimes	 * retransmit or persist is pending, then go to persist state.
6671541Srgrimes	 * If nothing happens soon, send when timer expires:
6681541Srgrimes	 * if window is nonzero, transmit what we can,
6691541Srgrimes	 * otherwise force out a byte.
6701541Srgrimes	 */
671168615Sandre	if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) &&
672168615Sandre	    !tcp_timer_active(tp, TT_PERSIST)) {
6731541Srgrimes		tp->t_rxtshift = 0;
6741541Srgrimes		tcp_setpersist(tp);
6751541Srgrimes	}
6761541Srgrimes
6771541Srgrimes	/*
6781541Srgrimes	 * No reason to send a segment, just return.
6791541Srgrimes	 */
680136327Srwatsonjust_return:
681136327Srwatson	SOCKBUF_UNLOCK(&so->so_snd);
6821541Srgrimes	return (0);
6831541Srgrimes
6841541Srgrimessend:
685136327Srwatson	SOCKBUF_LOCK_ASSERT(&so->so_snd);
6861541Srgrimes	/*
6871541Srgrimes	 * Before ESTABLISHED, force sending of initial options
6881541Srgrimes	 * unless TCP set not to do any options.
6891541Srgrimes	 * NOTE: we assume that the IP/TCP header plus TCP options
6901541Srgrimes	 * always fit in a single mbuf, leaving room for a maximum
6911541Srgrimes	 * link header, i.e.
69278064Sume	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
6931541Srgrimes	 */
6941541Srgrimes	optlen = 0;
69555679Sshin#ifdef INET6
69655679Sshin	if (isipv6)
69755679Sshin		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
69855679Sshin	else
69955679Sshin#endif
700221250Sbz		hdrlen = sizeof (struct tcpiphdr);
7011541Srgrimes
702167606Sandre	/*
703167606Sandre	 * Compute options for segment.
704167606Sandre	 * We only have to care about SYN and established connection
705167606Sandre	 * segments.  Options for SYN-ACK segments are handled in TCP
706167606Sandre	 * syncache.
707167606Sandre	 */
708293894Sglebius	to.to_flags = 0;
709167606Sandre	if ((tp->t_flags & TF_NOOPT) == 0) {
710167606Sandre		/* Maximum segment size. */
711167606Sandre		if (flags & TH_SYN) {
712167606Sandre			tp->snd_nxt = tp->iss;
713167606Sandre			to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
714167606Sandre			to.to_flags |= TOF_MSS;
715167606Sandre		}
716167606Sandre		/* Window scaling. */
717167606Sandre		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
718167606Sandre			to.to_wscale = tp->request_r_scale;
719167606Sandre			to.to_flags |= TOF_SCALE;
720167606Sandre		}
721167606Sandre		/* Timestamps. */
722167606Sandre		if ((tp->t_flags & TF_RCVD_TSTMP) ||
723167606Sandre		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
724231767Sbz			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
725167606Sandre			to.to_tsecr = tp->ts_recent;
726167606Sandre			to.to_flags |= TOF_TS;
727167606Sandre			/* Set receive buffer autosizing timestamp. */
728167606Sandre			if (tp->rfbuf_ts == 0 &&
729167606Sandre			    (so->so_rcv.sb_flags & SB_AUTOSIZE))
730231767Sbz				tp->rfbuf_ts = tcp_ts_getticks();
731167606Sandre		}
732167606Sandre		/* Selective ACK's. */
733169317Sandre		if (tp->t_flags & TF_SACK_PERMIT) {
734167606Sandre			if (flags & TH_SYN)
735167606Sandre				to.to_flags |= TOF_SACKPERM;
736167606Sandre			else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
737167606Sandre			    (tp->t_flags & TF_SACK_PERMIT) &&
738167606Sandre			    tp->rcv_numsacks > 0) {
739167606Sandre				to.to_flags |= TOF_SACK;
740167606Sandre				to.to_nsacks = tp->rcv_numsacks;
741167606Sandre				to.to_sacks = (u_char *)tp->sackblks;
7421541Srgrimes			}
7431541Srgrimes		}
744125680Sbms#ifdef TCP_SIGNATURE
745167606Sandre		/* TCP-MD5 (RFC2385). */
746167606Sandre		if (tp->t_flags & TF_SIGNATURE)
747167606Sandre			to.to_flags |= TOF_SIGNATURE;
748125680Sbms#endif /* TCP_SIGNATURE */
749125680Sbms
750167606Sandre		/* Processing the options. */
751174023Sbz		hdrlen += optlen = tcp_addoptions(&to, opt);
752145372Sps	}
753145372Sps
75455679Sshin#ifdef INET6
75555679Sshin	if (isipv6)
75655679Sshin		ipoptlen = ip6_optlen(tp->t_inpcb);
75755679Sshin	else
75855679Sshin#endif
75998704Sluigi	if (tp->t_inpcb->inp_options)
76036335Sfenner		ipoptlen = tp->t_inpcb->inp_options->m_len -
76136335Sfenner				offsetof(struct ipoption, ipopt_list);
76298704Sluigi	else
76336335Sfenner		ipoptlen = 0;
764171167Sgnn#ifdef IPSEC
765173835Sbz	ipoptlen += ipsec_optlen;
76655679Sshin#endif
76736335Sfenner
7681541Srgrimes	/*
7691541Srgrimes	 * Adjust data length if insertion of options will
7706283Swollman	 * bump the packet length beyond the t_maxopd length.
7716283Swollman	 * Clear the FIN bit because we cut off the tail of
7726283Swollman	 * the segment.
7731541Srgrimes	 */
77436335Sfenner	if (len + optlen + ipoptlen > tp->t_maxopd) {
7755802Sdg		flags &= ~TH_FIN;
776212803Sandre
777162110Sandre		if (tso) {
778212803Sandre			KASSERT(ipoptlen == 0,
779212803Sandre			    ("%s: TSO can't do IP options", __func__));
780212803Sandre
781212803Sandre			/*
782251296Sandre			 * Limit a burst to t_tsomax minus IP,
783212803Sandre			 * TCP and options length to keep ip->ip_len
784251296Sandre			 * from overflowing or exceeding the maximum
785251296Sandre			 * length allowed by the network interface.
786212803Sandre			 */
787251296Sandre			if (len > tp->t_tsomax - hdrlen) {
788251296Sandre				len = tp->t_tsomax - hdrlen;
789162110Sandre				sendalot = 1;
790212803Sandre			}
791212803Sandre
792212803Sandre			/*
793212803Sandre			 * Prevent the last segment from being
794212803Sandre			 * fractional unless the send sockbuf can
795212803Sandre			 * be emptied.
796212803Sandre			 */
797212803Sandre			if (sendalot && off + len < so->so_snd.sb_cc) {
798212803Sandre				len -= len % (tp->t_maxopd - optlen);
799162110Sandre				sendalot = 1;
800212803Sandre			}
801212803Sandre
802212803Sandre			/*
803212803Sandre			 * Send the FIN in a separate segment
804212803Sandre			 * after the bulk sending is done.
805212803Sandre			 * We don't trust the TSO implementations
806212803Sandre			 * to clear the FIN flag on all but the
807212803Sandre			 * last segment.
808212803Sandre			 */
809212803Sandre			if (tp->t_flags & TF_NEEDFIN)
810212803Sandre				sendalot = 1;
811212803Sandre
812162110Sandre		} else {
813162110Sandre			len = tp->t_maxopd - optlen - ipoptlen;
814162110Sandre			sendalot = 1;
815162110Sandre		}
816212803Sandre	} else
817212803Sandre		tso = 0;
8181541Srgrimes
819212803Sandre	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
820212803Sandre	    ("%s: len > IP_MAXPACKET", __func__));
821212803Sandre
8226283Swollman/*#ifdef DIAGNOSTIC*/
82357068Sshin#ifdef INET6
824133874Srwatson	if (max_linkhdr + hdrlen > MCLBYTES)
82557068Sshin#else
826133874Srwatson	if (max_linkhdr + hdrlen > MHLEN)
82798704Sluigi#endif
8281541Srgrimes		panic("tcphdr too big");
8296283Swollman/*#endif*/
8301541Srgrimes
8311541Srgrimes	/*
832182841Sbz	 * This KASSERT is here to catch edge cases at a well defined place.
833182841Sbz	 * Before, those had triggered (random) panic conditions further down.
834182841Sbz	 */
835182841Sbz	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
836182841Sbz
837182841Sbz	/*
8381541Srgrimes	 * Grab a header mbuf, attaching a copy of data to
8391541Srgrimes	 * be transmitted, and initialize the header from
8401541Srgrimes	 * the template for sends on this connection.
8411541Srgrimes	 */
8421541Srgrimes	if (len) {
843167715Sandre		struct mbuf *mb;
844167715Sandre		u_int moff;
845167715Sandre
846146463Sps		if ((tp->t_flags & TF_FORCEDATA) && len == 1)
847190948Srwatson			TCPSTAT_INC(tcps_sndprobe);
848169682Sjhb		else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
849215434Sgnn			tp->t_sndrexmitpack++;
850190948Srwatson			TCPSTAT_INC(tcps_sndrexmitpack);
851190948Srwatson			TCPSTAT_ADD(tcps_sndrexmitbyte, len);
8521541Srgrimes		} else {
853190948Srwatson			TCPSTAT_INC(tcps_sndpack);
854190948Srwatson			TCPSTAT_ADD(tcps_sndbyte, len);
8551541Srgrimes		}
856248323Sglebius#ifdef INET6
857248323Sglebius		if (MHLEN < hdrlen + max_linkhdr)
858248323Sglebius			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
859248323Sglebius		else
860248323Sglebius#endif
861248323Sglebius			m = m_gethdr(M_NOWAIT, MT_DATA);
862248323Sglebius
8631541Srgrimes		if (m == NULL) {
864136327Srwatson			SOCKBUF_UNLOCK(&so->so_snd);
8651541Srgrimes			error = ENOBUFS;
866249372Sglebius			sack_rxmit = 0;
8671541Srgrimes			goto out;
8681541Srgrimes		}
869248323Sglebius
8701541Srgrimes		m->m_data += max_linkhdr;
8711541Srgrimes		m->m_len = hdrlen;
872167715Sandre
873167715Sandre		/*
874167715Sandre		 * Start the m_copy functions from the closest mbuf
875167715Sandre		 * to the offset in the socket buffer chain.
876167715Sandre		 */
877167715Sandre		mb = sbsndptr(&so->so_snd, off, len, &moff);
878167715Sandre
8791541Srgrimes		if (len <= MHLEN - hdrlen - max_linkhdr) {
880167715Sandre			m_copydata(mb, moff, (int)len,
8811541Srgrimes			    mtod(m, caddr_t) + hdrlen);
8821541Srgrimes			m->m_len += len;
8831541Srgrimes		} else {
884167715Sandre			m->m_next = m_copy(mb, moff, (int)len);
885167715Sandre			if (m->m_next == NULL) {
886136327Srwatson				SOCKBUF_UNLOCK(&so->so_snd);
88710965Swollman				(void) m_free(m);
88810712Swollman				error = ENOBUFS;
889249372Sglebius				sack_rxmit = 0;
89010712Swollman				goto out;
89110712Swollman			}
8921541Srgrimes		}
893223799Scperciva
8941541Srgrimes		/*
8951541Srgrimes		 * If we're sending everything we've got, set PUSH.
8961541Srgrimes		 * (This will keep happy those implementations which only
8971541Srgrimes		 * give data to the user when a buffer fills or
8981541Srgrimes		 * a PUSH comes in.)
8991541Srgrimes		 */
9001541Srgrimes		if (off + len == so->so_snd.sb_cc)
9011541Srgrimes			flags |= TH_PUSH;
902136327Srwatson		SOCKBUF_UNLOCK(&so->so_snd);
9031541Srgrimes	} else {
904136327Srwatson		SOCKBUF_UNLOCK(&so->so_snd);
9051541Srgrimes		if (tp->t_flags & TF_ACKNOW)
906190948Srwatson			TCPSTAT_INC(tcps_sndacks);
9071541Srgrimes		else if (flags & (TH_SYN|TH_FIN|TH_RST))
908190948Srwatson			TCPSTAT_INC(tcps_sndctrl);
9091541Srgrimes		else if (SEQ_GT(tp->snd_up, tp->snd_una))
910190948Srwatson			TCPSTAT_INC(tcps_sndurg);
9111541Srgrimes		else
912190948Srwatson			TCPSTAT_INC(tcps_sndwinup);
9131541Srgrimes
914248373Sglebius		m = m_gethdr(M_NOWAIT, MT_DATA);
9151541Srgrimes		if (m == NULL) {
9161541Srgrimes			error = ENOBUFS;
917249372Sglebius			sack_rxmit = 0;
9181541Srgrimes			goto out;
9191541Srgrimes		}
92055679Sshin#ifdef INET6
92155679Sshin		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
92255679Sshin		    MHLEN >= hdrlen) {
92355679Sshin			MH_ALIGN(m, hdrlen);
92455679Sshin		} else
92555679Sshin#endif
9261541Srgrimes		m->m_data += max_linkhdr;
9271541Srgrimes		m->m_len = hdrlen;
9281541Srgrimes	}
929136327Srwatson	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
9301541Srgrimes	m->m_pkthdr.rcvif = (struct ifnet *)0;
931101106Srwatson#ifdef MAC
932172930Srwatson	mac_inpcb_create_mbuf(tp->t_inpcb, m);
933101106Srwatson#endif
93455679Sshin#ifdef INET6
93555679Sshin	if (isipv6) {
93655679Sshin		ip6 = mtod(m, struct ip6_hdr *);
93755679Sshin		th = (struct tcphdr *)(ip6 + 1);
938111144Sjlemon		tcpip_fillheaders(tp->t_inpcb, ip6, th);
93955679Sshin	} else
94055679Sshin#endif /* INET6 */
941133874Srwatson	{
942133874Srwatson		ip = mtod(m, struct ip *);
943133874Srwatson		ipov = (struct ipovly *)ip;
944133874Srwatson		th = (struct tcphdr *)(ip + 1);
945133874Srwatson		tcpip_fillheaders(tp->t_inpcb, ip, th);
946133874Srwatson	}
9471541Srgrimes
9481541Srgrimes	/*
9491541Srgrimes	 * Fill in fields, remembering maximum advertised
9501541Srgrimes	 * window for use in delaying messages about window sizes.
9511541Srgrimes	 * If resending a FIN, be sure not to use a new sequence number.
9521541Srgrimes	 */
9538876Srgrimes	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
9541541Srgrimes	    tp->snd_nxt == tp->snd_max)
9551541Srgrimes		tp->snd_nxt--;
9561541Srgrimes	/*
957181056Srpaulo	 * If we are starting a connection, send ECN setup
958181056Srpaulo	 * SYN packet. If we are on a retransmit, we may
959181056Srpaulo	 * resend those bits a number of times as per
960181056Srpaulo	 * RFC 3168.
961181056Srpaulo	 */
962181803Sbz	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
963181056Srpaulo		if (tp->t_rxtshift >= 1) {
964181803Sbz			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
965181056Srpaulo				flags |= TH_ECE|TH_CWR;
966181056Srpaulo		} else
967181056Srpaulo			flags |= TH_ECE|TH_CWR;
968181056Srpaulo	}
969181056Srpaulo
970181056Srpaulo	if (tp->t_state == TCPS_ESTABLISHED &&
971181056Srpaulo	    (tp->t_flags & TF_ECN_PERMIT)) {
972181056Srpaulo		/*
973181056Srpaulo		 * If the peer has ECN, mark data packets with
974181056Srpaulo		 * ECN capable transmission (ECT).
975181056Srpaulo		 * Ignore pure ack packets, retransmissions and window probes.
976181056Srpaulo		 */
977181056Srpaulo		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
978181056Srpaulo		    !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
979181056Srpaulo#ifdef INET6
980181056Srpaulo			if (isipv6)
981181056Srpaulo				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
982181056Srpaulo			else
983181056Srpaulo#endif
984181056Srpaulo				ip->ip_tos |= IPTOS_ECN_ECT0;
985190948Srwatson			TCPSTAT_INC(tcps_ecn_ect0);
986181056Srpaulo		}
987181056Srpaulo
988181056Srpaulo		/*
989181056Srpaulo		 * Reply with proper ECN notifications.
990181056Srpaulo		 */
991181056Srpaulo		if (tp->t_flags & TF_ECN_SND_CWR) {
992181056Srpaulo			flags |= TH_CWR;
993181056Srpaulo			tp->t_flags &= ~TF_ECN_SND_CWR;
994181056Srpaulo		}
995181056Srpaulo		if (tp->t_flags & TF_ECN_SND_ECE)
996181056Srpaulo			flags |= TH_ECE;
997181056Srpaulo	}
998181056Srpaulo
999181056Srpaulo	/*
10001541Srgrimes	 * If we are doing retransmissions, then snd_nxt will
10011541Srgrimes	 * not reflect the first unsent octet.  For ACK only
10021541Srgrimes	 * packets, we do not want the sequence number of the
10031541Srgrimes	 * retransmitted packet, we want the sequence number
10041541Srgrimes	 * of the next unsent octet.  So, if there is no data
10051541Srgrimes	 * (and no SYN or FIN), use snd_max instead of snd_nxt
10061541Srgrimes	 * when filling in ti_seq.  But if we are in persist
10071541Srgrimes	 * state, snd_max might reflect one byte beyond the
10081541Srgrimes	 * right edge of the window, so use snd_nxt in that
10091541Srgrimes	 * case, since we know we aren't doing a retransmission.
10101541Srgrimes	 * (retransmit and persist are mutually exclusive...)
10111541Srgrimes	 */
1012136151Sps	if (sack_rxmit == 0) {
1013168615Sandre		if (len || (flags & (TH_SYN|TH_FIN)) ||
1014168615Sandre		    tcp_timer_active(tp, TT_PERSIST))
1015136151Sps			th->th_seq = htonl(tp->snd_nxt);
1016136151Sps		else
1017136151Sps			th->th_seq = htonl(tp->snd_max);
1018136151Sps	} else {
1019130989Sps		th->th_seq = htonl(p->rxmit);
1020130989Sps		p->rxmit += len;
1021146123Sps		tp->sackhint.sack_bytes_rexmit += len;
1022130989Sps	}
102355679Sshin	th->th_ack = htonl(tp->rcv_nxt);
10241541Srgrimes	if (optlen) {
102555679Sshin		bcopy(opt, th + 1, optlen);
102655679Sshin		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
10271541Srgrimes	}
102855679Sshin	th->th_flags = flags;
10291541Srgrimes	/*
10301541Srgrimes	 * Calculate receive window.  Don't shrink window,
10311541Srgrimes	 * but avoid silly window syndrome.
10321541Srgrimes	 */
1033124849Sandre	if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
1034124849Sandre	    recwin < (long)tp->t_maxseg)
1035124849Sandre		recwin = 0;
1036221346Sjhb	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
1037221346Sjhb	    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
1038124849Sandre		recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
1039124849Sandre	if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
1040124849Sandre		recwin = (long)TCP_MAXWIN << tp->rcv_scale;
104187193Sdillon
1042170470Sandre	/*
1043170470Sandre	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1044170470Sandre	 * or <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK>
1045170470Sandre	 * case is handled in syncache.
1046170470Sandre	 */
1047170470Sandre	if (flags & TH_SYN)
1048170470Sandre		th->th_win = htons((u_short)
1049170470Sandre				(min(sbspace(&so->so_rcv), TCP_MAXWIN)));
1050170470Sandre	else
1051170470Sandre		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
105287193Sdillon
105387193Sdillon	/*
105487193Sdillon	 * Adjust the RXWIN0SENT flag - indicate that we have advertised
105587193Sdillon	 * a 0 window.  This may cause the remote transmitter to stall.  This
105687193Sdillon	 * flag tells soreceive() to disable delayed acknowledgements when
105787193Sdillon	 * draining the buffer.  This can occur if the receiver is attempting
1058180535Srpaulo	 * to read more data than can be buffered prior to transmitting on
105987193Sdillon	 * the connection.
106087193Sdillon	 */
1061215434Sgnn	if (th->th_win == 0) {
1062215434Sgnn		tp->t_sndzerowin++;
106387193Sdillon		tp->t_flags |= TF_RXWIN0SENT;
1064215434Sgnn	} else
106587193Sdillon		tp->t_flags &= ~TF_RXWIN0SENT;
10661541Srgrimes	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
106755679Sshin		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
106855679Sshin		th->th_flags |= TH_URG;
10691541Srgrimes	} else
10701541Srgrimes		/*
10711541Srgrimes		 * If no urgent pointer to send, then we pull
10721541Srgrimes		 * the urgent pointer to the left edge of the send window
10731541Srgrimes		 * so that it doesn't drift into the send window on sequence
10741541Srgrimes		 * number wraparound.
10751541Srgrimes		 */
10761541Srgrimes		tp->snd_up = tp->snd_una;		/* drag it along */
10771541Srgrimes
1078125680Sbms#ifdef TCP_SIGNATURE
1079293894Sglebius	if (to.to_flags & TOF_SIGNATURE) {
1080174120Sbz		int sigoff = to.to_signature - opt;
1081183001Sbz		tcp_signature_compute(m, 0, len, optlen,
1082125680Sbms		    (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
1083174120Sbz	}
1084125783Sbms#endif
1085125680Sbms
10861541Srgrimes	/*
10871541Srgrimes	 * Put TCP length in extended header, and then
10881541Srgrimes	 * checksum extended header and data.
10891541Srgrimes	 */
109055679Sshin	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1091235961Sbz	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
109255679Sshin#ifdef INET6
1093235961Sbz	if (isipv6) {
109455679Sshin		/*
109555679Sshin		 * ip6_plen is not need to be filled now, and will be filled
109655679Sshin		 * in ip6_output.
109755679Sshin		 */
1098236170Sbz		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
1099235961Sbz		th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
1100235961Sbz		    optlen + len, IPPROTO_TCP, 0);
1101235961Sbz	}
1102235961Sbz#endif
1103235961Sbz#if defined(INET6) && defined(INET)
110455679Sshin	else
1105235961Sbz#endif
1106235961Sbz#ifdef INET
1107133874Srwatson	{
1108236170Sbz		m->m_pkthdr.csum_flags = CSUM_TCP;
1109133874Srwatson		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1110133874Srwatson		    htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
111158698Sjlemon
1112133874Srwatson		/* IP version must be set here for ipv4/ipv6 checking later */
1113133874Srwatson		KASSERT(ip->ip_v == IPVERSION,
1114133874Srwatson		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
1115133874Srwatson	}
1116235961Sbz#endif
11171541Srgrimes
11181541Srgrimes	/*
1119162110Sandre	 * Enable TSO and specify the size of the segments.
1120162110Sandre	 * The TCP pseudo header checksum is always provided.
1121162110Sandre	 * XXX: Fixme: This is currently not the case for IPv6.
1122162110Sandre	 */
1123162110Sandre	if (tso) {
1124211317Sandre		KASSERT(len > tp->t_maxopd - optlen,
1125211317Sandre		    ("%s: len <= tso_segsz", __func__));
1126206844Sken		m->m_pkthdr.csum_flags |= CSUM_TSO;
1127162110Sandre		m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
1128162110Sandre	}
1129162110Sandre
1130223326Sbz#ifdef IPSEC
1131223326Sbz	KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
1132223326Sbz	    ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u",
1133223326Sbz	    __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
1134223326Sbz#else
1135212803Sandre	KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
1136223326Sbz	    ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u",
1137223326Sbz	    __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
1138223326Sbz#endif
1139212803Sandre
1140216758Slstewart	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
1141216758Slstewart	hhook_run_tcp_est_out(tp, th, &to, len, tso);
1142216758Slstewart
11432788Sdg#ifdef TCPDEBUG
11441541Srgrimes	/*
11451541Srgrimes	 * Trace.
11461541Srgrimes	 */
1147118862Sharti	if (so->so_options & SO_DEBUG) {
1148130683Sbms		u_short save = 0;
1149130666Sbms#ifdef INET6
1150130666Sbms		if (!isipv6)
1151130666Sbms#endif
1152130666Sbms		{
1153130666Sbms			save = ipov->ih_len;
1154130666Sbms			ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
1155130666Sbms		}
115655679Sshin		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
1157130666Sbms#ifdef INET6
1158130666Sbms		if (!isipv6)
1159130666Sbms#endif
1160118862Sharti		ipov->ih_len = save;
1161118862Sharti	}
1162221250Sbz#endif /* TCPDEBUG */
11631541Srgrimes
11641541Srgrimes	/*
11651541Srgrimes	 * Fill in IP length and desired time to live and
11661541Srgrimes	 * send to IP level.  There should be a better way
11671541Srgrimes	 * to handle ttl and tos; we could keep them in
11681541Srgrimes	 * the template, but need a way to checksum without them.
11691541Srgrimes	 */
117055679Sshin	/*
117155679Sshin	 * m->m_pkthdr.len should have been set before cksum calcuration,
117255679Sshin	 * because in6_cksum() need it.
117355679Sshin	 */
117455679Sshin#ifdef INET6
117555679Sshin	if (isipv6) {
1176238516Sglebius		struct route_in6 ro;
1177238516Sglebius
1178238516Sglebius		bzero(&ro, sizeof(ro));
117962587Sitojun		/*
118055679Sshin		 * we separately set hoplimit for every segment, since the
118155679Sshin		 * user might want to change the value via setsockopt.
118255679Sshin		 * Also, desired default hop limit might be changed via
118362587Sitojun		 * Neighbor Discovery.
118462587Sitojun		 */
1185122922Sandre		ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
118655679Sshin
1187254889Smarkj		/*
1188254889Smarkj		 * Set the packet size here for the benefit of DTrace probes.
1189254889Smarkj		 * ip6_output() will set it properly; it's supposed to include
1190254889Smarkj		 * the option header lengths as well.
1191254889Smarkj		 */
1192254889Smarkj		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
1193254889Smarkj
1194254889Smarkj		if (tp->t_state == TCPS_SYN_SENT)
1195260817Savg			TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
1196254889Smarkj
1197254889Smarkj		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
1198254889Smarkj
119955679Sshin		/* TODO: IPv6 IP6TOS_ECT bit on */
1200238516Sglebius		error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
1201238516Sglebius		    ((so->so_options & SO_DONTROUTE) ?  IP_ROUTETOIF : 0),
1202238516Sglebius		    NULL, NULL, tp->t_inpcb);
1203238516Sglebius
1204238516Sglebius		if (error == EMSGSIZE && ro.ro_rt != NULL)
1205263478Sglebius			mtu = ro.ro_rt->rt_mtu;
1206238516Sglebius		RO_RTFREE(&ro);
1207221250Sbz	}
120855679Sshin#endif /* INET6 */
1209221250Sbz#if defined(INET) && defined(INET6)
1210221250Sbz	else
1211221250Sbz#endif
1212221250Sbz#ifdef INET
12131541Srgrimes    {
1214238516Sglebius	struct route ro;
1215238516Sglebius
1216238516Sglebius	bzero(&ro, sizeof(ro));
1217241913Sglebius	ip->ip_len = htons(m->m_pkthdr.len);
121862587Sitojun#ifdef INET6
1219185371Sbz	if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
1220133874Srwatson		ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
122162587Sitojun#endif /* INET6 */
122210930Swollman	/*
1223122922Sandre	 * If we do path MTU discovery, then we set DF on every packet.
1224122922Sandre	 * This might not be the best thing to do according to RFC3390
1225122922Sandre	 * Section 2. However the tcp hostcache migitates the problem
1226122922Sandre	 * so it affects only the first tcp connection with a host.
1227211333Sandre	 *
1228211333Sandre	 * NB: Don't set DF on small MTU/MSS to have a safe fallback.
122910930Swollman	 */
1230211333Sandre	if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss)
1231241913Sglebius		ip->ip_off |= htons(IP_DF);
1232122922Sandre
1233254889Smarkj	if (tp->t_state == TCPS_SYN_SENT)
1234260817Savg		TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
1235254889Smarkj
1236254889Smarkj	TCP_PROBE5(send, NULL, tp, ip, tp, th);
1237254889Smarkj
1238238516Sglebius	error = ip_output(m, tp->t_inpcb->inp_options, &ro,
1239134793Sjmg	    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
1240134793Sjmg	    tp->t_inpcb);
1241238516Sglebius
1242238516Sglebius	if (error == EMSGSIZE && ro.ro_rt != NULL)
1243263478Sglebius		mtu = ro.ro_rt->rt_mtu;
1244238516Sglebius	RO_RTFREE(&ro);
12451541Srgrimes    }
1246221250Sbz#endif /* INET */
1247249372Sglebius
1248249372Sglebiusout:
1249249372Sglebius	/*
1250249372Sglebius	 * In transmit state, time the transmission and arrange for
1251249372Sglebius	 * the retransmit.  In persist state, just set snd_max.
1252249372Sglebius	 */
1253249372Sglebius	if ((tp->t_flags & TF_FORCEDATA) == 0 ||
1254249372Sglebius	    !tcp_timer_active(tp, TT_PERSIST)) {
1255249372Sglebius		tcp_seq startseq = tp->snd_nxt;
1256249372Sglebius
1257249372Sglebius		/*
1258249372Sglebius		 * Advance snd_nxt over sequence space of this segment.
1259249372Sglebius		 */
1260249372Sglebius		if (flags & (TH_SYN|TH_FIN)) {
1261249372Sglebius			if (flags & TH_SYN)
1262249372Sglebius				tp->snd_nxt++;
1263249372Sglebius			if (flags & TH_FIN) {
1264249372Sglebius				tp->snd_nxt++;
1265249372Sglebius				tp->t_flags |= TF_SENTFIN;
1266249372Sglebius			}
1267249372Sglebius		}
1268249372Sglebius		if (sack_rxmit)
1269249372Sglebius			goto timer;
1270249372Sglebius		tp->snd_nxt += len;
1271249372Sglebius		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1272249372Sglebius			tp->snd_max = tp->snd_nxt;
1273249372Sglebius			/*
1274249372Sglebius			 * Time this transmission if not a retransmission and
1275249372Sglebius			 * not currently timing anything.
1276249372Sglebius			 */
1277249372Sglebius			if (tp->t_rtttime == 0) {
1278249372Sglebius				tp->t_rtttime = ticks;
1279249372Sglebius				tp->t_rtseq = startseq;
1280249372Sglebius				TCPSTAT_INC(tcps_segstimed);
1281249372Sglebius			}
1282249372Sglebius		}
1283249372Sglebius
1284249372Sglebius		/*
1285249372Sglebius		 * Set retransmit timer if not currently set,
1286249372Sglebius		 * and not doing a pure ack or a keep-alive probe.
1287249372Sglebius		 * Initial value for retransmit timer is smoothed
1288249372Sglebius		 * round-trip time + 2 * round-trip time variance.
1289249372Sglebius		 * Initialize shift counter which is used for backoff
1290249372Sglebius		 * of retransmit time.
1291249372Sglebius		 */
1292249372Sglebiustimer:
1293249372Sglebius		if (!tcp_timer_active(tp, TT_REXMT) &&
1294249372Sglebius		    ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1295249372Sglebius		     (tp->snd_nxt != tp->snd_una))) {
1296249372Sglebius			if (tcp_timer_active(tp, TT_PERSIST)) {
1297249372Sglebius				tcp_timer_activate(tp, TT_PERSIST, 0);
1298249372Sglebius				tp->t_rxtshift = 0;
1299249372Sglebius			}
1300249372Sglebius			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1301249372Sglebius		}
1302249372Sglebius	} else {
1303249372Sglebius		/*
1304249372Sglebius		 * Persist case, update snd_max but since we are in
1305249372Sglebius		 * persist mode (no window) we do not update snd_nxt.
1306249372Sglebius		 */
1307249372Sglebius		int xlen = len;
1308249372Sglebius		if (flags & TH_SYN)
1309249372Sglebius			++xlen;
1310249372Sglebius		if (flags & TH_FIN) {
1311249372Sglebius			++xlen;
1312249372Sglebius			tp->t_flags |= TF_SENTFIN;
1313249372Sglebius		}
1314249372Sglebius		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1315249372Sglebius			tp->snd_max = tp->snd_nxt + len;
1316249372Sglebius	}
1317249372Sglebius
13181541Srgrimes	if (error) {
131964213Sarchie
132064213Sarchie		/*
132164213Sarchie		 * We know that the packet was lost, so back out the
132264213Sarchie		 * sequence number advance, if any.
1323162739Sandre		 *
1324162739Sandre		 * If the error is EPERM the packet got blocked by the
1325162739Sandre		 * local firewall.  Normally we should terminate the
1326162739Sandre		 * connection but the blocking may have been spurious
1327162739Sandre		 * due to a firewall reconfiguration cycle.  So we treat
1328162739Sandre		 * it like a packet loss and let the retransmit timer and
1329162739Sandre		 * timeouts do their work over time.
1330162739Sandre		 * XXX: It is a POLA question whether calling tcp_drop right
1331162739Sandre		 * away would be the really correct behavior instead.
133264213Sarchie		 */
1333167106Sglebius		if (((tp->t_flags & TF_FORCEDATA) == 0 ||
1334168615Sandre		    !tcp_timer_active(tp, TT_PERSIST)) &&
1335167106Sglebius		    ((flags & TH_SYN) == 0) &&
1336167106Sglebius		    (error != EPERM)) {
1337167106Sglebius			if (sack_rxmit) {
1338167106Sglebius				p->rxmit -= len;
1339167106Sglebius				tp->sackhint.sack_bytes_rexmit -= len;
1340167106Sglebius				KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
1341167106Sglebius				    ("sackhint bytes rtx >= 0"));
1342167106Sglebius			} else
1343167106Sglebius				tp->snd_nxt -= len;
134464213Sarchie		}
1345167106Sglebius		SOCKBUF_UNLOCK_ASSERT(&so->so_snd);	/* Check gotos. */
1346167106Sglebius		switch (error) {
1347167106Sglebius		case EPERM:
1348162739Sandre			tp->t_softerror = error;
1349162739Sandre			return (error);
1350167106Sglebius		case ENOBUFS:
1351168615Sandre	                if (!tcp_timer_active(tp, TT_REXMT) &&
1352168615Sandre			    !tcp_timer_active(tp, TT_PERSIST))
1353168615Sandre	                        tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1354145355Sandre			tp->snd_cwnd = tp->t_maxseg;
13551541Srgrimes			return (0);
1356167106Sglebius		case EMSGSIZE:
135711537Swollman			/*
1358162110Sandre			 * For some reason the interface we used initially
1359162110Sandre			 * to send segments changed to another or lowered
1360162110Sandre			 * its MTU.
1361162110Sandre			 * If TSO was active we either got an interface
1362162110Sandre			 * without TSO capabilits or TSO was turned off.
1363238516Sglebius			 * If we obtained mtu from ip_output() then update
1364238516Sglebius			 * it and try again.
136511537Swollman			 */
1366162110Sandre			if (tso)
1367162110Sandre				tp->t_flags &= ~TF_TSO;
1368238516Sglebius			if (mtu != 0) {
1369238516Sglebius				tcp_mss_update(tp, -1, mtu, NULL, NULL);
1370238516Sglebius				goto again;
1371238516Sglebius			}
1372238516Sglebius			return (error);
1373167107Sglebius		case EHOSTDOWN:
1374167106Sglebius		case EHOSTUNREACH:
1375167106Sglebius		case ENETDOWN:
1376167107Sglebius		case ENETUNREACH:
1377167106Sglebius			if (TCPS_HAVERCVDSYN(tp->t_state)) {
1378167106Sglebius				tp->t_softerror = error;
1379167106Sglebius				return (0);
1380167106Sglebius			}
1381167106Sglebius			/* FALLTHROUGH */
1382167106Sglebius		default:
1383167106Sglebius			return (error);
13841541Srgrimes		}
13851541Srgrimes	}
1386190948Srwatson	TCPSTAT_INC(tcps_sndtotal);
13871541Srgrimes
13881541Srgrimes	/*
13891541Srgrimes	 * Data sent (as far as we can tell).
13901541Srgrimes	 * If this advertises a larger window than any other segment,
13911541Srgrimes	 * then remember the size of the advertised window.
13921541Srgrimes	 * Any pending ACK has now been sent.
13931541Srgrimes	 */
1394223049Sjhb	if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
1395124849Sandre		tp->rcv_adv = tp->rcv_nxt + recwin;
13961541Srgrimes	tp->last_ack_sent = tp->rcv_nxt;
1397111139Sjlemon	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
1398168615Sandre	if (tcp_timer_active(tp, TT_DELACK))
1399168615Sandre		tcp_timer_activate(tp, TT_DELACK, 0);
140087145Sdillon#if 0
140187145Sdillon	/*
140287145Sdillon	 * This completely breaks TCP if newreno is turned on.  What happens
140387145Sdillon	 * is that if delayed-acks are turned on on the receiver, this code
140487145Sdillon	 * on the transmitter effectively destroys the TCP window, forcing
140587145Sdillon	 * it to four packets (1.5Kx4 = 6K window).
140687145Sdillon	 */
1407215166Slstewart	if (sendalot && --maxburst)
14081541Srgrimes		goto again;
140987145Sdillon#endif
141087145Sdillon	if (sendalot)
141187145Sdillon		goto again;
14121541Srgrimes	return (0);
14131541Srgrimes}
14141541Srgrimes
14151541Srgrimesvoid
1416167785Sandretcp_setpersist(struct tcpcb *tp)
14171541Srgrimes{
141850673Sjlemon	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
141950673Sjlemon	int tt;
14201541Srgrimes
1421221209Sjhb	tp->t_flags &= ~TF_PREVVALID;
1422168615Sandre	if (tcp_timer_active(tp, TT_REXMT))
142350673Sjlemon		panic("tcp_setpersist: retransmit pending");
14241541Srgrimes	/*
14251541Srgrimes	 * Start/restart persistance timer.
14261541Srgrimes	 */
142750673Sjlemon	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
142850673Sjlemon		      TCPTV_PERSMIN, TCPTV_PERSMAX);
1429168615Sandre	tcp_timer_activate(tp, TT_PERSIST, tt);
14301541Srgrimes	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
14311541Srgrimes		tp->t_rxtshift++;
14321541Srgrimes}
1433167606Sandre
1434167606Sandre/*
1435167606Sandre * Insert TCP options according to the supplied parameters to the place
1436167606Sandre * optp in a consistent way.  Can handle unaligned destinations.
1437167606Sandre *
1438167606Sandre * The order of the option processing is crucial for optimal packing and
1439167606Sandre * alignment for the scarce option space.
1440167606Sandre *
1441167606Sandre * The optimal order for a SYN/SYN-ACK segment is:
1442167606Sandre *   MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
1443167606Sandre *   Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
1444167606Sandre *
1445167606Sandre * The SACK options should be last.  SACK blocks consume 8*n+2 bytes.
1446167606Sandre * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
1447167606Sandre * At minimum we need 10 bytes (to generate 1 SACK block).  If both
1448167606Sandre * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
1449167606Sandre * we only have 10 bytes for SACK options (40 - (12 + 18)).
1450167606Sandre */
1451167606Sandreint
1452167606Sandretcp_addoptions(struct tcpopt *to, u_char *optp)
1453167606Sandre{
1454167606Sandre	u_int mask, optlen = 0;
1455167606Sandre
1456167606Sandre	for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
1457167606Sandre		if ((to->to_flags & mask) != mask)
1458167606Sandre			continue;
1459177988Sandre		if (optlen == TCP_MAXOLEN)
1460177988Sandre			break;
1461167606Sandre		switch (to->to_flags & mask) {
1462167606Sandre		case TOF_MSS:
1463167606Sandre			while (optlen % 4) {
1464167606Sandre				optlen += TCPOLEN_NOP;
1465167606Sandre				*optp++ = TCPOPT_NOP;
1466167606Sandre			}
1467177988Sandre			if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
1468177988Sandre				continue;
1469167606Sandre			optlen += TCPOLEN_MAXSEG;
1470167606Sandre			*optp++ = TCPOPT_MAXSEG;
1471167606Sandre			*optp++ = TCPOLEN_MAXSEG;
1472167606Sandre			to->to_mss = htons(to->to_mss);
1473167606Sandre			bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
1474167606Sandre			optp += sizeof(to->to_mss);
1475167606Sandre			break;
1476167606Sandre		case TOF_SCALE:
1477167606Sandre			while (!optlen || optlen % 2 != 1) {
1478167606Sandre				optlen += TCPOLEN_NOP;
1479167606Sandre				*optp++ = TCPOPT_NOP;
1480167606Sandre			}
1481177988Sandre			if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
1482177988Sandre				continue;
1483167606Sandre			optlen += TCPOLEN_WINDOW;
1484167606Sandre			*optp++ = TCPOPT_WINDOW;
1485167606Sandre			*optp++ = TCPOLEN_WINDOW;
1486167606Sandre			*optp++ = to->to_wscale;
1487167606Sandre			break;
1488167606Sandre		case TOF_SACKPERM:
1489167606Sandre			while (optlen % 2) {
1490167606Sandre				optlen += TCPOLEN_NOP;
1491167606Sandre				*optp++ = TCPOPT_NOP;
1492167606Sandre			}
1493177988Sandre			if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
1494177988Sandre				continue;
1495167606Sandre			optlen += TCPOLEN_SACK_PERMITTED;
1496167606Sandre			*optp++ = TCPOPT_SACK_PERMITTED;
1497167606Sandre			*optp++ = TCPOLEN_SACK_PERMITTED;
1498167606Sandre			break;
1499167606Sandre		case TOF_TS:
1500167606Sandre			while (!optlen || optlen % 4 != 2) {
1501167606Sandre				optlen += TCPOLEN_NOP;
1502167606Sandre				*optp++ = TCPOPT_NOP;
1503167606Sandre			}
1504177988Sandre			if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
1505177988Sandre				continue;
1506167606Sandre			optlen += TCPOLEN_TIMESTAMP;
1507167606Sandre			*optp++ = TCPOPT_TIMESTAMP;
1508167606Sandre			*optp++ = TCPOLEN_TIMESTAMP;
1509167606Sandre			to->to_tsval = htonl(to->to_tsval);
1510167606Sandre			to->to_tsecr = htonl(to->to_tsecr);
1511167606Sandre			bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
1512167606Sandre			optp += sizeof(to->to_tsval);
1513167606Sandre			bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
1514167606Sandre			optp += sizeof(to->to_tsecr);
1515167606Sandre			break;
1516293894Sglebius#ifdef TCP_SIGNATURE
1517167606Sandre		case TOF_SIGNATURE:
1518167606Sandre			{
1519167606Sandre			int siglen = TCPOLEN_SIGNATURE - 2;
1520167606Sandre
1521167606Sandre			while (!optlen || optlen % 4 != 2) {
1522167606Sandre				optlen += TCPOLEN_NOP;
1523167606Sandre				*optp++ = TCPOPT_NOP;
1524167606Sandre			}
1525168904Sandre			if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE)
1526167606Sandre				continue;
1527167606Sandre			optlen += TCPOLEN_SIGNATURE;
1528167606Sandre			*optp++ = TCPOPT_SIGNATURE;
1529167606Sandre			*optp++ = TCPOLEN_SIGNATURE;
1530167606Sandre			to->to_signature = optp;
1531167606Sandre			while (siglen--)
1532167606Sandre				 *optp++ = 0;
1533167606Sandre			break;
1534167606Sandre			}
1535293894Sglebius#endif
1536167606Sandre		case TOF_SACK:
1537167606Sandre			{
1538167606Sandre			int sackblks = 0;
1539167606Sandre			struct sackblk *sack = (struct sackblk *)to->to_sacks;
1540167606Sandre			tcp_seq sack_seq;
1541167606Sandre
1542167606Sandre			while (!optlen || optlen % 4 != 2) {
1543167606Sandre				optlen += TCPOLEN_NOP;
1544167606Sandre				*optp++ = TCPOPT_NOP;
1545167606Sandre			}
1546177988Sandre			if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
1547167606Sandre				continue;
1548167606Sandre			optlen += TCPOLEN_SACKHDR;
1549167606Sandre			*optp++ = TCPOPT_SACK;
1550167606Sandre			sackblks = min(to->to_nsacks,
1551168904Sandre					(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
1552167606Sandre			*optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
1553167606Sandre			while (sackblks--) {
1554167606Sandre				sack_seq = htonl(sack->start);
1555167606Sandre				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
1556167606Sandre				optp += sizeof(sack_seq);
1557167606Sandre				sack_seq = htonl(sack->end);
1558167606Sandre				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
1559167606Sandre				optp += sizeof(sack_seq);
1560167606Sandre				optlen += TCPOLEN_SACK;
1561167606Sandre				sack++;
1562167606Sandre			}
1563190948Srwatson			TCPSTAT_INC(tcps_sack_send_blocks);
1564167606Sandre			break;
1565167606Sandre			}
1566167606Sandre		default:
1567167606Sandre			panic("%s: unknown TCP option type", __func__);
1568167606Sandre			break;
1569167606Sandre		}
1570167606Sandre	}
1571167606Sandre
1572167606Sandre	/* Terminate and pad TCP options to a 4 byte boundary. */
1573167606Sandre	if (optlen % 4) {
1574167606Sandre		optlen += TCPOLEN_EOL;
1575167606Sandre		*optp++ = TCPOPT_EOL;
1576167606Sandre	}
1577176978Sbz	/*
1578176978Sbz	 * According to RFC 793 (STD0007):
1579176978Sbz	 *   "The content of the header beyond the End-of-Option option
1580176978Sbz	 *    must be header padding (i.e., zero)."
1581176978Sbz	 *   and later: "The padding is composed of zeros."
1582176978Sbz	 */
1583167606Sandre	while (optlen % 4) {
1584177986Sandre		optlen += TCPOLEN_PAD;
1585177986Sandre		*optp++ = TCPOPT_PAD;
1586167606Sandre	}
1587167606Sandre
1588168904Sandre	KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
1589167606Sandre	return (optlen);
1590167606Sandre}
1591