1139823Simp/*- 210965Swollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 31541Srgrimes * The Regents of the University of California. All rights reserved. 41541Srgrimes * 51541Srgrimes * Redistribution and use in source and binary forms, with or without 61541Srgrimes * modification, are permitted provided that the following conditions 71541Srgrimes * are met: 81541Srgrimes * 1. Redistributions of source code must retain the above copyright 91541Srgrimes * notice, this list of conditions and the following disclaimer. 101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer in the 121541Srgrimes * documentation and/or other materials provided with the distribution. 131541Srgrimes * 4. Neither the name of the University nor the names of its contributors 141541Srgrimes * may be used to endorse or promote products derived from this software 151541Srgrimes * without specific prior written permission. 161541Srgrimes * 171541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 181541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 191541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 201541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 211541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 221541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 231541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 241541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 251541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 261541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 271541Srgrimes * SUCH DAMAGE. 281541Srgrimes * 2910965Swollman * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 301541Srgrimes */ 311541Srgrimes 32172467Ssilby#include <sys/cdefs.h> 33172467Ssilby__FBSDID("$FreeBSD$"); 34172467Ssilby 35125680Sbms#include "opt_inet.h" 3655679Sshin#include "opt_inet6.h" 3756041Sshin#include "opt_ipsec.h" 38254889Smarkj#include "opt_kdtrace.h" 3929514Sjoerg#include "opt_tcpdebug.h" 4029514Sjoerg 411541Srgrimes#include <sys/param.h> 421541Srgrimes#include <sys/systm.h> 4376166Smarkm#include <sys/domain.h> 44216758Slstewart#include <sys/hhook.h> 4547547Sdg#include <sys/kernel.h> 4676166Smarkm#include <sys/lock.h> 471541Srgrimes#include <sys/mbuf.h> 4876166Smarkm#include <sys/mutex.h> 491541Srgrimes#include <sys/protosw.h> 50254889Smarkj#include <sys/sdt.h> 511541Srgrimes#include <sys/socket.h> 521541Srgrimes#include <sys/socketvar.h> 5376166Smarkm#include <sys/sysctl.h> 541541Srgrimes 55185571Sbz#include <net/if.h> 561541Srgrimes#include <net/route.h> 57196019Srwatson#include <net/vnet.h> 581541Srgrimes 59215166Slstewart#include <netinet/cc.h> 601541Srgrimes#include <netinet/in.h> 61254889Smarkj#include <netinet/in_kdtrace.h> 621541Srgrimes#include <netinet/in_systm.h> 631541Srgrimes#include <netinet/ip.h> 641541Srgrimes#include <netinet/in_pcb.h> 6562587Sitojun#include <netinet/ip_var.h> 66152592Sandre#include <netinet/ip_options.h> 6755679Sshin#ifdef INET6 6855679Sshin#include <netinet6/in6_pcb.h> 6962587Sitojun#include <netinet/ip6.h> 7055679Sshin#include <netinet6/ip6_var.h> 7155679Sshin#endif 721541Srgrimes#define TCPOUTFLAGS 731541Srgrimes#include <netinet/tcp_fsm.h> 741541Srgrimes#include <netinet/tcp_seq.h> 751541Srgrimes#include <netinet/tcp_timer.h> 761541Srgrimes#include <netinet/tcp_var.h> 771541Srgrimes#include <netinet/tcpip.h> 782788Sdg#ifdef TCPDEBUG 791541Srgrimes#include <netinet/tcp_debug.h> 802788Sdg#endif 81237263Snp#ifdef TCP_OFFLOAD 82237263Snp#include <netinet/tcp_offload.h> 83237263Snp#endif 841541Srgrimes 85171167Sgnn#ifdef IPSEC 86105199Ssam#include <netipsec/ipsec.h> 87171167Sgnn#endif /*IPSEC*/ 88105199Ssam 8958698Sjlemon#include <machine/in_cksum.h> 9058698Sjlemon 91163606Srwatson#include <security/mac/mac_framework.h> 92163606Srwatson 93207369SbzVNET_DEFINE(int, path_mtu_discovery) = 1; 94195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, 95195699Srwatson &VNET_NAME(path_mtu_discovery), 1, 96195699Srwatson "Enable Path MTU Discovery"); 971541Srgrimes 98207369SbzVNET_DEFINE(int, tcp_do_tso) = 1; 99207369Sbz#define V_tcp_do_tso VNET(tcp_do_tso) 100195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, 101195699Srwatson &VNET_NAME(tcp_do_tso), 0, 102195699Srwatson "Enable TCP Segmentation Offload"); 103162110Sandre 104226448SandreVNET_DEFINE(int, tcp_sendspace) = 1024*32; 105226448Sandre#define V_tcp_sendspace VNET(tcp_sendspace) 106227034SpluknetSYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 107226448Sandre &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); 108226448Sandre 109207369SbzVNET_DEFINE(int, tcp_do_autosndbuf) = 1; 110207369Sbz#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) 111195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, 112195699Srwatson &VNET_NAME(tcp_do_autosndbuf), 0, 113195699Srwatson "Enable automatic send buffer sizing"); 114166405Sandre 115207369SbzVNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; 116207369Sbz#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) 117195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, 118195699Srwatson &VNET_NAME(tcp_autosndbuf_inc), 0, 119183550Szec "Incrementor step size of automatic send buffer"); 120166405Sandre 121225169SbzVNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; 122207369Sbz#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) 123195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, 124195699Srwatson &VNET_NAME(tcp_autosndbuf_max), 0, 125183550Szec "Max size of automatic send buffer"); 126166405Sandre 127216758Slstewartstatic void inline hhook_run_tcp_est_out(struct tcpcb *tp, 128216758Slstewart struct tcphdr *th, struct tcpopt *to, 129216758Slstewart long len, int tso); 130215166Slstewartstatic void inline cc_after_idle(struct tcpcb *tp); 131166405Sandre 1321541Srgrimes/* 133242692Skevlo * Wrapper for the TCP established output helper hook. 134216758Slstewart */ 135216758Slstewartstatic void inline 136216758Slstewarthhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, 137216758Slstewart struct tcpopt *to, long len, int tso) 138216758Slstewart{ 139216758Slstewart struct tcp_hhook_data hhook_data; 140216758Slstewart 141216758Slstewart if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) { 142216758Slstewart hhook_data.tp = tp; 143216758Slstewart hhook_data.th = th; 144216758Slstewart hhook_data.to = to; 145216758Slstewart hhook_data.len = len; 146216758Slstewart hhook_data.tso = tso; 147216758Slstewart 148216758Slstewart hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, 149216758Slstewart tp->osd); 150216758Slstewart } 151216758Slstewart} 152216758Slstewart 153216758Slstewart/* 154215166Slstewart * CC wrapper hook functions 155215166Slstewart */ 156215166Slstewartstatic void inline 157215166Slstewartcc_after_idle(struct tcpcb *tp) 158215166Slstewart{ 159215166Slstewart INP_WLOCK_ASSERT(tp->t_inpcb); 160215166Slstewart 161215166Slstewart if (CC_ALGO(tp)->after_idle != NULL) 162215166Slstewart CC_ALGO(tp)->after_idle(tp->ccv); 163215166Slstewart} 164215166Slstewart 165215166Slstewart/* 1661541Srgrimes * Tcp output routine: figure out what should be sent and send it. 1671541Srgrimes */ 1681541Srgrimesint 16998704Sluigitcp_output(struct tcpcb *tp) 1701541Srgrimes{ 17198704Sluigi struct socket *so = tp->t_inpcb->inp_socket; 172124849Sandre long len, recwin, sendwin; 173221250Sbz int off, flags, error = 0; /* Keep compiler happy */ 17498704Sluigi struct mbuf *m; 17555679Sshin struct ip *ip = NULL; 17698704Sluigi struct ipovly *ipov = NULL; 17798704Sluigi struct tcphdr *th; 1786283Swollman u_char opt[TCP_MAXOLEN]; 17936335Sfenner unsigned ipoptlen, optlen, hdrlen; 180173835Sbz#ifdef IPSEC 181173835Sbz unsigned ipsec_optlen = 0; 182173835Sbz#endif 1831541Srgrimes int idle, sendalot; 184167606Sandre int sack_rxmit, sack_bytes_rxmt; 185130989Sps struct sackhole *p; 186238516Sglebius int tso, mtu; 187167606Sandre struct tcpopt to; 18887193Sdillon#if 0 18960067Sjlemon int maxburst = TCP_MAXBURST; 19087193Sdillon#endif 19155679Sshin#ifdef INET6 19298704Sluigi struct ip6_hdr *ip6 = NULL; 19355679Sshin int isipv6; 1941541Srgrimes 19555679Sshin isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 19655679Sshin#endif 19755679Sshin 198178285Srwatson INP_WLOCK_ASSERT(tp->t_inpcb); 199101713Sjennifer 200237263Snp#ifdef TCP_OFFLOAD 201237263Snp if (tp->t_flags & TF_TOE) 202237263Snp return (tcp_offload_output(tp)); 203237263Snp#endif 204237263Snp 2051541Srgrimes /* 2061541Srgrimes * Determine length of data that should be transmitted, 2071541Srgrimes * and flags that will be used. 2081541Srgrimes * If there is some data or critical controls (SYN, RST) 2091541Srgrimes * to send, then transmit; otherwise, investigate further. 2101541Srgrimes */ 21184564Sjayanth idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 212216105Slstewart if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) 213216105Slstewart cc_after_idle(tp); 21484564Sjayanth tp->t_flags &= ~TF_LASTIDLE; 21584564Sjayanth if (idle) { 21684564Sjayanth if (tp->t_flags & TF_MORETOCOME) { 21784564Sjayanth tp->t_flags |= TF_LASTIDLE; 21884564Sjayanth idle = 0; 21984564Sjayanth } 22084564Sjayanth } 2211541Srgrimesagain: 222130989Sps /* 223130989Sps * If we've recently taken a timeout, snd_max will be greater than 224130989Sps * snd_nxt. There may be SACK information that allows us to avoid 225130989Sps * resending already delivered data. Adjust snd_nxt accordingly. 226130989Sps */ 227169317Sandre if ((tp->t_flags & TF_SACK_PERMIT) && 228169317Sandre SEQ_LT(tp->snd_nxt, tp->snd_max)) 229130989Sps tcp_sack_adjust(tp); 2301541Srgrimes sendalot = 0; 231211317Sandre tso = 0; 232238516Sglebius mtu = 0; 2331541Srgrimes off = tp->snd_nxt - tp->snd_una; 234124849Sandre sendwin = min(tp->snd_wnd, tp->snd_cwnd); 2351541Srgrimes 2361541Srgrimes flags = tcp_outflags[tp->t_state]; 2371541Srgrimes /* 238130989Sps * Send any SACK-generated retransmissions. If we're explicitly trying 239130989Sps * to send out new data (when sendalot is 1), bypass this function. 240130989Sps * If we retransmit in fast recovery mode, decrement snd_cwnd, since 241130989Sps * we're replacing a (future) new transmission with a retransmission 242130989Sps * now, and we previously incremented snd_cwnd in tcp_input(). 243130989Sps */ 244133874Srwatson /* 245130989Sps * Still in sack recovery , reset rxmit flag to zero. 246130989Sps */ 247130989Sps sack_rxmit = 0; 248136151Sps sack_bytes_rxmt = 0; 249130989Sps len = 0; 250130989Sps p = NULL; 251215166Slstewart if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && 252136151Sps (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { 253136151Sps long cwin; 254136151Sps 255136151Sps cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; 256136151Sps if (cwin < 0) 257136151Sps cwin = 0; 258132417Sjayanth /* Do not retransmit SACK segments beyond snd_recover */ 259132417Sjayanth if (SEQ_GT(p->end, tp->snd_recover)) { 260132417Sjayanth /* 261133874Srwatson * (At least) part of sack hole extends beyond 262133874Srwatson * snd_recover. Check to see if we can rexmit data 263132417Sjayanth * for this hole. 264132417Sjayanth */ 265132417Sjayanth if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { 266133874Srwatson /* 267132417Sjayanth * Can't rexmit any more data for this hole. 268133874Srwatson * That data will be rexmitted in the next 269133874Srwatson * sack recovery episode, when snd_recover 270132417Sjayanth * moves past p->rxmit. 271132417Sjayanth */ 272132417Sjayanth p = NULL; 273132417Sjayanth goto after_sack_rexmit; 274132417Sjayanth } else 275132417Sjayanth /* Can rexmit part of the current hole */ 276136151Sps len = ((long)ulmin(cwin, 277136151Sps tp->snd_recover - p->rxmit)); 278132417Sjayanth } else 279136151Sps len = ((long)ulmin(cwin, p->end - p->rxmit)); 280130989Sps off = p->rxmit - tp->snd_una; 281133874Srwatson KASSERT(off >= 0,("%s: sack block to the left of una : %d", 282132417Sjayanth __func__, off)); 283130989Sps if (len > 0) { 284137066Srwatson sack_rxmit = 1; 285137066Srwatson sendalot = 1; 286190948Srwatson TCPSTAT_INC(tcps_sack_rexmits); 287190948Srwatson TCPSTAT_ADD(tcps_sack_rexmit_bytes, 288190948Srwatson min(len, tp->t_maxseg)); 289130989Sps } 290130989Sps } 291132417Sjayanthafter_sack_rexmit: 292130989Sps /* 2936283Swollman * Get standard flags, and add SYN or FIN if requested by 'hidden' 2946283Swollman * state flags. 2956283Swollman */ 2966283Swollman if (tp->t_flags & TF_NEEDFIN) 2976283Swollman flags |= TH_FIN; 2986283Swollman if (tp->t_flags & TF_NEEDSYN) 2996283Swollman flags |= TH_SYN; 3006283Swollman 301136327Srwatson SOCKBUF_LOCK(&so->so_snd); 3026283Swollman /* 3031541Srgrimes * If in persist timeout with window of 0, send 1 byte. 3041541Srgrimes * Otherwise, if window is small but nonzero 3051541Srgrimes * and timer expired, we will send what we can 3061541Srgrimes * and go to transmit state. 3071541Srgrimes */ 308146463Sps if (tp->t_flags & TF_FORCEDATA) { 309124849Sandre if (sendwin == 0) { 3101541Srgrimes /* 3111541Srgrimes * If we still have some data to send, then 3121541Srgrimes * clear the FIN bit. Usually this would 3131541Srgrimes * happen below when it realizes that we 3141541Srgrimes * aren't sending all the data. However, 31545439Sjulian * if we have exactly 1 byte of unsent data, 3161541Srgrimes * then it won't clear the FIN bit below, 3171541Srgrimes * and if we are in persist state, we wind 3181541Srgrimes * up sending the packet without recording 3191541Srgrimes * that we sent the FIN bit. 3201541Srgrimes * 3211541Srgrimes * We can't just blindly clear the FIN bit, 3221541Srgrimes * because if we don't have any more data 3231541Srgrimes * to send then the probe will be the FIN 3241541Srgrimes * itself. 3251541Srgrimes */ 3261541Srgrimes if (off < so->so_snd.sb_cc) 3271541Srgrimes flags &= ~TH_FIN; 328124849Sandre sendwin = 1; 3291541Srgrimes } else { 330168615Sandre tcp_timer_activate(tp, TT_PERSIST, 0); 3311541Srgrimes tp->t_rxtshift = 0; 3321541Srgrimes } 3331541Srgrimes } 3341541Srgrimes 335104815Sdillon /* 336133874Srwatson * If snd_nxt == snd_max and we have transmitted a FIN, the 337104815Sdillon * offset will be > 0 even if so_snd.sb_cc is 0, resulting in 338124849Sandre * a negative length. This can also occur when TCP opens up 339104815Sdillon * its congestion window while receiving additional duplicate 340104815Sdillon * acks after fast-retransmit because TCP will reset snd_nxt 341104815Sdillon * to snd_max after the fast-retransmit. 342104815Sdillon * 343104815Sdillon * In the normal retransmit-FIN-only case, however, snd_nxt will 344104815Sdillon * be set to snd_una, the offset will be 0, and the length may 345104815Sdillon * wind up 0. 346133874Srwatson * 347130989Sps * If sack_rxmit is true we are retransmitting from the scoreboard 348133874Srwatson * in which case len is already set. 349104815Sdillon */ 350136151Sps if (sack_rxmit == 0) { 351136151Sps if (sack_bytes_rxmt == 0) 352136151Sps len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); 353136151Sps else { 354136151Sps long cwin; 3551541Srgrimes 356136151Sps /* 357136151Sps * We are inside of a SACK recovery episode and are 358136151Sps * sending new data, having retransmitted all the 359136151Sps * data possible in the scoreboard. 360136151Sps */ 361138199Sps len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) 362138199Sps - off); 363140138Sps /* 364140138Sps * Don't remove this (len > 0) check ! 365140138Sps * We explicitly check for len > 0 here (although it 366140138Sps * isn't really necessary), to work around a gcc 367140138Sps * optimization issue - to force gcc to compute 368140138Sps * len above. Without this check, the computation 369140138Sps * of len is bungled by the optimizer. 370140138Sps */ 371140138Sps if (len > 0) { 372140138Sps cwin = tp->snd_cwnd - 373140138Sps (tp->snd_nxt - tp->sack_newdata) - 374140138Sps sack_bytes_rxmt; 375140138Sps if (cwin < 0) 376140138Sps cwin = 0; 377140138Sps len = lmin(len, cwin); 378140138Sps } 379136151Sps } 380136151Sps } 381136151Sps 3826283Swollman /* 3836283Swollman * Lop off SYN bit if it has already been sent. However, if this 3846283Swollman * is SYN-SENT state and if segment contains data and if we don't 3856283Swollman * know that foreign host supports TAO, suppress sending segment. 3866283Swollman */ 3876283Swollman if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { 388155961Sqingli if (tp->t_state != TCPS_SYN_RECEIVED) 389155961Sqingli flags &= ~TH_SYN; 3906283Swollman off--, len++; 3916283Swollman } 3926283Swollman 39313475Solah /* 394137139Sandre * Be careful not to send data and/or FIN on SYN segments. 39513475Solah * This measure is needed to prevent interoperability problems 39613475Solah * with not fully conformant TCP implementations. 39713475Solah */ 398137139Sandre if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 39913475Solah len = 0; 40013475Solah flags &= ~TH_FIN; 40113475Solah } 40213475Solah 403285780Sdelphij if (len <= 0) { 4041541Srgrimes /* 4051541Srgrimes * If FIN has been sent but not acked, 4061541Srgrimes * but we haven't been called to retransmit, 407104815Sdillon * len will be < 0. Otherwise, window shrank 4081541Srgrimes * after we sent into it. If window shrank to 0, 40915262Sdg * cancel pending retransmit, pull snd_nxt back 41015262Sdg * to (closed) window, and set the persist timer 41115262Sdg * if it isn't already going. If the window didn't 41215262Sdg * close completely, just wait for an ACK. 413285780Sdelphij * 414285780Sdelphij * We also do a general check here to ensure that 415285780Sdelphij * we will set the persist timer when we have data 416285780Sdelphij * to send, but a 0-byte window. This makes sure 417285780Sdelphij * the persist timer is set even if the packet 418285780Sdelphij * hits one of the "goto send" lines below. 4191541Srgrimes */ 4201541Srgrimes len = 0; 421285780Sdelphij if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && 422285780Sdelphij (off < (int) so->so_snd.sb_cc)) { 423168615Sandre tcp_timer_activate(tp, TT_REXMT, 0); 42415262Sdg tp->t_rxtshift = 0; 4251541Srgrimes tp->snd_nxt = tp->snd_una; 426168615Sandre if (!tcp_timer_active(tp, TT_PERSIST)) 42715262Sdg tcp_setpersist(tp); 4281541Srgrimes } 4291541Srgrimes } 430104815Sdillon 431166405Sandre /* len will be >= 0 after this point. */ 432182841Sbz KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 433166405Sandre 434104815Sdillon /* 435166405Sandre * Automatic sizing of send socket buffer. Often the send buffer 436166405Sandre * size is not optimally adjusted to the actual network conditions 437166405Sandre * at hand (delay bandwidth product). Setting the buffer size too 438166405Sandre * small limits throughput on links with high bandwidth and high 439166405Sandre * delay (eg. trans-continental/oceanic links). Setting the 440166405Sandre * buffer size too big consumes too much real kernel memory, 441166405Sandre * especially with many connections on busy servers. 442162110Sandre * 443166405Sandre * The criteria to step up the send buffer one notch are: 444166405Sandre * 1. receive window of remote host is larger than send buffer 445166405Sandre * (with a fudge factor of 5/4th); 446166405Sandre * 2. send buffer is filled to 7/8th with data (so we actually 447166405Sandre * have data to make use of it); 448166405Sandre * 3. send buffer fill has not hit maximal automatic size; 449166405Sandre * 4. our send window (slow start and cogestion controlled) is 450166405Sandre * larger than sent but unacknowledged data in send buffer. 451166405Sandre * 452166405Sandre * The remote host receive window scaling factor may limit the 453166405Sandre * growing of the send buffer before it reaches its allowed 454166405Sandre * maximum. 455166405Sandre * 456166405Sandre * It scales directly with slow start or congestion window 457166405Sandre * and does at most one step per received ACK. This fast 458166405Sandre * scaling has the drawback of growing the send buffer beyond 459166405Sandre * what is strictly necessary to make full use of a given 460166405Sandre * delay*bandwith product. However testing has shown this not 461166405Sandre * to be much of an problem. At worst we are trading wasting 462166405Sandre * of available bandwith (the non-use of it) for wasting some 463166405Sandre * socket buffer memory. 464166405Sandre * 465166405Sandre * TODO: Shrink send buffer during idle periods together 466166405Sandre * with congestion window. Requires another timer. Has to 467166405Sandre * wait for upcoming tcp timer rewrite. 468166405Sandre */ 469181803Sbz if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 470166405Sandre if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 471166405Sandre so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && 472181803Sbz so->so_snd.sb_cc < V_tcp_autosndbuf_max && 473166405Sandre sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { 474166405Sandre if (!sbreserve_locked(&so->so_snd, 475181803Sbz min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, 476181803Sbz V_tcp_autosndbuf_max), so, curthread)) 477166405Sandre so->so_snd.sb_flags &= ~SB_AUTOSIZE; 478166405Sandre } 479166405Sandre } 480166405Sandre 481166405Sandre /* 482212803Sandre * Decide if we can use TCP Segmentation Offloading (if supported by 483212803Sandre * hardware). 484166405Sandre * 485162110Sandre * TSO may only be used if we are in a pure bulk sending state. The 486162110Sandre * presence of TCP-MD5, SACK retransmits, SACK advertizements and 487162110Sandre * IP options prevent using TSO. With TSO the TCP header is the same 488162110Sandre * (except for the sequence number) for all generated packets. This 489162110Sandre * makes it impossible to transmit any options which vary per generated 490162110Sandre * segment or packet. 491104815Sdillon */ 492173835Sbz#ifdef IPSEC 493173835Sbz /* 494173835Sbz * Pre-calculate here as we save another lookup into the darknesses 495173835Sbz * of IPsec that way and can actually decide if TSO is ok. 496173835Sbz */ 497173835Sbz ipsec_optlen = ipsec_hdrsiz_tcp(tp); 498173835Sbz#endif 499212803Sandre if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && 500212803Sandre ((tp->t_flags & TF_SIGNATURE) == 0) && 501212803Sandre tp->rcv_numsacks == 0 && sack_rxmit == 0 && 502173835Sbz#ifdef IPSEC 503212803Sandre ipsec_optlen == 0 && 504173835Sbz#endif 505212803Sandre tp->t_inpcb->inp_options == NULL && 506212803Sandre tp->t_inpcb->in6p_options == NULL) 507212803Sandre tso = 1; 508211317Sandre 509132717Sjayanth if (sack_rxmit) { 510132717Sjayanth if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) 511132717Sjayanth flags &= ~TH_FIN; 512133874Srwatson } else { 513132717Sjayanth if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 514132717Sjayanth flags &= ~TH_FIN; 515132717Sjayanth } 5161541Srgrimes 517124849Sandre recwin = sbspace(&so->so_rcv); 5181541Srgrimes 5191541Srgrimes /* 52087193Sdillon * Sender silly window avoidance. We transmit under the following 52187193Sdillon * conditions when len is non-zero: 52287193Sdillon * 523162110Sandre * - We have a full segment (or more with TSO) 52487779Sjlemon * - This is the last buffer in a write()/send() and we are 52587779Sjlemon * either idle or running NODELAY 52687779Sjlemon * - we've timed out (e.g. persist timer) 52787779Sjlemon * - we have more then 1/2 the maximum send window's worth of 52887779Sjlemon * data (receiver may be limited the window size) 52987779Sjlemon * - we need to retransmit 5301541Srgrimes */ 5311541Srgrimes if (len) { 532162110Sandre if (len >= tp->t_maxseg) 5331541Srgrimes goto send; 53487193Sdillon /* 53587193Sdillon * NOTE! on localhost connections an 'ack' from the remote 53687193Sdillon * end may occur synchronously with the output and cause 53787193Sdillon * us to flush a buffer queued with moretocome. XXX 53887193Sdillon * 53987193Sdillon * note: the len + off check is almost certainly unnecessary. 54087193Sdillon */ 54187779Sjlemon if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 54287193Sdillon (idle || (tp->t_flags & TF_NODELAY)) && 54387193Sdillon len + off >= so->so_snd.sb_cc && 54487193Sdillon (tp->t_flags & TF_NOPUSH) == 0) { 5451541Srgrimes goto send; 54687193Sdillon } 547146463Sps if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ 5481541Srgrimes goto send; 5496283Swollman if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) 5501541Srgrimes goto send; 55187193Sdillon if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ 5521541Srgrimes goto send; 553130989Sps if (sack_rxmit) 554130989Sps goto send; 5551541Srgrimes } 5561541Srgrimes 5571541Srgrimes /* 558242252Sandre * Sending of standalone window updates. 559242251Sandre * 560242311Sandre * Window updates are important when we close our window due to a 561242311Sandre * full socket buffer and are opening it again after the application 562242252Sandre * reads data from it. Once the window has opened again and the 563242252Sandre * remote end starts to send again the ACK clock takes over and 564242252Sandre * provides the most current window information. 565242252Sandre * 566242311Sandre * We must avoid the silly window syndrome whereas every read 567242252Sandre * from the receive buffer, no matter how small, causes a window 568242252Sandre * update to be sent. We also should avoid sending a flurry of 569242252Sandre * window updates when the socket buffer had queued a lot of data 570242252Sandre * and the application is doing small reads. 571242252Sandre * 572242252Sandre * Prevent a flurry of pointless window updates by only sending 573242252Sandre * an update when we can increase the advertized window by more 574242252Sandre * than 1/4th of the socket buffer capacity. When the buffer is 575242252Sandre * getting full or is very small be more aggressive and send an 576242252Sandre * update whenever we can increase by two mss sized segments. 577242252Sandre * In all other situations the ACK's to new incoming data will 578242252Sandre * carry further window increases. 579242252Sandre * 580242251Sandre * Don't send an independent window update if a delayed 581242251Sandre * ACK is pending (it will get piggy-backed on it) or the 582242251Sandre * remote side already has done a half-close and won't send 583242252Sandre * more data. Skip this if the connection is in T/TCP 584242252Sandre * half-open state. 5851541Srgrimes */ 586170467Sandre if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 587242251Sandre !(tp->t_flags & TF_DELACK) && 588170467Sandre !TCPS_HAVERCVDFIN(tp->t_state)) { 5898876Srgrimes /* 590242252Sandre * "adv" is the amount we could increase the window, 5911541Srgrimes * taking into account that we are limited by 5921541Srgrimes * TCP_MAXWIN << tp->rcv_scale. 5931541Srgrimes */ 594221346Sjhb long adv; 595221346Sjhb int oldwin; 5961541Srgrimes 597221346Sjhb adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); 598221346Sjhb if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 599221346Sjhb oldwin = (tp->rcv_adv - tp->rcv_nxt); 600221346Sjhb adv -= oldwin; 601221346Sjhb } else 602221346Sjhb oldwin = 0; 603221346Sjhb 604220794Sjhb /* 605220794Sjhb * If the new window size ends up being the same as the old 606220794Sjhb * size when it is scaled, then don't force a window update. 607220794Sjhb */ 608221346Sjhb if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) 609220794Sjhb goto dontupdate; 610242252Sandre 611242252Sandre if (adv >= (long)(2 * tp->t_maxseg) && 612242252Sandre (adv >= (long)(so->so_rcv.sb_hiwat / 4) || 613242252Sandre recwin <= (long)(so->so_rcv.sb_hiwat / 8) || 614242252Sandre so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) 6151541Srgrimes goto send; 6161541Srgrimes } 617220794Sjhbdontupdate: 6181541Srgrimes 6191541Srgrimes /* 620104815Sdillon * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 621104815Sdillon * is also a catch-all for the retransmit timer timeout case. 6221541Srgrimes */ 6231541Srgrimes if (tp->t_flags & TF_ACKNOW) 6241541Srgrimes goto send; 6256283Swollman if ((flags & TH_RST) || 6266283Swollman ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) 6271541Srgrimes goto send; 6281541Srgrimes if (SEQ_GT(tp->snd_up, tp->snd_una)) 6291541Srgrimes goto send; 6301541Srgrimes /* 6311541Srgrimes * If our state indicates that FIN should be sent 632104815Sdillon * and we have not yet done so, then we need to send. 6331541Srgrimes */ 6341541Srgrimes if (flags & TH_FIN && 6351541Srgrimes ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 6361541Srgrimes goto send; 6371541Srgrimes /* 638130989Sps * In SACK, it is possible for tcp_output to fail to send a segment 639130989Sps * after the retransmission timer has been turned off. Make sure 640130989Sps * that the retransmission timer is set. 641130989Sps */ 642169317Sandre if ((tp->t_flags & TF_SACK_PERMIT) && 643169317Sandre SEQ_GT(tp->snd_max, tp->snd_una) && 644168615Sandre !tcp_timer_active(tp, TT_REXMT) && 645168615Sandre !tcp_timer_active(tp, TT_PERSIST)) { 646168615Sandre tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 647136327Srwatson goto just_return; 648133874Srwatson } 649130989Sps /* 6501541Srgrimes * TCP window updates are not reliable, rather a polling protocol 6511541Srgrimes * using ``persist'' packets is used to insure receipt of window 6521541Srgrimes * updates. The three ``states'' for the output side are: 6531541Srgrimes * idle not doing retransmits or persists 6541541Srgrimes * persisting to move a small or zero window 6551541Srgrimes * (re)transmitting and thereby not persisting 6561541Srgrimes * 657168615Sandre * tcp_timer_active(tp, TT_PERSIST) 65850673Sjlemon * is true when we are in persist state. 659146463Sps * (tp->t_flags & TF_FORCEDATA) 6601541Srgrimes * is set when we are called to send a persist packet. 661168615Sandre * tcp_timer_active(tp, TT_REXMT) 6621541Srgrimes * is set when we are retransmitting 6631541Srgrimes * The output side is idle when both timers are zero. 6641541Srgrimes * 6651541Srgrimes * If send window is too small, there is data to transmit, and no 6661541Srgrimes * retransmit or persist is pending, then go to persist state. 6671541Srgrimes * If nothing happens soon, send when timer expires: 6681541Srgrimes * if window is nonzero, transmit what we can, 6691541Srgrimes * otherwise force out a byte. 6701541Srgrimes */ 671168615Sandre if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) && 672168615Sandre !tcp_timer_active(tp, TT_PERSIST)) { 6731541Srgrimes tp->t_rxtshift = 0; 6741541Srgrimes tcp_setpersist(tp); 6751541Srgrimes } 6761541Srgrimes 6771541Srgrimes /* 6781541Srgrimes * No reason to send a segment, just return. 6791541Srgrimes */ 680136327Srwatsonjust_return: 681136327Srwatson SOCKBUF_UNLOCK(&so->so_snd); 6821541Srgrimes return (0); 6831541Srgrimes 6841541Srgrimessend: 685136327Srwatson SOCKBUF_LOCK_ASSERT(&so->so_snd); 6861541Srgrimes /* 6871541Srgrimes * Before ESTABLISHED, force sending of initial options 6881541Srgrimes * unless TCP set not to do any options. 6891541Srgrimes * NOTE: we assume that the IP/TCP header plus TCP options 6901541Srgrimes * always fit in a single mbuf, leaving room for a maximum 6911541Srgrimes * link header, i.e. 69278064Sume * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES 6931541Srgrimes */ 6941541Srgrimes optlen = 0; 69555679Sshin#ifdef INET6 69655679Sshin if (isipv6) 69755679Sshin hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 69855679Sshin else 69955679Sshin#endif 700221250Sbz hdrlen = sizeof (struct tcpiphdr); 7011541Srgrimes 702167606Sandre /* 703167606Sandre * Compute options for segment. 704167606Sandre * We only have to care about SYN and established connection 705167606Sandre * segments. Options for SYN-ACK segments are handled in TCP 706167606Sandre * syncache. 707167606Sandre */ 708293894Sglebius to.to_flags = 0; 709167606Sandre if ((tp->t_flags & TF_NOOPT) == 0) { 710167606Sandre /* Maximum segment size. */ 711167606Sandre if (flags & TH_SYN) { 712167606Sandre tp->snd_nxt = tp->iss; 713167606Sandre to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); 714167606Sandre to.to_flags |= TOF_MSS; 715167606Sandre } 716167606Sandre /* Window scaling. */ 717167606Sandre if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 718167606Sandre to.to_wscale = tp->request_r_scale; 719167606Sandre to.to_flags |= TOF_SCALE; 720167606Sandre } 721167606Sandre /* Timestamps. */ 722167606Sandre if ((tp->t_flags & TF_RCVD_TSTMP) || 723167606Sandre ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 724231767Sbz to.to_tsval = tcp_ts_getticks() + tp->ts_offset; 725167606Sandre to.to_tsecr = tp->ts_recent; 726167606Sandre to.to_flags |= TOF_TS; 727167606Sandre /* Set receive buffer autosizing timestamp. */ 728167606Sandre if (tp->rfbuf_ts == 0 && 729167606Sandre (so->so_rcv.sb_flags & SB_AUTOSIZE)) 730231767Sbz tp->rfbuf_ts = tcp_ts_getticks(); 731167606Sandre } 732167606Sandre /* Selective ACK's. */ 733169317Sandre if (tp->t_flags & TF_SACK_PERMIT) { 734167606Sandre if (flags & TH_SYN) 735167606Sandre to.to_flags |= TOF_SACKPERM; 736167606Sandre else if (TCPS_HAVEESTABLISHED(tp->t_state) && 737167606Sandre (tp->t_flags & TF_SACK_PERMIT) && 738167606Sandre tp->rcv_numsacks > 0) { 739167606Sandre to.to_flags |= TOF_SACK; 740167606Sandre to.to_nsacks = tp->rcv_numsacks; 741167606Sandre to.to_sacks = (u_char *)tp->sackblks; 7421541Srgrimes } 7431541Srgrimes } 744125680Sbms#ifdef TCP_SIGNATURE 745167606Sandre /* TCP-MD5 (RFC2385). */ 746167606Sandre if (tp->t_flags & TF_SIGNATURE) 747167606Sandre to.to_flags |= TOF_SIGNATURE; 748125680Sbms#endif /* TCP_SIGNATURE */ 749125680Sbms 750167606Sandre /* Processing the options. */ 751174023Sbz hdrlen += optlen = tcp_addoptions(&to, opt); 752145372Sps } 753145372Sps 75455679Sshin#ifdef INET6 75555679Sshin if (isipv6) 75655679Sshin ipoptlen = ip6_optlen(tp->t_inpcb); 75755679Sshin else 75855679Sshin#endif 75998704Sluigi if (tp->t_inpcb->inp_options) 76036335Sfenner ipoptlen = tp->t_inpcb->inp_options->m_len - 76136335Sfenner offsetof(struct ipoption, ipopt_list); 76298704Sluigi else 76336335Sfenner ipoptlen = 0; 764171167Sgnn#ifdef IPSEC 765173835Sbz ipoptlen += ipsec_optlen; 76655679Sshin#endif 76736335Sfenner 7681541Srgrimes /* 7691541Srgrimes * Adjust data length if insertion of options will 7706283Swollman * bump the packet length beyond the t_maxopd length. 7716283Swollman * Clear the FIN bit because we cut off the tail of 7726283Swollman * the segment. 7731541Srgrimes */ 77436335Sfenner if (len + optlen + ipoptlen > tp->t_maxopd) { 7755802Sdg flags &= ~TH_FIN; 776212803Sandre 777162110Sandre if (tso) { 778212803Sandre KASSERT(ipoptlen == 0, 779212803Sandre ("%s: TSO can't do IP options", __func__)); 780212803Sandre 781212803Sandre /* 782251296Sandre * Limit a burst to t_tsomax minus IP, 783212803Sandre * TCP and options length to keep ip->ip_len 784251296Sandre * from overflowing or exceeding the maximum 785251296Sandre * length allowed by the network interface. 786212803Sandre */ 787251296Sandre if (len > tp->t_tsomax - hdrlen) { 788251296Sandre len = tp->t_tsomax - hdrlen; 789162110Sandre sendalot = 1; 790212803Sandre } 791212803Sandre 792212803Sandre /* 793212803Sandre * Prevent the last segment from being 794212803Sandre * fractional unless the send sockbuf can 795212803Sandre * be emptied. 796212803Sandre */ 797212803Sandre if (sendalot && off + len < so->so_snd.sb_cc) { 798212803Sandre len -= len % (tp->t_maxopd - optlen); 799162110Sandre sendalot = 1; 800212803Sandre } 801212803Sandre 802212803Sandre /* 803212803Sandre * Send the FIN in a separate segment 804212803Sandre * after the bulk sending is done. 805212803Sandre * We don't trust the TSO implementations 806212803Sandre * to clear the FIN flag on all but the 807212803Sandre * last segment. 808212803Sandre */ 809212803Sandre if (tp->t_flags & TF_NEEDFIN) 810212803Sandre sendalot = 1; 811212803Sandre 812162110Sandre } else { 813162110Sandre len = tp->t_maxopd - optlen - ipoptlen; 814162110Sandre sendalot = 1; 815162110Sandre } 816212803Sandre } else 817212803Sandre tso = 0; 8181541Srgrimes 819212803Sandre KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 820212803Sandre ("%s: len > IP_MAXPACKET", __func__)); 821212803Sandre 8226283Swollman/*#ifdef DIAGNOSTIC*/ 82357068Sshin#ifdef INET6 824133874Srwatson if (max_linkhdr + hdrlen > MCLBYTES) 82557068Sshin#else 826133874Srwatson if (max_linkhdr + hdrlen > MHLEN) 82798704Sluigi#endif 8281541Srgrimes panic("tcphdr too big"); 8296283Swollman/*#endif*/ 8301541Srgrimes 8311541Srgrimes /* 832182841Sbz * This KASSERT is here to catch edge cases at a well defined place. 833182841Sbz * Before, those had triggered (random) panic conditions further down. 834182841Sbz */ 835182841Sbz KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 836182841Sbz 837182841Sbz /* 8381541Srgrimes * Grab a header mbuf, attaching a copy of data to 8391541Srgrimes * be transmitted, and initialize the header from 8401541Srgrimes * the template for sends on this connection. 8411541Srgrimes */ 8421541Srgrimes if (len) { 843167715Sandre struct mbuf *mb; 844167715Sandre u_int moff; 845167715Sandre 846146463Sps if ((tp->t_flags & TF_FORCEDATA) && len == 1) 847190948Srwatson TCPSTAT_INC(tcps_sndprobe); 848169682Sjhb else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 849215434Sgnn tp->t_sndrexmitpack++; 850190948Srwatson TCPSTAT_INC(tcps_sndrexmitpack); 851190948Srwatson TCPSTAT_ADD(tcps_sndrexmitbyte, len); 8521541Srgrimes } else { 853190948Srwatson TCPSTAT_INC(tcps_sndpack); 854190948Srwatson TCPSTAT_ADD(tcps_sndbyte, len); 8551541Srgrimes } 856248323Sglebius#ifdef INET6 857248323Sglebius if (MHLEN < hdrlen + max_linkhdr) 858248323Sglebius m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 859248323Sglebius else 860248323Sglebius#endif 861248323Sglebius m = m_gethdr(M_NOWAIT, MT_DATA); 862248323Sglebius 8631541Srgrimes if (m == NULL) { 864136327Srwatson SOCKBUF_UNLOCK(&so->so_snd); 8651541Srgrimes error = ENOBUFS; 866249372Sglebius sack_rxmit = 0; 8671541Srgrimes goto out; 8681541Srgrimes } 869248323Sglebius 8701541Srgrimes m->m_data += max_linkhdr; 8711541Srgrimes m->m_len = hdrlen; 872167715Sandre 873167715Sandre /* 874167715Sandre * Start the m_copy functions from the closest mbuf 875167715Sandre * to the offset in the socket buffer chain. 876167715Sandre */ 877167715Sandre mb = sbsndptr(&so->so_snd, off, len, &moff); 878167715Sandre 8791541Srgrimes if (len <= MHLEN - hdrlen - max_linkhdr) { 880167715Sandre m_copydata(mb, moff, (int)len, 8811541Srgrimes mtod(m, caddr_t) + hdrlen); 8821541Srgrimes m->m_len += len; 8831541Srgrimes } else { 884167715Sandre m->m_next = m_copy(mb, moff, (int)len); 885167715Sandre if (m->m_next == NULL) { 886136327Srwatson SOCKBUF_UNLOCK(&so->so_snd); 88710965Swollman (void) m_free(m); 88810712Swollman error = ENOBUFS; 889249372Sglebius sack_rxmit = 0; 89010712Swollman goto out; 89110712Swollman } 8921541Srgrimes } 893223799Scperciva 8941541Srgrimes /* 8951541Srgrimes * If we're sending everything we've got, set PUSH. 8961541Srgrimes * (This will keep happy those implementations which only 8971541Srgrimes * give data to the user when a buffer fills or 8981541Srgrimes * a PUSH comes in.) 8991541Srgrimes */ 9001541Srgrimes if (off + len == so->so_snd.sb_cc) 9011541Srgrimes flags |= TH_PUSH; 902136327Srwatson SOCKBUF_UNLOCK(&so->so_snd); 9031541Srgrimes } else { 904136327Srwatson SOCKBUF_UNLOCK(&so->so_snd); 9051541Srgrimes if (tp->t_flags & TF_ACKNOW) 906190948Srwatson TCPSTAT_INC(tcps_sndacks); 9071541Srgrimes else if (flags & (TH_SYN|TH_FIN|TH_RST)) 908190948Srwatson TCPSTAT_INC(tcps_sndctrl); 9091541Srgrimes else if (SEQ_GT(tp->snd_up, tp->snd_una)) 910190948Srwatson TCPSTAT_INC(tcps_sndurg); 9111541Srgrimes else 912190948Srwatson TCPSTAT_INC(tcps_sndwinup); 9131541Srgrimes 914248373Sglebius m = m_gethdr(M_NOWAIT, MT_DATA); 9151541Srgrimes if (m == NULL) { 9161541Srgrimes error = ENOBUFS; 917249372Sglebius sack_rxmit = 0; 9181541Srgrimes goto out; 9191541Srgrimes } 92055679Sshin#ifdef INET6 92155679Sshin if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 92255679Sshin MHLEN >= hdrlen) { 92355679Sshin MH_ALIGN(m, hdrlen); 92455679Sshin } else 92555679Sshin#endif 9261541Srgrimes m->m_data += max_linkhdr; 9271541Srgrimes m->m_len = hdrlen; 9281541Srgrimes } 929136327Srwatson SOCKBUF_UNLOCK_ASSERT(&so->so_snd); 9301541Srgrimes m->m_pkthdr.rcvif = (struct ifnet *)0; 931101106Srwatson#ifdef MAC 932172930Srwatson mac_inpcb_create_mbuf(tp->t_inpcb, m); 933101106Srwatson#endif 93455679Sshin#ifdef INET6 93555679Sshin if (isipv6) { 93655679Sshin ip6 = mtod(m, struct ip6_hdr *); 93755679Sshin th = (struct tcphdr *)(ip6 + 1); 938111144Sjlemon tcpip_fillheaders(tp->t_inpcb, ip6, th); 93955679Sshin } else 94055679Sshin#endif /* INET6 */ 941133874Srwatson { 942133874Srwatson ip = mtod(m, struct ip *); 943133874Srwatson ipov = (struct ipovly *)ip; 944133874Srwatson th = (struct tcphdr *)(ip + 1); 945133874Srwatson tcpip_fillheaders(tp->t_inpcb, ip, th); 946133874Srwatson } 9471541Srgrimes 9481541Srgrimes /* 9491541Srgrimes * Fill in fields, remembering maximum advertised 9501541Srgrimes * window for use in delaying messages about window sizes. 9511541Srgrimes * If resending a FIN, be sure not to use a new sequence number. 9521541Srgrimes */ 9538876Srgrimes if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 9541541Srgrimes tp->snd_nxt == tp->snd_max) 9551541Srgrimes tp->snd_nxt--; 9561541Srgrimes /* 957181056Srpaulo * If we are starting a connection, send ECN setup 958181056Srpaulo * SYN packet. If we are on a retransmit, we may 959181056Srpaulo * resend those bits a number of times as per 960181056Srpaulo * RFC 3168. 961181056Srpaulo */ 962181803Sbz if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { 963181056Srpaulo if (tp->t_rxtshift >= 1) { 964181803Sbz if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 965181056Srpaulo flags |= TH_ECE|TH_CWR; 966181056Srpaulo } else 967181056Srpaulo flags |= TH_ECE|TH_CWR; 968181056Srpaulo } 969181056Srpaulo 970181056Srpaulo if (tp->t_state == TCPS_ESTABLISHED && 971181056Srpaulo (tp->t_flags & TF_ECN_PERMIT)) { 972181056Srpaulo /* 973181056Srpaulo * If the peer has ECN, mark data packets with 974181056Srpaulo * ECN capable transmission (ECT). 975181056Srpaulo * Ignore pure ack packets, retransmissions and window probes. 976181056Srpaulo */ 977181056Srpaulo if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 978181056Srpaulo !((tp->t_flags & TF_FORCEDATA) && len == 1)) { 979181056Srpaulo#ifdef INET6 980181056Srpaulo if (isipv6) 981181056Srpaulo ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 982181056Srpaulo else 983181056Srpaulo#endif 984181056Srpaulo ip->ip_tos |= IPTOS_ECN_ECT0; 985190948Srwatson TCPSTAT_INC(tcps_ecn_ect0); 986181056Srpaulo } 987181056Srpaulo 988181056Srpaulo /* 989181056Srpaulo * Reply with proper ECN notifications. 990181056Srpaulo */ 991181056Srpaulo if (tp->t_flags & TF_ECN_SND_CWR) { 992181056Srpaulo flags |= TH_CWR; 993181056Srpaulo tp->t_flags &= ~TF_ECN_SND_CWR; 994181056Srpaulo } 995181056Srpaulo if (tp->t_flags & TF_ECN_SND_ECE) 996181056Srpaulo flags |= TH_ECE; 997181056Srpaulo } 998181056Srpaulo 999181056Srpaulo /* 10001541Srgrimes * If we are doing retransmissions, then snd_nxt will 10011541Srgrimes * not reflect the first unsent octet. For ACK only 10021541Srgrimes * packets, we do not want the sequence number of the 10031541Srgrimes * retransmitted packet, we want the sequence number 10041541Srgrimes * of the next unsent octet. So, if there is no data 10051541Srgrimes * (and no SYN or FIN), use snd_max instead of snd_nxt 10061541Srgrimes * when filling in ti_seq. But if we are in persist 10071541Srgrimes * state, snd_max might reflect one byte beyond the 10081541Srgrimes * right edge of the window, so use snd_nxt in that 10091541Srgrimes * case, since we know we aren't doing a retransmission. 10101541Srgrimes * (retransmit and persist are mutually exclusive...) 10111541Srgrimes */ 1012136151Sps if (sack_rxmit == 0) { 1013168615Sandre if (len || (flags & (TH_SYN|TH_FIN)) || 1014168615Sandre tcp_timer_active(tp, TT_PERSIST)) 1015136151Sps th->th_seq = htonl(tp->snd_nxt); 1016136151Sps else 1017136151Sps th->th_seq = htonl(tp->snd_max); 1018136151Sps } else { 1019130989Sps th->th_seq = htonl(p->rxmit); 1020130989Sps p->rxmit += len; 1021146123Sps tp->sackhint.sack_bytes_rexmit += len; 1022130989Sps } 102355679Sshin th->th_ack = htonl(tp->rcv_nxt); 10241541Srgrimes if (optlen) { 102555679Sshin bcopy(opt, th + 1, optlen); 102655679Sshin th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 10271541Srgrimes } 102855679Sshin th->th_flags = flags; 10291541Srgrimes /* 10301541Srgrimes * Calculate receive window. Don't shrink window, 10311541Srgrimes * but avoid silly window syndrome. 10321541Srgrimes */ 1033124849Sandre if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 1034124849Sandre recwin < (long)tp->t_maxseg) 1035124849Sandre recwin = 0; 1036221346Sjhb if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 1037221346Sjhb recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 1038124849Sandre recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 1039124849Sandre if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) 1040124849Sandre recwin = (long)TCP_MAXWIN << tp->rcv_scale; 104187193Sdillon 1042170470Sandre /* 1043170470Sandre * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1044170470Sandre * or <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> 1045170470Sandre * case is handled in syncache. 1046170470Sandre */ 1047170470Sandre if (flags & TH_SYN) 1048170470Sandre th->th_win = htons((u_short) 1049170470Sandre (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 1050170470Sandre else 1051170470Sandre th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 105287193Sdillon 105387193Sdillon /* 105487193Sdillon * Adjust the RXWIN0SENT flag - indicate that we have advertised 105587193Sdillon * a 0 window. This may cause the remote transmitter to stall. This 105687193Sdillon * flag tells soreceive() to disable delayed acknowledgements when 105787193Sdillon * draining the buffer. This can occur if the receiver is attempting 1058180535Srpaulo * to read more data than can be buffered prior to transmitting on 105987193Sdillon * the connection. 106087193Sdillon */ 1061215434Sgnn if (th->th_win == 0) { 1062215434Sgnn tp->t_sndzerowin++; 106387193Sdillon tp->t_flags |= TF_RXWIN0SENT; 1064215434Sgnn } else 106587193Sdillon tp->t_flags &= ~TF_RXWIN0SENT; 10661541Srgrimes if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 106755679Sshin th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 106855679Sshin th->th_flags |= TH_URG; 10691541Srgrimes } else 10701541Srgrimes /* 10711541Srgrimes * If no urgent pointer to send, then we pull 10721541Srgrimes * the urgent pointer to the left edge of the send window 10731541Srgrimes * so that it doesn't drift into the send window on sequence 10741541Srgrimes * number wraparound. 10751541Srgrimes */ 10761541Srgrimes tp->snd_up = tp->snd_una; /* drag it along */ 10771541Srgrimes 1078125680Sbms#ifdef TCP_SIGNATURE 1079293894Sglebius if (to.to_flags & TOF_SIGNATURE) { 1080174120Sbz int sigoff = to.to_signature - opt; 1081183001Sbz tcp_signature_compute(m, 0, len, optlen, 1082125680Sbms (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); 1083174120Sbz } 1084125783Sbms#endif 1085125680Sbms 10861541Srgrimes /* 10871541Srgrimes * Put TCP length in extended header, and then 10881541Srgrimes * checksum extended header and data. 10891541Srgrimes */ 109055679Sshin m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 1091235961Sbz m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 109255679Sshin#ifdef INET6 1093235961Sbz if (isipv6) { 109455679Sshin /* 109555679Sshin * ip6_plen is not need to be filled now, and will be filled 109655679Sshin * in ip6_output. 109755679Sshin */ 1098236170Sbz m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 1099235961Sbz th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + 1100235961Sbz optlen + len, IPPROTO_TCP, 0); 1101235961Sbz } 1102235961Sbz#endif 1103235961Sbz#if defined(INET6) && defined(INET) 110455679Sshin else 1105235961Sbz#endif 1106235961Sbz#ifdef INET 1107133874Srwatson { 1108236170Sbz m->m_pkthdr.csum_flags = CSUM_TCP; 1109133874Srwatson th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 1110133874Srwatson htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); 111158698Sjlemon 1112133874Srwatson /* IP version must be set here for ipv4/ipv6 checking later */ 1113133874Srwatson KASSERT(ip->ip_v == IPVERSION, 1114133874Srwatson ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 1115133874Srwatson } 1116235961Sbz#endif 11171541Srgrimes 11181541Srgrimes /* 1119162110Sandre * Enable TSO and specify the size of the segments. 1120162110Sandre * The TCP pseudo header checksum is always provided. 1121162110Sandre * XXX: Fixme: This is currently not the case for IPv6. 1122162110Sandre */ 1123162110Sandre if (tso) { 1124211317Sandre KASSERT(len > tp->t_maxopd - optlen, 1125211317Sandre ("%s: len <= tso_segsz", __func__)); 1126206844Sken m->m_pkthdr.csum_flags |= CSUM_TSO; 1127162110Sandre m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; 1128162110Sandre } 1129162110Sandre 1130223326Sbz#ifdef IPSEC 1131223326Sbz KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), 1132223326Sbz ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u", 1133223326Sbz __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); 1134223326Sbz#else 1135212803Sandre KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), 1136223326Sbz ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u", 1137223326Sbz __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); 1138223326Sbz#endif 1139212803Sandre 1140216758Slstewart /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 1141216758Slstewart hhook_run_tcp_est_out(tp, th, &to, len, tso); 1142216758Slstewart 11432788Sdg#ifdef TCPDEBUG 11441541Srgrimes /* 11451541Srgrimes * Trace. 11461541Srgrimes */ 1147118862Sharti if (so->so_options & SO_DEBUG) { 1148130683Sbms u_short save = 0; 1149130666Sbms#ifdef INET6 1150130666Sbms if (!isipv6) 1151130666Sbms#endif 1152130666Sbms { 1153130666Sbms save = ipov->ih_len; 1154130666Sbms ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); 1155130666Sbms } 115655679Sshin tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 1157130666Sbms#ifdef INET6 1158130666Sbms if (!isipv6) 1159130666Sbms#endif 1160118862Sharti ipov->ih_len = save; 1161118862Sharti } 1162221250Sbz#endif /* TCPDEBUG */ 11631541Srgrimes 11641541Srgrimes /* 11651541Srgrimes * Fill in IP length and desired time to live and 11661541Srgrimes * send to IP level. There should be a better way 11671541Srgrimes * to handle ttl and tos; we could keep them in 11681541Srgrimes * the template, but need a way to checksum without them. 11691541Srgrimes */ 117055679Sshin /* 117155679Sshin * m->m_pkthdr.len should have been set before cksum calcuration, 117255679Sshin * because in6_cksum() need it. 117355679Sshin */ 117455679Sshin#ifdef INET6 117555679Sshin if (isipv6) { 1176238516Sglebius struct route_in6 ro; 1177238516Sglebius 1178238516Sglebius bzero(&ro, sizeof(ro)); 117962587Sitojun /* 118055679Sshin * we separately set hoplimit for every segment, since the 118155679Sshin * user might want to change the value via setsockopt. 118255679Sshin * Also, desired default hop limit might be changed via 118362587Sitojun * Neighbor Discovery. 118462587Sitojun */ 1185122922Sandre ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); 118655679Sshin 1187254889Smarkj /* 1188254889Smarkj * Set the packet size here for the benefit of DTrace probes. 1189254889Smarkj * ip6_output() will set it properly; it's supposed to include 1190254889Smarkj * the option header lengths as well. 1191254889Smarkj */ 1192254889Smarkj ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 1193254889Smarkj 1194254889Smarkj if (tp->t_state == TCPS_SYN_SENT) 1195260817Savg TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 1196254889Smarkj 1197254889Smarkj TCP_PROBE5(send, NULL, tp, ip6, tp, th); 1198254889Smarkj 119955679Sshin /* TODO: IPv6 IP6TOS_ECT bit on */ 1200238516Sglebius error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro, 1201238516Sglebius ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 1202238516Sglebius NULL, NULL, tp->t_inpcb); 1203238516Sglebius 1204238516Sglebius if (error == EMSGSIZE && ro.ro_rt != NULL) 1205263478Sglebius mtu = ro.ro_rt->rt_mtu; 1206238516Sglebius RO_RTFREE(&ro); 1207221250Sbz } 120855679Sshin#endif /* INET6 */ 1209221250Sbz#if defined(INET) && defined(INET6) 1210221250Sbz else 1211221250Sbz#endif 1212221250Sbz#ifdef INET 12131541Srgrimes { 1214238516Sglebius struct route ro; 1215238516Sglebius 1216238516Sglebius bzero(&ro, sizeof(ro)); 1217241913Sglebius ip->ip_len = htons(m->m_pkthdr.len); 121862587Sitojun#ifdef INET6 1219185371Sbz if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) 1220133874Srwatson ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); 122162587Sitojun#endif /* INET6 */ 122210930Swollman /* 1223122922Sandre * If we do path MTU discovery, then we set DF on every packet. 1224122922Sandre * This might not be the best thing to do according to RFC3390 1225122922Sandre * Section 2. However the tcp hostcache migitates the problem 1226122922Sandre * so it affects only the first tcp connection with a host. 1227211333Sandre * 1228211333Sandre * NB: Don't set DF on small MTU/MSS to have a safe fallback. 122910930Swollman */ 1230211333Sandre if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) 1231241913Sglebius ip->ip_off |= htons(IP_DF); 1232122922Sandre 1233254889Smarkj if (tp->t_state == TCPS_SYN_SENT) 1234260817Savg TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 1235254889Smarkj 1236254889Smarkj TCP_PROBE5(send, NULL, tp, ip, tp, th); 1237254889Smarkj 1238238516Sglebius error = ip_output(m, tp->t_inpcb->inp_options, &ro, 1239134793Sjmg ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, 1240134793Sjmg tp->t_inpcb); 1241238516Sglebius 1242238516Sglebius if (error == EMSGSIZE && ro.ro_rt != NULL) 1243263478Sglebius mtu = ro.ro_rt->rt_mtu; 1244238516Sglebius RO_RTFREE(&ro); 12451541Srgrimes } 1246221250Sbz#endif /* INET */ 1247249372Sglebius 1248249372Sglebiusout: 1249249372Sglebius /* 1250249372Sglebius * In transmit state, time the transmission and arrange for 1251249372Sglebius * the retransmit. In persist state, just set snd_max. 1252249372Sglebius */ 1253249372Sglebius if ((tp->t_flags & TF_FORCEDATA) == 0 || 1254249372Sglebius !tcp_timer_active(tp, TT_PERSIST)) { 1255249372Sglebius tcp_seq startseq = tp->snd_nxt; 1256249372Sglebius 1257249372Sglebius /* 1258249372Sglebius * Advance snd_nxt over sequence space of this segment. 1259249372Sglebius */ 1260249372Sglebius if (flags & (TH_SYN|TH_FIN)) { 1261249372Sglebius if (flags & TH_SYN) 1262249372Sglebius tp->snd_nxt++; 1263249372Sglebius if (flags & TH_FIN) { 1264249372Sglebius tp->snd_nxt++; 1265249372Sglebius tp->t_flags |= TF_SENTFIN; 1266249372Sglebius } 1267249372Sglebius } 1268249372Sglebius if (sack_rxmit) 1269249372Sglebius goto timer; 1270249372Sglebius tp->snd_nxt += len; 1271249372Sglebius if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 1272249372Sglebius tp->snd_max = tp->snd_nxt; 1273249372Sglebius /* 1274249372Sglebius * Time this transmission if not a retransmission and 1275249372Sglebius * not currently timing anything. 1276249372Sglebius */ 1277249372Sglebius if (tp->t_rtttime == 0) { 1278249372Sglebius tp->t_rtttime = ticks; 1279249372Sglebius tp->t_rtseq = startseq; 1280249372Sglebius TCPSTAT_INC(tcps_segstimed); 1281249372Sglebius } 1282249372Sglebius } 1283249372Sglebius 1284249372Sglebius /* 1285249372Sglebius * Set retransmit timer if not currently set, 1286249372Sglebius * and not doing a pure ack or a keep-alive probe. 1287249372Sglebius * Initial value for retransmit timer is smoothed 1288249372Sglebius * round-trip time + 2 * round-trip time variance. 1289249372Sglebius * Initialize shift counter which is used for backoff 1290249372Sglebius * of retransmit time. 1291249372Sglebius */ 1292249372Sglebiustimer: 1293249372Sglebius if (!tcp_timer_active(tp, TT_REXMT) && 1294249372Sglebius ((sack_rxmit && tp->snd_nxt != tp->snd_max) || 1295249372Sglebius (tp->snd_nxt != tp->snd_una))) { 1296249372Sglebius if (tcp_timer_active(tp, TT_PERSIST)) { 1297249372Sglebius tcp_timer_activate(tp, TT_PERSIST, 0); 1298249372Sglebius tp->t_rxtshift = 0; 1299249372Sglebius } 1300249372Sglebius tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 1301249372Sglebius } 1302249372Sglebius } else { 1303249372Sglebius /* 1304249372Sglebius * Persist case, update snd_max but since we are in 1305249372Sglebius * persist mode (no window) we do not update snd_nxt. 1306249372Sglebius */ 1307249372Sglebius int xlen = len; 1308249372Sglebius if (flags & TH_SYN) 1309249372Sglebius ++xlen; 1310249372Sglebius if (flags & TH_FIN) { 1311249372Sglebius ++xlen; 1312249372Sglebius tp->t_flags |= TF_SENTFIN; 1313249372Sglebius } 1314249372Sglebius if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) 1315249372Sglebius tp->snd_max = tp->snd_nxt + len; 1316249372Sglebius } 1317249372Sglebius 13181541Srgrimes if (error) { 131964213Sarchie 132064213Sarchie /* 132164213Sarchie * We know that the packet was lost, so back out the 132264213Sarchie * sequence number advance, if any. 1323162739Sandre * 1324162739Sandre * If the error is EPERM the packet got blocked by the 1325162739Sandre * local firewall. Normally we should terminate the 1326162739Sandre * connection but the blocking may have been spurious 1327162739Sandre * due to a firewall reconfiguration cycle. So we treat 1328162739Sandre * it like a packet loss and let the retransmit timer and 1329162739Sandre * timeouts do their work over time. 1330162739Sandre * XXX: It is a POLA question whether calling tcp_drop right 1331162739Sandre * away would be the really correct behavior instead. 133264213Sarchie */ 1333167106Sglebius if (((tp->t_flags & TF_FORCEDATA) == 0 || 1334168615Sandre !tcp_timer_active(tp, TT_PERSIST)) && 1335167106Sglebius ((flags & TH_SYN) == 0) && 1336167106Sglebius (error != EPERM)) { 1337167106Sglebius if (sack_rxmit) { 1338167106Sglebius p->rxmit -= len; 1339167106Sglebius tp->sackhint.sack_bytes_rexmit -= len; 1340167106Sglebius KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, 1341167106Sglebius ("sackhint bytes rtx >= 0")); 1342167106Sglebius } else 1343167106Sglebius tp->snd_nxt -= len; 134464213Sarchie } 1345167106Sglebius SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ 1346167106Sglebius switch (error) { 1347167106Sglebius case EPERM: 1348162739Sandre tp->t_softerror = error; 1349162739Sandre return (error); 1350167106Sglebius case ENOBUFS: 1351168615Sandre if (!tcp_timer_active(tp, TT_REXMT) && 1352168615Sandre !tcp_timer_active(tp, TT_PERSIST)) 1353168615Sandre tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 1354145355Sandre tp->snd_cwnd = tp->t_maxseg; 13551541Srgrimes return (0); 1356167106Sglebius case EMSGSIZE: 135711537Swollman /* 1358162110Sandre * For some reason the interface we used initially 1359162110Sandre * to send segments changed to another or lowered 1360162110Sandre * its MTU. 1361162110Sandre * If TSO was active we either got an interface 1362162110Sandre * without TSO capabilits or TSO was turned off. 1363238516Sglebius * If we obtained mtu from ip_output() then update 1364238516Sglebius * it and try again. 136511537Swollman */ 1366162110Sandre if (tso) 1367162110Sandre tp->t_flags &= ~TF_TSO; 1368238516Sglebius if (mtu != 0) { 1369238516Sglebius tcp_mss_update(tp, -1, mtu, NULL, NULL); 1370238516Sglebius goto again; 1371238516Sglebius } 1372238516Sglebius return (error); 1373167107Sglebius case EHOSTDOWN: 1374167106Sglebius case EHOSTUNREACH: 1375167106Sglebius case ENETDOWN: 1376167107Sglebius case ENETUNREACH: 1377167106Sglebius if (TCPS_HAVERCVDSYN(tp->t_state)) { 1378167106Sglebius tp->t_softerror = error; 1379167106Sglebius return (0); 1380167106Sglebius } 1381167106Sglebius /* FALLTHROUGH */ 1382167106Sglebius default: 1383167106Sglebius return (error); 13841541Srgrimes } 13851541Srgrimes } 1386190948Srwatson TCPSTAT_INC(tcps_sndtotal); 13871541Srgrimes 13881541Srgrimes /* 13891541Srgrimes * Data sent (as far as we can tell). 13901541Srgrimes * If this advertises a larger window than any other segment, 13911541Srgrimes * then remember the size of the advertised window. 13921541Srgrimes * Any pending ACK has now been sent. 13931541Srgrimes */ 1394223049Sjhb if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 1395124849Sandre tp->rcv_adv = tp->rcv_nxt + recwin; 13961541Srgrimes tp->last_ack_sent = tp->rcv_nxt; 1397111139Sjlemon tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 1398168615Sandre if (tcp_timer_active(tp, TT_DELACK)) 1399168615Sandre tcp_timer_activate(tp, TT_DELACK, 0); 140087145Sdillon#if 0 140187145Sdillon /* 140287145Sdillon * This completely breaks TCP if newreno is turned on. What happens 140387145Sdillon * is that if delayed-acks are turned on on the receiver, this code 140487145Sdillon * on the transmitter effectively destroys the TCP window, forcing 140587145Sdillon * it to four packets (1.5Kx4 = 6K window). 140687145Sdillon */ 1407215166Slstewart if (sendalot && --maxburst) 14081541Srgrimes goto again; 140987145Sdillon#endif 141087145Sdillon if (sendalot) 141187145Sdillon goto again; 14121541Srgrimes return (0); 14131541Srgrimes} 14141541Srgrimes 14151541Srgrimesvoid 1416167785Sandretcp_setpersist(struct tcpcb *tp) 14171541Srgrimes{ 141850673Sjlemon int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 141950673Sjlemon int tt; 14201541Srgrimes 1421221209Sjhb tp->t_flags &= ~TF_PREVVALID; 1422168615Sandre if (tcp_timer_active(tp, TT_REXMT)) 142350673Sjlemon panic("tcp_setpersist: retransmit pending"); 14241541Srgrimes /* 14251541Srgrimes * Start/restart persistance timer. 14261541Srgrimes */ 142750673Sjlemon TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 142850673Sjlemon TCPTV_PERSMIN, TCPTV_PERSMAX); 1429168615Sandre tcp_timer_activate(tp, TT_PERSIST, tt); 14301541Srgrimes if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 14311541Srgrimes tp->t_rxtshift++; 14321541Srgrimes} 1433167606Sandre 1434167606Sandre/* 1435167606Sandre * Insert TCP options according to the supplied parameters to the place 1436167606Sandre * optp in a consistent way. Can handle unaligned destinations. 1437167606Sandre * 1438167606Sandre * The order of the option processing is crucial for optimal packing and 1439167606Sandre * alignment for the scarce option space. 1440167606Sandre * 1441167606Sandre * The optimal order for a SYN/SYN-ACK segment is: 1442167606Sandre * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + 1443167606Sandre * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. 1444167606Sandre * 1445167606Sandre * The SACK options should be last. SACK blocks consume 8*n+2 bytes. 1446167606Sandre * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). 1447167606Sandre * At minimum we need 10 bytes (to generate 1 SACK block). If both 1448167606Sandre * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, 1449167606Sandre * we only have 10 bytes for SACK options (40 - (12 + 18)). 1450167606Sandre */ 1451167606Sandreint 1452167606Sandretcp_addoptions(struct tcpopt *to, u_char *optp) 1453167606Sandre{ 1454167606Sandre u_int mask, optlen = 0; 1455167606Sandre 1456167606Sandre for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { 1457167606Sandre if ((to->to_flags & mask) != mask) 1458167606Sandre continue; 1459177988Sandre if (optlen == TCP_MAXOLEN) 1460177988Sandre break; 1461167606Sandre switch (to->to_flags & mask) { 1462167606Sandre case TOF_MSS: 1463167606Sandre while (optlen % 4) { 1464167606Sandre optlen += TCPOLEN_NOP; 1465167606Sandre *optp++ = TCPOPT_NOP; 1466167606Sandre } 1467177988Sandre if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) 1468177988Sandre continue; 1469167606Sandre optlen += TCPOLEN_MAXSEG; 1470167606Sandre *optp++ = TCPOPT_MAXSEG; 1471167606Sandre *optp++ = TCPOLEN_MAXSEG; 1472167606Sandre to->to_mss = htons(to->to_mss); 1473167606Sandre bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); 1474167606Sandre optp += sizeof(to->to_mss); 1475167606Sandre break; 1476167606Sandre case TOF_SCALE: 1477167606Sandre while (!optlen || optlen % 2 != 1) { 1478167606Sandre optlen += TCPOLEN_NOP; 1479167606Sandre *optp++ = TCPOPT_NOP; 1480167606Sandre } 1481177988Sandre if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) 1482177988Sandre continue; 1483167606Sandre optlen += TCPOLEN_WINDOW; 1484167606Sandre *optp++ = TCPOPT_WINDOW; 1485167606Sandre *optp++ = TCPOLEN_WINDOW; 1486167606Sandre *optp++ = to->to_wscale; 1487167606Sandre break; 1488167606Sandre case TOF_SACKPERM: 1489167606Sandre while (optlen % 2) { 1490167606Sandre optlen += TCPOLEN_NOP; 1491167606Sandre *optp++ = TCPOPT_NOP; 1492167606Sandre } 1493177988Sandre if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) 1494177988Sandre continue; 1495167606Sandre optlen += TCPOLEN_SACK_PERMITTED; 1496167606Sandre *optp++ = TCPOPT_SACK_PERMITTED; 1497167606Sandre *optp++ = TCPOLEN_SACK_PERMITTED; 1498167606Sandre break; 1499167606Sandre case TOF_TS: 1500167606Sandre while (!optlen || optlen % 4 != 2) { 1501167606Sandre optlen += TCPOLEN_NOP; 1502167606Sandre *optp++ = TCPOPT_NOP; 1503167606Sandre } 1504177988Sandre if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) 1505177988Sandre continue; 1506167606Sandre optlen += TCPOLEN_TIMESTAMP; 1507167606Sandre *optp++ = TCPOPT_TIMESTAMP; 1508167606Sandre *optp++ = TCPOLEN_TIMESTAMP; 1509167606Sandre to->to_tsval = htonl(to->to_tsval); 1510167606Sandre to->to_tsecr = htonl(to->to_tsecr); 1511167606Sandre bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); 1512167606Sandre optp += sizeof(to->to_tsval); 1513167606Sandre bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); 1514167606Sandre optp += sizeof(to->to_tsecr); 1515167606Sandre break; 1516293894Sglebius#ifdef TCP_SIGNATURE 1517167606Sandre case TOF_SIGNATURE: 1518167606Sandre { 1519167606Sandre int siglen = TCPOLEN_SIGNATURE - 2; 1520167606Sandre 1521167606Sandre while (!optlen || optlen % 4 != 2) { 1522167606Sandre optlen += TCPOLEN_NOP; 1523167606Sandre *optp++ = TCPOPT_NOP; 1524167606Sandre } 1525168904Sandre if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) 1526167606Sandre continue; 1527167606Sandre optlen += TCPOLEN_SIGNATURE; 1528167606Sandre *optp++ = TCPOPT_SIGNATURE; 1529167606Sandre *optp++ = TCPOLEN_SIGNATURE; 1530167606Sandre to->to_signature = optp; 1531167606Sandre while (siglen--) 1532167606Sandre *optp++ = 0; 1533167606Sandre break; 1534167606Sandre } 1535293894Sglebius#endif 1536167606Sandre case TOF_SACK: 1537167606Sandre { 1538167606Sandre int sackblks = 0; 1539167606Sandre struct sackblk *sack = (struct sackblk *)to->to_sacks; 1540167606Sandre tcp_seq sack_seq; 1541167606Sandre 1542167606Sandre while (!optlen || optlen % 4 != 2) { 1543167606Sandre optlen += TCPOLEN_NOP; 1544167606Sandre *optp++ = TCPOPT_NOP; 1545167606Sandre } 1546177988Sandre if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) 1547167606Sandre continue; 1548167606Sandre optlen += TCPOLEN_SACKHDR; 1549167606Sandre *optp++ = TCPOPT_SACK; 1550167606Sandre sackblks = min(to->to_nsacks, 1551168904Sandre (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); 1552167606Sandre *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; 1553167606Sandre while (sackblks--) { 1554167606Sandre sack_seq = htonl(sack->start); 1555167606Sandre bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); 1556167606Sandre optp += sizeof(sack_seq); 1557167606Sandre sack_seq = htonl(sack->end); 1558167606Sandre bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); 1559167606Sandre optp += sizeof(sack_seq); 1560167606Sandre optlen += TCPOLEN_SACK; 1561167606Sandre sack++; 1562167606Sandre } 1563190948Srwatson TCPSTAT_INC(tcps_sack_send_blocks); 1564167606Sandre break; 1565167606Sandre } 1566167606Sandre default: 1567167606Sandre panic("%s: unknown TCP option type", __func__); 1568167606Sandre break; 1569167606Sandre } 1570167606Sandre } 1571167606Sandre 1572167606Sandre /* Terminate and pad TCP options to a 4 byte boundary. */ 1573167606Sandre if (optlen % 4) { 1574167606Sandre optlen += TCPOLEN_EOL; 1575167606Sandre *optp++ = TCPOPT_EOL; 1576167606Sandre } 1577176978Sbz /* 1578176978Sbz * According to RFC 793 (STD0007): 1579176978Sbz * "The content of the header beyond the End-of-Option option 1580176978Sbz * must be header padding (i.e., zero)." 1581176978Sbz * and later: "The padding is composed of zeros." 1582176978Sbz */ 1583167606Sandre while (optlen % 4) { 1584177986Sandre optlen += TCPOLEN_PAD; 1585177986Sandre *optp++ = TCPOPT_PAD; 1586167606Sandre } 1587167606Sandre 1588168904Sandre KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); 1589167606Sandre return (optlen); 1590167606Sandre} 1591