115885Sjulian/*- 215885Sjulian * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 315885Sjulian * The Regents of the University of California. 415885Sjulian * All rights reserved. 515885Sjulian * 615885Sjulian * Redistribution and use in source and binary forms, with or without 715885Sjulian * modification, are permitted provided that the following conditions 815885Sjulian * are met: 924204Sbde * 1. Redistributions of source code must retain the above copyright 1029024Sbde * notice, this list of conditions and the following disclaimer. 1115885Sjulian * 2. Redistributions in binary form must reproduce the above copyright 1215885Sjulian * notice, this list of conditions and the following disclaimer in the 1315885Sjulian * documentation and/or other materials provided with the distribution. 1415885Sjulian * 4. Neither the name of the University nor the names of its contributors 1515885Sjulian * may be used to endorse or promote products derived from this software 1615885Sjulian * without specific prior written permission. 1715885Sjulian * 1815885Sjulian * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 1918207Sbde * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2018207Sbde * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2115885Sjulian * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2215885Sjulian * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2329182Sbde * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2429182Sbde * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2517967Sjulian * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2617967Sjulian * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2717254Sjulian * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2817254Sjulian * SUCH DAMAGE. 2917254Sjulian * 3017254Sjulian * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 3117254Sjulian */ 3217254Sjulian 3315885Sjulian/*- 3415885Sjulian * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 3515885Sjulian * 3628845Sjulian * NRL grants permission for redistribution and use in source and binary 3715885Sjulian * forms, with or without modification, of the software and documentation 3815885Sjulian * created at NRL provided that the following conditions are met: 3915885Sjulian * 4015885Sjulian * 1. Redistributions of source code must retain the above copyright 4115885Sjulian * notice, this list of conditions and the following disclaimer. 4215885Sjulian * 2. Redistributions in binary form must reproduce the above copyright 4315885Sjulian * notice, this list of conditions and the following disclaimer in the 4425791Sjulian * documentation and/or other materials provided with the distribution. 4525791Sjulian * 3. All advertising materials mentioning features or use of this software 4615885Sjulian * must display the following acknowledgements: 4715885Sjulian * This product includes software developed by the University of 4815885Sjulian * California, Berkeley and its contributors. 4915885Sjulian * This product includes software developed at the Information 5015885Sjulian * Technology Division, US Naval Research Laboratory. 5115885Sjulian * 4. Neither the name of the NRL nor the names of its contributors 5215885Sjulian * may be used to endorse or promote products derived from this software 5318240Sjulian * without specific prior written permission. 5415885Sjulian * 5517921Sjulian * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 5617921Sjulian * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 5717921Sjulian * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 5815885Sjulian * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 5915885Sjulian * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 6015885Sjulian * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 6115885Sjulian * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 6215885Sjulian * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 6315885Sjulian * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 6417921Sjulian * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 6517921Sjulian * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 6617921Sjulian * 6717921Sjulian * The views and conclusions contained in the software and documentation 6817921Sjulian * are those of the authors and should not be interpreted as representing 6917921Sjulian * official policies, either expressed or implied, of the US Naval 7015885Sjulian * Research Laboratory (NRL). 7115885Sjulian */ 7215885Sjulian 7317921Sjulian#include <sys/cdefs.h> 7417921Sjulian__FBSDID("$FreeBSD$"); 7517921Sjulian 7617921Sjulian#include "opt_inet.h" 7717921Sjulian#include "opt_inet6.h" 7817921Sjulian#include "opt_tcpdebug.h" 7917921Sjulian 8015885Sjulian#include <sys/param.h> 8115885Sjulian#include <sys/systm.h> 8215885Sjulian#include <sys/kernel.h> 8315885Sjulian#include <sys/sysctl.h> 8415885Sjulian#include <sys/malloc.h> 8515885Sjulian#include <sys/mbuf.h> 8615885Sjulian#include <sys/proc.h> /* for proc0 declaration */ 8715885Sjulian#include <sys/protosw.h> 8817921Sjulian#include <sys/socket.h> 8917921Sjulian#include <sys/socketvar.h> 9017921Sjulian#include <sys/syslog.h> 9117921Sjulian#include <sys/systm.h> 9215885Sjulian 9315885Sjulian#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 9415885Sjulian 9515885Sjulian#include <vm/uma.h> 9615885Sjulian 9715885Sjulian#include <net/if.h> 9817921Sjulian#include <net/route.h> 9917921Sjulian#include <net/vnet.h> 10017921Sjulian 10115885Sjulian#include <netinet/in.h> 10215885Sjulian#include <netinet/in_systm.h> 10315885Sjulian#include <netinet/ip.h> 10415885Sjulian#include <netinet/in_var.h> 10515885Sjulian#include <netinet/in_pcb.h> 10615885Sjulian#include <netinet/ip_var.h> 10715885Sjulian#include <netinet/ip6.h> 10817921Sjulian#include <netinet/icmp6.h> 10917921Sjulian#include <netinet6/nd6.h> 11017921Sjulian#include <netinet6/ip6_var.h> 11117921Sjulian#include <netinet6/in6_pcb.h> 11217921Sjulian#include <netinet/tcp.h> 11315885Sjulian#include <netinet/tcp_fsm.h> 11415885Sjulian#include <netinet/tcp_seq.h> 11515885Sjulian#include <netinet/tcp_timer.h> 11615885Sjulian#include <netinet/tcp_var.h> 11715885Sjulian#include <netinet6/tcp6_var.h> 11815885Sjulian#include <netinet/tcpip.h> 11915885Sjulian#ifdef TCPDEBUG 12017921Sjulian#include <netinet/tcp_debug.h> 12117921Sjulian#endif /* TCPDEBUG */ 12217921Sjulian 12317921Sjulian#include <machine/in_cksum.h> 12417921Sjulian 12515885SjulianVNET_DECLARE(struct uma_zone *, sack_hole_zone); 12615885Sjulian#define V_sack_hole_zone VNET(sack_hole_zone) 12715885Sjulian 12815885SjulianSYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); 12915885SjulianVNET_DEFINE(int, tcp_do_sack) = 1; 13015885Sjulian#define V_tcp_do_sack VNET(tcp_do_sack) 13115885SjulianSYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW, 13215885Sjulian &VNET_NAME(tcp_do_sack), 0, "Enable/Disable TCP SACK support"); 13315885Sjulian 13415885SjulianVNET_DEFINE(int, tcp_sack_maxholes) = 128; 13517921Sjulian#define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) 13617921SjulianSYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW, 13717921Sjulian &VNET_NAME(tcp_sack_maxholes), 0, 13817921Sjulian "Maximum number of TCP SACK holes allowed per connection"); 13915885Sjulian 14018240SjulianVNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536; 14118244Sjulian#define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) 14215885SjulianSYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW, 14315885Sjulian &VNET_NAME(tcp_sack_globalmaxholes), 0, 14415885Sjulian "Global maximum number of TCP SACK holes"); 14515885Sjulian 14615885SjulianVNET_DEFINE(int, tcp_sack_globalholes) = 0; 14717921Sjulian#define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) 14817921SjulianSYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD, 14915885Sjulian &VNET_NAME(tcp_sack_globalholes), 0, 15015885Sjulian "Global number of TCP SACK holes currently allocated"); 15118240Sjulian 15215885Sjulian/* 15315885Sjulian * This function is called upon receipt of new valid data (while not in 15415885Sjulian * header prediction mode), and it updates the ordered list of sacks. 15515885Sjulian */ 15615885Sjulianvoid 15718240Sjuliantcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) 15815885Sjulian{ 15915885Sjulian /* 16018240Sjulian * First reported block MUST be the most recent one. Subsequent 16115885Sjulian * blocks SHOULD be in the order in which they arrived at the 16218240Sjulian * receiver. These two conditions make the implementation fully 16318240Sjulian * compliant with RFC 2018. 16418240Sjulian */ 16518240Sjulian struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; 16618240Sjulian int num_head, num_saved, i; 16718240Sjulian 16818240Sjulian INP_WLOCK_ASSERT(tp->t_inpcb); 16915885Sjulian 17017921Sjulian /* Check arguments. */ 17117921Sjulian KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); 17217921Sjulian 17317921Sjulian /* SACK block for the received segment. */ 17420407Swollman head_blk.start = rcv_start; 17520407Swollman head_blk.end = rcv_end; 17620407Swollman 17718240Sjulian /* 17818240Sjulian * Merge updated SACK blocks into head_blk, and save unchanged SACK 17918240Sjulian * blocks into saved_blks[]. num_saved will have the number of the 18020407Swollman * saved SACK blocks. 18115885Sjulian */ 18217921Sjulian num_saved = 0; 18317921Sjulian for (i = 0; i < tp->rcv_numsacks; i++) { 18417921Sjulian tcp_seq start = tp->sackblks[i].start; 18517921Sjulian tcp_seq end = tp->sackblks[i].end; 18620407Swollman if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { 18720407Swollman /* 18820407Swollman * Discard this SACK block. 18915885Sjulian */ 19015885Sjulian } else if (SEQ_LEQ(head_blk.start, end) && 19115885Sjulian SEQ_GEQ(head_blk.end, start)) { 19215885Sjulian /* 19315885Sjulian * Merge this SACK block into head_blk. This SACK 19415885Sjulian * block itself will be discarded. 19515885Sjulian */ 19615885Sjulian if (SEQ_GT(head_blk.start, start)) 19715885Sjulian head_blk.start = start; 19817921Sjulian if (SEQ_LT(head_blk.end, end)) 19917921Sjulian head_blk.end = end; 20017921Sjulian } else { 20117921Sjulian /* 20215885Sjulian * Save this SACK block. 20315885Sjulian */ 20417921Sjulian saved_blks[num_saved].start = start; 20517921Sjulian saved_blks[num_saved].end = end; 20617921Sjulian num_saved++; 20715885Sjulian } 20815885Sjulian } 20915885Sjulian 21015885Sjulian /* 21115885Sjulian * Update SACK list in tp->sackblks[]. 21215885Sjulian */ 21315885Sjulian num_head = 0; 21415885Sjulian if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { 21517921Sjulian /* 21617921Sjulian * The received data segment is an out-of-order segment. Put 21717921Sjulian * head_blk at the top of SACK list. 21817921Sjulian */ 21915885Sjulian tp->sackblks[0] = head_blk; 22015885Sjulian num_head = 1; 22115885Sjulian /* 22215885Sjulian * If the number of saved SACK blocks exceeds its limit, 22315885Sjulian * discard the last SACK block. 22415885Sjulian */ 22517921Sjulian if (num_saved >= MAX_SACK_BLKS) 22617921Sjulian num_saved--; 22717921Sjulian } 22817921Sjulian if (num_saved > 0) { 22915885Sjulian /* 23015885Sjulian * Copy the saved SACK blocks back. 23115885Sjulian */ 23215885Sjulian bcopy(saved_blks, &tp->sackblks[num_head], 23315885Sjulian sizeof(struct sackblk) * num_saved); 23415885Sjulian } 23515885Sjulian 23615885Sjulian /* Save the number of SACK blocks. */ 23715885Sjulian tp->rcv_numsacks = num_head + num_saved; 23815885Sjulian} 23915885Sjulian 24015885Sjulian/* 24117921Sjulian * Delete all receiver-side SACK information. 24217921Sjulian */ 24317921Sjulianvoid 24417921Sjuliantcp_clean_sackreport(struct tcpcb *tp) 24515885Sjulian{ 24615885Sjulian int i; 24717921Sjulian 24817921Sjulian INP_WLOCK_ASSERT(tp->t_inpcb); 24917921Sjulian tp->rcv_numsacks = 0; 25017921Sjulian for (i = 0; i < MAX_SACK_BLKS; i++) 25117254Sjulian tp->sackblks[i].start = tp->sackblks[i].end=0; 25217254Sjulian} 25317921Sjulian 25417921Sjulian/* 25517921Sjulian * Allocate struct sackhole. 25617921Sjulian */ 25717254Sjulianstatic struct sackhole * 25817254Sjuliantcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) 25917254Sjulian{ 26017254Sjulian struct sackhole *hole; 26115885Sjulian 26215885Sjulian if (tp->snd_numholes >= V_tcp_sack_maxholes || 26315885Sjulian V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) { 26415885Sjulian TCPSTAT_INC(tcps_sack_sboverflow); 26515885Sjulian return NULL; 26615885Sjulian } 26715885Sjulian 26815885Sjulian hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT); 26915885Sjulian if (hole == NULL) 27015885Sjulian return NULL; 27115885Sjulian 27215885Sjulian hole->start = start; 27317921Sjulian hole->end = end; 27417921Sjulian hole->rxmit = start; 27517921Sjulian 27615885Sjulian tp->snd_numholes++; 27717921Sjulian atomic_add_int(&V_tcp_sack_globalholes, 1); 27817921Sjulian 27917921Sjulian return hole; 28017921Sjulian} 28118240Sjulian 28220407Swollman/* 28317921Sjulian * Free struct sackhole. 28418240Sjulian */ 28518240Sjulianstatic void 28618240Sjuliantcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) 28718240Sjulian{ 28818240Sjulian 28915885Sjulian uma_zfree(V_sack_hole_zone, hole); 29017921Sjulian 29117921Sjulian tp->snd_numholes--; 29217921Sjulian atomic_subtract_int(&V_tcp_sack_globalholes, 1); 29317921Sjulian 29415885Sjulian KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); 29515885Sjulian KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); 29615885Sjulian} 29715885Sjulian 29815885Sjulian/* 29915885Sjulian * Insert new SACK hole into scoreboard. 30015885Sjulian */ 30117921Sjulianstatic struct sackhole * 30217921Sjuliantcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end, 30317921Sjulian struct sackhole *after) 30417921Sjulian{ 30515885Sjulian struct sackhole *hole; 30615885Sjulian 30715885Sjulian /* Allocate a new SACK hole. */ 30815885Sjulian hole = tcp_sackhole_alloc(tp, start, end); 30915885Sjulian if (hole == NULL) 31015885Sjulian return NULL; 31117921Sjulian 31217921Sjulian /* Insert the new SACK hole into scoreboard. */ 31318240Sjulian if (after != NULL) 31418240Sjulian TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink); 31518240Sjulian else 31618240Sjulian TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink); 31718240Sjulian 31817921Sjulian /* Update SACK hint. */ 31918240Sjulian if (tp->sackhint.nexthole == NULL) 32015885Sjulian tp->sackhint.nexthole = hole; 32115885Sjulian 32215885Sjulian return hole; 32315885Sjulian} 32415885Sjulian 32515885Sjulian/* 32615885Sjulian * Remove SACK hole from scoreboard. 32715885Sjulian */ 32815885Sjulianstatic void 32917254Sjuliantcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole) 33017921Sjulian{ 33117921Sjulian 33217921Sjulian /* Update SACK hint. */ 33317921Sjulian if (tp->sackhint.nexthole == hole) 33417921Sjulian tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink); 33517921Sjulian 33615885Sjulian /* Remove this SACK hole. */ 33715885Sjulian TAILQ_REMOVE(&tp->snd_holes, hole, scblink); 33815885Sjulian 33915885Sjulian /* Free this SACK hole. */ 34015885Sjulian tcp_sackhole_free(tp, hole); 34115885Sjulian} 34215885Sjulian 34315885Sjulian/* 34417967Sjulian * Process cumulative ACK and the TCP SACK option to update the scoreboard. 34518005Sjulian * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of 34618005Sjulian * the sequence space). 34718005Sjulian * Returns 1 if incoming ACK has previously unknown SACK information, 34818005Sjulian * 0 otherwise. Note: We treat (snd_una, th_ack) as a sack block so any changes 34918005Sjulian * to that (i.e. left edge moving) would also be considered a change in SACK 35017967Sjulian * information which is slightly different than rfc6675. 35117967Sjulian */ 35217967Sjulianint 35317967Sjuliantcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) 35417967Sjulian{ 35517967Sjulian struct sackhole *cur, *temp; 35617967Sjulian struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; 35717967Sjulian int i, j, num_sack_blks, sack_changed; 35815885Sjulian 35915885Sjulian INP_WLOCK_ASSERT(tp->t_inpcb); 36015885Sjulian 36115885Sjulian num_sack_blks = 0; 36215885Sjulian sack_changed = 0; 36315885Sjulian /* 36415885Sjulian * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, 36517921Sjulian * treat [SND.UNA, SEG.ACK) as if it is a SACK block. 36617921Sjulian */ 36717921Sjulian if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { 36817921Sjulian sack_blocks[num_sack_blks].start = tp->snd_una; 36915885Sjulian sack_blocks[num_sack_blks++].end = th_ack; 37015885Sjulian } 37115885Sjulian /* 37215885Sjulian * Append received valid SACK blocks to sack_blocks[], but only if we 37315885Sjulian * received new blocks from the other side. 37415885Sjulian */ 37515885Sjulian if (to->to_flags & TOF_SACK) { 37615885Sjulian tp->sackhint.sacked_bytes = 0; /* reset */ 37717254Sjulian for (i = 0; i < to->to_nsacks; i++) { 37817967Sjulian bcopy((to->to_sacks + i * TCPOLEN_SACK), 37915885Sjulian &sack, sizeof(sack)); 38015885Sjulian sack.start = ntohl(sack.start); 38117921Sjulian sack.end = ntohl(sack.end); 38217921Sjulian if (SEQ_GT(sack.end, sack.start) && 38317921Sjulian SEQ_GT(sack.start, tp->snd_una) && 38415885Sjulian SEQ_GT(sack.start, th_ack) && 38517921Sjulian SEQ_LT(sack.start, tp->snd_max) && 38617921Sjulian SEQ_GT(sack.end, tp->snd_una) && 38717921Sjulian SEQ_LEQ(sack.end, tp->snd_max)) { 38817921Sjulian sack_blocks[num_sack_blks++] = sack; 38917921Sjulian tp->sackhint.sacked_bytes += 39017921Sjulian (sack.end-sack.start); 39117921Sjulian } 39217921Sjulian } 39315885Sjulian } 39415885Sjulian /* 39517254Sjulian * Return if SND.UNA is not advanced and no valid SACK block is 39615885Sjulian * received. 39715885Sjulian */ 39815885Sjulian if (num_sack_blks == 0) 39915885Sjulian return (sack_changed); 40017254Sjulian 40117964Sjulian /* 40217254Sjulian * Sort the SACK blocks so we can update the scoreboard with just one 40317254Sjulian * pass. The overhead of sorting upto 4+1 elements is less than 40417254Sjulian * making upto 4+1 passes over the scoreboard. 40517254Sjulian */ 40617254Sjulian for (i = 0; i < num_sack_blks; i++) { 40717964Sjulian for (j = i + 1; j < num_sack_blks; j++) { 40817254Sjulian if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 40915885Sjulian sack = sack_blocks[i]; 41015885Sjulian sack_blocks[i] = sack_blocks[j]; 41115885Sjulian sack_blocks[j] = sack; 41215885Sjulian } 41315885Sjulian } 41415885Sjulian } 41515885Sjulian if (TAILQ_EMPTY(&tp->snd_holes)) 41615885Sjulian /* 41715885Sjulian * Empty scoreboard. Need to initialize snd_fack (it may be 41815885Sjulian * uninitialized or have a bogus value). Scoreboard holes 41915885Sjulian * (from the sack blocks received) are created later below 42017254Sjulian * (in the logic that adds holes to the tail of the 42117254Sjulian * scoreboard). 42217254Sjulian */ 42317921Sjulian tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); 42417921Sjulian /* 42517921Sjulian * In the while-loop below, incoming SACK blocks (sack_blocks[]) and 42617921Sjulian * SACK holes (snd_holes) are traversed from their tails with just 42717254Sjulian * one pass in order to reduce the number of compares especially when 42815885Sjulian * the bandwidth-delay product is large. 42917921Sjulian * 43017921Sjulian * Note: Typically, in the first RTT of SACK recovery, the highest 43117921Sjulian * three or four SACK blocks with the same ack number are received. 43217921Sjulian * In the second RTT, if retransmitted data segments are not lost, 43317921Sjulian * the highest three or four SACK blocks with ack number advancing 43417921Sjulian * are received. 43517921Sjulian */ 43617921Sjulian sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ 43717921Sjulian tp->sackhint.last_sack_ack = sblkp->end; 43817921Sjulian if (SEQ_LT(tp->snd_fack, sblkp->start)) { 43915885Sjulian /* 44015885Sjulian * The highest SACK block is beyond fack. Append new SACK 44115885Sjulian * hole at the tail. If the second or later highest SACK 44215885Sjulian * blocks are also beyond the current fack, they will be 44317921Sjulian * inserted by way of hole splitting in the while-loop below. 44417921Sjulian */ 44517921Sjulian temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); 44617921Sjulian if (temp != NULL) { 44717921Sjulian tp->snd_fack = sblkp->end; 44815885Sjulian /* Go to the previous sack block. */ 44915885Sjulian sblkp--; 45015885Sjulian sack_changed = 1; 45115885Sjulian } else { 45215885Sjulian /* 45315885Sjulian * We failed to add a new hole based on the current 45417921Sjulian * sack block. Skip over all the sack blocks that 45517921Sjulian * fall completely to the right of snd_fack and 45617921Sjulian * proceed to trim the scoreboard based on the 45717921Sjulian * remaining sack blocks. This also trims the 45817921Sjulian * scoreboard for th_ack (which is sack_blocks[0]). 45915885Sjulian */ 46015885Sjulian while (sblkp >= sack_blocks && 46115885Sjulian SEQ_LT(tp->snd_fack, sblkp->start)) 46215885Sjulian sblkp--; 46315885Sjulian if (sblkp >= sack_blocks && 46418005Sjulian SEQ_LT(tp->snd_fack, sblkp->end)) 46515885Sjulian tp->snd_fack = sblkp->end; 46615885Sjulian } 46717921Sjulian } else if (SEQ_LT(tp->snd_fack, sblkp->end)) { 46817921Sjulian /* fack is advanced. */ 46917921Sjulian tp->snd_fack = sblkp->end; 47015885Sjulian sack_changed = 1; 47115885Sjulian } 47215885Sjulian /* We must have at least one SACK hole in scoreboard. */ 47317921Sjulian KASSERT(!TAILQ_EMPTY(&tp->snd_holes), 47417921Sjulian ("SACK scoreboard must not be empty")); 47517921Sjulian cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */ 47617921Sjulian /* 47715885Sjulian * Since the incoming sack blocks are sorted, we can process them 47815885Sjulian * making one sweep of the scoreboard. 47915885Sjulian */ 48017921Sjulian while (sblkp >= sack_blocks && cur != NULL) { 48117921Sjulian if (SEQ_GEQ(sblkp->start, cur->end)) { 48217921Sjulian /* 48317921Sjulian * SACKs data beyond the current hole. Go to the 48417921Sjulian * previous sack block. 48515885Sjulian */ 48615885Sjulian sblkp--; 48715885Sjulian continue; 48815885Sjulian } 48915885Sjulian if (SEQ_LEQ(sblkp->end, cur->start)) { 49015885Sjulian /* 49117921Sjulian * SACKs data before the current hole. Go to the 49228845Sjulian * previous hole. 49328845Sjulian */ 49428845Sjulian cur = TAILQ_PREV(cur, sackhole_head, scblink); 49528845Sjulian continue; 49628845Sjulian } 49728845Sjulian tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); 49817921Sjulian KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, 49917921Sjulian ("sackhint bytes rtx >= 0")); 50017921Sjulian sack_changed = 1; 50115885Sjulian if (SEQ_LEQ(sblkp->start, cur->start)) { 50215885Sjulian /* Data acks at least the beginning of hole. */ 50315885Sjulian if (SEQ_GEQ(sblkp->end, cur->end)) { 50415885Sjulian /* Acks entire hole, so delete hole. */ 50517921Sjulian temp = cur; 50617921Sjulian cur = TAILQ_PREV(cur, sackhole_head, scblink); 50717921Sjulian tcp_sackhole_remove(tp, temp); 50817921Sjulian /* 50917921Sjulian * The sack block may ack all or part of the 51017921Sjulian * next hole too, so continue onto the next 51115885Sjulian * hole. 51215885Sjulian */ 51315885Sjulian continue; 51415885Sjulian } else { 51515885Sjulian /* Move start of hole forward. */ 51615885Sjulian cur->start = sblkp->end; 51715885Sjulian cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 51817921Sjulian } 51917921Sjulian } else { 52017921Sjulian /* Data acks at least the end of hole. */ 52117921Sjulian if (SEQ_GEQ(sblkp->end, cur->end)) { 52217921Sjulian /* Move end of hole backward. */ 52329681Sgibbs cur->end = sblkp->start; 52417254Sjulian cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 52517921Sjulian } else { 52617921Sjulian /* 52717921Sjulian * ACKs some data in middle of a hole; need 52817921Sjulian * to split current hole 52917254Sjulian */ 53015885Sjulian temp = tcp_sackhole_insert(tp, sblkp->end, 53115885Sjulian cur->end, cur); 53215885Sjulian if (temp != NULL) { 53318005Sjulian if (SEQ_GT(cur->rxmit, temp->rxmit)) { 53415885Sjulian temp->rxmit = cur->rxmit; 53515885Sjulian tp->sackhint.sack_bytes_rexmit 53617921Sjulian += (temp->rxmit 53717921Sjulian - temp->start); 53817921Sjulian } 53917921Sjulian cur->end = sblkp->start; 54017921Sjulian cur->rxmit = SEQ_MIN(cur->rxmit, 54117921Sjulian cur->end); 54217921Sjulian } 54315885Sjulian } 54415885Sjulian } 54515885Sjulian tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); 54615885Sjulian /* 54717921Sjulian * Testing sblkp->start against cur->start tells us whether 54817921Sjulian * we're done with the sack block or the sack hole. 54917921Sjulian * Accordingly, we advance one or the other. 55017921Sjulian */ 55115885Sjulian if (SEQ_LEQ(sblkp->start, cur->start)) 55215885Sjulian cur = TAILQ_PREV(cur, sackhole_head, scblink); 55315885Sjulian else 55415885Sjulian sblkp--; 55515885Sjulian } 55615885Sjulian return (sack_changed); 55715885Sjulian} 55817921Sjulian 55917921Sjulian/* 56017921Sjulian * Free all SACK holes to clear the scoreboard. 56117921Sjulian */ 56217921Sjulianvoid 56315885Sjuliantcp_free_sackholes(struct tcpcb *tp) 56415885Sjulian{ 56515885Sjulian struct sackhole *q; 56615885Sjulian 56715885Sjulian INP_WLOCK_ASSERT(tp->t_inpcb); 56815885Sjulian while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL) 56915885Sjulian tcp_sackhole_remove(tp, q); 57015885Sjulian tp->sackhint.sack_bytes_rexmit = 0; 57115885Sjulian 57217921Sjulian KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0")); 57317921Sjulian KASSERT(tp->sackhint.nexthole == NULL, 57417921Sjulian ("tp->sackhint.nexthole == NULL")); 57517921Sjulian} 57615885Sjulian 57715885Sjulian/* 57817921Sjulian * Partial ack handling within a sack recovery episode. Keeping this very 57917921Sjulian * simple for now. When a partial ack is received, force snd_cwnd to a value 58017921Sjulian * that will allow the sender to transmit no more than 2 segments. If 58117921Sjulian * necessary, a better scheme can be adopted at a later point, but for now, 58215885Sjulian * the goal is to prevent the sender from bursting a large amount of data in 58315885Sjulian * the midst of sack recovery. 58415885Sjulian */ 58517254Sjulianvoid 58615885Sjuliantcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 58715885Sjulian{ 58815885Sjulian int num_segs = 1; 58917921Sjulian 59017921Sjulian INP_WLOCK_ASSERT(tp->t_inpcb); 59117921Sjulian tcp_timer_activate(tp, TT_REXMT, 0); 59217967Sjulian tp->t_rtttime = 0; 59317921Sjulian /* Send one or 2 segments based on how much new data was acked. */ 59417254Sjulian if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) >= 2) 59517254Sjulian num_segs = 2; 59615885Sjulian tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + 59717661Sjulian (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg); 59817661Sjulian if (tp->snd_cwnd > tp->snd_ssthresh) 59917921Sjulian tp->snd_cwnd = tp->snd_ssthresh; 60015885Sjulian tp->t_flags |= TF_ACKNOW; 60117921Sjulian (void) tcp_output(tp); 60217921Sjulian} 60317921Sjulian 60417265Sjulian#if 0 60517265Sjulian/* 60617265Sjulian * Debug version of tcp_sack_output() that walks the scoreboard. Used for 60717265Sjulian * now to sanity check the hint. 60817254Sjulian */ 60917254Sjulianstatic struct sackhole * 61017254Sjuliantcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt) 61117254Sjulian{ 61217921Sjulian struct sackhole *p; 61317967Sjulian 61417967Sjulian INP_WLOCK_ASSERT(tp->t_inpcb); 61517967Sjulian *sack_bytes_rexmt = 0; 61617921Sjulian TAILQ_FOREACH(p, &tp->snd_holes, scblink) { 61717921Sjulian if (SEQ_LT(p->rxmit, p->end)) { 61817967Sjulian if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ 61917967Sjulian continue; 62017967Sjulian } 62117967Sjulian *sack_bytes_rexmt += (p->rxmit - p->start); 62217967Sjulian break; 62317921Sjulian } 62417967Sjulian *sack_bytes_rexmt += (p->rxmit - p->start); 62517254Sjulian } 62617967Sjulian return (p); 62717254Sjulian} 62817254Sjulian#endif 62917254Sjulian 63017254Sjulian/* 63117254Sjulian * Returns the next hole to retransmit and the number of retransmitted bytes 63217254Sjulian * from the scoreboard. We store both the next hole and the number of 63317254Sjulian * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK 63418005Sjulian * reception). This avoids scoreboard traversals completely. 63517254Sjulian * 63615885Sjulian * The loop here will traverse *at most* one link. Here's the argument. For 63717254Sjulian * the loop to traverse more than 1 link before finding the next hole to 63817254Sjulian * retransmit, we would need to have at least 1 node following the current 63917921Sjulian * hint with (rxmit == end). But, for all holes following the current hint, 64028845Sjulian * (start == rxmit), since we have not yet retransmitted from them. 64128845Sjulian * Therefore, in order to traverse more 1 link in the loop below, we need to 64228845Sjulian * have at least one node following the current hint with (start == rxmit == 64328845Sjulian * end). But that can't happen, (start == end) means that all the data in 64428845Sjulian * that hole has been sacked, in which case, the hole would have been removed 64517921Sjulian * from the scoreboard. 64617921Sjulian */ 64717921Sjulianstruct sackhole * 64815885Sjuliantcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) 64917967Sjulian{ 65015885Sjulian struct sackhole *hole = NULL; 65115885Sjulian 65215885Sjulian INP_WLOCK_ASSERT(tp->t_inpcb); 65315885Sjulian *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit; 65415885Sjulian hole = tp->sackhint.nexthole; 65515885Sjulian if (hole == NULL || SEQ_LT(hole->rxmit, hole->end)) 65615885Sjulian goto out; 65717921Sjulian while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) { 65817921Sjulian if (SEQ_LT(hole->rxmit, hole->end)) { 65917921Sjulian tp->sackhint.nexthole = hole; 66015885Sjulian break; 66115885Sjulian } 66215885Sjulian } 66315885Sjulianout: 66415885Sjulian return (hole); 66515885Sjulian} 66617921Sjulian 66717921Sjulian/* 66817921Sjulian * After a timeout, the SACK list may be rebuilt. This SACK information 66915885Sjulian * should be used to avoid retransmitting SACKed data. This function 67015885Sjulian * traverses the SACK list to see if snd_nxt should be moved forward. 67115885Sjulian */ 67215885Sjulianvoid 67315885Sjuliantcp_sack_adjust(struct tcpcb *tp) 67415885Sjulian{ 67517921Sjulian struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); 67617921Sjulian 67717921Sjulian INP_WLOCK_ASSERT(tp->t_inpcb); 67815885Sjulian if (cur == NULL) 67915885Sjulian return; /* No holes */ 68015885Sjulian if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) 68117921Sjulian return; /* We're already beyond any SACKed blocks */ 68217921Sjulian /*- 68317921Sjulian * Two cases for which we want to advance snd_nxt: 68417921Sjulian * i) snd_nxt lies between end of one hole and beginning of another 68515885Sjulian * ii) snd_nxt lies between end of last hole and snd_fack 68615885Sjulian */ 68717921Sjulian while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { 68817921Sjulian if (SEQ_LT(tp->snd_nxt, cur->end)) 68917921Sjulian return; 69017921Sjulian if (SEQ_GEQ(tp->snd_nxt, p->start)) 69117921Sjulian cur = p; 69217921Sjulian else { 69317921Sjulian tp->snd_nxt = p->start; 69417921Sjulian return; 69517921Sjulian } 69617921Sjulian } 69715885Sjulian if (SEQ_LT(tp->snd_nxt, cur->end)) 69815885Sjulian return; 69915885Sjulian tp->snd_nxt = tp->snd_fack; 70015885Sjulian} 70115885Sjulian