tcp_syncache.c revision 315514
1/*- 2 * Copyright (c) 2001 McAfee, Inc. 3 * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG 4 * All rights reserved. 5 * 6 * This software was developed for the FreeBSD Project by Jonathan Lemon 7 * and McAfee Research, the Security Research Division of McAfee, Inc. under 8 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the 9 * DARPA CHATS research program. [2001 McAfee, Inc.] 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33#include <sys/cdefs.h> 34__FBSDID("$FreeBSD: stable/11/sys/netinet/tcp_syncache.c 315514 2017-03-18 22:04:20Z ae $"); 35 36#include "opt_inet.h" 37#include "opt_inet6.h" 38#include "opt_ipsec.h" 39#include "opt_pcbgroup.h" 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/hash.h> 44#include <sys/refcount.h> 45#include <sys/kernel.h> 46#include <sys/sysctl.h> 47#include <sys/limits.h> 48#include <sys/lock.h> 49#include <sys/mutex.h> 50#include <sys/malloc.h> 51#include <sys/mbuf.h> 52#include <sys/proc.h> /* for proc0 declaration */ 53#include <sys/random.h> 54#include <sys/socket.h> 55#include <sys/socketvar.h> 56#include <sys/syslog.h> 57#include <sys/ucred.h> 58 59#include <sys/md5.h> 60#include <crypto/siphash/siphash.h> 61 62#include <vm/uma.h> 63 64#include <net/if.h> 65#include <net/if_var.h> 66#include <net/route.h> 67#include <net/vnet.h> 68 69#include <netinet/in.h> 70#include <netinet/in_systm.h> 71#include <netinet/ip.h> 72#include <netinet/in_var.h> 73#include <netinet/in_pcb.h> 74#include <netinet/ip_var.h> 75#include <netinet/ip_options.h> 76#ifdef INET6 77#include <netinet/ip6.h> 78#include <netinet/icmp6.h> 79#include <netinet6/nd6.h> 80#include <netinet6/ip6_var.h> 81#include <netinet6/in6_pcb.h> 82#endif 83#include <netinet/tcp.h> 84#ifdef TCP_RFC7413 85#include <netinet/tcp_fastopen.h> 86#endif 87#include <netinet/tcp_fsm.h> 88#include <netinet/tcp_seq.h> 89#include <netinet/tcp_timer.h> 90#include <netinet/tcp_var.h> 91#include <netinet/tcp_syncache.h> 92#ifdef INET6 93#include <netinet6/tcp6_var.h> 94#endif 95#ifdef TCP_OFFLOAD 96#include <netinet/toecore.h> 97#endif 98 99#include <netipsec/ipsec_support.h> 100 101#include <machine/in_cksum.h> 102 103#include <security/mac/mac_framework.h> 104 105static VNET_DEFINE(int, tcp_syncookies) = 1; 106#define V_tcp_syncookies VNET(tcp_syncookies) 107SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW, 108 &VNET_NAME(tcp_syncookies), 0, 109 "Use TCP SYN cookies if the syncache overflows"); 110 111static VNET_DEFINE(int, tcp_syncookiesonly) = 0; 112#define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) 113SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW, 114 &VNET_NAME(tcp_syncookiesonly), 0, 115 "Use only TCP SYN cookies"); 116 117#ifdef TCP_OFFLOAD 118#define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) 119#endif 120 121static void syncache_drop(struct syncache *, struct syncache_head *); 122static void syncache_free(struct syncache *); 123static void syncache_insert(struct syncache *, struct syncache_head *); 124static int syncache_respond(struct syncache *, struct syncache_head *, int, 125 const struct mbuf *); 126static struct socket *syncache_socket(struct syncache *, struct socket *, 127 struct mbuf *m); 128static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, 129 int docallout); 130static void syncache_timer(void *); 131 132static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, 133 uint8_t *, uintptr_t); 134static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); 135static struct syncache 136 *syncookie_lookup(struct in_conninfo *, struct syncache_head *, 137 struct syncache *, struct tcphdr *, struct tcpopt *, 138 struct socket *); 139static void syncookie_reseed(void *); 140#ifdef INVARIANTS 141static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, 142 struct syncache *sc, struct tcphdr *th, struct tcpopt *to, 143 struct socket *lso); 144#endif 145 146/* 147 * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. 148 * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds, 149 * the odds are that the user has given up attempting to connect by then. 150 */ 151#define SYNCACHE_MAXREXMTS 3 152 153/* Arbitrary values */ 154#define TCP_SYNCACHE_HASHSIZE 512 155#define TCP_SYNCACHE_BUCKETLIMIT 30 156 157static VNET_DEFINE(struct tcp_syncache, tcp_syncache); 158#define V_tcp_syncache VNET(tcp_syncache) 159 160static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, 161 "TCP SYN cache"); 162 163SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN, 164 &VNET_NAME(tcp_syncache.bucket_limit), 0, 165 "Per-bucket hash limit for syncache"); 166 167SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, 168 &VNET_NAME(tcp_syncache.cache_limit), 0, 169 "Overall entry limit for syncache"); 170 171SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET, 172 &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache"); 173 174SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, 175 &VNET_NAME(tcp_syncache.hashsize), 0, 176 "Size of TCP syncache hashtable"); 177 178SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLFLAG_RW, 179 &VNET_NAME(tcp_syncache.rexmt_limit), 0, 180 "Limit on SYN/ACK retransmissions"); 181 182VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; 183SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, 184 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, 185 "Send reset on socket allocation failure"); 186 187static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); 188 189#define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) 190#define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) 191#define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) 192 193/* 194 * Requires the syncache entry to be already removed from the bucket list. 195 */ 196static void 197syncache_free(struct syncache *sc) 198{ 199 200 if (sc->sc_ipopts) 201 (void) m_free(sc->sc_ipopts); 202 if (sc->sc_cred) 203 crfree(sc->sc_cred); 204#ifdef MAC 205 mac_syncache_destroy(&sc->sc_label); 206#endif 207 208 uma_zfree(V_tcp_syncache.zone, sc); 209} 210 211void 212syncache_init(void) 213{ 214 int i; 215 216 V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; 217 V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; 218 V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; 219 V_tcp_syncache.hash_secret = arc4random(); 220 221 TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", 222 &V_tcp_syncache.hashsize); 223 TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", 224 &V_tcp_syncache.bucket_limit); 225 if (!powerof2(V_tcp_syncache.hashsize) || 226 V_tcp_syncache.hashsize == 0) { 227 printf("WARNING: syncache hash size is not a power of 2.\n"); 228 V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; 229 } 230 V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; 231 232 /* Set limits. */ 233 V_tcp_syncache.cache_limit = 234 V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; 235 TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", 236 &V_tcp_syncache.cache_limit); 237 238 /* Allocate the hash table. */ 239 V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * 240 sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); 241 242#ifdef VIMAGE 243 V_tcp_syncache.vnet = curvnet; 244#endif 245 246 /* Initialize the hash buckets. */ 247 for (i = 0; i < V_tcp_syncache.hashsize; i++) { 248 TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); 249 mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", 250 NULL, MTX_DEF); 251 callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, 252 &V_tcp_syncache.hashbase[i].sch_mtx, 0); 253 V_tcp_syncache.hashbase[i].sch_length = 0; 254 V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache; 255 } 256 257 /* Create the syncache entry zone. */ 258 V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), 259 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 260 V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, 261 V_tcp_syncache.cache_limit); 262 263 /* Start the SYN cookie reseeder callout. */ 264 callout_init(&V_tcp_syncache.secret.reseed, 1); 265 arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0); 266 arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); 267 callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, 268 syncookie_reseed, &V_tcp_syncache); 269} 270 271#ifdef VIMAGE 272void 273syncache_destroy(void) 274{ 275 struct syncache_head *sch; 276 struct syncache *sc, *nsc; 277 int i; 278 279 /* 280 * Stop the re-seed timer before freeing resources. No need to 281 * possibly schedule it another time. 282 */ 283 callout_drain(&V_tcp_syncache.secret.reseed); 284 285 /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ 286 for (i = 0; i < V_tcp_syncache.hashsize; i++) { 287 288 sch = &V_tcp_syncache.hashbase[i]; 289 callout_drain(&sch->sch_timer); 290 291 SCH_LOCK(sch); 292 TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) 293 syncache_drop(sc, sch); 294 SCH_UNLOCK(sch); 295 KASSERT(TAILQ_EMPTY(&sch->sch_bucket), 296 ("%s: sch->sch_bucket not empty", __func__)); 297 KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0", 298 __func__, sch->sch_length)); 299 mtx_destroy(&sch->sch_mtx); 300 } 301 302 KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0, 303 ("%s: cache_count not 0", __func__)); 304 305 /* Free the allocated global resources. */ 306 uma_zdestroy(V_tcp_syncache.zone); 307 free(V_tcp_syncache.hashbase, M_SYNCACHE); 308} 309#endif 310 311/* 312 * Inserts a syncache entry into the specified bucket row. 313 * Locks and unlocks the syncache_head autonomously. 314 */ 315static void 316syncache_insert(struct syncache *sc, struct syncache_head *sch) 317{ 318 struct syncache *sc2; 319 320 SCH_LOCK(sch); 321 322 /* 323 * Make sure that we don't overflow the per-bucket limit. 324 * If the bucket is full, toss the oldest element. 325 */ 326 if (sch->sch_length >= V_tcp_syncache.bucket_limit) { 327 KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), 328 ("sch->sch_length incorrect")); 329 sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); 330 syncache_drop(sc2, sch); 331 TCPSTAT_INC(tcps_sc_bucketoverflow); 332 } 333 334 /* Put it into the bucket. */ 335 TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); 336 sch->sch_length++; 337 338#ifdef TCP_OFFLOAD 339 if (ADDED_BY_TOE(sc)) { 340 struct toedev *tod = sc->sc_tod; 341 342 tod->tod_syncache_added(tod, sc->sc_todctx); 343 } 344#endif 345 346 /* Reinitialize the bucket row's timer. */ 347 if (sch->sch_length == 1) 348 sch->sch_nextc = ticks + INT_MAX; 349 syncache_timeout(sc, sch, 1); 350 351 SCH_UNLOCK(sch); 352 353 TCPSTATES_INC(TCPS_SYN_RECEIVED); 354 TCPSTAT_INC(tcps_sc_added); 355} 356 357/* 358 * Remove and free entry from syncache bucket row. 359 * Expects locked syncache head. 360 */ 361static void 362syncache_drop(struct syncache *sc, struct syncache_head *sch) 363{ 364 365 SCH_LOCK_ASSERT(sch); 366 367 TCPSTATES_DEC(TCPS_SYN_RECEIVED); 368 TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); 369 sch->sch_length--; 370 371#ifdef TCP_OFFLOAD 372 if (ADDED_BY_TOE(sc)) { 373 struct toedev *tod = sc->sc_tod; 374 375 tod->tod_syncache_removed(tod, sc->sc_todctx); 376 } 377#endif 378 379 syncache_free(sc); 380} 381 382/* 383 * Engage/reengage time on bucket row. 384 */ 385static void 386syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) 387{ 388 sc->sc_rxttime = ticks + 389 TCPTV_RTOBASE * (tcp_syn_backoff[sc->sc_rxmits]); 390 sc->sc_rxmits++; 391 if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { 392 sch->sch_nextc = sc->sc_rxttime; 393 if (docallout) 394 callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, 395 syncache_timer, (void *)sch); 396 } 397} 398 399/* 400 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 401 * If we have retransmitted an entry the maximum number of times, expire it. 402 * One separate timer for each bucket row. 403 */ 404static void 405syncache_timer(void *xsch) 406{ 407 struct syncache_head *sch = (struct syncache_head *)xsch; 408 struct syncache *sc, *nsc; 409 int tick = ticks; 410 char *s; 411 412 CURVNET_SET(sch->sch_sc->vnet); 413 414 /* NB: syncache_head has already been locked by the callout. */ 415 SCH_LOCK_ASSERT(sch); 416 417 /* 418 * In the following cycle we may remove some entries and/or 419 * advance some timeouts, so re-initialize the bucket timer. 420 */ 421 sch->sch_nextc = tick + INT_MAX; 422 423 TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { 424 /* 425 * We do not check if the listen socket still exists 426 * and accept the case where the listen socket may be 427 * gone by the time we resend the SYN/ACK. We do 428 * not expect this to happens often. If it does, 429 * then the RST will be sent by the time the remote 430 * host does the SYN/ACK->ACK. 431 */ 432 if (TSTMP_GT(sc->sc_rxttime, tick)) { 433 if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) 434 sch->sch_nextc = sc->sc_rxttime; 435 continue; 436 } 437 if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { 438 if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 439 log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " 440 "giving up and removing syncache entry\n", 441 s, __func__); 442 free(s, M_TCPLOG); 443 } 444 syncache_drop(sc, sch); 445 TCPSTAT_INC(tcps_sc_stale); 446 continue; 447 } 448 if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 449 log(LOG_DEBUG, "%s; %s: Response timeout, " 450 "retransmitting (%u) SYN|ACK\n", 451 s, __func__, sc->sc_rxmits); 452 free(s, M_TCPLOG); 453 } 454 455 syncache_respond(sc, sch, 1, NULL); 456 TCPSTAT_INC(tcps_sc_retransmitted); 457 syncache_timeout(sc, sch, 0); 458 } 459 if (!TAILQ_EMPTY(&(sch)->sch_bucket)) 460 callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, 461 syncache_timer, (void *)(sch)); 462 CURVNET_RESTORE(); 463} 464 465/* 466 * Find an entry in the syncache. 467 * Returns always with locked syncache_head plus a matching entry or NULL. 468 */ 469static struct syncache * 470syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) 471{ 472 struct syncache *sc; 473 struct syncache_head *sch; 474 uint32_t hash; 475 476 /* 477 * The hash is built on foreign port + local port + foreign address. 478 * We rely on the fact that struct in_conninfo starts with 16 bits 479 * of foreign port, then 16 bits of local port then followed by 128 480 * bits of foreign address. In case of IPv4 address, the first 3 481 * 32-bit words of the address always are zeroes. 482 */ 483 hash = jenkins_hash32((uint32_t *)&inc->inc_ie, 5, 484 V_tcp_syncache.hash_secret) & V_tcp_syncache.hashmask; 485 486 sch = &V_tcp_syncache.hashbase[hash]; 487 *schp = sch; 488 SCH_LOCK(sch); 489 490 /* Circle through bucket row to find matching entry. */ 491 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) 492 if (bcmp(&inc->inc_ie, &sc->sc_inc.inc_ie, 493 sizeof(struct in_endpoints)) == 0) 494 break; 495 496 return (sc); /* Always returns with locked sch. */ 497} 498 499/* 500 * This function is called when we get a RST for a 501 * non-existent connection, so that we can see if the 502 * connection is in the syn cache. If it is, zap it. 503 */ 504void 505syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) 506{ 507 struct syncache *sc; 508 struct syncache_head *sch; 509 char *s = NULL; 510 511 sc = syncache_lookup(inc, &sch); /* returns locked sch */ 512 SCH_LOCK_ASSERT(sch); 513 514 /* 515 * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. 516 * See RFC 793 page 65, section SEGMENT ARRIVES. 517 */ 518 if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) { 519 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 520 log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " 521 "FIN flag set, segment ignored\n", s, __func__); 522 TCPSTAT_INC(tcps_badrst); 523 goto done; 524 } 525 526 /* 527 * No corresponding connection was found in syncache. 528 * If syncookies are enabled and possibly exclusively 529 * used, or we are under memory pressure, a valid RST 530 * may not find a syncache entry. In that case we're 531 * done and no SYN|ACK retransmissions will happen. 532 * Otherwise the RST was misdirected or spoofed. 533 */ 534 if (sc == NULL) { 535 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 536 log(LOG_DEBUG, "%s; %s: Spurious RST without matching " 537 "syncache entry (possibly syncookie only), " 538 "segment ignored\n", s, __func__); 539 TCPSTAT_INC(tcps_badrst); 540 goto done; 541 } 542 543 /* 544 * If the RST bit is set, check the sequence number to see 545 * if this is a valid reset segment. 546 * RFC 793 page 37: 547 * In all states except SYN-SENT, all reset (RST) segments 548 * are validated by checking their SEQ-fields. A reset is 549 * valid if its sequence number is in the window. 550 * 551 * The sequence number in the reset segment is normally an 552 * echo of our outgoing acknowlegement numbers, but some hosts 553 * send a reset with the sequence number at the rightmost edge 554 * of our receive window, and we have to handle this case. 555 */ 556 if (SEQ_GEQ(th->th_seq, sc->sc_irs) && 557 SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { 558 syncache_drop(sc, sch); 559 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 560 log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " 561 "connection attempt aborted by remote endpoint\n", 562 s, __func__); 563 TCPSTAT_INC(tcps_sc_reset); 564 } else { 565 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 566 log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != " 567 "IRS %u (+WND %u), segment ignored\n", 568 s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd); 569 TCPSTAT_INC(tcps_badrst); 570 } 571 572done: 573 if (s != NULL) 574 free(s, M_TCPLOG); 575 SCH_UNLOCK(sch); 576} 577 578void 579syncache_badack(struct in_conninfo *inc) 580{ 581 struct syncache *sc; 582 struct syncache_head *sch; 583 584 sc = syncache_lookup(inc, &sch); /* returns locked sch */ 585 SCH_LOCK_ASSERT(sch); 586 if (sc != NULL) { 587 syncache_drop(sc, sch); 588 TCPSTAT_INC(tcps_sc_badack); 589 } 590 SCH_UNLOCK(sch); 591} 592 593void 594syncache_unreach(struct in_conninfo *inc, struct tcphdr *th) 595{ 596 struct syncache *sc; 597 struct syncache_head *sch; 598 599 sc = syncache_lookup(inc, &sch); /* returns locked sch */ 600 SCH_LOCK_ASSERT(sch); 601 if (sc == NULL) 602 goto done; 603 604 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 605 if (ntohl(th->th_seq) != sc->sc_iss) 606 goto done; 607 608 /* 609 * If we've rertransmitted 3 times and this is our second error, 610 * we remove the entry. Otherwise, we allow it to continue on. 611 * This prevents us from incorrectly nuking an entry during a 612 * spurious network outage. 613 * 614 * See tcp_notify(). 615 */ 616 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { 617 sc->sc_flags |= SCF_UNREACH; 618 goto done; 619 } 620 syncache_drop(sc, sch); 621 TCPSTAT_INC(tcps_sc_unreach); 622done: 623 SCH_UNLOCK(sch); 624} 625 626/* 627 * Build a new TCP socket structure from a syncache entry. 628 * 629 * On success return the newly created socket with its underlying inp locked. 630 */ 631static struct socket * 632syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) 633{ 634 struct tcp_function_block *blk; 635 struct inpcb *inp = NULL; 636 struct socket *so; 637 struct tcpcb *tp; 638 int error; 639 char *s; 640 641 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 642 643 /* 644 * Ok, create the full blown connection, and set things up 645 * as they would have been set up if we had created the 646 * connection when the SYN arrived. If we can't create 647 * the connection, abort it. 648 */ 649 so = sonewconn(lso, 0); 650 if (so == NULL) { 651 /* 652 * Drop the connection; we will either send a RST or 653 * have the peer retransmit its SYN again after its 654 * RTO and try again. 655 */ 656 TCPSTAT_INC(tcps_listendrop); 657 if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 658 log(LOG_DEBUG, "%s; %s: Socket create failed " 659 "due to limits or memory shortage\n", 660 s, __func__); 661 free(s, M_TCPLOG); 662 } 663 goto abort2; 664 } 665#ifdef MAC 666 mac_socketpeer_set_from_mbuf(m, so); 667#endif 668 669 inp = sotoinpcb(so); 670 inp->inp_inc.inc_fibnum = so->so_fibnum; 671 INP_WLOCK(inp); 672 /* 673 * Exclusive pcbinfo lock is not required in syncache socket case even 674 * if two inpcb locks can be acquired simultaneously: 675 * - the inpcb in LISTEN state, 676 * - the newly created inp. 677 * 678 * In this case, an inp cannot be at same time in LISTEN state and 679 * just created by an accept() call. 680 */ 681 INP_HASH_WLOCK(&V_tcbinfo); 682 683 /* Insert new socket into PCB hash list. */ 684 inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; 685#ifdef INET6 686 if (sc->sc_inc.inc_flags & INC_ISIPV6) { 687 inp->in6p_laddr = sc->sc_inc.inc6_laddr; 688 } else { 689 inp->inp_vflag &= ~INP_IPV6; 690 inp->inp_vflag |= INP_IPV4; 691#endif 692 inp->inp_laddr = sc->sc_inc.inc_laddr; 693#ifdef INET6 694 } 695#endif 696 697 /* 698 * If there's an mbuf and it has a flowid, then let's initialise the 699 * inp with that particular flowid. 700 */ 701 if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 702 inp->inp_flowid = m->m_pkthdr.flowid; 703 inp->inp_flowtype = M_HASHTYPE_GET(m); 704 } 705 706 /* 707 * Install in the reservation hash table for now, but don't yet 708 * install a connection group since the full 4-tuple isn't yet 709 * configured. 710 */ 711 inp->inp_lport = sc->sc_inc.inc_lport; 712 if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) { 713 /* 714 * Undo the assignments above if we failed to 715 * put the PCB on the hash lists. 716 */ 717#ifdef INET6 718 if (sc->sc_inc.inc_flags & INC_ISIPV6) 719 inp->in6p_laddr = in6addr_any; 720 else 721#endif 722 inp->inp_laddr.s_addr = INADDR_ANY; 723 inp->inp_lport = 0; 724 if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 725 log(LOG_DEBUG, "%s; %s: in_pcbinshash failed " 726 "with error %i\n", 727 s, __func__, error); 728 free(s, M_TCPLOG); 729 } 730 INP_HASH_WUNLOCK(&V_tcbinfo); 731 goto abort; 732 } 733#ifdef INET6 734 if (sc->sc_inc.inc_flags & INC_ISIPV6) { 735 struct inpcb *oinp = sotoinpcb(lso); 736 struct in6_addr laddr6; 737 struct sockaddr_in6 sin6; 738 /* 739 * Inherit socket options from the listening socket. 740 * Note that in6p_inputopts are not (and should not be) 741 * copied, since it stores previously received options and is 742 * used to detect if each new option is different than the 743 * previous one and hence should be passed to a user. 744 * If we copied in6p_inputopts, a user would not be able to 745 * receive options just after calling the accept system call. 746 */ 747 inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; 748 if (oinp->in6p_outputopts) 749 inp->in6p_outputopts = 750 ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); 751 752 sin6.sin6_family = AF_INET6; 753 sin6.sin6_len = sizeof(sin6); 754 sin6.sin6_addr = sc->sc_inc.inc6_faddr; 755 sin6.sin6_port = sc->sc_inc.inc_fport; 756 sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; 757 laddr6 = inp->in6p_laddr; 758 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 759 inp->in6p_laddr = sc->sc_inc.inc6_laddr; 760 if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, 761 thread0.td_ucred, m)) != 0) { 762 inp->in6p_laddr = laddr6; 763 if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 764 log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed " 765 "with error %i\n", 766 s, __func__, error); 767 free(s, M_TCPLOG); 768 } 769 INP_HASH_WUNLOCK(&V_tcbinfo); 770 goto abort; 771 } 772 /* Override flowlabel from in6_pcbconnect. */ 773 inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; 774 inp->inp_flow |= sc->sc_flowlabel; 775 } 776#endif /* INET6 */ 777#if defined(INET) && defined(INET6) 778 else 779#endif 780#ifdef INET 781 { 782 struct in_addr laddr; 783 struct sockaddr_in sin; 784 785 inp->inp_options = (m) ? ip_srcroute(m) : NULL; 786 787 if (inp->inp_options == NULL) { 788 inp->inp_options = sc->sc_ipopts; 789 sc->sc_ipopts = NULL; 790 } 791 792 sin.sin_family = AF_INET; 793 sin.sin_len = sizeof(sin); 794 sin.sin_addr = sc->sc_inc.inc_faddr; 795 sin.sin_port = sc->sc_inc.inc_fport; 796 bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); 797 laddr = inp->inp_laddr; 798 if (inp->inp_laddr.s_addr == INADDR_ANY) 799 inp->inp_laddr = sc->sc_inc.inc_laddr; 800 if ((error = in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin, 801 thread0.td_ucred, m)) != 0) { 802 inp->inp_laddr = laddr; 803 if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 804 log(LOG_DEBUG, "%s; %s: in_pcbconnect failed " 805 "with error %i\n", 806 s, __func__, error); 807 free(s, M_TCPLOG); 808 } 809 INP_HASH_WUNLOCK(&V_tcbinfo); 810 goto abort; 811 } 812 } 813#endif /* INET */ 814#if defined(IPSEC) || defined(IPSEC_SUPPORT) 815 /* Copy old policy into new socket's. */ 816 if (ipsec_copy_pcbpolicy(sotoinpcb(lso), inp) != 0) 817 printf("syncache_socket: could not copy policy\n"); 818#endif 819 INP_HASH_WUNLOCK(&V_tcbinfo); 820 tp = intotcpcb(inp); 821 tcp_state_change(tp, TCPS_SYN_RECEIVED); 822 tp->iss = sc->sc_iss; 823 tp->irs = sc->sc_irs; 824 tcp_rcvseqinit(tp); 825 tcp_sendseqinit(tp); 826 blk = sototcpcb(lso)->t_fb; 827 if (blk != tp->t_fb) { 828 /* 829 * Our parents t_fb was not the default, 830 * we need to release our ref on tp->t_fb and 831 * pickup one on the new entry. 832 */ 833 struct tcp_function_block *rblk; 834 835 rblk = find_and_ref_tcp_fb(blk); 836 KASSERT(rblk != NULL, 837 ("cannot find blk %p out of syncache?", blk)); 838 if (tp->t_fb->tfb_tcp_fb_fini) 839 (*tp->t_fb->tfb_tcp_fb_fini)(tp); 840 refcount_release(&tp->t_fb->tfb_refcnt); 841 tp->t_fb = rblk; 842 if (tp->t_fb->tfb_tcp_fb_init) { 843 (*tp->t_fb->tfb_tcp_fb_init)(tp); 844 } 845 } 846 tp->snd_wl1 = sc->sc_irs; 847 tp->snd_max = tp->iss + 1; 848 tp->snd_nxt = tp->iss + 1; 849 tp->rcv_up = sc->sc_irs + 1; 850 tp->rcv_wnd = sc->sc_wnd; 851 tp->rcv_adv += tp->rcv_wnd; 852 tp->last_ack_sent = tp->rcv_nxt; 853 854 tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); 855 if (sc->sc_flags & SCF_NOOPT) 856 tp->t_flags |= TF_NOOPT; 857 else { 858 if (sc->sc_flags & SCF_WINSCALE) { 859 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 860 tp->snd_scale = sc->sc_requested_s_scale; 861 tp->request_r_scale = sc->sc_requested_r_scale; 862 } 863 if (sc->sc_flags & SCF_TIMESTAMP) { 864 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 865 tp->ts_recent = sc->sc_tsreflect; 866 tp->ts_recent_age = tcp_ts_getticks(); 867 tp->ts_offset = sc->sc_tsoff; 868 } 869#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 870 if (sc->sc_flags & SCF_SIGNATURE) 871 tp->t_flags |= TF_SIGNATURE; 872#endif 873 if (sc->sc_flags & SCF_SACK) 874 tp->t_flags |= TF_SACK_PERMIT; 875 } 876 877 if (sc->sc_flags & SCF_ECN) 878 tp->t_flags |= TF_ECN_PERMIT; 879 880 /* 881 * Set up MSS and get cached values from tcp_hostcache. 882 * This might overwrite some of the defaults we just set. 883 */ 884 tcp_mss(tp, sc->sc_peer_mss); 885 886 /* 887 * If the SYN,ACK was retransmitted, indicate that CWND to be 888 * limited to one segment in cc_conn_init(). 889 * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits. 890 */ 891 if (sc->sc_rxmits > 1) 892 tp->snd_cwnd = 1; 893 894#ifdef TCP_OFFLOAD 895 /* 896 * Allow a TOE driver to install its hooks. Note that we hold the 897 * pcbinfo lock too and that prevents tcp_usr_accept from accepting a 898 * new connection before the TOE driver has done its thing. 899 */ 900 if (ADDED_BY_TOE(sc)) { 901 struct toedev *tod = sc->sc_tod; 902 903 tod->tod_offload_socket(tod, sc->sc_todctx, so); 904 } 905#endif 906 /* 907 * Copy and activate timers. 908 */ 909 tp->t_keepinit = sototcpcb(lso)->t_keepinit; 910 tp->t_keepidle = sototcpcb(lso)->t_keepidle; 911 tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; 912 tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; 913 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); 914 915 TCPSTAT_INC(tcps_accepts); 916 return (so); 917 918abort: 919 INP_WUNLOCK(inp); 920abort2: 921 if (so != NULL) 922 soabort(so); 923 return (NULL); 924} 925 926/* 927 * This function gets called when we receive an ACK for a 928 * socket in the LISTEN state. We look up the connection 929 * in the syncache, and if its there, we pull it out of 930 * the cache and turn it into a full-blown connection in 931 * the SYN-RECEIVED state. 932 * 933 * On syncache_socket() success the newly created socket 934 * has its underlying inp locked. 935 */ 936int 937syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, 938 struct socket **lsop, struct mbuf *m) 939{ 940 struct syncache *sc; 941 struct syncache_head *sch; 942 struct syncache scs; 943 char *s; 944 945 /* 946 * Global TCP locks are held because we manipulate the PCB lists 947 * and create a new socket. 948 */ 949 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 950 KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, 951 ("%s: can handle only ACK", __func__)); 952 953 sc = syncache_lookup(inc, &sch); /* returns locked sch */ 954 SCH_LOCK_ASSERT(sch); 955 956#ifdef INVARIANTS 957 /* 958 * Test code for syncookies comparing the syncache stored 959 * values with the reconstructed values from the cookie. 960 */ 961 if (sc != NULL) 962 syncookie_cmp(inc, sch, sc, th, to, *lsop); 963#endif 964 965 if (sc == NULL) { 966 /* 967 * There is no syncache entry, so see if this ACK is 968 * a returning syncookie. To do this, first: 969 * A. See if this socket has had a syncache entry dropped in 970 * the past. We don't want to accept a bogus syncookie 971 * if we've never received a SYN. 972 * B. check that the syncookie is valid. If it is, then 973 * cobble up a fake syncache entry, and return. 974 */ 975 if (!V_tcp_syncookies) { 976 SCH_UNLOCK(sch); 977 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 978 log(LOG_DEBUG, "%s; %s: Spurious ACK, " 979 "segment rejected (syncookies disabled)\n", 980 s, __func__); 981 goto failed; 982 } 983 bzero(&scs, sizeof(scs)); 984 sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); 985 SCH_UNLOCK(sch); 986 if (sc == NULL) { 987 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 988 log(LOG_DEBUG, "%s; %s: Segment failed " 989 "SYNCOOKIE authentication, segment rejected " 990 "(probably spoofed)\n", s, __func__); 991 goto failed; 992 } 993#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 994 /* If received ACK has MD5 signature, check it. */ 995 if ((to->to_flags & TOF_SIGNATURE) != 0 && 996 (!TCPMD5_ENABLED() || 997 TCPMD5_INPUT(m, th, to->to_signature) != 0)) { 998 /* Drop the ACK. */ 999 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1000 log(LOG_DEBUG, "%s; %s: Segment rejected, " 1001 "MD5 signature doesn't match.\n", 1002 s, __func__); 1003 free(s, M_TCPLOG); 1004 } 1005 TCPSTAT_INC(tcps_sig_err_sigopt); 1006 return (-1); /* Do not send RST */ 1007 } 1008#endif /* TCP_SIGNATURE */ 1009 } else { 1010#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 1011 /* 1012 * If listening socket requested TCP digests, check that 1013 * received ACK has signature and it is correct. 1014 * If not, drop the ACK and leave sc entry in th cache, 1015 * because SYN was received with correct signature. 1016 */ 1017 if (sc->sc_flags & SCF_SIGNATURE) { 1018 if ((to->to_flags & TOF_SIGNATURE) == 0) { 1019 /* No signature */ 1020 TCPSTAT_INC(tcps_sig_err_nosigopt); 1021 SCH_UNLOCK(sch); 1022 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1023 log(LOG_DEBUG, "%s; %s: Segment " 1024 "rejected, MD5 signature wasn't " 1025 "provided.\n", s, __func__); 1026 free(s, M_TCPLOG); 1027 } 1028 return (-1); /* Do not send RST */ 1029 } 1030 if (!TCPMD5_ENABLED() || 1031 TCPMD5_INPUT(m, th, to->to_signature) != 0) { 1032 /* Doesn't match or no SA */ 1033 SCH_UNLOCK(sch); 1034 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1035 log(LOG_DEBUG, "%s; %s: Segment " 1036 "rejected, MD5 signature doesn't " 1037 "match.\n", s, __func__); 1038 free(s, M_TCPLOG); 1039 } 1040 return (-1); /* Do not send RST */ 1041 } 1042 } 1043#endif /* TCP_SIGNATURE */ 1044 /* 1045 * Pull out the entry to unlock the bucket row. 1046 * 1047 * NOTE: We must decrease TCPS_SYN_RECEIVED count here, not 1048 * tcp_state_change(). The tcpcb is not existent at this 1049 * moment. A new one will be allocated via syncache_socket-> 1050 * sonewconn->tcp_usr_attach in TCPS_CLOSED state, then 1051 * syncache_socket() will change it to TCPS_SYN_RECEIVED. 1052 */ 1053 TCPSTATES_DEC(TCPS_SYN_RECEIVED); 1054 TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); 1055 sch->sch_length--; 1056#ifdef TCP_OFFLOAD 1057 if (ADDED_BY_TOE(sc)) { 1058 struct toedev *tod = sc->sc_tod; 1059 1060 tod->tod_syncache_removed(tod, sc->sc_todctx); 1061 } 1062#endif 1063 SCH_UNLOCK(sch); 1064 } 1065 1066 /* 1067 * Segment validation: 1068 * ACK must match our initial sequence number + 1 (the SYN|ACK). 1069 */ 1070 if (th->th_ack != sc->sc_iss + 1) { 1071 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 1072 log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " 1073 "rejected\n", s, __func__, th->th_ack, sc->sc_iss); 1074 goto failed; 1075 } 1076 1077 /* 1078 * The SEQ must fall in the window starting at the received 1079 * initial receive sequence number + 1 (the SYN). 1080 */ 1081 if (SEQ_LEQ(th->th_seq, sc->sc_irs) || 1082 SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { 1083 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 1084 log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " 1085 "rejected\n", s, __func__, th->th_seq, sc->sc_irs); 1086 goto failed; 1087 } 1088 1089 /* 1090 * If timestamps were not negotiated during SYN/ACK they 1091 * must not appear on any segment during this session. 1092 */ 1093 if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { 1094 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 1095 log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 1096 "segment rejected\n", s, __func__); 1097 goto failed; 1098 } 1099 1100 /* 1101 * If timestamps were negotiated during SYN/ACK they should 1102 * appear on every segment during this session. 1103 * XXXAO: This is only informal as there have been unverified 1104 * reports of non-compliants stacks. 1105 */ 1106 if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) { 1107 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1108 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1109 "no action\n", s, __func__); 1110 free(s, M_TCPLOG); 1111 s = NULL; 1112 } 1113 } 1114 1115 /* 1116 * If timestamps were negotiated, the reflected timestamp 1117 * must be equal to what we actually sent in the SYN|ACK 1118 * except in the case of 0. Some boxes are known for sending 1119 * broken timestamp replies during the 3whs (and potentially 1120 * during the connection also). 1121 * 1122 * Accept the final ACK of 3whs with reflected timestamp of 0 1123 * instead of sending a RST and deleting the syncache entry. 1124 */ 1125 if ((to->to_flags & TOF_TS) && to->to_tsecr && 1126 to->to_tsecr != sc->sc_ts) { 1127 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 1128 log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " 1129 "segment rejected\n", 1130 s, __func__, to->to_tsecr, sc->sc_ts); 1131 goto failed; 1132 } 1133 1134 *lsop = syncache_socket(sc, *lsop, m); 1135 1136 if (*lsop == NULL) 1137 TCPSTAT_INC(tcps_sc_aborted); 1138 else 1139 TCPSTAT_INC(tcps_sc_completed); 1140 1141/* how do we find the inp for the new socket? */ 1142 if (sc != &scs) 1143 syncache_free(sc); 1144 return (1); 1145failed: 1146 if (sc != NULL && sc != &scs) 1147 syncache_free(sc); 1148 if (s != NULL) 1149 free(s, M_TCPLOG); 1150 *lsop = NULL; 1151 return (0); 1152} 1153 1154#ifdef TCP_RFC7413 1155static void 1156syncache_tfo_expand(struct syncache *sc, struct socket **lsop, struct mbuf *m, 1157 uint64_t response_cookie) 1158{ 1159 struct inpcb *inp; 1160 struct tcpcb *tp; 1161 unsigned int *pending_counter; 1162 1163 /* 1164 * Global TCP locks are held because we manipulate the PCB lists 1165 * and create a new socket. 1166 */ 1167 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1168 1169 pending_counter = intotcpcb(sotoinpcb(*lsop))->t_tfo_pending; 1170 *lsop = syncache_socket(sc, *lsop, m); 1171 if (*lsop == NULL) { 1172 TCPSTAT_INC(tcps_sc_aborted); 1173 atomic_subtract_int(pending_counter, 1); 1174 } else { 1175 inp = sotoinpcb(*lsop); 1176 tp = intotcpcb(inp); 1177 tp->t_flags |= TF_FASTOPEN; 1178 tp->t_tfo_cookie = response_cookie; 1179 tp->snd_max = tp->iss; 1180 tp->snd_nxt = tp->iss; 1181 tp->t_tfo_pending = pending_counter; 1182 TCPSTAT_INC(tcps_sc_completed); 1183 } 1184} 1185#endif /* TCP_RFC7413 */ 1186 1187/* 1188 * Given a LISTEN socket and an inbound SYN request, add 1189 * this to the syn cache, and send back a segment: 1190 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 1191 * to the source. 1192 * 1193 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 1194 * Doing so would require that we hold onto the data and deliver it 1195 * to the application. However, if we are the target of a SYN-flood 1196 * DoS attack, an attacker could send data which would eventually 1197 * consume all available buffer space if it were ACKed. By not ACKing 1198 * the data, we avoid this DoS scenario. 1199 * 1200 * The exception to the above is when a SYN with a valid TCP Fast Open (TFO) 1201 * cookie is processed, V_tcp_fastopen_enabled set to true, and the 1202 * TCP_FASTOPEN socket option is set. In this case, a new socket is created 1203 * and returned via lsop, the mbuf is not freed so that tcp_input() can 1204 * queue its data to the socket, and 1 is returned to indicate the 1205 * TFO-socket-creation path was taken. 1206 */ 1207int 1208syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, 1209 struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, 1210 void *todctx) 1211{ 1212 struct tcpcb *tp; 1213 struct socket *so; 1214 struct syncache *sc = NULL; 1215 struct syncache_head *sch; 1216 struct mbuf *ipopts = NULL; 1217 u_int ltflags; 1218 int win, sb_hiwat, ip_ttl, ip_tos; 1219 char *s; 1220 int rv = 0; 1221#ifdef INET6 1222 int autoflowlabel = 0; 1223#endif 1224#ifdef MAC 1225 struct label *maclabel; 1226#endif 1227 struct syncache scs; 1228 struct ucred *cred; 1229#ifdef TCP_RFC7413 1230 uint64_t tfo_response_cookie; 1231 int tfo_cookie_valid = 0; 1232 int tfo_response_cookie_valid = 0; 1233#endif 1234 1235 INP_WLOCK_ASSERT(inp); /* listen socket */ 1236 KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, 1237 ("%s: unexpected tcp flags", __func__)); 1238 1239 /* 1240 * Combine all so/tp operations very early to drop the INP lock as 1241 * soon as possible. 1242 */ 1243 so = *lsop; 1244 tp = sototcpcb(so); 1245 cred = crhold(so->so_cred); 1246 1247#ifdef INET6 1248 if ((inc->inc_flags & INC_ISIPV6) && 1249 (inp->inp_flags & IN6P_AUTOFLOWLABEL)) 1250 autoflowlabel = 1; 1251#endif 1252 ip_ttl = inp->inp_ip_ttl; 1253 ip_tos = inp->inp_ip_tos; 1254 win = sbspace(&so->so_rcv); 1255 sb_hiwat = so->so_rcv.sb_hiwat; 1256 ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); 1257 1258#ifdef TCP_RFC7413 1259 if (V_tcp_fastopen_enabled && (tp->t_flags & TF_FASTOPEN) && 1260 (tp->t_tfo_pending != NULL) && (to->to_flags & TOF_FASTOPEN)) { 1261 /* 1262 * Limit the number of pending TFO connections to 1263 * approximately half of the queue limit. This prevents TFO 1264 * SYN floods from starving the service by filling the 1265 * listen queue with bogus TFO connections. 1266 */ 1267 if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <= 1268 (so->so_qlimit / 2)) { 1269 int result; 1270 1271 result = tcp_fastopen_check_cookie(inc, 1272 to->to_tfo_cookie, to->to_tfo_len, 1273 &tfo_response_cookie); 1274 tfo_cookie_valid = (result > 0); 1275 tfo_response_cookie_valid = (result >= 0); 1276 } else 1277 atomic_subtract_int(tp->t_tfo_pending, 1); 1278 } 1279#endif 1280 1281 /* By the time we drop the lock these should no longer be used. */ 1282 so = NULL; 1283 tp = NULL; 1284 1285#ifdef MAC 1286 if (mac_syncache_init(&maclabel) != 0) { 1287 INP_WUNLOCK(inp); 1288 goto done; 1289 } else 1290 mac_syncache_create(maclabel, inp); 1291#endif 1292#ifdef TCP_RFC7413 1293 if (!tfo_cookie_valid) 1294#endif 1295 INP_WUNLOCK(inp); 1296 1297 /* 1298 * Remember the IP options, if any. 1299 */ 1300#ifdef INET6 1301 if (!(inc->inc_flags & INC_ISIPV6)) 1302#endif 1303#ifdef INET 1304 ipopts = (m) ? ip_srcroute(m) : NULL; 1305#else 1306 ipopts = NULL; 1307#endif 1308 1309#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 1310 /* 1311 * If listening socket requested TCP digests, check that received 1312 * SYN has signature and it is correct. If signature doesn't match 1313 * or TCP_SIGNATURE support isn't enabled, drop the packet. 1314 */ 1315 if (ltflags & TF_SIGNATURE) { 1316 if ((to->to_flags & TOF_SIGNATURE) == 0) { 1317 TCPSTAT_INC(tcps_sig_err_nosigopt); 1318 goto done; 1319 } 1320 if (!TCPMD5_ENABLED() || 1321 TCPMD5_INPUT(m, th, to->to_signature) != 0) 1322 goto done; 1323 } 1324#endif /* TCP_SIGNATURE */ 1325 /* 1326 * See if we already have an entry for this connection. 1327 * If we do, resend the SYN,ACK, and reset the retransmit timer. 1328 * 1329 * XXX: should the syncache be re-initialized with the contents 1330 * of the new SYN here (which may have different options?) 1331 * 1332 * XXX: We do not check the sequence number to see if this is a 1333 * real retransmit or a new connection attempt. The question is 1334 * how to handle such a case; either ignore it as spoofed, or 1335 * drop the current entry and create a new one? 1336 */ 1337 sc = syncache_lookup(inc, &sch); /* returns locked entry */ 1338 SCH_LOCK_ASSERT(sch); 1339 if (sc != NULL) { 1340#ifdef TCP_RFC7413 1341 if (tfo_cookie_valid) 1342 INP_WUNLOCK(inp); 1343#endif 1344 TCPSTAT_INC(tcps_sc_dupsyn); 1345 if (ipopts) { 1346 /* 1347 * If we were remembering a previous source route, 1348 * forget it and use the new one we've been given. 1349 */ 1350 if (sc->sc_ipopts) 1351 (void) m_free(sc->sc_ipopts); 1352 sc->sc_ipopts = ipopts; 1353 } 1354 /* 1355 * Update timestamp if present. 1356 */ 1357 if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) 1358 sc->sc_tsreflect = to->to_tsval; 1359 else 1360 sc->sc_flags &= ~SCF_TIMESTAMP; 1361#ifdef MAC 1362 /* 1363 * Since we have already unconditionally allocated label 1364 * storage, free it up. The syncache entry will already 1365 * have an initialized label we can use. 1366 */ 1367 mac_syncache_destroy(&maclabel); 1368#endif 1369 /* Retransmit SYN|ACK and reset retransmit count. */ 1370 if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { 1371 log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " 1372 "resetting timer and retransmitting SYN|ACK\n", 1373 s, __func__); 1374 free(s, M_TCPLOG); 1375 } 1376 if (syncache_respond(sc, sch, 1, m) == 0) { 1377 sc->sc_rxmits = 0; 1378 syncache_timeout(sc, sch, 1); 1379 TCPSTAT_INC(tcps_sndacks); 1380 TCPSTAT_INC(tcps_sndtotal); 1381 } 1382 SCH_UNLOCK(sch); 1383 goto done; 1384 } 1385 1386#ifdef TCP_RFC7413 1387 if (tfo_cookie_valid) { 1388 bzero(&scs, sizeof(scs)); 1389 sc = &scs; 1390 goto skip_alloc; 1391 } 1392#endif 1393 1394 sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); 1395 if (sc == NULL) { 1396 /* 1397 * The zone allocator couldn't provide more entries. 1398 * Treat this as if the cache was full; drop the oldest 1399 * entry and insert the new one. 1400 */ 1401 TCPSTAT_INC(tcps_sc_zonefail); 1402 if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) 1403 syncache_drop(sc, sch); 1404 sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); 1405 if (sc == NULL) { 1406 if (V_tcp_syncookies) { 1407 bzero(&scs, sizeof(scs)); 1408 sc = &scs; 1409 } else { 1410 SCH_UNLOCK(sch); 1411 if (ipopts) 1412 (void) m_free(ipopts); 1413 goto done; 1414 } 1415 } 1416 } 1417 1418#ifdef TCP_RFC7413 1419skip_alloc: 1420 if (!tfo_cookie_valid && tfo_response_cookie_valid) 1421 sc->sc_tfo_cookie = &tfo_response_cookie; 1422#endif 1423 1424 /* 1425 * Fill in the syncache values. 1426 */ 1427#ifdef MAC 1428 sc->sc_label = maclabel; 1429#endif 1430 sc->sc_cred = cred; 1431 cred = NULL; 1432 sc->sc_ipopts = ipopts; 1433 bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); 1434#ifdef INET6 1435 if (!(inc->inc_flags & INC_ISIPV6)) 1436#endif 1437 { 1438 sc->sc_ip_tos = ip_tos; 1439 sc->sc_ip_ttl = ip_ttl; 1440 } 1441#ifdef TCP_OFFLOAD 1442 sc->sc_tod = tod; 1443 sc->sc_todctx = todctx; 1444#endif 1445 sc->sc_irs = th->th_seq; 1446 sc->sc_iss = arc4random(); 1447 sc->sc_flags = 0; 1448 sc->sc_flowlabel = 0; 1449 1450 /* 1451 * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. 1452 * win was derived from socket earlier in the function. 1453 */ 1454 win = imax(win, 0); 1455 win = imin(win, TCP_MAXWIN); 1456 sc->sc_wnd = win; 1457 1458 if (V_tcp_do_rfc1323) { 1459 /* 1460 * A timestamp received in a SYN makes 1461 * it ok to send timestamp requests and replies. 1462 */ 1463 if (to->to_flags & TOF_TS) { 1464 sc->sc_tsreflect = to->to_tsval; 1465 sc->sc_ts = tcp_ts_getticks(); 1466 sc->sc_flags |= SCF_TIMESTAMP; 1467 } 1468 if (to->to_flags & TOF_SCALE) { 1469 int wscale = 0; 1470 1471 /* 1472 * Pick the smallest possible scaling factor that 1473 * will still allow us to scale up to sb_max, aka 1474 * kern.ipc.maxsockbuf. 1475 * 1476 * We do this because there are broken firewalls that 1477 * will corrupt the window scale option, leading to 1478 * the other endpoint believing that our advertised 1479 * window is unscaled. At scale factors larger than 1480 * 5 the unscaled window will drop below 1500 bytes, 1481 * leading to serious problems when traversing these 1482 * broken firewalls. 1483 * 1484 * With the default maxsockbuf of 256K, a scale factor 1485 * of 3 will be chosen by this algorithm. Those who 1486 * choose a larger maxsockbuf should watch out 1487 * for the compatibility problems mentioned above. 1488 * 1489 * RFC1323: The Window field in a SYN (i.e., a <SYN> 1490 * or <SYN,ACK>) segment itself is never scaled. 1491 */ 1492 while (wscale < TCP_MAX_WINSHIFT && 1493 (TCP_MAXWIN << wscale) < sb_max) 1494 wscale++; 1495 sc->sc_requested_r_scale = wscale; 1496 sc->sc_requested_s_scale = to->to_wscale; 1497 sc->sc_flags |= SCF_WINSCALE; 1498 } 1499 } 1500#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 1501 /* 1502 * If listening socket requested TCP digests, flag this in the 1503 * syncache so that syncache_respond() will do the right thing 1504 * with the SYN+ACK. 1505 */ 1506 if (ltflags & TF_SIGNATURE) 1507 sc->sc_flags |= SCF_SIGNATURE; 1508#endif /* TCP_SIGNATURE */ 1509 if (to->to_flags & TOF_SACKPERM) 1510 sc->sc_flags |= SCF_SACK; 1511 if (to->to_flags & TOF_MSS) 1512 sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ 1513 if (ltflags & TF_NOOPT) 1514 sc->sc_flags |= SCF_NOOPT; 1515 if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) 1516 sc->sc_flags |= SCF_ECN; 1517 1518 if (V_tcp_syncookies) 1519 sc->sc_iss = syncookie_generate(sch, sc); 1520#ifdef INET6 1521 if (autoflowlabel) { 1522 if (V_tcp_syncookies) 1523 sc->sc_flowlabel = sc->sc_iss; 1524 else 1525 sc->sc_flowlabel = ip6_randomflowlabel(); 1526 sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; 1527 } 1528#endif 1529 SCH_UNLOCK(sch); 1530 1531#ifdef TCP_RFC7413 1532 if (tfo_cookie_valid) { 1533 syncache_tfo_expand(sc, lsop, m, tfo_response_cookie); 1534 /* INP_WUNLOCK(inp) will be performed by the called */ 1535 rv = 1; 1536 goto tfo_done; 1537 } 1538#endif 1539 1540 /* 1541 * Do a standard 3-way handshake. 1542 */ 1543 if (syncache_respond(sc, sch, 0, m) == 0) { 1544 if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) 1545 syncache_free(sc); 1546 else if (sc != &scs) 1547 syncache_insert(sc, sch); /* locks and unlocks sch */ 1548 TCPSTAT_INC(tcps_sndacks); 1549 TCPSTAT_INC(tcps_sndtotal); 1550 } else { 1551 if (sc != &scs) 1552 syncache_free(sc); 1553 TCPSTAT_INC(tcps_sc_dropped); 1554 } 1555 1556done: 1557 if (m) { 1558 *lsop = NULL; 1559 m_freem(m); 1560 } 1561#ifdef TCP_RFC7413 1562tfo_done: 1563#endif 1564 if (cred != NULL) 1565 crfree(cred); 1566#ifdef MAC 1567 if (sc == &scs) 1568 mac_syncache_destroy(&maclabel); 1569#endif 1570 return (rv); 1571} 1572 1573/* 1574 * Send SYN|ACK to the peer. Either in response to the peer's SYN, 1575 * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL. 1576 */ 1577static int 1578syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked, 1579 const struct mbuf *m0) 1580{ 1581 struct ip *ip = NULL; 1582 struct mbuf *m; 1583 struct tcphdr *th = NULL; 1584 int optlen, error = 0; /* Make compiler happy */ 1585 u_int16_t hlen, tlen, mssopt; 1586 struct tcpopt to; 1587#ifdef INET6 1588 struct ip6_hdr *ip6 = NULL; 1589#endif 1590 hlen = 1591#ifdef INET6 1592 (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) : 1593#endif 1594 sizeof(struct ip); 1595 tlen = hlen + sizeof(struct tcphdr); 1596 1597 /* Determine MSS we advertize to other end of connection. */ 1598 mssopt = tcp_mssopt(&sc->sc_inc); 1599 if (sc->sc_peer_mss) 1600 mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss); 1601 1602 /* XXX: Assume that the entire packet will fit in a header mbuf. */ 1603 KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, 1604 ("syncache: mbuf too small")); 1605 1606 /* Create the IP+TCP header from scratch. */ 1607 m = m_gethdr(M_NOWAIT, MT_DATA); 1608 if (m == NULL) 1609 return (ENOBUFS); 1610#ifdef MAC 1611 mac_syncache_create_mbuf(sc->sc_label, m); 1612#endif 1613 m->m_data += max_linkhdr; 1614 m->m_len = tlen; 1615 m->m_pkthdr.len = tlen; 1616 m->m_pkthdr.rcvif = NULL; 1617 1618#ifdef INET6 1619 if (sc->sc_inc.inc_flags & INC_ISIPV6) { 1620 ip6 = mtod(m, struct ip6_hdr *); 1621 ip6->ip6_vfc = IPV6_VERSION; 1622 ip6->ip6_nxt = IPPROTO_TCP; 1623 ip6->ip6_src = sc->sc_inc.inc6_laddr; 1624 ip6->ip6_dst = sc->sc_inc.inc6_faddr; 1625 ip6->ip6_plen = htons(tlen - hlen); 1626 /* ip6_hlim is set after checksum */ 1627 ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; 1628 ip6->ip6_flow |= sc->sc_flowlabel; 1629 1630 th = (struct tcphdr *)(ip6 + 1); 1631 } 1632#endif 1633#if defined(INET6) && defined(INET) 1634 else 1635#endif 1636#ifdef INET 1637 { 1638 ip = mtod(m, struct ip *); 1639 ip->ip_v = IPVERSION; 1640 ip->ip_hl = sizeof(struct ip) >> 2; 1641 ip->ip_len = htons(tlen); 1642 ip->ip_id = 0; 1643 ip->ip_off = 0; 1644 ip->ip_sum = 0; 1645 ip->ip_p = IPPROTO_TCP; 1646 ip->ip_src = sc->sc_inc.inc_laddr; 1647 ip->ip_dst = sc->sc_inc.inc_faddr; 1648 ip->ip_ttl = sc->sc_ip_ttl; 1649 ip->ip_tos = sc->sc_ip_tos; 1650 1651 /* 1652 * See if we should do MTU discovery. Route lookups are 1653 * expensive, so we will only unset the DF bit if: 1654 * 1655 * 1) path_mtu_discovery is disabled 1656 * 2) the SCF_UNREACH flag has been set 1657 */ 1658 if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) 1659 ip->ip_off |= htons(IP_DF); 1660 1661 th = (struct tcphdr *)(ip + 1); 1662 } 1663#endif /* INET */ 1664 th->th_sport = sc->sc_inc.inc_lport; 1665 th->th_dport = sc->sc_inc.inc_fport; 1666 1667 th->th_seq = htonl(sc->sc_iss); 1668 th->th_ack = htonl(sc->sc_irs + 1); 1669 th->th_off = sizeof(struct tcphdr) >> 2; 1670 th->th_x2 = 0; 1671 th->th_flags = TH_SYN|TH_ACK; 1672 th->th_win = htons(sc->sc_wnd); 1673 th->th_urp = 0; 1674 1675 if (sc->sc_flags & SCF_ECN) { 1676 th->th_flags |= TH_ECE; 1677 TCPSTAT_INC(tcps_ecn_shs); 1678 } 1679 1680 /* Tack on the TCP options. */ 1681 if ((sc->sc_flags & SCF_NOOPT) == 0) { 1682 to.to_flags = 0; 1683 1684 to.to_mss = mssopt; 1685 to.to_flags = TOF_MSS; 1686 if (sc->sc_flags & SCF_WINSCALE) { 1687 to.to_wscale = sc->sc_requested_r_scale; 1688 to.to_flags |= TOF_SCALE; 1689 } 1690 if (sc->sc_flags & SCF_TIMESTAMP) { 1691 /* Virgin timestamp or TCP cookie enhanced one. */ 1692 to.to_tsval = sc->sc_ts; 1693 to.to_tsecr = sc->sc_tsreflect; 1694 to.to_flags |= TOF_TS; 1695 } 1696 if (sc->sc_flags & SCF_SACK) 1697 to.to_flags |= TOF_SACKPERM; 1698#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 1699 if (sc->sc_flags & SCF_SIGNATURE) 1700 to.to_flags |= TOF_SIGNATURE; 1701#endif 1702#ifdef TCP_RFC7413 1703 if (sc->sc_tfo_cookie) { 1704 to.to_flags |= TOF_FASTOPEN; 1705 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 1706 to.to_tfo_cookie = sc->sc_tfo_cookie; 1707 /* don't send cookie again when retransmitting response */ 1708 sc->sc_tfo_cookie = NULL; 1709 } 1710#endif 1711 optlen = tcp_addoptions(&to, (u_char *)(th + 1)); 1712 1713 /* Adjust headers by option size. */ 1714 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 1715 m->m_len += optlen; 1716 m->m_pkthdr.len += optlen; 1717#ifdef INET6 1718 if (sc->sc_inc.inc_flags & INC_ISIPV6) 1719 ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); 1720 else 1721#endif 1722 ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 1723#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 1724 if (sc->sc_flags & SCF_SIGNATURE) { 1725 KASSERT(to.to_flags & TOF_SIGNATURE, 1726 ("tcp_addoptions() didn't set tcp_signature")); 1727 1728 /* NOTE: to.to_signature is inside of mbuf */ 1729 if (!TCPMD5_ENABLED() || 1730 TCPMD5_OUTPUT(m, th, to.to_signature) != 0) { 1731 m_freem(m); 1732 return (EACCES); 1733 } 1734 } 1735#endif 1736 } else 1737 optlen = 0; 1738 1739 M_SETFIB(m, sc->sc_inc.inc_fibnum); 1740 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1741 /* 1742 * If we have peer's SYN and it has a flowid, then let's assign it to 1743 * our SYN|ACK. ip6_output() and ip_output() will not assign flowid 1744 * to SYN|ACK due to lack of inp here. 1745 */ 1746 if (m0 != NULL && M_HASHTYPE_GET(m0) != M_HASHTYPE_NONE) { 1747 m->m_pkthdr.flowid = m0->m_pkthdr.flowid; 1748 M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0)); 1749 } 1750#ifdef INET6 1751 if (sc->sc_inc.inc_flags & INC_ISIPV6) { 1752 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 1753 th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, 1754 IPPROTO_TCP, 0); 1755 ip6->ip6_hlim = in6_selecthlim(NULL, NULL); 1756#ifdef TCP_OFFLOAD 1757 if (ADDED_BY_TOE(sc)) { 1758 struct toedev *tod = sc->sc_tod; 1759 1760 error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); 1761 1762 return (error); 1763 } 1764#endif 1765 error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); 1766 } 1767#endif 1768#if defined(INET6) && defined(INET) 1769 else 1770#endif 1771#ifdef INET 1772 { 1773 m->m_pkthdr.csum_flags = CSUM_TCP; 1774 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 1775 htons(tlen + optlen - hlen + IPPROTO_TCP)); 1776#ifdef TCP_OFFLOAD 1777 if (ADDED_BY_TOE(sc)) { 1778 struct toedev *tod = sc->sc_tod; 1779 1780 error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); 1781 1782 return (error); 1783 } 1784#endif 1785 error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); 1786 } 1787#endif 1788 return (error); 1789} 1790 1791/* 1792 * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks 1793 * that exceed the capacity of the syncache by avoiding the storage of any 1794 * of the SYNs we receive. Syncookies defend against blind SYN flooding 1795 * attacks where the attacker does not have access to our responses. 1796 * 1797 * Syncookies encode and include all necessary information about the 1798 * connection setup within the SYN|ACK that we send back. That way we 1799 * can avoid keeping any local state until the ACK to our SYN|ACK returns 1800 * (if ever). Normally the syncache and syncookies are running in parallel 1801 * with the latter taking over when the former is exhausted. When matching 1802 * syncache entry is found the syncookie is ignored. 1803 * 1804 * The only reliable information persisting the 3WHS is our initial sequence 1805 * number ISS of 32 bits. Syncookies embed a cryptographically sufficient 1806 * strong hash (MAC) value and a few bits of TCP SYN options in the ISS 1807 * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK 1808 * returns and signifies a legitimate connection if it matches the ACK. 1809 * 1810 * The available space of 32 bits to store the hash and to encode the SYN 1811 * option information is very tight and we should have at least 24 bits for 1812 * the MAC to keep the number of guesses by blind spoofing reasonably high. 1813 * 1814 * SYN option information we have to encode to fully restore a connection: 1815 * MSS: is imporant to chose an optimal segment size to avoid IP level 1816 * fragmentation along the path. The common MSS values can be encoded 1817 * in a 3-bit table. Uncommon values are captured by the next lower value 1818 * in the table leading to a slight increase in packetization overhead. 1819 * WSCALE: is necessary to allow large windows to be used for high delay- 1820 * bandwidth product links. Not scaling the window when it was initially 1821 * negotiated is bad for performance as lack of scaling further decreases 1822 * the apparent available send window. We only need to encode the WSCALE 1823 * we received from the remote end. Our end can be recalculated at any 1824 * time. The common WSCALE values can be encoded in a 3-bit table. 1825 * Uncommon values are captured by the next lower value in the table 1826 * making us under-estimate the available window size halving our 1827 * theoretically possible maximum throughput for that connection. 1828 * SACK: Greatly assists in packet loss recovery and requires 1 bit. 1829 * TIMESTAMP and SIGNATURE is not encoded because they are permanent options 1830 * that are included in all segments on a connection. We enable them when 1831 * the ACK has them. 1832 * 1833 * Security of syncookies and attack vectors: 1834 * 1835 * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod) 1836 * together with the gloabl secret to make it unique per connection attempt. 1837 * Thus any change of any of those parameters results in a different MAC output 1838 * in an unpredictable way unless a collision is encountered. 24 bits of the 1839 * MAC are embedded into the ISS. 1840 * 1841 * To prevent replay attacks two rotating global secrets are updated with a 1842 * new random value every 15 seconds. The life-time of a syncookie is thus 1843 * 15-30 seconds. 1844 * 1845 * Vector 1: Attacking the secret. This requires finding a weakness in the 1846 * MAC itself or the way it is used here. The attacker can do a chosen plain 1847 * text attack by varying and testing the all parameters under his control. 1848 * The strength depends on the size and randomness of the secret, and the 1849 * cryptographic security of the MAC function. Due to the constant updating 1850 * of the secret the attacker has at most 29.999 seconds to find the secret 1851 * and launch spoofed connections. After that he has to start all over again. 1852 * 1853 * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC 1854 * size an average of 4,823 attempts are required for a 50% chance of success 1855 * to spoof a single syncookie (birthday collision paradox). However the 1856 * attacker is blind and doesn't know if one of his attempts succeeded unless 1857 * he has a side channel to interfere success from. A single connection setup 1858 * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. 1859 * This many attempts are required for each one blind spoofed connection. For 1860 * every additional spoofed connection he has to launch another N attempts. 1861 * Thus for a sustained rate 100 spoofed connections per second approximately 1862 * 1,800,000 packets per second would have to be sent. 1863 * 1864 * NB: The MAC function should be fast so that it doesn't become a CPU 1865 * exhaustion attack vector itself. 1866 * 1867 * References: 1868 * RFC4987 TCP SYN Flooding Attacks and Common Mitigations 1869 * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 1870 * http://cr.yp.to/syncookies.html (overview) 1871 * http://cr.yp.to/syncookies/archive (details) 1872 * 1873 * 1874 * Schematic construction of a syncookie enabled Initial Sequence Number: 1875 * 0 1 2 3 1876 * 12345678901234567890123456789012 1877 * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| 1878 * 1879 * x 24 MAC (truncated) 1880 * W 3 Send Window Scale index 1881 * M 3 MSS index 1882 * S 1 SACK permitted 1883 * P 1 Odd/even secret 1884 */ 1885 1886/* 1887 * Distribution and probability of certain MSS values. Those in between are 1888 * rounded down to the next lower one. 1889 * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011] 1890 * .2% .3% 5% 7% 7% 20% 15% 45% 1891 */ 1892static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 }; 1893 1894/* 1895 * Distribution and probability of certain WSCALE values. We have to map the 1896 * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 1897 * bits based on prevalence of certain values. Where we don't have an exact 1898 * match for are rounded down to the next lower one letting us under-estimate 1899 * the true available window. At the moment this would happen only for the 1900 * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer 1901 * and window size). The absence of the WSCALE option (no scaling in either 1902 * direction) is encoded with index zero. 1903 * [WSCALE values histograms, Allman, 2012] 1904 * X 10 10 35 5 6 14 10% by host 1905 * X 11 4 5 5 18 49 3% by connections 1906 */ 1907static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 }; 1908 1909/* 1910 * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed 1911 * and good cryptographic properties. 1912 */ 1913static uint32_t 1914syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags, 1915 uint8_t *secbits, uintptr_t secmod) 1916{ 1917 SIPHASH_CTX ctx; 1918 uint32_t siphash[2]; 1919 1920 SipHash24_Init(&ctx); 1921 SipHash_SetKey(&ctx, secbits); 1922 switch (inc->inc_flags & INC_ISIPV6) { 1923#ifdef INET 1924 case 0: 1925 SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr)); 1926 SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr)); 1927 break; 1928#endif 1929#ifdef INET6 1930 case INC_ISIPV6: 1931 SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr)); 1932 SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr)); 1933 break; 1934#endif 1935 } 1936 SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport)); 1937 SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport)); 1938 SipHash_Update(&ctx, &irs, sizeof(irs)); 1939 SipHash_Update(&ctx, &flags, sizeof(flags)); 1940 SipHash_Update(&ctx, &secmod, sizeof(secmod)); 1941 SipHash_Final((u_int8_t *)&siphash, &ctx); 1942 1943 return (siphash[0] ^ siphash[1]); 1944} 1945 1946static tcp_seq 1947syncookie_generate(struct syncache_head *sch, struct syncache *sc) 1948{ 1949 u_int i, mss, secbit, wscale; 1950 uint32_t iss, hash; 1951 uint8_t *secbits; 1952 union syncookie cookie; 1953 1954 SCH_LOCK_ASSERT(sch); 1955 1956 cookie.cookie = 0; 1957 1958 /* Map our computed MSS into the 3-bit index. */ 1959 mss = min(tcp_mssopt(&sc->sc_inc), max(sc->sc_peer_mss, V_tcp_minmss)); 1960 for (i = nitems(tcp_sc_msstab) - 1; tcp_sc_msstab[i] > mss && i > 0; 1961 i--) 1962 ; 1963 cookie.flags.mss_idx = i; 1964 1965 /* 1966 * Map the send window scale into the 3-bit index but only if 1967 * the wscale option was received. 1968 */ 1969 if (sc->sc_flags & SCF_WINSCALE) { 1970 wscale = sc->sc_requested_s_scale; 1971 for (i = nitems(tcp_sc_wstab) - 1; 1972 tcp_sc_wstab[i] > wscale && i > 0; 1973 i--) 1974 ; 1975 cookie.flags.wscale_idx = i; 1976 } 1977 1978 /* Can we do SACK? */ 1979 if (sc->sc_flags & SCF_SACK) 1980 cookie.flags.sack_ok = 1; 1981 1982 /* Which of the two secrets to use. */ 1983 secbit = sch->sch_sc->secret.oddeven & 0x1; 1984 cookie.flags.odd_even = secbit; 1985 1986 secbits = sch->sch_sc->secret.key[secbit]; 1987 hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, 1988 (uintptr_t)sch); 1989 1990 /* 1991 * Put the flags into the hash and XOR them to get better ISS number 1992 * variance. This doesn't enhance the cryptographic strength and is 1993 * done to prevent the 8 cookie bits from showing up directly on the 1994 * wire. 1995 */ 1996 iss = hash & ~0xff; 1997 iss |= cookie.cookie ^ (hash >> 24); 1998 1999 /* Randomize the timestamp. */ 2000 if (sc->sc_flags & SCF_TIMESTAMP) { 2001 sc->sc_ts = arc4random(); 2002 sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks(); 2003 } 2004 2005 TCPSTAT_INC(tcps_sc_sendcookie); 2006 return (iss); 2007} 2008 2009static struct syncache * 2010syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, 2011 struct syncache *sc, struct tcphdr *th, struct tcpopt *to, 2012 struct socket *lso) 2013{ 2014 uint32_t hash; 2015 uint8_t *secbits; 2016 tcp_seq ack, seq; 2017 int wnd, wscale = 0; 2018 union syncookie cookie; 2019 2020 SCH_LOCK_ASSERT(sch); 2021 2022 /* 2023 * Pull information out of SYN-ACK/ACK and revert sequence number 2024 * advances. 2025 */ 2026 ack = th->th_ack - 1; 2027 seq = th->th_seq - 1; 2028 2029 /* 2030 * Unpack the flags containing enough information to restore the 2031 * connection. 2032 */ 2033 cookie.cookie = (ack & 0xff) ^ (ack >> 24); 2034 2035 /* Which of the two secrets to use. */ 2036 secbits = sch->sch_sc->secret.key[cookie.flags.odd_even]; 2037 2038 hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); 2039 2040 /* The recomputed hash matches the ACK if this was a genuine cookie. */ 2041 if ((ack & ~0xff) != (hash & ~0xff)) 2042 return (NULL); 2043 2044 /* Fill in the syncache values. */ 2045 sc->sc_flags = 0; 2046 bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); 2047 sc->sc_ipopts = NULL; 2048 2049 sc->sc_irs = seq; 2050 sc->sc_iss = ack; 2051 2052 switch (inc->inc_flags & INC_ISIPV6) { 2053#ifdef INET 2054 case 0: 2055 sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; 2056 sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; 2057 break; 2058#endif 2059#ifdef INET6 2060 case INC_ISIPV6: 2061 if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) 2062 sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK; 2063 break; 2064#endif 2065 } 2066 2067 sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx]; 2068 2069 /* We can simply recompute receive window scale we sent earlier. */ 2070 while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) 2071 wscale++; 2072 2073 /* Only use wscale if it was enabled in the orignal SYN. */ 2074 if (cookie.flags.wscale_idx > 0) { 2075 sc->sc_requested_r_scale = wscale; 2076 sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx]; 2077 sc->sc_flags |= SCF_WINSCALE; 2078 } 2079 2080 wnd = sbspace(&lso->so_rcv); 2081 wnd = imax(wnd, 0); 2082 wnd = imin(wnd, TCP_MAXWIN); 2083 sc->sc_wnd = wnd; 2084 2085 if (cookie.flags.sack_ok) 2086 sc->sc_flags |= SCF_SACK; 2087 2088 if (to->to_flags & TOF_TS) { 2089 sc->sc_flags |= SCF_TIMESTAMP; 2090 sc->sc_tsreflect = to->to_tsval; 2091 sc->sc_ts = to->to_tsecr; 2092 sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks(); 2093 } 2094 2095 if (to->to_flags & TOF_SIGNATURE) 2096 sc->sc_flags |= SCF_SIGNATURE; 2097 2098 sc->sc_rxmits = 0; 2099 2100 TCPSTAT_INC(tcps_sc_recvcookie); 2101 return (sc); 2102} 2103 2104#ifdef INVARIANTS 2105static int 2106syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, 2107 struct syncache *sc, struct tcphdr *th, struct tcpopt *to, 2108 struct socket *lso) 2109{ 2110 struct syncache scs, *scx; 2111 char *s; 2112 2113 bzero(&scs, sizeof(scs)); 2114 scx = syncookie_lookup(inc, sch, &scs, th, to, lso); 2115 2116 if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) 2117 return (0); 2118 2119 if (scx != NULL) { 2120 if (sc->sc_peer_mss != scx->sc_peer_mss) 2121 log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", 2122 s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); 2123 2124 if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) 2125 log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", 2126 s, __func__, sc->sc_requested_r_scale, 2127 scx->sc_requested_r_scale); 2128 2129 if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) 2130 log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", 2131 s, __func__, sc->sc_requested_s_scale, 2132 scx->sc_requested_s_scale); 2133 2134 if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) 2135 log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); 2136 } 2137 2138 if (s != NULL) 2139 free(s, M_TCPLOG); 2140 return (0); 2141} 2142#endif /* INVARIANTS */ 2143 2144static void 2145syncookie_reseed(void *arg) 2146{ 2147 struct tcp_syncache *sc = arg; 2148 uint8_t *secbits; 2149 int secbit; 2150 2151 /* 2152 * Reseeding the secret doesn't have to be protected by a lock. 2153 * It only must be ensured that the new random values are visible 2154 * to all CPUs in a SMP environment. The atomic with release 2155 * semantics ensures that. 2156 */ 2157 secbit = (sc->secret.oddeven & 0x1) ? 0 : 1; 2158 secbits = sc->secret.key[secbit]; 2159 arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0); 2160 atomic_add_rel_int(&sc->secret.oddeven, 1); 2161 2162 /* Reschedule ourself. */ 2163 callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); 2164} 2165 2166/* 2167 * Exports the syncache entries to userland so that netstat can display 2168 * them alongside the other sockets. This function is intended to be 2169 * called only from tcp_pcblist. 2170 * 2171 * Due to concurrency on an active system, the number of pcbs exported 2172 * may have no relation to max_pcbs. max_pcbs merely indicates the 2173 * amount of space the caller allocated for this function to use. 2174 */ 2175int 2176syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) 2177{ 2178 struct xtcpcb xt; 2179 struct syncache *sc; 2180 struct syncache_head *sch; 2181 int count, error, i; 2182 2183 for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { 2184 sch = &V_tcp_syncache.hashbase[i]; 2185 SCH_LOCK(sch); 2186 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { 2187 if (count >= max_pcbs) { 2188 SCH_UNLOCK(sch); 2189 goto exit; 2190 } 2191 if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) 2192 continue; 2193 bzero(&xt, sizeof(xt)); 2194 xt.xt_len = sizeof(xt); 2195 if (sc->sc_inc.inc_flags & INC_ISIPV6) 2196 xt.xt_inp.inp_vflag = INP_IPV6; 2197 else 2198 xt.xt_inp.inp_vflag = INP_IPV4; 2199 bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); 2200 xt.xt_tp.t_inpcb = &xt.xt_inp; 2201 xt.xt_tp.t_state = TCPS_SYN_RECEIVED; 2202 xt.xt_socket.xso_protocol = IPPROTO_TCP; 2203 xt.xt_socket.xso_len = sizeof (struct xsocket); 2204 xt.xt_socket.so_type = SOCK_STREAM; 2205 xt.xt_socket.so_state = SS_ISCONNECTING; 2206 error = SYSCTL_OUT(req, &xt, sizeof xt); 2207 if (error) { 2208 SCH_UNLOCK(sch); 2209 goto exit; 2210 } 2211 count++; 2212 } 2213 SCH_UNLOCK(sch); 2214 } 2215exit: 2216 *pcbs_exported = count; 2217 return error; 2218} 2219