1/*- 2 * Copyright (c) 2007, Myricom Inc. 3 * Copyright (c) 2008, Intel Corporation. 4 * Copyright (c) 2012 The FreeBSD Foundation 5 * All rights reserved. 6 * 7 * Portions of this software were developed by Bjoern Zeeb 8 * under sponsorship from the FreeBSD Foundation. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD$"); 34 35#include "opt_inet.h" 36#include "opt_inet6.h" 37 38#include <sys/param.h> 39#include <sys/systm.h> 40#include <sys/mbuf.h> 41#include <sys/kernel.h> 42#include <sys/socket.h> 43#include <sys/sysctl.h> 44 45#include <net/if.h> 46#include <net/if_var.h> 47#include <net/ethernet.h> 48#include <net/vnet.h> 49 50#include <netinet/in_systm.h> 51#include <netinet/in.h> 52#include <netinet/ip6.h> 53#include <netinet/ip.h> 54#include <netinet/ip_var.h> 55#include <netinet/tcp.h> 56#include <netinet/tcp_lro.h> 57#include <netinet/tcp_var.h> 58 59#include <netinet6/ip6_var.h> 60 61#include <machine/in_cksum.h> 62 63#ifndef LRO_ENTRIES 64#define LRO_ENTRIES 8 /* # of LRO entries per RX queue. */ 65#endif 66 67#define TCP_LRO_UPDATE_CSUM 1 68#ifndef TCP_LRO_UPDATE_CSUM 69#define TCP_LRO_INVALID_CSUM 0x0000 70#endif 71 72SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 73 "TCP LRO"); 74 75static unsigned tcp_lro_entries = LRO_ENTRIES; 76SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, 77 CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, 78 "default number of LRO entries"); 79 80int 81tcp_lro_init(struct lro_ctrl *lc) 82{ 83 struct lro_entry *le; 84 int error, i; 85 86 lc->lro_bad_csum = 0; 87 lc->lro_queued = 0; 88 lc->lro_flushed = 0; 89 lc->lro_cnt = 0; 90 SLIST_INIT(&lc->lro_free); 91 SLIST_INIT(&lc->lro_active); 92 93 error = 0; 94 for (i = 0; i < tcp_lro_entries; i++) { 95 le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF, 96 M_NOWAIT | M_ZERO); 97 if (le == NULL) { 98 if (i == 0) 99 error = ENOMEM; 100 break; 101 } 102 lc->lro_cnt = i + 1; 103 SLIST_INSERT_HEAD(&lc->lro_free, le, next); 104 } 105 106 return (error); 107} 108 109void 110tcp_lro_free(struct lro_ctrl *lc) 111{ 112 struct lro_entry *le; 113 114 while (!SLIST_EMPTY(&lc->lro_free)) { 115 le = SLIST_FIRST(&lc->lro_free); 116 SLIST_REMOVE_HEAD(&lc->lro_free, next); 117 free(le, M_DEVBUF); 118 } 119} 120 121#ifdef TCP_LRO_UPDATE_CSUM 122static uint16_t 123tcp_lro_csum_th(struct tcphdr *th) 124{ 125 uint32_t ch; 126 uint16_t *p, l; 127 128 ch = th->th_sum = 0x0000; 129 l = th->th_off; 130 p = (uint16_t *)th; 131 while (l > 0) { 132 ch += *p; 133 p++; 134 ch += *p; 135 p++; 136 l--; 137 } 138 while (ch > 0xffff) 139 ch = (ch >> 16) + (ch & 0xffff); 140 141 return (ch & 0xffff); 142} 143 144static uint16_t 145tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, 146 uint16_t tcp_data_len, uint16_t csum) 147{ 148 uint32_t c; 149 uint16_t cs; 150 151 c = csum; 152 153 /* Remove length from checksum. */ 154 switch (le->eh_type) { 155#ifdef INET6 156 case ETHERTYPE_IPV6: 157 { 158 struct ip6_hdr *ip6; 159 160 ip6 = (struct ip6_hdr *)l3hdr; 161 if (le->append_cnt == 0) 162 cs = ip6->ip6_plen; 163 else { 164 uint32_t cx; 165 166 cx = ntohs(ip6->ip6_plen); 167 cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); 168 } 169 break; 170 } 171#endif 172#ifdef INET 173 case ETHERTYPE_IP: 174 { 175 struct ip *ip4; 176 177 ip4 = (struct ip *)l3hdr; 178 if (le->append_cnt == 0) 179 cs = ip4->ip_len; 180 else { 181 cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), 182 IPPROTO_TCP); 183 cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, 184 htons(cs)); 185 } 186 break; 187 } 188#endif 189 default: 190 cs = 0; /* Keep compiler happy. */ 191 } 192 193 cs = ~cs; 194 c += cs; 195 196 /* Remove TCP header csum. */ 197 cs = ~tcp_lro_csum_th(th); 198 c += cs; 199 while (c > 0xffff) 200 c = (c >> 16) + (c & 0xffff); 201 202 return (c & 0xffff); 203} 204#endif 205 206void 207tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) 208{ 209 struct lro_entry *le, *le_tmp; 210 struct timeval tv; 211 212 if (SLIST_EMPTY(&lc->lro_active)) 213 return; 214 215 getmicrotime(&tv); 216 timevalsub(&tv, timeout); 217 SLIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { 218 if (timevalcmp(&tv, &le->mtime, >=)) { 219 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); 220 tcp_lro_flush(lc, le); 221 } 222 } 223} 224 225void 226tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) 227{ 228 229 if (le->append_cnt > 0) { 230 struct tcphdr *th; 231 uint16_t p_len; 232 233 p_len = htons(le->p_len); 234 switch (le->eh_type) { 235#ifdef INET6 236 case ETHERTYPE_IPV6: 237 { 238 struct ip6_hdr *ip6; 239 240 ip6 = le->le_ip6; 241 ip6->ip6_plen = p_len; 242 th = (struct tcphdr *)(ip6 + 1); 243 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 244 CSUM_PSEUDO_HDR; 245 le->p_len += ETHER_HDR_LEN + sizeof(*ip6); 246 break; 247 } 248#endif 249#ifdef INET 250 case ETHERTYPE_IP: 251 { 252 struct ip *ip4; 253#ifdef TCP_LRO_UPDATE_CSUM 254 uint32_t cl; 255 uint16_t c; 256#endif 257 258 ip4 = le->le_ip4; 259#ifdef TCP_LRO_UPDATE_CSUM 260 /* Fix IP header checksum for new length. */ 261 c = ~ip4->ip_sum; 262 cl = c; 263 c = ~ip4->ip_len; 264 cl += c + p_len; 265 while (cl > 0xffff) 266 cl = (cl >> 16) + (cl & 0xffff); 267 c = cl; 268 ip4->ip_sum = ~c; 269#else 270 ip4->ip_sum = TCP_LRO_INVALID_CSUM; 271#endif 272 ip4->ip_len = p_len; 273 th = (struct tcphdr *)(ip4 + 1); 274 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | 275 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; 276 le->p_len += ETHER_HDR_LEN; 277 break; 278 } 279#endif 280 default: 281 th = NULL; /* Keep compiler happy. */ 282 } 283 le->m_head->m_pkthdr.csum_data = 0xffff; 284 le->m_head->m_pkthdr.len = le->p_len; 285 286 /* Incorporate the latest ACK into the TCP header. */ 287 th->th_ack = le->ack_seq; 288 th->th_win = le->window; 289 /* Incorporate latest timestamp into the TCP header. */ 290 if (le->timestamp != 0) { 291 uint32_t *ts_ptr; 292 293 ts_ptr = (uint32_t *)(th + 1); 294 ts_ptr[1] = htonl(le->tsval); 295 ts_ptr[2] = le->tsecr; 296 } 297#ifdef TCP_LRO_UPDATE_CSUM 298 /* Update the TCP header checksum. */ 299 le->ulp_csum += p_len; 300 le->ulp_csum += tcp_lro_csum_th(th); 301 while (le->ulp_csum > 0xffff) 302 le->ulp_csum = (le->ulp_csum >> 16) + 303 (le->ulp_csum & 0xffff); 304 th->th_sum = (le->ulp_csum & 0xffff); 305 th->th_sum = ~th->th_sum; 306#else 307 th->th_sum = TCP_LRO_INVALID_CSUM; 308#endif 309 } 310 311 (*lc->ifp->if_input)(lc->ifp, le->m_head); 312 lc->lro_queued += le->append_cnt + 1; 313 lc->lro_flushed++; 314 bzero(le, sizeof(*le)); 315 SLIST_INSERT_HEAD(&lc->lro_free, le, next); 316} 317 318#ifdef INET6 319static int 320tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, 321 struct tcphdr **th) 322{ 323 324 /* XXX-BZ we should check the flow-label. */ 325 326 /* XXX-BZ We do not yet support ext. hdrs. */ 327 if (ip6->ip6_nxt != IPPROTO_TCP) 328 return (TCP_LRO_NOT_SUPPORTED); 329 330 /* Find the TCP header. */ 331 *th = (struct tcphdr *)(ip6 + 1); 332 333 return (0); 334} 335#endif 336 337#ifdef INET 338static int 339tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, 340 struct tcphdr **th) 341{ 342 int csum_flags; 343 uint16_t csum; 344 345 if (ip4->ip_p != IPPROTO_TCP) 346 return (TCP_LRO_NOT_SUPPORTED); 347 348 /* Ensure there are no options. */ 349 if ((ip4->ip_hl << 2) != sizeof (*ip4)) 350 return (TCP_LRO_CANNOT); 351 352 /* .. and the packet is not fragmented. */ 353 if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) 354 return (TCP_LRO_CANNOT); 355 356 /* Legacy IP has a header checksum that needs to be correct. */ 357 csum_flags = m->m_pkthdr.csum_flags; 358 if (csum_flags & CSUM_IP_CHECKED) { 359 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { 360 lc->lro_bad_csum++; 361 return (TCP_LRO_CANNOT); 362 } 363 } else { 364 csum = in_cksum_hdr(ip4); 365 if (__predict_false((csum) != 0)) { 366 lc->lro_bad_csum++; 367 return (TCP_LRO_CANNOT); 368 } 369 } 370 371 /* Find the TCP header (we assured there are no IP options). */ 372 *th = (struct tcphdr *)(ip4 + 1); 373 374 return (0); 375} 376#endif 377 378int 379tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) 380{ 381 struct lro_entry *le; 382 struct ether_header *eh; 383#ifdef INET6 384 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 385#endif 386#ifdef INET 387 struct ip *ip4 = NULL; /* Keep compiler happy. */ 388#endif 389 struct tcphdr *th; 390 void *l3hdr = NULL; /* Keep compiler happy. */ 391 uint32_t *ts_ptr; 392 tcp_seq seq; 393 int error, ip_len, l; 394 uint16_t eh_type, tcp_data_len; 395 int force_flush = 0; 396 397 /* We expect a contiguous header [eh, ip, tcp]. */ 398 399 eh = mtod(m, struct ether_header *); 400 eh_type = ntohs(eh->ether_type); 401 switch (eh_type) { 402#ifdef INET6 403 case ETHERTYPE_IPV6: 404 { 405 CURVNET_SET(lc->ifp->if_vnet); 406 if (V_ip6_forwarding != 0) { 407 /* XXX-BZ stats but changing lro_ctrl is a problem. */ 408 CURVNET_RESTORE(); 409 return (TCP_LRO_CANNOT); 410 } 411 CURVNET_RESTORE(); 412 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); 413 error = tcp_lro_rx_ipv6(lc, m, ip6, &th); 414 if (error != 0) 415 return (error); 416 tcp_data_len = ntohs(ip6->ip6_plen); 417 ip_len = sizeof(*ip6) + tcp_data_len; 418 break; 419 } 420#endif 421#ifdef INET 422 case ETHERTYPE_IP: 423 { 424 CURVNET_SET(lc->ifp->if_vnet); 425 if (V_ipforwarding != 0) { 426 /* XXX-BZ stats but changing lro_ctrl is a problem. */ 427 CURVNET_RESTORE(); 428 return (TCP_LRO_CANNOT); 429 } 430 CURVNET_RESTORE(); 431 l3hdr = ip4 = (struct ip *)(eh + 1); 432 error = tcp_lro_rx_ipv4(lc, m, ip4, &th); 433 if (error != 0) 434 return (error); 435 ip_len = ntohs(ip4->ip_len); 436 tcp_data_len = ip_len - sizeof(*ip4); 437 break; 438 } 439#endif 440 /* XXX-BZ what happens in case of VLAN(s)? */ 441 default: 442 return (TCP_LRO_NOT_SUPPORTED); 443 } 444 445 /* 446 * If the frame is padded beyond the end of the IP packet, then we must 447 * trim the extra bytes off. 448 */ 449 l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); 450 if (l != 0) { 451 if (l < 0) 452 /* Truncated packet. */ 453 return (TCP_LRO_CANNOT); 454 455 m_adj(m, -l); 456 } 457 458 /* 459 * Check TCP header constraints. 460 */ 461 /* Ensure no bits set besides ACK or PSH. */ 462 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { 463 if (th->th_flags & TH_SYN) 464 return (TCP_LRO_CANNOT); 465 /* 466 * Make sure that previously seen segements/ACKs are delivered 467 * before this segement, e.g. FIN. 468 */ 469 force_flush = 1; 470 } 471 472 /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */ 473 /* XXX-BZ Ideally we'd flush on PUSH? */ 474 475 /* 476 * Check for timestamps. 477 * Since the only option we handle are timestamps, we only have to 478 * handle the simple case of aligned timestamps. 479 */ 480 l = (th->th_off << 2); 481 tcp_data_len -= l; 482 l -= sizeof(*th); 483 ts_ptr = (uint32_t *)(th + 1); 484 if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || 485 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| 486 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { 487 /* 488 * Make sure that previously seen segements/ACKs are delivered 489 * before this segement. 490 */ 491 force_flush = 1; 492 } 493 494 /* If the driver did not pass in the checksum, set it now. */ 495 if (csum == 0x0000) 496 csum = th->th_sum; 497 498 seq = ntohl(th->th_seq); 499 500 /* Try to find a matching previous segment. */ 501 SLIST_FOREACH(le, &lc->lro_active, next) { 502 if (le->eh_type != eh_type) 503 continue; 504 if (le->source_port != th->th_sport || 505 le->dest_port != th->th_dport) 506 continue; 507 switch (eh_type) { 508#ifdef INET6 509 case ETHERTYPE_IPV6: 510 if (bcmp(&le->source_ip6, &ip6->ip6_src, 511 sizeof(struct in6_addr)) != 0 || 512 bcmp(&le->dest_ip6, &ip6->ip6_dst, 513 sizeof(struct in6_addr)) != 0) 514 continue; 515 break; 516#endif 517#ifdef INET 518 case ETHERTYPE_IP: 519 if (le->source_ip4 != ip4->ip_src.s_addr || 520 le->dest_ip4 != ip4->ip_dst.s_addr) 521 continue; 522 break; 523#endif 524 } 525 526 if (force_flush) { 527 /* Timestamps mismatch; this is a FIN, etc */ 528 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); 529 tcp_lro_flush(lc, le); 530 return (TCP_LRO_CANNOT); 531 } 532 533 /* Flush now if appending will result in overflow. */ 534 if (le->p_len > (65535 - tcp_data_len)) { 535 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); 536 tcp_lro_flush(lc, le); 537 break; 538 } 539 540 /* Try to append the new segment. */ 541 if (__predict_false(seq != le->next_seq || 542 (tcp_data_len == 0 && le->ack_seq == th->th_ack))) { 543 /* Out of order packet or duplicate ACK. */ 544 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); 545 tcp_lro_flush(lc, le); 546 return (TCP_LRO_CANNOT); 547 } 548 549 if (l != 0) { 550 uint32_t tsval = ntohl(*(ts_ptr + 1)); 551 /* Make sure timestamp values are increasing. */ 552 /* XXX-BZ flip and use TSTMP_GEQ macro for this? */ 553 if (__predict_false(le->tsval > tsval || 554 *(ts_ptr + 2) == 0)) 555 return (TCP_LRO_CANNOT); 556 le->tsval = tsval; 557 le->tsecr = *(ts_ptr + 2); 558 } 559 560 le->next_seq += tcp_data_len; 561 le->ack_seq = th->th_ack; 562 le->window = th->th_win; 563 le->append_cnt++; 564 565#ifdef TCP_LRO_UPDATE_CSUM 566 le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th, 567 tcp_data_len, ~csum); 568#endif 569 570 if (tcp_data_len == 0) { 571 m_freem(m); 572 return (0); 573 } 574 575 le->p_len += tcp_data_len; 576 577 /* 578 * Adjust the mbuf so that m_data points to the first byte of 579 * the ULP payload. Adjust the mbuf to avoid complications and 580 * append new segment to existing mbuf chain. 581 */ 582 m_adj(m, m->m_pkthdr.len - tcp_data_len); 583 m->m_flags &= ~M_PKTHDR; 584 585 le->m_tail->m_next = m; 586 le->m_tail = m_last(m); 587 588 /* 589 * If a possible next full length packet would cause an 590 * overflow, pro-actively flush now. 591 */ 592 if (le->p_len > (65535 - lc->ifp->if_mtu)) { 593 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); 594 tcp_lro_flush(lc, le); 595 } else 596 getmicrotime(&le->mtime); 597 598 return (0); 599 } 600 601 if (force_flush) { 602 /* 603 * Nothing to flush, but this segment can not be further 604 * aggregated/delayed. 605 */ 606 return (TCP_LRO_CANNOT); 607 } 608 609 /* Try to find an empty slot. */ 610 if (SLIST_EMPTY(&lc->lro_free)) 611 return (TCP_LRO_NO_ENTRIES); 612 613 /* Start a new segment chain. */ 614 le = SLIST_FIRST(&lc->lro_free); 615 SLIST_REMOVE_HEAD(&lc->lro_free, next); 616 SLIST_INSERT_HEAD(&lc->lro_active, le, next); 617 getmicrotime(&le->mtime); 618 619 /* Start filling in details. */ 620 switch (eh_type) { 621#ifdef INET6 622 case ETHERTYPE_IPV6: 623 le->le_ip6 = ip6; 624 le->source_ip6 = ip6->ip6_src; 625 le->dest_ip6 = ip6->ip6_dst; 626 le->eh_type = eh_type; 627 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); 628 break; 629#endif 630#ifdef INET 631 case ETHERTYPE_IP: 632 le->le_ip4 = ip4; 633 le->source_ip4 = ip4->ip_src.s_addr; 634 le->dest_ip4 = ip4->ip_dst.s_addr; 635 le->eh_type = eh_type; 636 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; 637 break; 638#endif 639 } 640 le->source_port = th->th_sport; 641 le->dest_port = th->th_dport; 642 643 le->next_seq = seq + tcp_data_len; 644 le->ack_seq = th->th_ack; 645 le->window = th->th_win; 646 if (l != 0) { 647 le->timestamp = 1; 648 le->tsval = ntohl(*(ts_ptr + 1)); 649 le->tsecr = *(ts_ptr + 2); 650 } 651 652#ifdef TCP_LRO_UPDATE_CSUM 653 /* 654 * Do not touch the csum of the first packet. However save the 655 * "adjusted" checksum of just the source and destination addresses, 656 * the next header and the TCP payload. The length and TCP header 657 * parts may change, so we remove those from the saved checksum and 658 * re-add with final values on tcp_lro_flush() if needed. 659 */ 660 KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", 661 __func__, le, le->ulp_csum)); 662 663 le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, 664 ~csum); 665 th->th_sum = csum; /* Restore checksum on first packet. */ 666#endif 667 668 le->m_head = m; 669 le->m_tail = m_last(m); 670 671 return (0); 672} 673 674/* end */ 675