sfxge_rx.c revision 284555
1/*- 2 * Copyright (c) 2010-2015 Solarflare Communications Inc. 3 * All rights reserved. 4 * 5 * This software was developed in part by Philip Paeps under contract for 6 * Solarflare Communications, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright notice, 12 * this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * 29 * The views and conclusions contained in the software and documentation are 30 * those of the authors and should not be interpreted as representing official 31 * policies, either expressed or implied, of the FreeBSD Project. 32 */ 33 34#include <sys/cdefs.h> 35__FBSDID("$FreeBSD: stable/10/sys/dev/sfxge/sfxge_rx.c 284555 2015-06-18 15:46:39Z arybchik $"); 36 37#include <sys/types.h> 38#include <sys/mbuf.h> 39#include <sys/smp.h> 40#include <sys/socket.h> 41#include <sys/sysctl.h> 42#include <sys/syslog.h> 43#include <sys/limits.h> 44#include <sys/syslog.h> 45 46#include <net/ethernet.h> 47#include <net/if.h> 48#include <net/if_vlan_var.h> 49 50#include <netinet/in.h> 51#include <netinet/ip.h> 52#include <netinet/ip6.h> 53#include <netinet/tcp.h> 54 55#include <machine/in_cksum.h> 56 57#include "common/efx.h" 58 59 60#include "sfxge.h" 61#include "sfxge_rx.h" 62 63#define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 64 65#ifdef SFXGE_LRO 66 67SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL, 68 "Large receive offload (LRO) parameters"); 69 70#define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 71 72/* Size of the LRO hash table. Must be a power of 2. A larger table 73 * means we can accelerate a larger number of streams. 74 */ 75static unsigned lro_table_size = 128; 76TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 77SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 78 &lro_table_size, 0, 79 "Size of the LRO hash table (must be a power of 2)"); 80 81/* Maximum length of a hash chain. If chains get too long then the lookup 82 * time increases and may exceed the benefit of LRO. 83 */ 84static unsigned lro_chain_max = 20; 85TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 86SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 87 &lro_chain_max, 0, 88 "The maximum length of a hash chain"); 89 90/* Maximum time (in ticks) that a connection can be idle before it's LRO 91 * state is discarded. 92 */ 93static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 94TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 95SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 96 &lro_idle_ticks, 0, 97 "The maximum time (in ticks) that a connection can be idle " 98 "before it's LRO state is discarded"); 99 100/* Number of packets with payload that must arrive in-order before a 101 * connection is eligible for LRO. The idea is we should avoid coalescing 102 * segments when the sender is in slow-start because reducing the ACK rate 103 * can damage performance. 104 */ 105static int lro_slow_start_packets = 2000; 106TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 107SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 108 &lro_slow_start_packets, 0, 109 "Number of packets with payload that must arrive in-order before " 110 "a connection is eligible for LRO"); 111 112/* Number of packets with payload that must arrive in-order following loss 113 * before a connection is eligible for LRO. The idea is we should avoid 114 * coalescing segments when the sender is recovering from loss, because 115 * reducing the ACK rate can damage performance. 116 */ 117static int lro_loss_packets = 20; 118TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 119SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 120 &lro_loss_packets, 0, 121 "Number of packets with payload that must arrive in-order " 122 "following loss before a connection is eligible for LRO"); 123 124/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 125#define SFXGE_LRO_L2_ID_VLAN 0x4000 126#define SFXGE_LRO_L2_ID_IPV6 0x8000 127#define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 128#define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 129 130/* Compare IPv6 addresses, avoiding conditional branches */ 131static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 132 const struct in6_addr *right) 133{ 134#if LONG_BIT == 64 135 const uint64_t *left64 = (const uint64_t *)left; 136 const uint64_t *right64 = (const uint64_t *)right; 137 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 138#else 139 return (left->s6_addr32[0] - right->s6_addr32[0]) | 140 (left->s6_addr32[1] - right->s6_addr32[1]) | 141 (left->s6_addr32[2] - right->s6_addr32[2]) | 142 (left->s6_addr32[3] - right->s6_addr32[3]); 143#endif 144} 145 146#endif /* SFXGE_LRO */ 147 148void 149sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 150{ 151 152 rxq->flush_state = SFXGE_FLUSH_DONE; 153} 154 155void 156sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 157{ 158 159 rxq->flush_state = SFXGE_FLUSH_FAILED; 160} 161 162static uint8_t toep_key[] = { 163 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 164 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 165 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 166 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 167 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 168}; 169 170static void 171sfxge_rx_post_refill(void *arg) 172{ 173 struct sfxge_rxq *rxq = arg; 174 struct sfxge_softc *sc; 175 unsigned int index; 176 struct sfxge_evq *evq; 177 uint16_t magic; 178 179 sc = rxq->sc; 180 index = rxq->index; 181 evq = sc->evq[index]; 182 183 magic = SFXGE_MAGIC_RX_QREFILL | index; 184 185 /* This is guaranteed due to the start/stop order of rx and ev */ 186 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 187 ("evq not started")); 188 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 189 ("rxq not started")); 190 efx_ev_qpost(evq->common, magic); 191} 192 193static void 194sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 195{ 196 /* Initially retry after 100 ms, but back off in case of 197 * repeated failures as we probably have to wait for the 198 * administrator to raise the pool limit. */ 199 if (retrying) 200 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 201 else 202 rxq->refill_delay = hz / 10; 203 204 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 205 sfxge_rx_post_refill, rxq); 206} 207 208static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc) 209{ 210 struct mb_args args; 211 struct mbuf *m; 212 213 /* Allocate mbuf structure */ 214 args.flags = M_PKTHDR; 215 args.type = MT_DATA; 216 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT); 217 218 /* Allocate (and attach) packet buffer */ 219 if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) { 220 uma_zfree(zone_mbuf, m); 221 m = NULL; 222 } 223 224 return (m); 225} 226 227#define SFXGE_REFILL_BATCH 64 228 229static void 230sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 231{ 232 struct sfxge_softc *sc; 233 unsigned int index; 234 struct sfxge_evq *evq; 235 unsigned int batch; 236 unsigned int rxfill; 237 unsigned int mblksize; 238 int ntodo; 239 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 240 241 sc = rxq->sc; 242 index = rxq->index; 243 evq = sc->evq[index]; 244 245 prefetch_read_many(sc->enp); 246 prefetch_read_many(rxq->common); 247 248 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 249 250 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 251 return; 252 253 rxfill = rxq->added - rxq->completed; 254 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 255 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 256 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 257 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 258 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 259 260 if (ntodo == 0) 261 return; 262 263 batch = 0; 264 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 265 while (ntodo-- > 0) { 266 unsigned int id; 267 struct sfxge_rx_sw_desc *rx_desc; 268 bus_dma_segment_t seg; 269 struct mbuf *m; 270 271 id = (rxq->added + batch) & rxq->ptr_mask; 272 rx_desc = &rxq->queue[id]; 273 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 274 275 rx_desc->flags = EFX_DISCARD; 276 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc); 277 if (m == NULL) 278 break; 279 280 /* m_len specifies length of area to be mapped for DMA */ 281 m->m_len = mblksize; 282 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE); 283 m->m_data += sc->rx_buffer_align; 284 285 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 286 addr[batch++] = seg.ds_addr; 287 288 if (batch == SFXGE_REFILL_BATCH) { 289 efx_rx_qpost(rxq->common, addr, mblksize, batch, 290 rxq->completed, rxq->added); 291 rxq->added += batch; 292 batch = 0; 293 } 294 } 295 296 if (ntodo != 0) 297 sfxge_rx_schedule_refill(rxq, retrying); 298 299 if (batch != 0) { 300 efx_rx_qpost(rxq->common, addr, mblksize, batch, 301 rxq->completed, rxq->added); 302 rxq->added += batch; 303 } 304 305 /* Make the descriptors visible to the hardware */ 306 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 307 BUS_DMASYNC_PREWRITE); 308 309 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 310 311 /* The queue could still be empty if no descriptors were actually 312 * pushed, in which case there will be no event to cause the next 313 * refill, so we must schedule a refill ourselves. 314 */ 315 if(rxq->pushed == rxq->completed) { 316 sfxge_rx_schedule_refill(rxq, retrying); 317 } 318} 319 320void 321sfxge_rx_qrefill(struct sfxge_rxq *rxq) 322{ 323 324 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 325 return; 326 327 /* Make sure the queue is full */ 328 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 329} 330 331static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 332{ 333 struct ifnet *ifp = sc->ifnet; 334 335 m->m_pkthdr.rcvif = ifp; 336 m->m_pkthdr.csum_data = 0xffff; 337 ifp->if_input(ifp, m); 338} 339 340static void 341sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc) 342{ 343 struct mbuf *m = rx_desc->mbuf; 344 int flags = rx_desc->flags; 345 int csum_flags; 346 347 /* Convert checksum flags */ 348 csum_flags = (flags & EFX_CKSUM_IPV4) ? 349 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 350 if (flags & EFX_CKSUM_TCPUDP) 351 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 352 353 /* The hash covers a 4-tuple for TCP only */ 354 if (flags & EFX_PKT_TCP) { 355 m->m_pkthdr.flowid = 356 efx_psuedo_hdr_hash_get(sc->enp, 357 EFX_RX_HASHALG_TOEPLITZ, 358 mtod(m, uint8_t *)); 359 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); 360 } 361 m->m_data += sc->rx_prefix_size; 362 m->m_len = rx_desc->size - sc->rx_prefix_size; 363 m->m_pkthdr.len = m->m_len; 364 m->m_pkthdr.csum_flags = csum_flags; 365 __sfxge_rx_deliver(sc, rx_desc->mbuf); 366 367 rx_desc->flags = EFX_DISCARD; 368 rx_desc->mbuf = NULL; 369} 370 371#ifdef SFXGE_LRO 372 373static void 374sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 375{ 376 struct sfxge_softc *sc = st->sc; 377 struct mbuf *m = c->mbuf; 378 struct tcphdr *c_th; 379 int csum_flags; 380 381 KASSERT(m, ("no mbuf to deliver")); 382 383 ++st->n_bursts; 384 385 /* Finish off packet munging and recalculate IP header checksum. */ 386 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 387 struct ip *iph = c->nh; 388 iph->ip_len = htons(iph->ip_len); 389 iph->ip_sum = 0; 390 iph->ip_sum = in_cksum_hdr(iph); 391 c_th = (struct tcphdr *)(iph + 1); 392 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 393 CSUM_IP_CHECKED | CSUM_IP_VALID); 394 } else { 395 struct ip6_hdr *iph = c->nh; 396 iph->ip6_plen = htons(iph->ip6_plen); 397 c_th = (struct tcphdr *)(iph + 1); 398 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 399 } 400 401 c_th->th_win = c->th_last->th_win; 402 c_th->th_ack = c->th_last->th_ack; 403 if (c_th->th_off == c->th_last->th_off) { 404 /* Copy TCP options (take care to avoid going negative). */ 405 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 406 memcpy(c_th + 1, c->th_last + 1, optlen); 407 } 408 409 m->m_pkthdr.flowid = c->conn_hash; 410 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); 411 412 m->m_pkthdr.csum_flags = csum_flags; 413 __sfxge_rx_deliver(sc, m); 414 415 c->mbuf = NULL; 416 c->delivered = 1; 417} 418 419/* Drop the given connection, and add it to the free list. */ 420static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 421{ 422 unsigned bucket; 423 424 KASSERT(!c->mbuf, ("found orphaned mbuf")); 425 426 if (c->next_buf.mbuf != NULL) { 427 sfxge_rx_deliver(rxq->sc, &c->next_buf); 428 LIST_REMOVE(c, active_link); 429 } 430 431 bucket = c->conn_hash & rxq->lro.conns_mask; 432 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 433 --rxq->lro.conns_n[bucket]; 434 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 435 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 436} 437 438/* Stop tracking connections that have gone idle in order to keep hash 439 * chains short. 440 */ 441static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 442{ 443 struct sfxge_lro_conn *c; 444 unsigned i; 445 446 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 447 ("found active connections")); 448 449 rxq->lro.last_purge_ticks = now; 450 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 451 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 452 continue; 453 454 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 455 if (now - c->last_pkt_ticks > lro_idle_ticks) { 456 ++rxq->lro.n_drop_idle; 457 sfxge_lro_drop(rxq, c); 458 } 459 } 460} 461 462static void 463sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 464 struct mbuf *mbuf, struct tcphdr *th) 465{ 466 struct tcphdr *c_th; 467 468 /* Tack the new mbuf onto the chain. */ 469 KASSERT(!mbuf->m_next, ("mbuf already chained")); 470 c->mbuf_tail->m_next = mbuf; 471 c->mbuf_tail = mbuf; 472 473 /* Increase length appropriately */ 474 c->mbuf->m_pkthdr.len += mbuf->m_len; 475 476 /* Update the connection state flags */ 477 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 478 struct ip *iph = c->nh; 479 iph->ip_len += mbuf->m_len; 480 c_th = (struct tcphdr *)(iph + 1); 481 } else { 482 struct ip6_hdr *iph = c->nh; 483 iph->ip6_plen += mbuf->m_len; 484 c_th = (struct tcphdr *)(iph + 1); 485 } 486 c_th->th_flags |= (th->th_flags & TH_PUSH); 487 c->th_last = th; 488 ++st->n_merges; 489 490 /* Pass packet up now if another segment could overflow the IP 491 * length. 492 */ 493 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 494 sfxge_lro_deliver(st, c); 495} 496 497static void 498sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 499 struct mbuf *mbuf, void *nh, struct tcphdr *th) 500{ 501 /* Start the chain */ 502 c->mbuf = mbuf; 503 c->mbuf_tail = c->mbuf; 504 c->nh = nh; 505 c->th_last = th; 506 507 mbuf->m_pkthdr.len = mbuf->m_len; 508 509 /* Mangle header fields for later processing */ 510 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 511 struct ip *iph = nh; 512 iph->ip_len = ntohs(iph->ip_len); 513 } else { 514 struct ip6_hdr *iph = nh; 515 iph->ip6_plen = ntohs(iph->ip6_plen); 516 } 517} 518 519/* Try to merge or otherwise hold or deliver (as appropriate) the 520 * packet buffered for this connection (c->next_buf). Return a flag 521 * indicating whether the connection is still active for LRO purposes. 522 */ 523static int 524sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 525{ 526 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 527 char *eh = c->next_eh; 528 int data_length, hdr_length, dont_merge; 529 unsigned th_seq, pkt_length; 530 struct tcphdr *th; 531 unsigned now; 532 533 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 534 struct ip *iph = c->next_nh; 535 th = (struct tcphdr *)(iph + 1); 536 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 537 } else { 538 struct ip6_hdr *iph = c->next_nh; 539 th = (struct tcphdr *)(iph + 1); 540 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 541 } 542 543 hdr_length = (char *) th + th->th_off * 4 - eh; 544 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 545 hdr_length); 546 th_seq = ntohl(th->th_seq); 547 dont_merge = ((data_length <= 0) 548 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 549 550 /* Check for options other than aligned timestamp. */ 551 if (th->th_off != 5) { 552 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 553 if (th->th_off == 8 && 554 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 555 (TCPOPT_NOP << 16) | 556 (TCPOPT_TIMESTAMP << 8) | 557 TCPOLEN_TIMESTAMP)) { 558 /* timestamp option -- okay */ 559 } else { 560 dont_merge = 1; 561 } 562 } 563 564 if (__predict_false(th_seq != c->next_seq)) { 565 /* Out-of-order, so start counting again. */ 566 if (c->mbuf != NULL) 567 sfxge_lro_deliver(&rxq->lro, c); 568 c->n_in_order_pkts -= lro_loss_packets; 569 c->next_seq = th_seq + data_length; 570 ++rxq->lro.n_misorder; 571 goto deliver_buf_out; 572 } 573 c->next_seq = th_seq + data_length; 574 575 now = ticks; 576 if (now - c->last_pkt_ticks > lro_idle_ticks) { 577 ++rxq->lro.n_drop_idle; 578 if (c->mbuf != NULL) 579 sfxge_lro_deliver(&rxq->lro, c); 580 sfxge_lro_drop(rxq, c); 581 return (0); 582 } 583 c->last_pkt_ticks = ticks; 584 585 if (c->n_in_order_pkts < lro_slow_start_packets) { 586 /* May be in slow-start, so don't merge. */ 587 ++rxq->lro.n_slow_start; 588 ++c->n_in_order_pkts; 589 goto deliver_buf_out; 590 } 591 592 if (__predict_false(dont_merge)) { 593 if (c->mbuf != NULL) 594 sfxge_lro_deliver(&rxq->lro, c); 595 if (th->th_flags & (TH_FIN | TH_RST)) { 596 ++rxq->lro.n_drop_closed; 597 sfxge_lro_drop(rxq, c); 598 return (0); 599 } 600 goto deliver_buf_out; 601 } 602 603 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 604 605 if (__predict_true(c->mbuf != NULL)) { 606 /* Remove headers and any padding */ 607 rx_buf->mbuf->m_data += hdr_length; 608 rx_buf->mbuf->m_len = data_length; 609 610 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 611 } else { 612 /* Remove any padding */ 613 rx_buf->mbuf->m_len = pkt_length; 614 615 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 616 } 617 618 rx_buf->mbuf = NULL; 619 return (1); 620 621 deliver_buf_out: 622 sfxge_rx_deliver(rxq->sc, rx_buf); 623 return (1); 624} 625 626static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 627 uint16_t l2_id, void *nh, struct tcphdr *th) 628{ 629 unsigned bucket = conn_hash & st->conns_mask; 630 struct sfxge_lro_conn *c; 631 632 if (st->conns_n[bucket] >= lro_chain_max) { 633 ++st->n_too_many; 634 return; 635 } 636 637 if (!TAILQ_EMPTY(&st->free_conns)) { 638 c = TAILQ_FIRST(&st->free_conns); 639 TAILQ_REMOVE(&st->free_conns, c, link); 640 } else { 641 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 642 if (c == NULL) 643 return; 644 c->mbuf = NULL; 645 c->next_buf.mbuf = NULL; 646 } 647 648 /* Create the connection tracking data */ 649 ++st->conns_n[bucket]; 650 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 651 c->l2_id = l2_id; 652 c->conn_hash = conn_hash; 653 c->source = th->th_sport; 654 c->dest = th->th_dport; 655 c->n_in_order_pkts = 0; 656 c->last_pkt_ticks = *(volatile int *)&ticks; 657 c->delivered = 0; 658 ++st->n_new_stream; 659 /* NB. We don't initialise c->next_seq, and it doesn't matter what 660 * value it has. Most likely the next packet received for this 661 * connection will not match -- no harm done. 662 */ 663} 664 665/* Process mbuf and decide whether to dispatch it to the stack now or 666 * later. 667 */ 668static void 669sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 670{ 671 struct sfxge_softc *sc = rxq->sc; 672 struct mbuf *m = rx_buf->mbuf; 673 struct ether_header *eh; 674 struct sfxge_lro_conn *c; 675 uint16_t l2_id; 676 uint16_t l3_proto; 677 void *nh; 678 struct tcphdr *th; 679 uint32_t conn_hash; 680 unsigned bucket; 681 682 /* Get the hardware hash */ 683 conn_hash = efx_psuedo_hdr_hash_get(sc->enp, 684 EFX_RX_HASHALG_TOEPLITZ, 685 mtod(m, uint8_t *)); 686 687 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 688 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 689 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 690 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 691 SFXGE_LRO_L2_ID_VLAN; 692 l3_proto = veh->evl_proto; 693 nh = veh + 1; 694 } else { 695 l2_id = 0; 696 l3_proto = eh->ether_type; 697 nh = eh + 1; 698 } 699 700 /* Check whether this is a suitable packet (unfragmented 701 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 702 * length, and compute a hash if necessary. If not, return. 703 */ 704 if (l3_proto == htons(ETHERTYPE_IP)) { 705 struct ip *iph = nh; 706 707 KASSERT(iph->ip_p == IPPROTO_TCP, 708 ("IPv4 protocol is not TCP, but packet marker is set")); 709 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 710 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 711 goto deliver_now; 712 th = (struct tcphdr *)(iph + 1); 713 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 714 struct ip6_hdr *iph = nh; 715 716 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 717 ("IPv6 next header is not TCP, but packet marker is set")); 718 l2_id |= SFXGE_LRO_L2_ID_IPV6; 719 th = (struct tcphdr *)(iph + 1); 720 } else { 721 goto deliver_now; 722 } 723 724 bucket = conn_hash & rxq->lro.conns_mask; 725 726 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 727 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 728 continue; 729 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 730 continue; 731 if (c->mbuf != NULL) { 732 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 733 struct ip *c_iph, *iph = nh; 734 c_iph = c->nh; 735 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 736 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 737 continue; 738 } else { 739 struct ip6_hdr *c_iph, *iph = nh; 740 c_iph = c->nh; 741 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 742 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 743 continue; 744 } 745 } 746 747 /* Re-insert at head of list to reduce lookup time. */ 748 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 749 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 750 751 if (c->next_buf.mbuf != NULL) { 752 if (!sfxge_lro_try_merge(rxq, c)) 753 goto deliver_now; 754 } else { 755 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 756 active_link); 757 } 758 c->next_buf = *rx_buf; 759 c->next_eh = eh; 760 c->next_nh = nh; 761 762 rx_buf->mbuf = NULL; 763 rx_buf->flags = EFX_DISCARD; 764 return; 765 } 766 767 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 768 deliver_now: 769 sfxge_rx_deliver(sc, rx_buf); 770} 771 772static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 773{ 774 struct sfxge_lro_state *st = &rxq->lro; 775 struct sfxge_lro_conn *c; 776 unsigned t; 777 778 while (!LIST_EMPTY(&st->active_conns)) { 779 c = LIST_FIRST(&st->active_conns); 780 if (!c->delivered && c->mbuf != NULL) 781 sfxge_lro_deliver(st, c); 782 if (sfxge_lro_try_merge(rxq, c)) { 783 if (c->mbuf != NULL) 784 sfxge_lro_deliver(st, c); 785 LIST_REMOVE(c, active_link); 786 } 787 c->delivered = 0; 788 } 789 790 t = *(volatile int *)&ticks; 791 if (__predict_false(t != st->last_purge_ticks)) 792 sfxge_lro_purge_idle(rxq, t); 793} 794 795#else /* !SFXGE_LRO */ 796 797static void 798sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 799{ 800} 801 802static void 803sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 804{ 805} 806 807#endif /* SFXGE_LRO */ 808 809void 810sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 811{ 812 struct sfxge_softc *sc = rxq->sc; 813 int if_capenable = sc->ifnet->if_capenable; 814 int lro_enabled = if_capenable & IFCAP_LRO; 815 unsigned int index; 816 struct sfxge_evq *evq; 817 unsigned int completed; 818 unsigned int level; 819 struct mbuf *m; 820 struct sfxge_rx_sw_desc *prev = NULL; 821 822 index = rxq->index; 823 evq = sc->evq[index]; 824 825 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 826 827 completed = rxq->completed; 828 while (completed != rxq->pending) { 829 unsigned int id; 830 struct sfxge_rx_sw_desc *rx_desc; 831 832 id = completed++ & rxq->ptr_mask; 833 rx_desc = &rxq->queue[id]; 834 m = rx_desc->mbuf; 835 836 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 837 goto discard; 838 839 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 840 goto discard; 841 842 /* Read the length from the psuedo header if required */ 843 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 844 uint16_t tmp_size; 845 int rc; 846 rc = efx_psuedo_hdr_pkt_length_get(sc->enp, 847 mtod(m, uint8_t *), 848 &tmp_size); 849 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 850 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 851 } 852 853 prefetch_read_many(mtod(m, caddr_t)); 854 855 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 856 case EFX_PKT_IPV4: 857 if (~if_capenable & IFCAP_RXCSUM) 858 rx_desc->flags &= 859 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 860 break; 861 case EFX_PKT_IPV6: 862 if (~if_capenable & IFCAP_RXCSUM_IPV6) 863 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 864 break; 865 case 0: 866 /* Check for loopback packets */ 867 { 868 struct ether_header *etherhp; 869 870 /*LINTED*/ 871 etherhp = mtod(m, struct ether_header *); 872 873 if (etherhp->ether_type == 874 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 875 EFSYS_PROBE(loopback); 876 877 rxq->loopback++; 878 goto discard; 879 } 880 } 881 break; 882 default: 883 KASSERT(B_FALSE, 884 ("Rx descriptor with both IPv4 and IPv6 flags")); 885 goto discard; 886 } 887 888 /* Pass packet up the stack or into LRO (pipelined) */ 889 if (prev != NULL) { 890 if (lro_enabled && 891 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 892 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 893 sfxge_lro(rxq, prev); 894 else 895 sfxge_rx_deliver(sc, prev); 896 } 897 prev = rx_desc; 898 continue; 899 900discard: 901 /* Return the packet to the pool */ 902 m_free(m); 903 rx_desc->mbuf = NULL; 904 } 905 rxq->completed = completed; 906 907 level = rxq->added - rxq->completed; 908 909 /* Pass last packet up the stack or into LRO */ 910 if (prev != NULL) { 911 if (lro_enabled && 912 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 913 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 914 sfxge_lro(rxq, prev); 915 else 916 sfxge_rx_deliver(sc, prev); 917 } 918 919 /* 920 * If there are any pending flows and this is the end of the 921 * poll then they must be completed. 922 */ 923 if (eop) 924 sfxge_lro_end_of_burst(rxq); 925 926 /* Top up the queue if necessary */ 927 if (level < rxq->refill_threshold) 928 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 929} 930 931static void 932sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 933{ 934 struct sfxge_rxq *rxq; 935 struct sfxge_evq *evq; 936 unsigned int count; 937 unsigned int retry = 3; 938 939 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 940 941 rxq = sc->rxq[index]; 942 evq = sc->evq[index]; 943 944 SFXGE_EVQ_LOCK(evq); 945 946 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 947 ("rxq not started")); 948 949 rxq->init_state = SFXGE_RXQ_INITIALIZED; 950 951 callout_stop(&rxq->refill_callout); 952 953 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 954 rxq->flush_state = SFXGE_FLUSH_PENDING; 955 956 SFXGE_EVQ_UNLOCK(evq); 957 958 /* Flush the receive queue */ 959 if (efx_rx_qflush(rxq->common) != 0) { 960 SFXGE_EVQ_LOCK(evq); 961 rxq->flush_state = SFXGE_FLUSH_FAILED; 962 break; 963 } 964 965 count = 0; 966 do { 967 /* Spin for 100 ms */ 968 DELAY(100000); 969 970 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 971 break; 972 973 } while (++count < 20); 974 975 SFXGE_EVQ_LOCK(evq); 976 977 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 978 /* Flush timeout - neither done nor failed */ 979 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 980 device_get_nameunit(sc->dev), index); 981 rxq->flush_state = SFXGE_FLUSH_DONE; 982 } 983 retry--; 984 } 985 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 986 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 987 device_get_nameunit(sc->dev), index); 988 rxq->flush_state = SFXGE_FLUSH_DONE; 989 } 990 991 rxq->pending = rxq->added; 992 sfxge_rx_qcomplete(rxq, B_TRUE); 993 994 KASSERT(rxq->completed == rxq->pending, 995 ("rxq->completed != rxq->pending")); 996 997 rxq->added = 0; 998 rxq->pushed = 0; 999 rxq->pending = 0; 1000 rxq->completed = 0; 1001 rxq->loopback = 0; 1002 1003 /* Destroy the common code receive queue. */ 1004 efx_rx_qdestroy(rxq->common); 1005 1006 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1007 EFX_RXQ_NBUFS(sc->rxq_entries)); 1008 1009 SFXGE_EVQ_UNLOCK(evq); 1010} 1011 1012static int 1013sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1014{ 1015 struct sfxge_rxq *rxq; 1016 efsys_mem_t *esmp; 1017 struct sfxge_evq *evq; 1018 int rc; 1019 1020 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1021 1022 rxq = sc->rxq[index]; 1023 esmp = &rxq->mem; 1024 evq = sc->evq[index]; 1025 1026 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1027 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1028 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1029 ("evq->init_state != SFXGE_EVQ_STARTED")); 1030 1031 /* Program the buffer table. */ 1032 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1033 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1034 return (rc); 1035 1036 /* Create the common code receive queue. */ 1037 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT, 1038 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, 1039 &rxq->common)) != 0) 1040 goto fail; 1041 1042 SFXGE_EVQ_LOCK(evq); 1043 1044 /* Enable the receive queue. */ 1045 efx_rx_qenable(rxq->common); 1046 1047 rxq->init_state = SFXGE_RXQ_STARTED; 1048 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1049 1050 /* Try to fill the queue from the pool. */ 1051 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1052 1053 SFXGE_EVQ_UNLOCK(evq); 1054 1055 return (0); 1056 1057fail: 1058 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1059 EFX_RXQ_NBUFS(sc->rxq_entries)); 1060 return (rc); 1061} 1062 1063void 1064sfxge_rx_stop(struct sfxge_softc *sc) 1065{ 1066 int index; 1067 1068 efx_mac_filter_default_rxq_clear(sc->enp); 1069 1070 /* Stop the receive queue(s) */ 1071 index = sc->rxq_count; 1072 while (--index >= 0) 1073 sfxge_rx_qstop(sc, index); 1074 1075 sc->rx_prefix_size = 0; 1076 sc->rx_buffer_size = 0; 1077 1078 efx_rx_fini(sc->enp); 1079} 1080 1081int 1082sfxge_rx_start(struct sfxge_softc *sc) 1083{ 1084 struct sfxge_intr *intr; 1085 const efx_nic_cfg_t *encp; 1086 size_t hdrlen, align, reserved; 1087 int index; 1088 int rc; 1089 1090 intr = &sc->intr; 1091 1092 /* Initialize the common code receive module. */ 1093 if ((rc = efx_rx_init(sc->enp)) != 0) 1094 return (rc); 1095 1096 encp = efx_nic_cfg_get(sc->enp); 1097 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1098 1099 /* Calculate the receive packet buffer size. */ 1100 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1101 1102 /* Ensure IP headers are 32bit aligned */ 1103 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1104 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen; 1105 1106 sc->rx_buffer_size += sc->rx_buffer_align; 1107 1108 /* Align end of packet buffer for RX DMA end padding */ 1109 align = MAX(1, encp->enc_rx_buf_align_end); 1110 EFSYS_ASSERT(ISP2(align)); 1111 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align); 1112 1113 /* 1114 * Standard mbuf zones only guarantee pointer-size alignment; 1115 * we need extra space to align to the cache line 1116 */ 1117 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1118 1119 /* Select zone for packet buffers */ 1120 if (reserved <= MCLBYTES) 1121 sc->rx_buffer_zone = zone_clust; 1122 else if (reserved <= MJUMPAGESIZE) 1123 sc->rx_buffer_zone = zone_jumbop; 1124 else if (reserved <= MJUM9BYTES) 1125 sc->rx_buffer_zone = zone_jumbo9; 1126 else 1127 sc->rx_buffer_zone = zone_jumbo16; 1128 1129 /* 1130 * Set up the scale table. Enable all hash types and hash insertion. 1131 */ 1132 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++) 1133 sc->rx_indir_table[index] = index % sc->rxq_count; 1134 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 1135 SFXGE_RX_SCALE_MAX)) != 0) 1136 goto fail; 1137 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 1138 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) | 1139 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE); 1140 1141 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key, 1142 sizeof(toep_key))) != 0) 1143 goto fail; 1144 1145 /* Start the receive queue(s). */ 1146 for (index = 0; index < sc->rxq_count; index++) { 1147 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1148 goto fail2; 1149 } 1150 1151 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1152 sc->intr.n_alloc > 1); 1153 if (rc != 0) 1154 goto fail3; 1155 1156 return (0); 1157 1158fail3: 1159fail2: 1160 while (--index >= 0) 1161 sfxge_rx_qstop(sc, index); 1162 1163fail: 1164 efx_rx_fini(sc->enp); 1165 1166 return (rc); 1167} 1168 1169#ifdef SFXGE_LRO 1170 1171static void sfxge_lro_init(struct sfxge_rxq *rxq) 1172{ 1173 struct sfxge_lro_state *st = &rxq->lro; 1174 unsigned i; 1175 1176 st->conns_mask = lro_table_size - 1; 1177 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1178 ("lro_table_size must be a power of 2")); 1179 st->sc = rxq->sc; 1180 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1181 M_SFXGE, M_WAITOK); 1182 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1183 M_SFXGE, M_WAITOK); 1184 for (i = 0; i <= st->conns_mask; ++i) { 1185 TAILQ_INIT(&st->conns[i]); 1186 st->conns_n[i] = 0; 1187 } 1188 LIST_INIT(&st->active_conns); 1189 TAILQ_INIT(&st->free_conns); 1190} 1191 1192static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1193{ 1194 struct sfxge_lro_state *st = &rxq->lro; 1195 struct sfxge_lro_conn *c; 1196 unsigned i; 1197 1198 /* Return cleanly if sfxge_lro_init() has not been called. */ 1199 if (st->conns == NULL) 1200 return; 1201 1202 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1203 1204 for (i = 0; i <= st->conns_mask; ++i) { 1205 while (!TAILQ_EMPTY(&st->conns[i])) { 1206 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1207 sfxge_lro_drop(rxq, c); 1208 } 1209 } 1210 1211 while (!TAILQ_EMPTY(&st->free_conns)) { 1212 c = TAILQ_FIRST(&st->free_conns); 1213 TAILQ_REMOVE(&st->free_conns, c, link); 1214 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1215 free(c, M_SFXGE); 1216 } 1217 1218 free(st->conns_n, M_SFXGE); 1219 free(st->conns, M_SFXGE); 1220 st->conns = NULL; 1221} 1222 1223#else 1224 1225static void 1226sfxge_lro_init(struct sfxge_rxq *rxq) 1227{ 1228} 1229 1230static void 1231sfxge_lro_fini(struct sfxge_rxq *rxq) 1232{ 1233} 1234 1235#endif /* SFXGE_LRO */ 1236 1237static void 1238sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1239{ 1240 struct sfxge_rxq *rxq; 1241 1242 rxq = sc->rxq[index]; 1243 1244 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1245 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1246 1247 /* Free the context array and the flow table. */ 1248 free(rxq->queue, M_SFXGE); 1249 sfxge_lro_fini(rxq); 1250 1251 /* Release DMA memory. */ 1252 sfxge_dma_free(&rxq->mem); 1253 1254 sc->rxq[index] = NULL; 1255 1256 free(rxq, M_SFXGE); 1257} 1258 1259static int 1260sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1261{ 1262 struct sfxge_rxq *rxq; 1263 struct sfxge_evq *evq; 1264 efsys_mem_t *esmp; 1265 int rc; 1266 1267 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1268 1269 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1270 rxq->sc = sc; 1271 rxq->index = index; 1272 rxq->entries = sc->rxq_entries; 1273 rxq->ptr_mask = rxq->entries - 1; 1274 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1275 1276 sc->rxq[index] = rxq; 1277 esmp = &rxq->mem; 1278 1279 evq = sc->evq[index]; 1280 1281 /* Allocate and zero DMA space. */ 1282 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1283 return (rc); 1284 1285 /* Allocate buffer table entries. */ 1286 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1287 &rxq->buf_base_id); 1288 1289 /* Allocate the context array and the flow table. */ 1290 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1291 M_SFXGE, M_WAITOK | M_ZERO); 1292 sfxge_lro_init(rxq); 1293 1294 callout_init(&rxq->refill_callout, B_TRUE); 1295 1296 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1297 1298 return (0); 1299} 1300 1301static const struct { 1302 const char *name; 1303 size_t offset; 1304} sfxge_rx_stats[] = { 1305#define SFXGE_RX_STAT(name, member) \ 1306 { #name, offsetof(struct sfxge_rxq, member) } 1307#ifdef SFXGE_LRO 1308 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1309 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1310 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1311 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1312 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1313 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1314 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1315 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1316#endif 1317}; 1318 1319static int 1320sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1321{ 1322 struct sfxge_softc *sc = arg1; 1323 unsigned int id = arg2; 1324 unsigned int sum, index; 1325 1326 /* Sum across all RX queues */ 1327 sum = 0; 1328 for (index = 0; index < sc->rxq_count; index++) 1329 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1330 sfxge_rx_stats[id].offset); 1331 1332 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1333} 1334 1335static void 1336sfxge_rx_stat_init(struct sfxge_softc *sc) 1337{ 1338 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1339 struct sysctl_oid_list *stat_list; 1340 unsigned int id; 1341 1342 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1343 1344 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1345 SYSCTL_ADD_PROC( 1346 ctx, stat_list, 1347 OID_AUTO, sfxge_rx_stats[id].name, 1348 CTLTYPE_UINT|CTLFLAG_RD, 1349 sc, id, sfxge_rx_stat_handler, "IU", 1350 ""); 1351 } 1352} 1353 1354void 1355sfxge_rx_fini(struct sfxge_softc *sc) 1356{ 1357 int index; 1358 1359 index = sc->rxq_count; 1360 while (--index >= 0) 1361 sfxge_rx_qfini(sc, index); 1362 1363 sc->rxq_count = 0; 1364} 1365 1366int 1367sfxge_rx_init(struct sfxge_softc *sc) 1368{ 1369 struct sfxge_intr *intr; 1370 int index; 1371 int rc; 1372 1373#ifdef SFXGE_LRO 1374 if (!ISP2(lro_table_size)) { 1375 log(LOG_ERR, "%s=%u must be power of 2", 1376 SFXGE_LRO_PARAM(table_size), lro_table_size); 1377 rc = EINVAL; 1378 goto fail_lro_table_size; 1379 } 1380 1381 if (lro_idle_ticks == 0) 1382 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1383#endif 1384 1385 intr = &sc->intr; 1386 1387 sc->rxq_count = intr->n_alloc; 1388 1389 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1390 ("intr->state != SFXGE_INTR_INITIALIZED")); 1391 1392 /* Initialize the receive queue(s) - one per interrupt. */ 1393 for (index = 0; index < sc->rxq_count; index++) { 1394 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1395 goto fail; 1396 } 1397 1398 sfxge_rx_stat_init(sc); 1399 1400 return (0); 1401 1402fail: 1403 /* Tear down the receive queue(s). */ 1404 while (--index >= 0) 1405 sfxge_rx_qfini(sc, index); 1406 1407 sc->rxq_count = 0; 1408 1409#ifdef SFXGE_LRO 1410fail_lro_table_size: 1411#endif 1412 return (rc); 1413} 1414