sfxge_rx.c revision 330897
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2010-2016 Solarflare Communications Inc.
5 * All rights reserved.
6 *
7 * This software was developed in part by Philip Paeps under contract for
8 * Solarflare Communications, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright notice,
14 *    this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright notice,
16 *    this list of conditions and the following disclaimer in the documentation
17 *    and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * The views and conclusions contained in the software and documentation are
32 * those of the authors and should not be interpreted as representing official
33 * policies, either expressed or implied, of the FreeBSD Project.
34 */
35
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD: stable/11/sys/dev/sfxge/sfxge_rx.c 330897 2018-03-14 03:19:51Z eadler $");
38
39#include "opt_rss.h"
40
41#include <sys/param.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/smp.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/limits.h>
49#include <sys/syslog.h>
50
51#include <net/ethernet.h>
52#include <net/if.h>
53#include <net/if_vlan_var.h>
54
55#include <netinet/in.h>
56#include <netinet/ip.h>
57#include <netinet/ip6.h>
58#include <netinet/tcp.h>
59
60#include <machine/in_cksum.h>
61
62#ifdef RSS
63#include <net/rss_config.h>
64#endif
65
66#include "common/efx.h"
67
68
69#include "sfxge.h"
70#include "sfxge_rx.h"
71
72#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
73
74#ifdef SFXGE_LRO
75
76SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
77	    "Large receive offload (LRO) parameters");
78
79#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
80
81/* Size of the LRO hash table.  Must be a power of 2.  A larger table
82 * means we can accelerate a larger number of streams.
83 */
84static unsigned lro_table_size = 128;
85TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
86SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
87	    &lro_table_size, 0,
88	    "Size of the LRO hash table (must be a power of 2)");
89
90/* Maximum length of a hash chain.  If chains get too long then the lookup
91 * time increases and may exceed the benefit of LRO.
92 */
93static unsigned lro_chain_max = 20;
94TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
95SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
96	    &lro_chain_max, 0,
97	    "The maximum length of a hash chain");
98
99/* Maximum time (in ticks) that a connection can be idle before it's LRO
100 * state is discarded.
101 */
102static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
103TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
104SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
105	    &lro_idle_ticks, 0,
106	    "The maximum time (in ticks) that a connection can be idle "
107	    "before it's LRO state is discarded");
108
109/* Number of packets with payload that must arrive in-order before a
110 * connection is eligible for LRO.  The idea is we should avoid coalescing
111 * segments when the sender is in slow-start because reducing the ACK rate
112 * can damage performance.
113 */
114static int lro_slow_start_packets = 2000;
115TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
116SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
117	    &lro_slow_start_packets, 0,
118	    "Number of packets with payload that must arrive in-order before "
119	    "a connection is eligible for LRO");
120
121/* Number of packets with payload that must arrive in-order following loss
122 * before a connection is eligible for LRO.  The idea is we should avoid
123 * coalescing segments when the sender is recovering from loss, because
124 * reducing the ACK rate can damage performance.
125 */
126static int lro_loss_packets = 20;
127TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
128SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
129	    &lro_loss_packets, 0,
130	    "Number of packets with payload that must arrive in-order "
131	    "following loss before a connection is eligible for LRO");
132
133/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
134#define	SFXGE_LRO_L2_ID_VLAN 0x4000
135#define	SFXGE_LRO_L2_ID_IPV6 0x8000
136#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
137#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
138
139/* Compare IPv6 addresses, avoiding conditional branches */
140static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
141				   const struct in6_addr *right)
142{
143#if LONG_BIT == 64
144	const uint64_t *left64 = (const uint64_t *)left;
145	const uint64_t *right64 = (const uint64_t *)right;
146	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
147#else
148	return (left->s6_addr32[0] - right->s6_addr32[0]) |
149	       (left->s6_addr32[1] - right->s6_addr32[1]) |
150	       (left->s6_addr32[2] - right->s6_addr32[2]) |
151	       (left->s6_addr32[3] - right->s6_addr32[3]);
152#endif
153}
154
155#endif	/* SFXGE_LRO */
156
157void
158sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
159{
160
161	rxq->flush_state = SFXGE_FLUSH_DONE;
162}
163
164void
165sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
166{
167
168	rxq->flush_state = SFXGE_FLUSH_FAILED;
169}
170
171#ifdef RSS
172static uint8_t toep_key[RSS_KEYSIZE];
173#else
174static uint8_t toep_key[] = {
175	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
176	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
177	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
178	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
179	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
180};
181#endif
182
183static void
184sfxge_rx_post_refill(void *arg)
185{
186	struct sfxge_rxq *rxq = arg;
187	struct sfxge_softc *sc;
188	unsigned int index;
189	struct sfxge_evq *evq;
190	uint16_t magic;
191
192	sc = rxq->sc;
193	index = rxq->index;
194	evq = sc->evq[index];
195	magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
196
197	/* This is guaranteed due to the start/stop order of rx and ev */
198	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
199	    ("evq not started"));
200	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
201	    ("rxq not started"));
202	efx_ev_qpost(evq->common, magic);
203}
204
205static void
206sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
207{
208	/* Initially retry after 100 ms, but back off in case of
209	 * repeated failures as we probably have to wait for the
210	 * administrator to raise the pool limit. */
211	if (retrying)
212		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
213	else
214		rxq->refill_delay = hz / 10;
215
216	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
217			     sfxge_rx_post_refill, rxq);
218}
219
220#define	SFXGE_REFILL_BATCH  64
221
222static void
223sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
224{
225	struct sfxge_softc *sc;
226	unsigned int index;
227	struct sfxge_evq *evq;
228	unsigned int batch;
229	unsigned int rxfill;
230	unsigned int mblksize;
231	int ntodo;
232	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
233
234	sc = rxq->sc;
235	index = rxq->index;
236	evq = sc->evq[index];
237
238	prefetch_read_many(sc->enp);
239	prefetch_read_many(rxq->common);
240
241	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
242
243	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
244		return;
245
246	rxfill = rxq->added - rxq->completed;
247	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
248	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
249	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
250	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
251	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
252
253	if (ntodo == 0)
254		return;
255
256	batch = 0;
257	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
258	while (ntodo-- > 0) {
259		unsigned int id;
260		struct sfxge_rx_sw_desc *rx_desc;
261		bus_dma_segment_t seg;
262		struct mbuf *m;
263
264		id = (rxq->added + batch) & rxq->ptr_mask;
265		rx_desc = &rxq->queue[id];
266		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
267
268		rx_desc->flags = EFX_DISCARD;
269		m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
270		    sc->rx_cluster_size);
271		if (m == NULL)
272			break;
273
274		/* m_len specifies length of area to be mapped for DMA */
275		m->m_len  = mblksize;
276		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
277		m->m_data += sc->rx_buffer_align;
278
279		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
280		addr[batch++] = seg.ds_addr;
281
282		if (batch == SFXGE_REFILL_BATCH) {
283			efx_rx_qpost(rxq->common, addr, mblksize, batch,
284			    rxq->completed, rxq->added);
285			rxq->added += batch;
286			batch = 0;
287		}
288	}
289
290	if (ntodo != 0)
291		sfxge_rx_schedule_refill(rxq, retrying);
292
293	if (batch != 0) {
294		efx_rx_qpost(rxq->common, addr, mblksize, batch,
295		    rxq->completed, rxq->added);
296		rxq->added += batch;
297	}
298
299	/* Make the descriptors visible to the hardware */
300	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
301			BUS_DMASYNC_PREWRITE);
302
303	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
304
305	/* The queue could still be empty if no descriptors were actually
306	 * pushed, in which case there will be no event to cause the next
307	 * refill, so we must schedule a refill ourselves.
308	 */
309	if(rxq->pushed == rxq->completed) {
310		sfxge_rx_schedule_refill(rxq, retrying);
311	}
312}
313
314void
315sfxge_rx_qrefill(struct sfxge_rxq *rxq)
316{
317
318	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
319		return;
320
321	/* Make sure the queue is full */
322	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
323}
324
325static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
326{
327	struct ifnet *ifp = sc->ifnet;
328
329	m->m_pkthdr.rcvif = ifp;
330	m->m_pkthdr.csum_data = 0xffff;
331	ifp->if_input(ifp, m);
332}
333
334static void
335sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
336{
337	struct sfxge_softc *sc = rxq->sc;
338	struct mbuf *m = rx_desc->mbuf;
339	int flags = rx_desc->flags;
340	int csum_flags;
341
342	/* Convert checksum flags */
343	csum_flags = (flags & EFX_CKSUM_IPV4) ?
344		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
345	if (flags & EFX_CKSUM_TCPUDP)
346		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
347
348	if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
349		m->m_pkthdr.flowid =
350			efx_pseudo_hdr_hash_get(rxq->common,
351						EFX_RX_HASHALG_TOEPLITZ,
352						mtod(m, uint8_t *));
353		/* The hash covers a 4-tuple for TCP only */
354		M_HASHTYPE_SET(m,
355		    (flags & EFX_PKT_IPV4) ?
356			((flags & EFX_PKT_TCP) ?
357			    M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
358			((flags & EFX_PKT_TCP) ?
359			    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
360	}
361	m->m_data += sc->rx_prefix_size;
362	m->m_len = rx_desc->size - sc->rx_prefix_size;
363	m->m_pkthdr.len = m->m_len;
364	m->m_pkthdr.csum_flags = csum_flags;
365	__sfxge_rx_deliver(sc, rx_desc->mbuf);
366
367	rx_desc->flags = EFX_DISCARD;
368	rx_desc->mbuf = NULL;
369}
370
371#ifdef SFXGE_LRO
372
373static void
374sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
375{
376	struct sfxge_softc *sc = st->sc;
377	struct mbuf *m = c->mbuf;
378	struct tcphdr *c_th;
379	int csum_flags;
380
381	KASSERT(m, ("no mbuf to deliver"));
382
383	++st->n_bursts;
384
385	/* Finish off packet munging and recalculate IP header checksum. */
386	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
387		struct ip *iph = c->nh;
388		iph->ip_len = htons(iph->ip_len);
389		iph->ip_sum = 0;
390		iph->ip_sum = in_cksum_hdr(iph);
391		c_th = (struct tcphdr *)(iph + 1);
392		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
393			      CSUM_IP_CHECKED | CSUM_IP_VALID);
394	} else {
395		struct ip6_hdr *iph = c->nh;
396		iph->ip6_plen = htons(iph->ip6_plen);
397		c_th = (struct tcphdr *)(iph + 1);
398		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
399	}
400
401	c_th->th_win = c->th_last->th_win;
402	c_th->th_ack = c->th_last->th_ack;
403	if (c_th->th_off == c->th_last->th_off) {
404		/* Copy TCP options (take care to avoid going negative). */
405		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
406		memcpy(c_th + 1, c->th_last + 1, optlen);
407	}
408
409	m->m_pkthdr.flowid = c->conn_hash;
410	M_HASHTYPE_SET(m,
411	    SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
412		M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
413
414	m->m_pkthdr.csum_flags = csum_flags;
415	__sfxge_rx_deliver(sc, m);
416
417	c->mbuf = NULL;
418	c->delivered = 1;
419}
420
421/* Drop the given connection, and add it to the free list. */
422static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
423{
424	unsigned bucket;
425
426	KASSERT(!c->mbuf, ("found orphaned mbuf"));
427
428	if (c->next_buf.mbuf != NULL) {
429		sfxge_rx_deliver(rxq, &c->next_buf);
430		LIST_REMOVE(c, active_link);
431	}
432
433	bucket = c->conn_hash & rxq->lro.conns_mask;
434	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
435	--rxq->lro.conns_n[bucket];
436	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
437	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
438}
439
440/* Stop tracking connections that have gone idle in order to keep hash
441 * chains short.
442 */
443static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
444{
445	struct sfxge_lro_conn *c;
446	unsigned i;
447
448	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
449		("found active connections"));
450
451	rxq->lro.last_purge_ticks = now;
452	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
453		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
454			continue;
455
456		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
457		if (now - c->last_pkt_ticks > lro_idle_ticks) {
458			++rxq->lro.n_drop_idle;
459			sfxge_lro_drop(rxq, c);
460		}
461	}
462}
463
464static void
465sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
466		struct mbuf *mbuf, struct tcphdr *th)
467{
468	struct tcphdr *c_th;
469
470	/* Tack the new mbuf onto the chain. */
471	KASSERT(!mbuf->m_next, ("mbuf already chained"));
472	c->mbuf_tail->m_next = mbuf;
473	c->mbuf_tail = mbuf;
474
475	/* Increase length appropriately */
476	c->mbuf->m_pkthdr.len += mbuf->m_len;
477
478	/* Update the connection state flags */
479	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
480		struct ip *iph = c->nh;
481		iph->ip_len += mbuf->m_len;
482		c_th = (struct tcphdr *)(iph + 1);
483	} else {
484		struct ip6_hdr *iph = c->nh;
485		iph->ip6_plen += mbuf->m_len;
486		c_th = (struct tcphdr *)(iph + 1);
487	}
488	c_th->th_flags |= (th->th_flags & TH_PUSH);
489	c->th_last = th;
490	++st->n_merges;
491
492	/* Pass packet up now if another segment could overflow the IP
493	 * length.
494	 */
495	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
496		sfxge_lro_deliver(st, c);
497}
498
499static void
500sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
501		struct mbuf *mbuf, void *nh, struct tcphdr *th)
502{
503	/* Start the chain */
504	c->mbuf = mbuf;
505	c->mbuf_tail = c->mbuf;
506	c->nh = nh;
507	c->th_last = th;
508
509	mbuf->m_pkthdr.len = mbuf->m_len;
510
511	/* Mangle header fields for later processing */
512	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
513		struct ip *iph = nh;
514		iph->ip_len = ntohs(iph->ip_len);
515	} else {
516		struct ip6_hdr *iph = nh;
517		iph->ip6_plen = ntohs(iph->ip6_plen);
518	}
519}
520
521/* Try to merge or otherwise hold or deliver (as appropriate) the
522 * packet buffered for this connection (c->next_buf).  Return a flag
523 * indicating whether the connection is still active for LRO purposes.
524 */
525static int
526sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
527{
528	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
529	char *eh = c->next_eh;
530	int data_length, hdr_length, dont_merge;
531	unsigned th_seq, pkt_length;
532	struct tcphdr *th;
533	unsigned now;
534
535	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
536		struct ip *iph = c->next_nh;
537		th = (struct tcphdr *)(iph + 1);
538		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
539	} else {
540		struct ip6_hdr *iph = c->next_nh;
541		th = (struct tcphdr *)(iph + 1);
542		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
543	}
544
545	hdr_length = (char *) th + th->th_off * 4 - eh;
546	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
547		       hdr_length);
548	th_seq = ntohl(th->th_seq);
549	dont_merge = ((data_length <= 0)
550		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
551
552	/* Check for options other than aligned timestamp. */
553	if (th->th_off != 5) {
554		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
555		if (th->th_off == 8 &&
556		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
557					(TCPOPT_NOP << 16) |
558					(TCPOPT_TIMESTAMP << 8) |
559					TCPOLEN_TIMESTAMP)) {
560			/* timestamp option -- okay */
561		} else {
562			dont_merge = 1;
563		}
564	}
565
566	if (__predict_false(th_seq != c->next_seq)) {
567		/* Out-of-order, so start counting again. */
568		if (c->mbuf != NULL)
569			sfxge_lro_deliver(&rxq->lro, c);
570		c->n_in_order_pkts -= lro_loss_packets;
571		c->next_seq = th_seq + data_length;
572		++rxq->lro.n_misorder;
573		goto deliver_buf_out;
574	}
575	c->next_seq = th_seq + data_length;
576
577	now = ticks;
578	if (now - c->last_pkt_ticks > lro_idle_ticks) {
579		++rxq->lro.n_drop_idle;
580		if (c->mbuf != NULL)
581			sfxge_lro_deliver(&rxq->lro, c);
582		sfxge_lro_drop(rxq, c);
583		return (0);
584	}
585	c->last_pkt_ticks = ticks;
586
587	if (c->n_in_order_pkts < lro_slow_start_packets) {
588		/* May be in slow-start, so don't merge. */
589		++rxq->lro.n_slow_start;
590		++c->n_in_order_pkts;
591		goto deliver_buf_out;
592	}
593
594	if (__predict_false(dont_merge)) {
595		if (c->mbuf != NULL)
596			sfxge_lro_deliver(&rxq->lro, c);
597		if (th->th_flags & (TH_FIN | TH_RST)) {
598			++rxq->lro.n_drop_closed;
599			sfxge_lro_drop(rxq, c);
600			return (0);
601		}
602		goto deliver_buf_out;
603	}
604
605	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
606
607	if (__predict_true(c->mbuf != NULL)) {
608		/* Remove headers and any padding */
609		rx_buf->mbuf->m_data += hdr_length;
610		rx_buf->mbuf->m_len = data_length;
611
612		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
613	} else {
614		/* Remove any padding */
615		rx_buf->mbuf->m_len = pkt_length;
616
617		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
618	}
619
620	rx_buf->mbuf = NULL;
621	return (1);
622
623 deliver_buf_out:
624	sfxge_rx_deliver(rxq, rx_buf);
625	return (1);
626}
627
628static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
629			       uint16_t l2_id, void *nh, struct tcphdr *th)
630{
631	unsigned bucket = conn_hash & st->conns_mask;
632	struct sfxge_lro_conn *c;
633
634	if (st->conns_n[bucket] >= lro_chain_max) {
635		++st->n_too_many;
636		return;
637	}
638
639	if (!TAILQ_EMPTY(&st->free_conns)) {
640		c = TAILQ_FIRST(&st->free_conns);
641		TAILQ_REMOVE(&st->free_conns, c, link);
642	} else {
643		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
644		if (c == NULL)
645			return;
646		c->mbuf = NULL;
647		c->next_buf.mbuf = NULL;
648	}
649
650	/* Create the connection tracking data */
651	++st->conns_n[bucket];
652	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
653	c->l2_id = l2_id;
654	c->conn_hash = conn_hash;
655	c->source = th->th_sport;
656	c->dest = th->th_dport;
657	c->n_in_order_pkts = 0;
658	c->last_pkt_ticks = *(volatile int *)&ticks;
659	c->delivered = 0;
660	++st->n_new_stream;
661	/* NB. We don't initialise c->next_seq, and it doesn't matter what
662	 * value it has.  Most likely the next packet received for this
663	 * connection will not match -- no harm done.
664	 */
665}
666
667/* Process mbuf and decide whether to dispatch it to the stack now or
668 * later.
669 */
670static void
671sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
672{
673	struct sfxge_softc *sc = rxq->sc;
674	struct mbuf *m = rx_buf->mbuf;
675	struct ether_header *eh;
676	struct sfxge_lro_conn *c;
677	uint16_t l2_id;
678	uint16_t l3_proto;
679	void *nh;
680	struct tcphdr *th;
681	uint32_t conn_hash;
682	unsigned bucket;
683
684	/* Get the hardware hash */
685	conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
686					    EFX_RX_HASHALG_TOEPLITZ,
687					    mtod(m, uint8_t *));
688
689	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
690	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
691		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
692		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
693			SFXGE_LRO_L2_ID_VLAN;
694		l3_proto = veh->evl_proto;
695		nh = veh + 1;
696	} else {
697		l2_id = 0;
698		l3_proto = eh->ether_type;
699		nh = eh + 1;
700	}
701
702	/* Check whether this is a suitable packet (unfragmented
703	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
704	 * length, and compute a hash if necessary.  If not, return.
705	 */
706	if (l3_proto == htons(ETHERTYPE_IP)) {
707		struct ip *iph = nh;
708
709		KASSERT(iph->ip_p == IPPROTO_TCP,
710		    ("IPv4 protocol is not TCP, but packet marker is set"));
711		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
712		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
713			goto deliver_now;
714		th = (struct tcphdr *)(iph + 1);
715	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
716		struct ip6_hdr *iph = nh;
717
718		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
719		    ("IPv6 next header is not TCP, but packet marker is set"));
720		l2_id |= SFXGE_LRO_L2_ID_IPV6;
721		th = (struct tcphdr *)(iph + 1);
722	} else {
723		goto deliver_now;
724	}
725
726	bucket = conn_hash & rxq->lro.conns_mask;
727
728	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
729		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
730			continue;
731		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
732			continue;
733		if (c->mbuf != NULL) {
734			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
735				struct ip *c_iph, *iph = nh;
736				c_iph = c->nh;
737				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
738				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
739					continue;
740			} else {
741				struct ip6_hdr *c_iph, *iph = nh;
742				c_iph = c->nh;
743				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
744				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
745					continue;
746			}
747		}
748
749		/* Re-insert at head of list to reduce lookup time. */
750		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
751		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
752
753		if (c->next_buf.mbuf != NULL) {
754			if (!sfxge_lro_try_merge(rxq, c))
755				goto deliver_now;
756		} else {
757			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
758			    active_link);
759		}
760		c->next_buf = *rx_buf;
761		c->next_eh = eh;
762		c->next_nh = nh;
763
764		rx_buf->mbuf = NULL;
765		rx_buf->flags = EFX_DISCARD;
766		return;
767	}
768
769	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
770 deliver_now:
771	sfxge_rx_deliver(rxq, rx_buf);
772}
773
774static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
775{
776	struct sfxge_lro_state *st = &rxq->lro;
777	struct sfxge_lro_conn *c;
778	unsigned t;
779
780	while (!LIST_EMPTY(&st->active_conns)) {
781		c = LIST_FIRST(&st->active_conns);
782		if (!c->delivered && c->mbuf != NULL)
783			sfxge_lro_deliver(st, c);
784		if (sfxge_lro_try_merge(rxq, c)) {
785			if (c->mbuf != NULL)
786				sfxge_lro_deliver(st, c);
787			LIST_REMOVE(c, active_link);
788		}
789		c->delivered = 0;
790	}
791
792	t = *(volatile int *)&ticks;
793	if (__predict_false(t != st->last_purge_ticks))
794		sfxge_lro_purge_idle(rxq, t);
795}
796
797#else	/* !SFXGE_LRO */
798
799static void
800sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
801{
802}
803
804static void
805sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
806{
807}
808
809#endif	/* SFXGE_LRO */
810
811void
812sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
813{
814	struct sfxge_softc *sc = rxq->sc;
815	int if_capenable = sc->ifnet->if_capenable;
816	int lro_enabled = if_capenable & IFCAP_LRO;
817	unsigned int index;
818	struct sfxge_evq *evq;
819	unsigned int completed;
820	unsigned int level;
821	struct mbuf *m;
822	struct sfxge_rx_sw_desc *prev = NULL;
823
824	index = rxq->index;
825	evq = sc->evq[index];
826
827	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
828
829	completed = rxq->completed;
830	while (completed != rxq->pending) {
831		unsigned int id;
832		struct sfxge_rx_sw_desc *rx_desc;
833
834		id = completed++ & rxq->ptr_mask;
835		rx_desc = &rxq->queue[id];
836		m = rx_desc->mbuf;
837
838		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
839			goto discard;
840
841		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
842			goto discard;
843
844		/* Read the length from the pseudo header if required */
845		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
846			uint16_t tmp_size;
847			int rc;
848			rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
849							   mtod(m, uint8_t *),
850							   &tmp_size);
851			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
852			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
853		}
854
855		prefetch_read_many(mtod(m, caddr_t));
856
857		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
858		case EFX_PKT_IPV4:
859			if (~if_capenable & IFCAP_RXCSUM)
860				rx_desc->flags &=
861				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
862			break;
863		case EFX_PKT_IPV6:
864			if (~if_capenable & IFCAP_RXCSUM_IPV6)
865				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
866			break;
867		case 0:
868			/* Check for loopback packets */
869			{
870				struct ether_header *etherhp;
871
872				/*LINTED*/
873				etherhp = mtod(m, struct ether_header *);
874
875				if (etherhp->ether_type ==
876				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
877					EFSYS_PROBE(loopback);
878
879					rxq->loopback++;
880					goto discard;
881				}
882			}
883			break;
884		default:
885			KASSERT(B_FALSE,
886			    ("Rx descriptor with both IPv4 and IPv6 flags"));
887			goto discard;
888		}
889
890		/* Pass packet up the stack or into LRO (pipelined) */
891		if (prev != NULL) {
892			if (lro_enabled &&
893			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
894			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
895				sfxge_lro(rxq, prev);
896			else
897				sfxge_rx_deliver(rxq, prev);
898		}
899		prev = rx_desc;
900		continue;
901
902discard:
903		/* Return the packet to the pool */
904		m_free(m);
905		rx_desc->mbuf = NULL;
906	}
907	rxq->completed = completed;
908
909	level = rxq->added - rxq->completed;
910
911	/* Pass last packet up the stack or into LRO */
912	if (prev != NULL) {
913		if (lro_enabled &&
914		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
915		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
916			sfxge_lro(rxq, prev);
917		else
918			sfxge_rx_deliver(rxq, prev);
919	}
920
921	/*
922	 * If there are any pending flows and this is the end of the
923	 * poll then they must be completed.
924	 */
925	if (eop)
926		sfxge_lro_end_of_burst(rxq);
927
928	/* Top up the queue if necessary */
929	if (level < rxq->refill_threshold)
930		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
931}
932
933static void
934sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
935{
936	struct sfxge_rxq *rxq;
937	struct sfxge_evq *evq;
938	unsigned int count;
939	unsigned int retry = 3;
940
941	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
942
943	rxq = sc->rxq[index];
944	evq = sc->evq[index];
945
946	SFXGE_EVQ_LOCK(evq);
947
948	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
949	    ("rxq not started"));
950
951	rxq->init_state = SFXGE_RXQ_INITIALIZED;
952
953	callout_stop(&rxq->refill_callout);
954
955	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
956		rxq->flush_state = SFXGE_FLUSH_PENDING;
957
958		SFXGE_EVQ_UNLOCK(evq);
959
960		/* Flush the receive queue */
961		if (efx_rx_qflush(rxq->common) != 0) {
962			SFXGE_EVQ_LOCK(evq);
963			rxq->flush_state = SFXGE_FLUSH_FAILED;
964			break;
965		}
966
967		count = 0;
968		do {
969			/* Spin for 100 ms */
970			DELAY(100000);
971
972			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
973				break;
974
975		} while (++count < 20);
976
977		SFXGE_EVQ_LOCK(evq);
978
979		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
980			/* Flush timeout - neither done nor failed */
981			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
982			    device_get_nameunit(sc->dev), index);
983			rxq->flush_state = SFXGE_FLUSH_DONE;
984		}
985		retry--;
986	}
987	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
988		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
989		    device_get_nameunit(sc->dev), index);
990		rxq->flush_state = SFXGE_FLUSH_DONE;
991	}
992
993	rxq->pending = rxq->added;
994	sfxge_rx_qcomplete(rxq, B_TRUE);
995
996	KASSERT(rxq->completed == rxq->pending,
997	    ("rxq->completed != rxq->pending"));
998
999	rxq->added = 0;
1000	rxq->pushed = 0;
1001	rxq->pending = 0;
1002	rxq->completed = 0;
1003	rxq->loopback = 0;
1004
1005	/* Destroy the common code receive queue. */
1006	efx_rx_qdestroy(rxq->common);
1007
1008	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1009	    EFX_RXQ_NBUFS(sc->rxq_entries));
1010
1011	SFXGE_EVQ_UNLOCK(evq);
1012}
1013
1014static int
1015sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1016{
1017	struct sfxge_rxq *rxq;
1018	efsys_mem_t *esmp;
1019	struct sfxge_evq *evq;
1020	int rc;
1021
1022	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1023
1024	rxq = sc->rxq[index];
1025	esmp = &rxq->mem;
1026	evq = sc->evq[index];
1027
1028	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1029	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1030	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1031	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1032
1033	/* Program the buffer table. */
1034	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1035	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1036		return (rc);
1037
1038	/* Create the common code receive queue. */
1039	if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
1040	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1041	    &rxq->common)) != 0)
1042		goto fail;
1043
1044	SFXGE_EVQ_LOCK(evq);
1045
1046	/* Enable the receive queue. */
1047	efx_rx_qenable(rxq->common);
1048
1049	rxq->init_state = SFXGE_RXQ_STARTED;
1050	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1051
1052	/* Try to fill the queue from the pool. */
1053	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1054
1055	SFXGE_EVQ_UNLOCK(evq);
1056
1057	return (0);
1058
1059fail:
1060	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1061	    EFX_RXQ_NBUFS(sc->rxq_entries));
1062	return (rc);
1063}
1064
1065void
1066sfxge_rx_stop(struct sfxge_softc *sc)
1067{
1068	int index;
1069
1070	efx_mac_filter_default_rxq_clear(sc->enp);
1071
1072	/* Stop the receive queue(s) */
1073	index = sc->rxq_count;
1074	while (--index >= 0)
1075		sfxge_rx_qstop(sc, index);
1076
1077	sc->rx_prefix_size = 0;
1078	sc->rx_buffer_size = 0;
1079
1080	efx_rx_fini(sc->enp);
1081}
1082
1083int
1084sfxge_rx_start(struct sfxge_softc *sc)
1085{
1086	struct sfxge_intr *intr;
1087	const efx_nic_cfg_t *encp;
1088	size_t hdrlen, align, reserved;
1089	int index;
1090	int rc;
1091
1092	intr = &sc->intr;
1093
1094	/* Initialize the common code receive module. */
1095	if ((rc = efx_rx_init(sc->enp)) != 0)
1096		return (rc);
1097
1098	encp = efx_nic_cfg_get(sc->enp);
1099	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1100
1101	/* Calculate the receive packet buffer size. */
1102	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1103
1104	/* Ensure IP headers are 32bit aligned */
1105	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1106	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1107
1108	sc->rx_buffer_size += sc->rx_buffer_align;
1109
1110	/* Align end of packet buffer for RX DMA end padding */
1111	align = MAX(1, encp->enc_rx_buf_align_end);
1112	EFSYS_ASSERT(ISP2(align));
1113	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1114
1115	/*
1116	 * Standard mbuf zones only guarantee pointer-size alignment;
1117	 * we need extra space to align to the cache line
1118	 */
1119	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1120
1121	/* Select zone for packet buffers */
1122	if (reserved <= MCLBYTES)
1123		sc->rx_cluster_size = MCLBYTES;
1124	else if (reserved <= MJUMPAGESIZE)
1125		sc->rx_cluster_size = MJUMPAGESIZE;
1126	else if (reserved <= MJUM9BYTES)
1127		sc->rx_cluster_size = MJUM9BYTES;
1128	else
1129		sc->rx_cluster_size = MJUM16BYTES;
1130
1131	/*
1132	 * Set up the scale table.  Enable all hash types and hash insertion.
1133	 */
1134	for (index = 0; index < nitems(sc->rx_indir_table); index++)
1135#ifdef RSS
1136		sc->rx_indir_table[index] =
1137			rss_get_indirection_to_bucket(index) % sc->rxq_count;
1138#else
1139		sc->rx_indir_table[index] = index % sc->rxq_count;
1140#endif
1141	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1142				       nitems(sc->rx_indir_table))) != 0)
1143		goto fail;
1144	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1145	    EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
1146	    EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
1147
1148#ifdef RSS
1149	rss_getkey(toep_key);
1150#endif
1151	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1152				       sizeof(toep_key))) != 0)
1153		goto fail;
1154
1155	/* Start the receive queue(s). */
1156	for (index = 0; index < sc->rxq_count; index++) {
1157		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1158			goto fail2;
1159	}
1160
1161	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1162					    sc->intr.n_alloc > 1);
1163	if (rc != 0)
1164		goto fail3;
1165
1166	return (0);
1167
1168fail3:
1169fail2:
1170	while (--index >= 0)
1171		sfxge_rx_qstop(sc, index);
1172
1173fail:
1174	efx_rx_fini(sc->enp);
1175
1176	return (rc);
1177}
1178
1179#ifdef SFXGE_LRO
1180
1181static void sfxge_lro_init(struct sfxge_rxq *rxq)
1182{
1183	struct sfxge_lro_state *st = &rxq->lro;
1184	unsigned i;
1185
1186	st->conns_mask = lro_table_size - 1;
1187	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1188		("lro_table_size must be a power of 2"));
1189	st->sc = rxq->sc;
1190	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1191			   M_SFXGE, M_WAITOK);
1192	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1193			     M_SFXGE, M_WAITOK);
1194	for (i = 0; i <= st->conns_mask; ++i) {
1195		TAILQ_INIT(&st->conns[i]);
1196		st->conns_n[i] = 0;
1197	}
1198	LIST_INIT(&st->active_conns);
1199	TAILQ_INIT(&st->free_conns);
1200}
1201
1202static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1203{
1204	struct sfxge_lro_state *st = &rxq->lro;
1205	struct sfxge_lro_conn *c;
1206	unsigned i;
1207
1208	/* Return cleanly if sfxge_lro_init() has not been called. */
1209	if (st->conns == NULL)
1210		return;
1211
1212	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1213
1214	for (i = 0; i <= st->conns_mask; ++i) {
1215		while (!TAILQ_EMPTY(&st->conns[i])) {
1216			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1217			sfxge_lro_drop(rxq, c);
1218		}
1219	}
1220
1221	while (!TAILQ_EMPTY(&st->free_conns)) {
1222		c = TAILQ_FIRST(&st->free_conns);
1223		TAILQ_REMOVE(&st->free_conns, c, link);
1224		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1225		free(c, M_SFXGE);
1226	}
1227
1228	free(st->conns_n, M_SFXGE);
1229	free(st->conns, M_SFXGE);
1230	st->conns = NULL;
1231}
1232
1233#else
1234
1235static void
1236sfxge_lro_init(struct sfxge_rxq *rxq)
1237{
1238}
1239
1240static void
1241sfxge_lro_fini(struct sfxge_rxq *rxq)
1242{
1243}
1244
1245#endif	/* SFXGE_LRO */
1246
1247static void
1248sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1249{
1250	struct sfxge_rxq *rxq;
1251
1252	rxq = sc->rxq[index];
1253
1254	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1255	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1256
1257	/* Free the context array and the flow table. */
1258	free(rxq->queue, M_SFXGE);
1259	sfxge_lro_fini(rxq);
1260
1261	/* Release DMA memory. */
1262	sfxge_dma_free(&rxq->mem);
1263
1264	sc->rxq[index] = NULL;
1265
1266	free(rxq, M_SFXGE);
1267}
1268
1269static int
1270sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1271{
1272	struct sfxge_rxq *rxq;
1273	struct sfxge_evq *evq;
1274	efsys_mem_t *esmp;
1275	int rc;
1276
1277	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1278
1279	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1280	rxq->sc = sc;
1281	rxq->index = index;
1282	rxq->entries = sc->rxq_entries;
1283	rxq->ptr_mask = rxq->entries - 1;
1284	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1285
1286	sc->rxq[index] = rxq;
1287	esmp = &rxq->mem;
1288
1289	evq = sc->evq[index];
1290
1291	/* Allocate and zero DMA space. */
1292	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1293		return (rc);
1294
1295	/* Allocate buffer table entries. */
1296	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1297				 &rxq->buf_base_id);
1298
1299	/* Allocate the context array and the flow table. */
1300	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1301	    M_SFXGE, M_WAITOK | M_ZERO);
1302	sfxge_lro_init(rxq);
1303
1304	callout_init(&rxq->refill_callout, 1);
1305
1306	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1307
1308	return (0);
1309}
1310
1311static const struct {
1312	const char *name;
1313	size_t offset;
1314} sfxge_rx_stats[] = {
1315#define	SFXGE_RX_STAT(name, member) \
1316	{ #name, offsetof(struct sfxge_rxq, member) }
1317#ifdef SFXGE_LRO
1318	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1319	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1320	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1321	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1322	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1323	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1324	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1325	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1326#endif
1327};
1328
1329static int
1330sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1331{
1332	struct sfxge_softc *sc = arg1;
1333	unsigned int id = arg2;
1334	unsigned int sum, index;
1335
1336	/* Sum across all RX queues */
1337	sum = 0;
1338	for (index = 0; index < sc->rxq_count; index++)
1339		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1340					 sfxge_rx_stats[id].offset);
1341
1342	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1343}
1344
1345static void
1346sfxge_rx_stat_init(struct sfxge_softc *sc)
1347{
1348	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1349	struct sysctl_oid_list *stat_list;
1350	unsigned int id;
1351
1352	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1353
1354	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1355		SYSCTL_ADD_PROC(
1356			ctx, stat_list,
1357			OID_AUTO, sfxge_rx_stats[id].name,
1358			CTLTYPE_UINT|CTLFLAG_RD,
1359			sc, id, sfxge_rx_stat_handler, "IU",
1360			"");
1361	}
1362}
1363
1364void
1365sfxge_rx_fini(struct sfxge_softc *sc)
1366{
1367	int index;
1368
1369	index = sc->rxq_count;
1370	while (--index >= 0)
1371		sfxge_rx_qfini(sc, index);
1372
1373	sc->rxq_count = 0;
1374}
1375
1376int
1377sfxge_rx_init(struct sfxge_softc *sc)
1378{
1379	struct sfxge_intr *intr;
1380	int index;
1381	int rc;
1382
1383#ifdef SFXGE_LRO
1384	if (!ISP2(lro_table_size)) {
1385		log(LOG_ERR, "%s=%u must be power of 2",
1386		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1387		rc = EINVAL;
1388		goto fail_lro_table_size;
1389	}
1390
1391	if (lro_idle_ticks == 0)
1392		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1393#endif
1394
1395	intr = &sc->intr;
1396
1397	sc->rxq_count = intr->n_alloc;
1398
1399	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1400	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1401
1402	/* Initialize the receive queue(s) - one per interrupt. */
1403	for (index = 0; index < sc->rxq_count; index++) {
1404		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1405			goto fail;
1406	}
1407
1408	sfxge_rx_stat_init(sc);
1409
1410	return (0);
1411
1412fail:
1413	/* Tear down the receive queue(s). */
1414	while (--index >= 0)
1415		sfxge_rx_qfini(sc, index);
1416
1417	sc->rxq_count = 0;
1418
1419#ifdef SFXGE_LRO
1420fail_lro_table_size:
1421#endif
1422	return (rc);
1423}
1424