sfxge_rx.c revision 284555
1/*-
2 * Copyright (c) 2010-2015 Solarflare Communications Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright notice,
12 *    this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * The views and conclusions contained in the software and documentation are
30 * those of the authors and should not be interpreted as representing official
31 * policies, either expressed or implied, of the FreeBSD Project.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/10/sys/dev/sfxge/sfxge_rx.c 284555 2015-06-18 15:46:39Z arybchik $");
36
37#include <sys/types.h>
38#include <sys/mbuf.h>
39#include <sys/smp.h>
40#include <sys/socket.h>
41#include <sys/sysctl.h>
42#include <sys/syslog.h>
43#include <sys/limits.h>
44#include <sys/syslog.h>
45
46#include <net/ethernet.h>
47#include <net/if.h>
48#include <net/if_vlan_var.h>
49
50#include <netinet/in.h>
51#include <netinet/ip.h>
52#include <netinet/ip6.h>
53#include <netinet/tcp.h>
54
55#include <machine/in_cksum.h>
56
57#include "common/efx.h"
58
59
60#include "sfxge.h"
61#include "sfxge_rx.h"
62
63#define	RX_REFILL_THRESHOLD(_entries)	(EFX_RXQ_LIMIT(_entries) * 9 / 10)
64
65#ifdef SFXGE_LRO
66
67SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
68	    "Large receive offload (LRO) parameters");
69
70#define	SFXGE_LRO_PARAM(_param)	SFXGE_PARAM(lro._param)
71
72/* Size of the LRO hash table.  Must be a power of 2.  A larger table
73 * means we can accelerate a larger number of streams.
74 */
75static unsigned lro_table_size = 128;
76TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
77SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
78	    &lro_table_size, 0,
79	    "Size of the LRO hash table (must be a power of 2)");
80
81/* Maximum length of a hash chain.  If chains get too long then the lookup
82 * time increases and may exceed the benefit of LRO.
83 */
84static unsigned lro_chain_max = 20;
85TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
86SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
87	    &lro_chain_max, 0,
88	    "The maximum length of a hash chain");
89
90/* Maximum time (in ticks) that a connection can be idle before it's LRO
91 * state is discarded.
92 */
93static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
94TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
95SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
96	    &lro_idle_ticks, 0,
97	    "The maximum time (in ticks) that a connection can be idle "
98	    "before it's LRO state is discarded");
99
100/* Number of packets with payload that must arrive in-order before a
101 * connection is eligible for LRO.  The idea is we should avoid coalescing
102 * segments when the sender is in slow-start because reducing the ACK rate
103 * can damage performance.
104 */
105static int lro_slow_start_packets = 2000;
106TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
107SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
108	    &lro_slow_start_packets, 0,
109	    "Number of packets with payload that must arrive in-order before "
110	    "a connection is eligible for LRO");
111
112/* Number of packets with payload that must arrive in-order following loss
113 * before a connection is eligible for LRO.  The idea is we should avoid
114 * coalescing segments when the sender is recovering from loss, because
115 * reducing the ACK rate can damage performance.
116 */
117static int lro_loss_packets = 20;
118TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
119SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
120	    &lro_loss_packets, 0,
121	    "Number of packets with payload that must arrive in-order "
122	    "following loss before a connection is eligible for LRO");
123
124/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
125#define	SFXGE_LRO_L2_ID_VLAN 0x4000
126#define	SFXGE_LRO_L2_ID_IPV6 0x8000
127#define	SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
128#define	SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
129
130/* Compare IPv6 addresses, avoiding conditional branches */
131static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
132				   const struct in6_addr *right)
133{
134#if LONG_BIT == 64
135	const uint64_t *left64 = (const uint64_t *)left;
136	const uint64_t *right64 = (const uint64_t *)right;
137	return (left64[0] - right64[0]) | (left64[1] - right64[1]);
138#else
139	return (left->s6_addr32[0] - right->s6_addr32[0]) |
140	       (left->s6_addr32[1] - right->s6_addr32[1]) |
141	       (left->s6_addr32[2] - right->s6_addr32[2]) |
142	       (left->s6_addr32[3] - right->s6_addr32[3]);
143#endif
144}
145
146#endif	/* SFXGE_LRO */
147
148void
149sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
150{
151
152	rxq->flush_state = SFXGE_FLUSH_DONE;
153}
154
155void
156sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
157{
158
159	rxq->flush_state = SFXGE_FLUSH_FAILED;
160}
161
162static uint8_t toep_key[] = {
163	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
164	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
165	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
166	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
167	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
168};
169
170static void
171sfxge_rx_post_refill(void *arg)
172{
173	struct sfxge_rxq *rxq = arg;
174	struct sfxge_softc *sc;
175	unsigned int index;
176	struct sfxge_evq *evq;
177	uint16_t magic;
178
179	sc = rxq->sc;
180	index = rxq->index;
181	evq = sc->evq[index];
182
183	magic = SFXGE_MAGIC_RX_QREFILL | index;
184
185	/* This is guaranteed due to the start/stop order of rx and ev */
186	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
187	    ("evq not started"));
188	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
189	    ("rxq not started"));
190	efx_ev_qpost(evq->common, magic);
191}
192
193static void
194sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
195{
196	/* Initially retry after 100 ms, but back off in case of
197	 * repeated failures as we probably have to wait for the
198	 * administrator to raise the pool limit. */
199	if (retrying)
200		rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
201	else
202		rxq->refill_delay = hz / 10;
203
204	callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
205			     sfxge_rx_post_refill, rxq);
206}
207
208static struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
209{
210	struct mb_args args;
211	struct mbuf *m;
212
213	/* Allocate mbuf structure */
214	args.flags = M_PKTHDR;
215	args.type = MT_DATA;
216	m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT);
217
218	/* Allocate (and attach) packet buffer */
219	if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) {
220		uma_zfree(zone_mbuf, m);
221		m = NULL;
222	}
223
224	return (m);
225}
226
227#define	SFXGE_REFILL_BATCH  64
228
229static void
230sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
231{
232	struct sfxge_softc *sc;
233	unsigned int index;
234	struct sfxge_evq *evq;
235	unsigned int batch;
236	unsigned int rxfill;
237	unsigned int mblksize;
238	int ntodo;
239	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
240
241	sc = rxq->sc;
242	index = rxq->index;
243	evq = sc->evq[index];
244
245	prefetch_read_many(sc->enp);
246	prefetch_read_many(rxq->common);
247
248	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
249
250	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
251		return;
252
253	rxfill = rxq->added - rxq->completed;
254	KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
255	    ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
256	ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
257	KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
258	    ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
259
260	if (ntodo == 0)
261		return;
262
263	batch = 0;
264	mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
265	while (ntodo-- > 0) {
266		unsigned int id;
267		struct sfxge_rx_sw_desc *rx_desc;
268		bus_dma_segment_t seg;
269		struct mbuf *m;
270
271		id = (rxq->added + batch) & rxq->ptr_mask;
272		rx_desc = &rxq->queue[id];
273		KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
274
275		rx_desc->flags = EFX_DISCARD;
276		m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
277		if (m == NULL)
278			break;
279
280		/* m_len specifies length of area to be mapped for DMA */
281		m->m_len  = mblksize;
282		m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
283		m->m_data += sc->rx_buffer_align;
284
285		sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
286		addr[batch++] = seg.ds_addr;
287
288		if (batch == SFXGE_REFILL_BATCH) {
289			efx_rx_qpost(rxq->common, addr, mblksize, batch,
290			    rxq->completed, rxq->added);
291			rxq->added += batch;
292			batch = 0;
293		}
294	}
295
296	if (ntodo != 0)
297		sfxge_rx_schedule_refill(rxq, retrying);
298
299	if (batch != 0) {
300		efx_rx_qpost(rxq->common, addr, mblksize, batch,
301		    rxq->completed, rxq->added);
302		rxq->added += batch;
303	}
304
305	/* Make the descriptors visible to the hardware */
306	bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
307			BUS_DMASYNC_PREWRITE);
308
309	efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
310
311	/* The queue could still be empty if no descriptors were actually
312	 * pushed, in which case there will be no event to cause the next
313	 * refill, so we must schedule a refill ourselves.
314	 */
315	if(rxq->pushed == rxq->completed) {
316		sfxge_rx_schedule_refill(rxq, retrying);
317	}
318}
319
320void
321sfxge_rx_qrefill(struct sfxge_rxq *rxq)
322{
323
324	if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
325		return;
326
327	/* Make sure the queue is full */
328	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
329}
330
331static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
332{
333	struct ifnet *ifp = sc->ifnet;
334
335	m->m_pkthdr.rcvif = ifp;
336	m->m_pkthdr.csum_data = 0xffff;
337	ifp->if_input(ifp, m);
338}
339
340static void
341sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
342{
343	struct mbuf *m = rx_desc->mbuf;
344	int flags = rx_desc->flags;
345	int csum_flags;
346
347	/* Convert checksum flags */
348	csum_flags = (flags & EFX_CKSUM_IPV4) ?
349		(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
350	if (flags & EFX_CKSUM_TCPUDP)
351		csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
352
353	/* The hash covers a 4-tuple for TCP only */
354	if (flags & EFX_PKT_TCP) {
355		m->m_pkthdr.flowid =
356			efx_psuedo_hdr_hash_get(sc->enp,
357						EFX_RX_HASHALG_TOEPLITZ,
358						mtod(m, uint8_t *));
359		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
360	}
361	m->m_data += sc->rx_prefix_size;
362	m->m_len = rx_desc->size - sc->rx_prefix_size;
363	m->m_pkthdr.len = m->m_len;
364	m->m_pkthdr.csum_flags = csum_flags;
365	__sfxge_rx_deliver(sc, rx_desc->mbuf);
366
367	rx_desc->flags = EFX_DISCARD;
368	rx_desc->mbuf = NULL;
369}
370
371#ifdef SFXGE_LRO
372
373static void
374sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
375{
376	struct sfxge_softc *sc = st->sc;
377	struct mbuf *m = c->mbuf;
378	struct tcphdr *c_th;
379	int csum_flags;
380
381	KASSERT(m, ("no mbuf to deliver"));
382
383	++st->n_bursts;
384
385	/* Finish off packet munging and recalculate IP header checksum. */
386	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
387		struct ip *iph = c->nh;
388		iph->ip_len = htons(iph->ip_len);
389		iph->ip_sum = 0;
390		iph->ip_sum = in_cksum_hdr(iph);
391		c_th = (struct tcphdr *)(iph + 1);
392		csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
393			      CSUM_IP_CHECKED | CSUM_IP_VALID);
394	} else {
395		struct ip6_hdr *iph = c->nh;
396		iph->ip6_plen = htons(iph->ip6_plen);
397		c_th = (struct tcphdr *)(iph + 1);
398		csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
399	}
400
401	c_th->th_win = c->th_last->th_win;
402	c_th->th_ack = c->th_last->th_ack;
403	if (c_th->th_off == c->th_last->th_off) {
404		/* Copy TCP options (take care to avoid going negative). */
405		int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
406		memcpy(c_th + 1, c->th_last + 1, optlen);
407	}
408
409	m->m_pkthdr.flowid = c->conn_hash;
410	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
411
412	m->m_pkthdr.csum_flags = csum_flags;
413	__sfxge_rx_deliver(sc, m);
414
415	c->mbuf = NULL;
416	c->delivered = 1;
417}
418
419/* Drop the given connection, and add it to the free list. */
420static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
421{
422	unsigned bucket;
423
424	KASSERT(!c->mbuf, ("found orphaned mbuf"));
425
426	if (c->next_buf.mbuf != NULL) {
427		sfxge_rx_deliver(rxq->sc, &c->next_buf);
428		LIST_REMOVE(c, active_link);
429	}
430
431	bucket = c->conn_hash & rxq->lro.conns_mask;
432	KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
433	--rxq->lro.conns_n[bucket];
434	TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
435	TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
436}
437
438/* Stop tracking connections that have gone idle in order to keep hash
439 * chains short.
440 */
441static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
442{
443	struct sfxge_lro_conn *c;
444	unsigned i;
445
446	KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
447		("found active connections"));
448
449	rxq->lro.last_purge_ticks = now;
450	for (i = 0; i <= rxq->lro.conns_mask; ++i) {
451		if (TAILQ_EMPTY(&rxq->lro.conns[i]))
452			continue;
453
454		c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
455		if (now - c->last_pkt_ticks > lro_idle_ticks) {
456			++rxq->lro.n_drop_idle;
457			sfxge_lro_drop(rxq, c);
458		}
459	}
460}
461
462static void
463sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
464		struct mbuf *mbuf, struct tcphdr *th)
465{
466	struct tcphdr *c_th;
467
468	/* Tack the new mbuf onto the chain. */
469	KASSERT(!mbuf->m_next, ("mbuf already chained"));
470	c->mbuf_tail->m_next = mbuf;
471	c->mbuf_tail = mbuf;
472
473	/* Increase length appropriately */
474	c->mbuf->m_pkthdr.len += mbuf->m_len;
475
476	/* Update the connection state flags */
477	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
478		struct ip *iph = c->nh;
479		iph->ip_len += mbuf->m_len;
480		c_th = (struct tcphdr *)(iph + 1);
481	} else {
482		struct ip6_hdr *iph = c->nh;
483		iph->ip6_plen += mbuf->m_len;
484		c_th = (struct tcphdr *)(iph + 1);
485	}
486	c_th->th_flags |= (th->th_flags & TH_PUSH);
487	c->th_last = th;
488	++st->n_merges;
489
490	/* Pass packet up now if another segment could overflow the IP
491	 * length.
492	 */
493	if (c->mbuf->m_pkthdr.len > 65536 - 9200)
494		sfxge_lro_deliver(st, c);
495}
496
497static void
498sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
499		struct mbuf *mbuf, void *nh, struct tcphdr *th)
500{
501	/* Start the chain */
502	c->mbuf = mbuf;
503	c->mbuf_tail = c->mbuf;
504	c->nh = nh;
505	c->th_last = th;
506
507	mbuf->m_pkthdr.len = mbuf->m_len;
508
509	/* Mangle header fields for later processing */
510	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
511		struct ip *iph = nh;
512		iph->ip_len = ntohs(iph->ip_len);
513	} else {
514		struct ip6_hdr *iph = nh;
515		iph->ip6_plen = ntohs(iph->ip6_plen);
516	}
517}
518
519/* Try to merge or otherwise hold or deliver (as appropriate) the
520 * packet buffered for this connection (c->next_buf).  Return a flag
521 * indicating whether the connection is still active for LRO purposes.
522 */
523static int
524sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
525{
526	struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
527	char *eh = c->next_eh;
528	int data_length, hdr_length, dont_merge;
529	unsigned th_seq, pkt_length;
530	struct tcphdr *th;
531	unsigned now;
532
533	if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
534		struct ip *iph = c->next_nh;
535		th = (struct tcphdr *)(iph + 1);
536		pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
537	} else {
538		struct ip6_hdr *iph = c->next_nh;
539		th = (struct tcphdr *)(iph + 1);
540		pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
541	}
542
543	hdr_length = (char *) th + th->th_off * 4 - eh;
544	data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
545		       hdr_length);
546	th_seq = ntohl(th->th_seq);
547	dont_merge = ((data_length <= 0)
548		      | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
549
550	/* Check for options other than aligned timestamp. */
551	if (th->th_off != 5) {
552		const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
553		if (th->th_off == 8 &&
554		    opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
555					(TCPOPT_NOP << 16) |
556					(TCPOPT_TIMESTAMP << 8) |
557					TCPOLEN_TIMESTAMP)) {
558			/* timestamp option -- okay */
559		} else {
560			dont_merge = 1;
561		}
562	}
563
564	if (__predict_false(th_seq != c->next_seq)) {
565		/* Out-of-order, so start counting again. */
566		if (c->mbuf != NULL)
567			sfxge_lro_deliver(&rxq->lro, c);
568		c->n_in_order_pkts -= lro_loss_packets;
569		c->next_seq = th_seq + data_length;
570		++rxq->lro.n_misorder;
571		goto deliver_buf_out;
572	}
573	c->next_seq = th_seq + data_length;
574
575	now = ticks;
576	if (now - c->last_pkt_ticks > lro_idle_ticks) {
577		++rxq->lro.n_drop_idle;
578		if (c->mbuf != NULL)
579			sfxge_lro_deliver(&rxq->lro, c);
580		sfxge_lro_drop(rxq, c);
581		return (0);
582	}
583	c->last_pkt_ticks = ticks;
584
585	if (c->n_in_order_pkts < lro_slow_start_packets) {
586		/* May be in slow-start, so don't merge. */
587		++rxq->lro.n_slow_start;
588		++c->n_in_order_pkts;
589		goto deliver_buf_out;
590	}
591
592	if (__predict_false(dont_merge)) {
593		if (c->mbuf != NULL)
594			sfxge_lro_deliver(&rxq->lro, c);
595		if (th->th_flags & (TH_FIN | TH_RST)) {
596			++rxq->lro.n_drop_closed;
597			sfxge_lro_drop(rxq, c);
598			return (0);
599		}
600		goto deliver_buf_out;
601	}
602
603	rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
604
605	if (__predict_true(c->mbuf != NULL)) {
606		/* Remove headers and any padding */
607		rx_buf->mbuf->m_data += hdr_length;
608		rx_buf->mbuf->m_len = data_length;
609
610		sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
611	} else {
612		/* Remove any padding */
613		rx_buf->mbuf->m_len = pkt_length;
614
615		sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
616	}
617
618	rx_buf->mbuf = NULL;
619	return (1);
620
621 deliver_buf_out:
622	sfxge_rx_deliver(rxq->sc, rx_buf);
623	return (1);
624}
625
626static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
627			       uint16_t l2_id, void *nh, struct tcphdr *th)
628{
629	unsigned bucket = conn_hash & st->conns_mask;
630	struct sfxge_lro_conn *c;
631
632	if (st->conns_n[bucket] >= lro_chain_max) {
633		++st->n_too_many;
634		return;
635	}
636
637	if (!TAILQ_EMPTY(&st->free_conns)) {
638		c = TAILQ_FIRST(&st->free_conns);
639		TAILQ_REMOVE(&st->free_conns, c, link);
640	} else {
641		c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
642		if (c == NULL)
643			return;
644		c->mbuf = NULL;
645		c->next_buf.mbuf = NULL;
646	}
647
648	/* Create the connection tracking data */
649	++st->conns_n[bucket];
650	TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
651	c->l2_id = l2_id;
652	c->conn_hash = conn_hash;
653	c->source = th->th_sport;
654	c->dest = th->th_dport;
655	c->n_in_order_pkts = 0;
656	c->last_pkt_ticks = *(volatile int *)&ticks;
657	c->delivered = 0;
658	++st->n_new_stream;
659	/* NB. We don't initialise c->next_seq, and it doesn't matter what
660	 * value it has.  Most likely the next packet received for this
661	 * connection will not match -- no harm done.
662	 */
663}
664
665/* Process mbuf and decide whether to dispatch it to the stack now or
666 * later.
667 */
668static void
669sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
670{
671	struct sfxge_softc *sc = rxq->sc;
672	struct mbuf *m = rx_buf->mbuf;
673	struct ether_header *eh;
674	struct sfxge_lro_conn *c;
675	uint16_t l2_id;
676	uint16_t l3_proto;
677	void *nh;
678	struct tcphdr *th;
679	uint32_t conn_hash;
680	unsigned bucket;
681
682	/* Get the hardware hash */
683	conn_hash = efx_psuedo_hdr_hash_get(sc->enp,
684					    EFX_RX_HASHALG_TOEPLITZ,
685					    mtod(m, uint8_t *));
686
687	eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
688	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
689		struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
690		l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
691			SFXGE_LRO_L2_ID_VLAN;
692		l3_proto = veh->evl_proto;
693		nh = veh + 1;
694	} else {
695		l2_id = 0;
696		l3_proto = eh->ether_type;
697		nh = eh + 1;
698	}
699
700	/* Check whether this is a suitable packet (unfragmented
701	 * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
702	 * length, and compute a hash if necessary.  If not, return.
703	 */
704	if (l3_proto == htons(ETHERTYPE_IP)) {
705		struct ip *iph = nh;
706
707		KASSERT(iph->ip_p == IPPROTO_TCP,
708		    ("IPv4 protocol is not TCP, but packet marker is set"));
709		if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
710		    (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
711			goto deliver_now;
712		th = (struct tcphdr *)(iph + 1);
713	} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
714		struct ip6_hdr *iph = nh;
715
716		KASSERT(iph->ip6_nxt == IPPROTO_TCP,
717		    ("IPv6 next header is not TCP, but packet marker is set"));
718		l2_id |= SFXGE_LRO_L2_ID_IPV6;
719		th = (struct tcphdr *)(iph + 1);
720	} else {
721		goto deliver_now;
722	}
723
724	bucket = conn_hash & rxq->lro.conns_mask;
725
726	TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
727		if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
728			continue;
729		if ((c->source - th->th_sport) | (c->dest - th->th_dport))
730			continue;
731		if (c->mbuf != NULL) {
732			if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
733				struct ip *c_iph, *iph = nh;
734				c_iph = c->nh;
735				if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
736				    (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
737					continue;
738			} else {
739				struct ip6_hdr *c_iph, *iph = nh;
740				c_iph = c->nh;
741				if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
742				    ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
743					continue;
744			}
745		}
746
747		/* Re-insert at head of list to reduce lookup time. */
748		TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
749		TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
750
751		if (c->next_buf.mbuf != NULL) {
752			if (!sfxge_lro_try_merge(rxq, c))
753				goto deliver_now;
754		} else {
755			LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
756			    active_link);
757		}
758		c->next_buf = *rx_buf;
759		c->next_eh = eh;
760		c->next_nh = nh;
761
762		rx_buf->mbuf = NULL;
763		rx_buf->flags = EFX_DISCARD;
764		return;
765	}
766
767	sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
768 deliver_now:
769	sfxge_rx_deliver(sc, rx_buf);
770}
771
772static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
773{
774	struct sfxge_lro_state *st = &rxq->lro;
775	struct sfxge_lro_conn *c;
776	unsigned t;
777
778	while (!LIST_EMPTY(&st->active_conns)) {
779		c = LIST_FIRST(&st->active_conns);
780		if (!c->delivered && c->mbuf != NULL)
781			sfxge_lro_deliver(st, c);
782		if (sfxge_lro_try_merge(rxq, c)) {
783			if (c->mbuf != NULL)
784				sfxge_lro_deliver(st, c);
785			LIST_REMOVE(c, active_link);
786		}
787		c->delivered = 0;
788	}
789
790	t = *(volatile int *)&ticks;
791	if (__predict_false(t != st->last_purge_ticks))
792		sfxge_lro_purge_idle(rxq, t);
793}
794
795#else	/* !SFXGE_LRO */
796
797static void
798sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
799{
800}
801
802static void
803sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
804{
805}
806
807#endif	/* SFXGE_LRO */
808
809void
810sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
811{
812	struct sfxge_softc *sc = rxq->sc;
813	int if_capenable = sc->ifnet->if_capenable;
814	int lro_enabled = if_capenable & IFCAP_LRO;
815	unsigned int index;
816	struct sfxge_evq *evq;
817	unsigned int completed;
818	unsigned int level;
819	struct mbuf *m;
820	struct sfxge_rx_sw_desc *prev = NULL;
821
822	index = rxq->index;
823	evq = sc->evq[index];
824
825	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
826
827	completed = rxq->completed;
828	while (completed != rxq->pending) {
829		unsigned int id;
830		struct sfxge_rx_sw_desc *rx_desc;
831
832		id = completed++ & rxq->ptr_mask;
833		rx_desc = &rxq->queue[id];
834		m = rx_desc->mbuf;
835
836		if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
837			goto discard;
838
839		if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
840			goto discard;
841
842		/* Read the length from the psuedo header if required */
843		if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
844			uint16_t tmp_size;
845			int rc;
846			rc = efx_psuedo_hdr_pkt_length_get(sc->enp,
847							   mtod(m, uint8_t *),
848							   &tmp_size);
849			KASSERT(rc == 0, ("cannot get packet length: %d", rc));
850			rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
851		}
852
853		prefetch_read_many(mtod(m, caddr_t));
854
855		switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
856		case EFX_PKT_IPV4:
857			if (~if_capenable & IFCAP_RXCSUM)
858				rx_desc->flags &=
859				    ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
860			break;
861		case EFX_PKT_IPV6:
862			if (~if_capenable & IFCAP_RXCSUM_IPV6)
863				rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
864			break;
865		case 0:
866			/* Check for loopback packets */
867			{
868				struct ether_header *etherhp;
869
870				/*LINTED*/
871				etherhp = mtod(m, struct ether_header *);
872
873				if (etherhp->ether_type ==
874				    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
875					EFSYS_PROBE(loopback);
876
877					rxq->loopback++;
878					goto discard;
879				}
880			}
881			break;
882		default:
883			KASSERT(B_FALSE,
884			    ("Rx descriptor with both IPv4 and IPv6 flags"));
885			goto discard;
886		}
887
888		/* Pass packet up the stack or into LRO (pipelined) */
889		if (prev != NULL) {
890			if (lro_enabled &&
891			    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
892			     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
893				sfxge_lro(rxq, prev);
894			else
895				sfxge_rx_deliver(sc, prev);
896		}
897		prev = rx_desc;
898		continue;
899
900discard:
901		/* Return the packet to the pool */
902		m_free(m);
903		rx_desc->mbuf = NULL;
904	}
905	rxq->completed = completed;
906
907	level = rxq->added - rxq->completed;
908
909	/* Pass last packet up the stack or into LRO */
910	if (prev != NULL) {
911		if (lro_enabled &&
912		    ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
913		     (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
914			sfxge_lro(rxq, prev);
915		else
916			sfxge_rx_deliver(sc, prev);
917	}
918
919	/*
920	 * If there are any pending flows and this is the end of the
921	 * poll then they must be completed.
922	 */
923	if (eop)
924		sfxge_lro_end_of_burst(rxq);
925
926	/* Top up the queue if necessary */
927	if (level < rxq->refill_threshold)
928		sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
929}
930
931static void
932sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
933{
934	struct sfxge_rxq *rxq;
935	struct sfxge_evq *evq;
936	unsigned int count;
937	unsigned int retry = 3;
938
939	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
940
941	rxq = sc->rxq[index];
942	evq = sc->evq[index];
943
944	SFXGE_EVQ_LOCK(evq);
945
946	KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
947	    ("rxq not started"));
948
949	rxq->init_state = SFXGE_RXQ_INITIALIZED;
950
951	callout_stop(&rxq->refill_callout);
952
953	while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
954		rxq->flush_state = SFXGE_FLUSH_PENDING;
955
956		SFXGE_EVQ_UNLOCK(evq);
957
958		/* Flush the receive queue */
959		if (efx_rx_qflush(rxq->common) != 0) {
960			SFXGE_EVQ_LOCK(evq);
961			rxq->flush_state = SFXGE_FLUSH_FAILED;
962			break;
963		}
964
965		count = 0;
966		do {
967			/* Spin for 100 ms */
968			DELAY(100000);
969
970			if (rxq->flush_state != SFXGE_FLUSH_PENDING)
971				break;
972
973		} while (++count < 20);
974
975		SFXGE_EVQ_LOCK(evq);
976
977		if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
978			/* Flush timeout - neither done nor failed */
979			log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
980			    device_get_nameunit(sc->dev), index);
981			rxq->flush_state = SFXGE_FLUSH_DONE;
982		}
983		retry--;
984	}
985	if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
986		log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
987		    device_get_nameunit(sc->dev), index);
988		rxq->flush_state = SFXGE_FLUSH_DONE;
989	}
990
991	rxq->pending = rxq->added;
992	sfxge_rx_qcomplete(rxq, B_TRUE);
993
994	KASSERT(rxq->completed == rxq->pending,
995	    ("rxq->completed != rxq->pending"));
996
997	rxq->added = 0;
998	rxq->pushed = 0;
999	rxq->pending = 0;
1000	rxq->completed = 0;
1001	rxq->loopback = 0;
1002
1003	/* Destroy the common code receive queue. */
1004	efx_rx_qdestroy(rxq->common);
1005
1006	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1007	    EFX_RXQ_NBUFS(sc->rxq_entries));
1008
1009	SFXGE_EVQ_UNLOCK(evq);
1010}
1011
1012static int
1013sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
1014{
1015	struct sfxge_rxq *rxq;
1016	efsys_mem_t *esmp;
1017	struct sfxge_evq *evq;
1018	int rc;
1019
1020	SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
1021
1022	rxq = sc->rxq[index];
1023	esmp = &rxq->mem;
1024	evq = sc->evq[index];
1025
1026	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1027	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1028	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1029	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1030
1031	/* Program the buffer table. */
1032	if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
1033	    EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
1034		return (rc);
1035
1036	/* Create the common code receive queue. */
1037	if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1038	    esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
1039	    &rxq->common)) != 0)
1040		goto fail;
1041
1042	SFXGE_EVQ_LOCK(evq);
1043
1044	/* Enable the receive queue. */
1045	efx_rx_qenable(rxq->common);
1046
1047	rxq->init_state = SFXGE_RXQ_STARTED;
1048	rxq->flush_state = SFXGE_FLUSH_REQUIRED;
1049
1050	/* Try to fill the queue from the pool. */
1051	sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
1052
1053	SFXGE_EVQ_UNLOCK(evq);
1054
1055	return (0);
1056
1057fail:
1058	efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
1059	    EFX_RXQ_NBUFS(sc->rxq_entries));
1060	return (rc);
1061}
1062
1063void
1064sfxge_rx_stop(struct sfxge_softc *sc)
1065{
1066	int index;
1067
1068	efx_mac_filter_default_rxq_clear(sc->enp);
1069
1070	/* Stop the receive queue(s) */
1071	index = sc->rxq_count;
1072	while (--index >= 0)
1073		sfxge_rx_qstop(sc, index);
1074
1075	sc->rx_prefix_size = 0;
1076	sc->rx_buffer_size = 0;
1077
1078	efx_rx_fini(sc->enp);
1079}
1080
1081int
1082sfxge_rx_start(struct sfxge_softc *sc)
1083{
1084	struct sfxge_intr *intr;
1085	const efx_nic_cfg_t *encp;
1086	size_t hdrlen, align, reserved;
1087	int index;
1088	int rc;
1089
1090	intr = &sc->intr;
1091
1092	/* Initialize the common code receive module. */
1093	if ((rc = efx_rx_init(sc->enp)) != 0)
1094		return (rc);
1095
1096	encp = efx_nic_cfg_get(sc->enp);
1097	sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
1098
1099	/* Calculate the receive packet buffer size. */
1100	sc->rx_prefix_size = encp->enc_rx_prefix_size;
1101
1102	/* Ensure IP headers are 32bit aligned */
1103	hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
1104	sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
1105
1106	sc->rx_buffer_size += sc->rx_buffer_align;
1107
1108	/* Align end of packet buffer for RX DMA end padding */
1109	align = MAX(1, encp->enc_rx_buf_align_end);
1110	EFSYS_ASSERT(ISP2(align));
1111	sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
1112
1113	/*
1114	 * Standard mbuf zones only guarantee pointer-size alignment;
1115	 * we need extra space to align to the cache line
1116	 */
1117	reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
1118
1119	/* Select zone for packet buffers */
1120	if (reserved <= MCLBYTES)
1121		sc->rx_buffer_zone = zone_clust;
1122	else if (reserved <= MJUMPAGESIZE)
1123		sc->rx_buffer_zone = zone_jumbop;
1124	else if (reserved <= MJUM9BYTES)
1125		sc->rx_buffer_zone = zone_jumbo9;
1126	else
1127		sc->rx_buffer_zone = zone_jumbo16;
1128
1129	/*
1130	 * Set up the scale table.  Enable all hash types and hash insertion.
1131	 */
1132	for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
1133		sc->rx_indir_table[index] = index % sc->rxq_count;
1134	if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
1135				       SFXGE_RX_SCALE_MAX)) != 0)
1136		goto fail;
1137	(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
1138	    (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
1139	    (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
1140
1141	if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
1142				       sizeof(toep_key))) != 0)
1143		goto fail;
1144
1145	/* Start the receive queue(s). */
1146	for (index = 0; index < sc->rxq_count; index++) {
1147		if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1148			goto fail2;
1149	}
1150
1151	rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
1152					    sc->intr.n_alloc > 1);
1153	if (rc != 0)
1154		goto fail3;
1155
1156	return (0);
1157
1158fail3:
1159fail2:
1160	while (--index >= 0)
1161		sfxge_rx_qstop(sc, index);
1162
1163fail:
1164	efx_rx_fini(sc->enp);
1165
1166	return (rc);
1167}
1168
1169#ifdef SFXGE_LRO
1170
1171static void sfxge_lro_init(struct sfxge_rxq *rxq)
1172{
1173	struct sfxge_lro_state *st = &rxq->lro;
1174	unsigned i;
1175
1176	st->conns_mask = lro_table_size - 1;
1177	KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1178		("lro_table_size must be a power of 2"));
1179	st->sc = rxq->sc;
1180	st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1181			   M_SFXGE, M_WAITOK);
1182	st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1183			     M_SFXGE, M_WAITOK);
1184	for (i = 0; i <= st->conns_mask; ++i) {
1185		TAILQ_INIT(&st->conns[i]);
1186		st->conns_n[i] = 0;
1187	}
1188	LIST_INIT(&st->active_conns);
1189	TAILQ_INIT(&st->free_conns);
1190}
1191
1192static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1193{
1194	struct sfxge_lro_state *st = &rxq->lro;
1195	struct sfxge_lro_conn *c;
1196	unsigned i;
1197
1198	/* Return cleanly if sfxge_lro_init() has not been called. */
1199	if (st->conns == NULL)
1200		return;
1201
1202	KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1203
1204	for (i = 0; i <= st->conns_mask; ++i) {
1205		while (!TAILQ_EMPTY(&st->conns[i])) {
1206			c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1207			sfxge_lro_drop(rxq, c);
1208		}
1209	}
1210
1211	while (!TAILQ_EMPTY(&st->free_conns)) {
1212		c = TAILQ_FIRST(&st->free_conns);
1213		TAILQ_REMOVE(&st->free_conns, c, link);
1214		KASSERT(!c->mbuf, ("found orphaned mbuf"));
1215		free(c, M_SFXGE);
1216	}
1217
1218	free(st->conns_n, M_SFXGE);
1219	free(st->conns, M_SFXGE);
1220	st->conns = NULL;
1221}
1222
1223#else
1224
1225static void
1226sfxge_lro_init(struct sfxge_rxq *rxq)
1227{
1228}
1229
1230static void
1231sfxge_lro_fini(struct sfxge_rxq *rxq)
1232{
1233}
1234
1235#endif	/* SFXGE_LRO */
1236
1237static void
1238sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1239{
1240	struct sfxge_rxq *rxq;
1241
1242	rxq = sc->rxq[index];
1243
1244	KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1245	    ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1246
1247	/* Free the context array and the flow table. */
1248	free(rxq->queue, M_SFXGE);
1249	sfxge_lro_fini(rxq);
1250
1251	/* Release DMA memory. */
1252	sfxge_dma_free(&rxq->mem);
1253
1254	sc->rxq[index] = NULL;
1255
1256	free(rxq, M_SFXGE);
1257}
1258
1259static int
1260sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1261{
1262	struct sfxge_rxq *rxq;
1263	struct sfxge_evq *evq;
1264	efsys_mem_t *esmp;
1265	int rc;
1266
1267	KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
1268
1269	rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1270	rxq->sc = sc;
1271	rxq->index = index;
1272	rxq->entries = sc->rxq_entries;
1273	rxq->ptr_mask = rxq->entries - 1;
1274	rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
1275
1276	sc->rxq[index] = rxq;
1277	esmp = &rxq->mem;
1278
1279	evq = sc->evq[index];
1280
1281	/* Allocate and zero DMA space. */
1282	if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
1283		return (rc);
1284
1285	/* Allocate buffer table entries. */
1286	sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
1287				 &rxq->buf_base_id);
1288
1289	/* Allocate the context array and the flow table. */
1290	rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
1291	    M_SFXGE, M_WAITOK | M_ZERO);
1292	sfxge_lro_init(rxq);
1293
1294	callout_init(&rxq->refill_callout, B_TRUE);
1295
1296	rxq->init_state = SFXGE_RXQ_INITIALIZED;
1297
1298	return (0);
1299}
1300
1301static const struct {
1302	const char *name;
1303	size_t offset;
1304} sfxge_rx_stats[] = {
1305#define	SFXGE_RX_STAT(name, member) \
1306	{ #name, offsetof(struct sfxge_rxq, member) }
1307#ifdef SFXGE_LRO
1308	SFXGE_RX_STAT(lro_merges, lro.n_merges),
1309	SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1310	SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1311	SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1312	SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1313	SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1314	SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1315	SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1316#endif
1317};
1318
1319static int
1320sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1321{
1322	struct sfxge_softc *sc = arg1;
1323	unsigned int id = arg2;
1324	unsigned int sum, index;
1325
1326	/* Sum across all RX queues */
1327	sum = 0;
1328	for (index = 0; index < sc->rxq_count; index++)
1329		sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1330					 sfxge_rx_stats[id].offset);
1331
1332	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1333}
1334
1335static void
1336sfxge_rx_stat_init(struct sfxge_softc *sc)
1337{
1338	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1339	struct sysctl_oid_list *stat_list;
1340	unsigned int id;
1341
1342	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1343
1344	for (id = 0; id < nitems(sfxge_rx_stats); id++) {
1345		SYSCTL_ADD_PROC(
1346			ctx, stat_list,
1347			OID_AUTO, sfxge_rx_stats[id].name,
1348			CTLTYPE_UINT|CTLFLAG_RD,
1349			sc, id, sfxge_rx_stat_handler, "IU",
1350			"");
1351	}
1352}
1353
1354void
1355sfxge_rx_fini(struct sfxge_softc *sc)
1356{
1357	int index;
1358
1359	index = sc->rxq_count;
1360	while (--index >= 0)
1361		sfxge_rx_qfini(sc, index);
1362
1363	sc->rxq_count = 0;
1364}
1365
1366int
1367sfxge_rx_init(struct sfxge_softc *sc)
1368{
1369	struct sfxge_intr *intr;
1370	int index;
1371	int rc;
1372
1373#ifdef SFXGE_LRO
1374	if (!ISP2(lro_table_size)) {
1375		log(LOG_ERR, "%s=%u must be power of 2",
1376		    SFXGE_LRO_PARAM(table_size), lro_table_size);
1377		rc = EINVAL;
1378		goto fail_lro_table_size;
1379	}
1380
1381	if (lro_idle_ticks == 0)
1382		lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1383#endif
1384
1385	intr = &sc->intr;
1386
1387	sc->rxq_count = intr->n_alloc;
1388
1389	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1390	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1391
1392	/* Initialize the receive queue(s) - one per interrupt. */
1393	for (index = 0; index < sc->rxq_count; index++) {
1394		if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1395			goto fail;
1396	}
1397
1398	sfxge_rx_stat_init(sc);
1399
1400	return (0);
1401
1402fail:
1403	/* Tear down the receive queue(s). */
1404	while (--index >= 0)
1405		sfxge_rx_qfini(sc, index);
1406
1407	sc->rxq_count = 0;
1408
1409#ifdef SFXGE_LRO
1410fail_lro_table_size:
1411#endif
1412	return (rc);
1413}
1414