sfxge_tx.c revision 281955
1205147Sedwin/*-
2205147Sedwin * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3205147Sedwin * All rights reserved.
4205147Sedwin *
5205147Sedwin * This software was developed in part by Philip Paeps under contract for
6205147Sedwin * Solarflare Communications, Inc.
7205147Sedwin *
8205147Sedwin * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/* Theory of operation:
31 *
32 * Tx queues allocation and mapping
33 *
34 * One Tx queue with enabled checksum offload is allocated per Rx channel
35 * (event queue).  Also 2 Tx queues (one without checksum offload and one
36 * with IP checksum offload only) are allocated and bound to event queue 0.
37 * sfxge_txq_type is used as Tx queue label.
38 *
39 * So, event queue plus label mapping to Tx queue index is:
40 *	if event queue index is 0, TxQ-index = TxQ-label * [0..SFXGE_TXQ_NTYPES)
41 *	else TxQ-index = SFXGE_TXQ_NTYPES + EvQ-index - 1
42 * See sfxge_get_txq_by_label() sfxge_ev.c
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD: stable/10/sys/dev/sfxge/sfxge_tx.c 281955 2015-04-24 23:26:44Z hiren $");
47
48#include <sys/types.h>
49#include <sys/mbuf.h>
50#include <sys/smp.h>
51#include <sys/socket.h>
52#include <sys/sysctl.h>
53#include <sys/syslog.h>
54
55#include <net/bpf.h>
56#include <net/ethernet.h>
57#include <net/if.h>
58#include <net/if_vlan_var.h>
59
60#include <netinet/in.h>
61#include <netinet/ip.h>
62#include <netinet/ip6.h>
63#include <netinet/tcp.h>
64
65#include "common/efx.h"
66
67#include "sfxge.h"
68#include "sfxge_tx.h"
69
70/*
71 * Estimate maximum number of Tx descriptors required for TSO packet.
72 * With minimum MSS and maximum mbuf length we might need more (even
73 * than a ring-ful of descriptors), but this should not happen in
74 * practice except due to deliberate attack.  In that case we will
75 * truncate the output at a packet boundary.
76 */
77#define	SFXGE_TSO_MAX_DESC						\
78	(SFXGE_TSO_MAX_SEGS * 2 + SFXGE_TX_MAPPING_MAX_SEG - 1)
79
80/*
81 * Set the block level to ensure there is space to generate a
82 * large number of descriptors for TSO.
83 */
84#define	SFXGE_TXQ_BLOCK_LEVEL(_entries)					\
85	(EFX_TXQ_LIMIT(_entries) - SFXGE_TSO_MAX_DESC)
86
87
88#define	SFXGE_PARAM_TX_DPL_GET_MAX	SFXGE_PARAM(tx_dpl_get_max)
89static int sfxge_tx_dpl_get_max = SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT;
90TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_MAX, &sfxge_tx_dpl_get_max);
91SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_max, CTLFLAG_RDTUN,
92	   &sfxge_tx_dpl_get_max, 0,
93	   "Maximum number of any packets in deferred packet get-list");
94
95#define	SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX \
96	SFXGE_PARAM(tx_dpl_get_non_tcp_max)
97static int sfxge_tx_dpl_get_non_tcp_max =
98	SFXGE_TX_DPL_GET_NON_TCP_PKT_LIMIT_DEFAULT;
99TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX, &sfxge_tx_dpl_get_non_tcp_max);
100SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_non_tcp_max, CTLFLAG_RDTUN,
101	   &sfxge_tx_dpl_get_non_tcp_max, 0,
102	   "Maximum number of non-TCP packets in deferred packet get-list");
103
104#define	SFXGE_PARAM_TX_DPL_PUT_MAX	SFXGE_PARAM(tx_dpl_put_max)
105static int sfxge_tx_dpl_put_max = SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT;
106TUNABLE_INT(SFXGE_PARAM_TX_DPL_PUT_MAX, &sfxge_tx_dpl_put_max);
107SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_put_max, CTLFLAG_RDTUN,
108	   &sfxge_tx_dpl_put_max, 0,
109	   "Maximum number of any packets in deferred packet put-list");
110
111
112static const struct {
113	const char *name;
114	size_t offset;
115} sfxge_tx_stats[] = {
116#define	SFXGE_TX_STAT(name, member) \
117	{ #name, offsetof(struct sfxge_txq, member) }
118	SFXGE_TX_STAT(tso_bursts, tso_bursts),
119	SFXGE_TX_STAT(tso_packets, tso_packets),
120	SFXGE_TX_STAT(tso_long_headers, tso_long_headers),
121	SFXGE_TX_STAT(tso_pdrop_too_many, tso_pdrop_too_many),
122	SFXGE_TX_STAT(tso_pdrop_no_rsrc, tso_pdrop_no_rsrc),
123	SFXGE_TX_STAT(tx_collapses, collapses),
124	SFXGE_TX_STAT(tx_drops, drops),
125	SFXGE_TX_STAT(tx_get_overflow, get_overflow),
126	SFXGE_TX_STAT(tx_get_non_tcp_overflow, get_non_tcp_overflow),
127	SFXGE_TX_STAT(tx_put_overflow, put_overflow),
128	SFXGE_TX_STAT(tx_netdown_drops, netdown_drops),
129};
130
131
132/* Forward declarations. */
133static void sfxge_tx_qdpl_service(struct sfxge_txq *txq);
134static void sfxge_tx_qlist_post(struct sfxge_txq *txq);
135static void sfxge_tx_qunblock(struct sfxge_txq *txq);
136static int sfxge_tx_queue_tso(struct sfxge_txq *txq, struct mbuf *mbuf,
137			      const bus_dma_segment_t *dma_seg, int n_dma_seg);
138
139void
140sfxge_tx_qcomplete(struct sfxge_txq *txq, struct sfxge_evq *evq)
141{
142	unsigned int completed;
143
144	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
145
146	completed = txq->completed;
147	while (completed != txq->pending) {
148		struct sfxge_tx_mapping *stmp;
149		unsigned int id;
150
151		id = completed++ & txq->ptr_mask;
152
153		stmp = &txq->stmp[id];
154		if (stmp->flags & TX_BUF_UNMAP) {
155			bus_dmamap_unload(txq->packet_dma_tag, stmp->map);
156			if (stmp->flags & TX_BUF_MBUF) {
157				struct mbuf *m = stmp->u.mbuf;
158				do
159					m = m_free(m);
160				while (m != NULL);
161			} else {
162				free(stmp->u.heap_buf, M_SFXGE);
163			}
164			stmp->flags = 0;
165		}
166	}
167	txq->completed = completed;
168
169	/* Check whether we need to unblock the queue. */
170	mb();
171	if (txq->blocked) {
172		unsigned int level;
173
174		level = txq->added - txq->completed;
175		if (level <= SFXGE_TXQ_UNBLOCK_LEVEL(txq->entries))
176			sfxge_tx_qunblock(txq);
177	}
178}
179
180static unsigned int
181sfxge_is_mbuf_non_tcp(struct mbuf *mbuf)
182{
183	/* Absense of TCP checksum flags does not mean that it is non-TCP
184	 * but it should be true if user wants to achieve high throughput.
185	 */
186	return (!(mbuf->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)));
187}
188
189/*
190 * Reorder the put list and append it to the get list.
191 */
192static void
193sfxge_tx_qdpl_swizzle(struct sfxge_txq *txq)
194{
195	struct sfxge_tx_dpl *stdp;
196	struct mbuf *mbuf, *get_next, **get_tailp;
197	volatile uintptr_t *putp;
198	uintptr_t put;
199	unsigned int count;
200	unsigned int non_tcp_count;
201
202	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
203
204	stdp = &txq->dpl;
205
206	/* Acquire the put list. */
207	putp = &stdp->std_put;
208	put = atomic_readandclear_ptr(putp);
209	mbuf = (void *)put;
210
211	if (mbuf == NULL)
212		return;
213
214	/* Reverse the put list. */
215	get_tailp = &mbuf->m_nextpkt;
216	get_next = NULL;
217
218	count = 0;
219	non_tcp_count = 0;
220	do {
221		struct mbuf *put_next;
222
223		non_tcp_count += sfxge_is_mbuf_non_tcp(mbuf);
224		put_next = mbuf->m_nextpkt;
225		mbuf->m_nextpkt = get_next;
226		get_next = mbuf;
227		mbuf = put_next;
228
229		count++;
230	} while (mbuf != NULL);
231
232	if (count > stdp->std_put_hiwat)
233		stdp->std_put_hiwat = count;
234
235	/* Append the reversed put list to the get list. */
236	KASSERT(*get_tailp == NULL, ("*get_tailp != NULL"));
237	*stdp->std_getp = get_next;
238	stdp->std_getp = get_tailp;
239	stdp->std_get_count += count;
240	stdp->std_get_non_tcp_count += non_tcp_count;
241}
242
243static void
244sfxge_tx_qreap(struct sfxge_txq *txq)
245{
246	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
247
248	txq->reaped = txq->completed;
249}
250
251static void
252sfxge_tx_qlist_post(struct sfxge_txq *txq)
253{
254	unsigned int old_added;
255	unsigned int level;
256	int rc;
257
258	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
259
260	KASSERT(txq->n_pend_desc != 0, ("txq->n_pend_desc == 0"));
261	KASSERT(txq->n_pend_desc <= SFXGE_TSO_MAX_DESC,
262		("txq->n_pend_desc too large"));
263	KASSERT(!txq->blocked, ("txq->blocked"));
264
265	old_added = txq->added;
266
267	/* Post the fragment list. */
268	rc = efx_tx_qpost(txq->common, txq->pend_desc, txq->n_pend_desc,
269			  txq->reaped, &txq->added);
270	KASSERT(rc == 0, ("efx_tx_qpost() failed"));
271
272	/* If efx_tx_qpost() had to refragment, our information about
273	 * buffers to free may be associated with the wrong
274	 * descriptors.
275	 */
276	KASSERT(txq->added - old_added == txq->n_pend_desc,
277		("efx_tx_qpost() refragmented descriptors"));
278
279	level = txq->added - txq->reaped;
280	KASSERT(level <= txq->entries, ("overfilled TX queue"));
281
282	/* Clear the fragment list. */
283	txq->n_pend_desc = 0;
284
285	/* Have we reached the block level? */
286	if (level < SFXGE_TXQ_BLOCK_LEVEL(txq->entries))
287		return;
288
289	/* Reap, and check again */
290	sfxge_tx_qreap(txq);
291	level = txq->added - txq->reaped;
292	if (level < SFXGE_TXQ_BLOCK_LEVEL(txq->entries))
293		return;
294
295	txq->blocked = 1;
296
297	/*
298	 * Avoid a race with completion interrupt handling that could leave
299	 * the queue blocked.
300	 */
301	mb();
302	sfxge_tx_qreap(txq);
303	level = txq->added - txq->reaped;
304	if (level < SFXGE_TXQ_BLOCK_LEVEL(txq->entries)) {
305		mb();
306		txq->blocked = 0;
307	}
308}
309
310static int sfxge_tx_queue_mbuf(struct sfxge_txq *txq, struct mbuf *mbuf)
311{
312	bus_dmamap_t *used_map;
313	bus_dmamap_t map;
314	bus_dma_segment_t dma_seg[SFXGE_TX_MAPPING_MAX_SEG];
315	unsigned int id;
316	struct sfxge_tx_mapping *stmp;
317	efx_buffer_t *desc;
318	int n_dma_seg;
319	int rc;
320	int i;
321
322	KASSERT(!txq->blocked, ("txq->blocked"));
323
324	if (mbuf->m_pkthdr.csum_flags & CSUM_TSO)
325		prefetch_read_many(mbuf->m_data);
326
327	if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED)) {
328		rc = EINTR;
329		goto reject;
330	}
331
332	/* Load the packet for DMA. */
333	id = txq->added & txq->ptr_mask;
334	stmp = &txq->stmp[id];
335	rc = bus_dmamap_load_mbuf_sg(txq->packet_dma_tag, stmp->map,
336				     mbuf, dma_seg, &n_dma_seg, 0);
337	if (rc == EFBIG) {
338		/* Try again. */
339		struct mbuf *new_mbuf = m_collapse(mbuf, M_NOWAIT,
340						   SFXGE_TX_MAPPING_MAX_SEG);
341		if (new_mbuf == NULL)
342			goto reject;
343		++txq->collapses;
344		mbuf = new_mbuf;
345		rc = bus_dmamap_load_mbuf_sg(txq->packet_dma_tag,
346					     stmp->map, mbuf,
347					     dma_seg, &n_dma_seg, 0);
348	}
349	if (rc != 0)
350		goto reject;
351
352	/* Make the packet visible to the hardware. */
353	bus_dmamap_sync(txq->packet_dma_tag, stmp->map, BUS_DMASYNC_PREWRITE);
354
355	used_map = &stmp->map;
356
357	if (mbuf->m_pkthdr.csum_flags & CSUM_TSO) {
358		rc = sfxge_tx_queue_tso(txq, mbuf, dma_seg, n_dma_seg);
359		if (rc < 0)
360			goto reject_mapped;
361		stmp = &txq->stmp[rc];
362	} else {
363		/* Add the mapping to the fragment list, and set flags
364		 * for the buffer.
365		 */
366		i = 0;
367		for (;;) {
368			desc = &txq->pend_desc[i];
369			desc->eb_addr = dma_seg[i].ds_addr;
370			desc->eb_size = dma_seg[i].ds_len;
371			if (i == n_dma_seg - 1) {
372				desc->eb_eop = 1;
373				break;
374			}
375			desc->eb_eop = 0;
376			i++;
377
378			stmp->flags = 0;
379			if (__predict_false(stmp ==
380					    &txq->stmp[txq->ptr_mask]))
381				stmp = &txq->stmp[0];
382			else
383				stmp++;
384		}
385		txq->n_pend_desc = n_dma_seg;
386	}
387
388	/*
389	 * If the mapping required more than one descriptor
390	 * then we need to associate the DMA map with the last
391	 * descriptor, not the first.
392	 */
393	if (used_map != &stmp->map) {
394		map = stmp->map;
395		stmp->map = *used_map;
396		*used_map = map;
397	}
398
399	stmp->u.mbuf = mbuf;
400	stmp->flags = TX_BUF_UNMAP | TX_BUF_MBUF;
401
402	/* Post the fragment list. */
403	sfxge_tx_qlist_post(txq);
404
405	return (0);
406
407reject_mapped:
408	bus_dmamap_unload(txq->packet_dma_tag, *used_map);
409reject:
410	/* Drop the packet on the floor. */
411	m_freem(mbuf);
412	++txq->drops;
413
414	return (rc);
415}
416
417/*
418 * Drain the deferred packet list into the transmit queue.
419 */
420static void
421sfxge_tx_qdpl_drain(struct sfxge_txq *txq)
422{
423	struct sfxge_softc *sc;
424	struct sfxge_tx_dpl *stdp;
425	struct mbuf *mbuf, *next;
426	unsigned int count;
427	unsigned int non_tcp_count;
428	unsigned int pushed;
429	int rc;
430
431	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
432
433	sc = txq->sc;
434	stdp = &txq->dpl;
435	pushed = txq->added;
436
437	if (__predict_true(txq->init_state == SFXGE_TXQ_STARTED)) {
438		prefetch_read_many(sc->enp);
439		prefetch_read_many(txq->common);
440	}
441
442	mbuf = stdp->std_get;
443	count = stdp->std_get_count;
444	non_tcp_count = stdp->std_get_non_tcp_count;
445
446	if (count > stdp->std_get_hiwat)
447		stdp->std_get_hiwat = count;
448
449	while (count != 0) {
450		KASSERT(mbuf != NULL, ("mbuf == NULL"));
451
452		next = mbuf->m_nextpkt;
453		mbuf->m_nextpkt = NULL;
454
455		ETHER_BPF_MTAP(sc->ifnet, mbuf); /* packet capture */
456
457		if (next != NULL)
458			prefetch_read_many(next);
459
460		rc = sfxge_tx_queue_mbuf(txq, mbuf);
461		--count;
462		non_tcp_count -= sfxge_is_mbuf_non_tcp(mbuf);
463		mbuf = next;
464		if (rc != 0)
465			continue;
466
467		if (txq->blocked)
468			break;
469
470		/* Push the fragments to the hardware in batches. */
471		if (txq->added - pushed >= SFXGE_TX_BATCH) {
472			efx_tx_qpush(txq->common, txq->added);
473			pushed = txq->added;
474		}
475	}
476
477	if (count == 0) {
478		KASSERT(mbuf == NULL, ("mbuf != NULL"));
479		KASSERT(non_tcp_count == 0,
480			("inconsistent TCP/non-TCP detection"));
481		stdp->std_get = NULL;
482		stdp->std_get_count = 0;
483		stdp->std_get_non_tcp_count = 0;
484		stdp->std_getp = &stdp->std_get;
485	} else {
486		stdp->std_get = mbuf;
487		stdp->std_get_count = count;
488		stdp->std_get_non_tcp_count = non_tcp_count;
489	}
490
491	if (txq->added != pushed)
492		efx_tx_qpush(txq->common, txq->added);
493
494	KASSERT(txq->blocked || stdp->std_get_count == 0,
495		("queue unblocked but count is non-zero"));
496}
497
498#define	SFXGE_TX_QDPL_PENDING(_txq)					\
499	((_txq)->dpl.std_put != 0)
500
501/*
502 * Service the deferred packet list.
503 *
504 * NOTE: drops the txq mutex!
505 */
506static void
507sfxge_tx_qdpl_service(struct sfxge_txq *txq)
508{
509	SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
510
511	do {
512		if (SFXGE_TX_QDPL_PENDING(txq))
513			sfxge_tx_qdpl_swizzle(txq);
514
515		if (!txq->blocked)
516			sfxge_tx_qdpl_drain(txq);
517
518		SFXGE_TXQ_UNLOCK(txq);
519	} while (SFXGE_TX_QDPL_PENDING(txq) &&
520		 SFXGE_TXQ_TRYLOCK(txq));
521}
522
523/*
524 * Put a packet on the deferred packet list.
525 *
526 * If we are called with the txq lock held, we put the packet on the "get
527 * list", otherwise we atomically push it on the "put list".  The swizzle
528 * function takes care of ordering.
529 *
530 * The length of the put list is bounded by SFXGE_TX_MAX_DEFERRED.  We
531 * overload the csum_data field in the mbuf to keep track of this length
532 * because there is no cheap alternative to avoid races.
533 */
534static int
535sfxge_tx_qdpl_put(struct sfxge_txq *txq, struct mbuf *mbuf, int locked)
536{
537	struct sfxge_tx_dpl *stdp;
538
539	stdp = &txq->dpl;
540
541	KASSERT(mbuf->m_nextpkt == NULL, ("mbuf->m_nextpkt != NULL"));
542
543	if (locked) {
544		SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
545
546		sfxge_tx_qdpl_swizzle(txq);
547
548		if (stdp->std_get_count >= stdp->std_get_max) {
549			txq->get_overflow++;
550			return (ENOBUFS);
551		}
552		if (sfxge_is_mbuf_non_tcp(mbuf)) {
553			if (stdp->std_get_non_tcp_count >=
554			    stdp->std_get_non_tcp_max) {
555				txq->get_non_tcp_overflow++;
556				return (ENOBUFS);
557			}
558			stdp->std_get_non_tcp_count++;
559		}
560
561		*(stdp->std_getp) = mbuf;
562		stdp->std_getp = &mbuf->m_nextpkt;
563		stdp->std_get_count++;
564	} else {
565		volatile uintptr_t *putp;
566		uintptr_t old;
567		uintptr_t new;
568		unsigned old_len;
569
570		putp = &stdp->std_put;
571		new = (uintptr_t)mbuf;
572
573		do {
574			old = *putp;
575			if (old != 0) {
576				struct mbuf *mp = (struct mbuf *)old;
577				old_len = mp->m_pkthdr.csum_data;
578			} else
579				old_len = 0;
580			if (old_len >= stdp->std_put_max) {
581				atomic_add_long(&txq->put_overflow, 1);
582				return (ENOBUFS);
583			}
584			mbuf->m_pkthdr.csum_data = old_len + 1;
585			mbuf->m_nextpkt = (void *)old;
586		} while (atomic_cmpset_ptr(putp, old, new) == 0);
587	}
588
589	return (0);
590}
591
592/*
593 * Called from if_transmit - will try to grab the txq lock and enqueue to the
594 * put list if it succeeds, otherwise try to push onto the defer list if space.
595 */
596int
597sfxge_tx_packet_add(struct sfxge_txq *txq, struct mbuf *m)
598{
599	int locked;
600	int rc;
601
602	if (!SFXGE_LINK_UP(txq->sc)) {
603		rc = ENETDOWN;
604		atomic_add_long(&txq->netdown_drops, 1);
605		goto fail;
606	}
607
608	/*
609	 * Try to grab the txq lock.  If we are able to get the lock,
610	 * the packet will be appended to the "get list" of the deferred
611	 * packet list.  Otherwise, it will be pushed on the "put list".
612	 */
613	locked = SFXGE_TXQ_TRYLOCK(txq);
614
615	if (sfxge_tx_qdpl_put(txq, m, locked) != 0) {
616		if (locked)
617			SFXGE_TXQ_UNLOCK(txq);
618		rc = ENOBUFS;
619		goto fail;
620	}
621
622	/*
623	 * Try to grab the lock again.
624	 *
625	 * If we are able to get the lock, we need to process the deferred
626	 * packet list.  If we are not able to get the lock, another thread
627	 * is processing the list.
628	 */
629	if (!locked)
630		locked = SFXGE_TXQ_TRYLOCK(txq);
631
632	if (locked) {
633		/* Try to service the list. */
634		sfxge_tx_qdpl_service(txq);
635		/* Lock has been dropped. */
636	}
637
638	return (0);
639
640fail:
641	m_freem(m);
642	return (rc);
643}
644
645static void
646sfxge_tx_qdpl_flush(struct sfxge_txq *txq)
647{
648	struct sfxge_tx_dpl *stdp = &txq->dpl;
649	struct mbuf *mbuf, *next;
650
651	SFXGE_TXQ_LOCK(txq);
652
653	sfxge_tx_qdpl_swizzle(txq);
654	for (mbuf = stdp->std_get; mbuf != NULL; mbuf = next) {
655		next = mbuf->m_nextpkt;
656		m_freem(mbuf);
657	}
658	stdp->std_get = NULL;
659	stdp->std_get_count = 0;
660	stdp->std_get_non_tcp_count = 0;
661	stdp->std_getp = &stdp->std_get;
662
663	SFXGE_TXQ_UNLOCK(txq);
664}
665
666void
667sfxge_if_qflush(struct ifnet *ifp)
668{
669	struct sfxge_softc *sc;
670	unsigned int i;
671
672	sc = ifp->if_softc;
673
674	for (i = 0; i < sc->txq_count; i++)
675		sfxge_tx_qdpl_flush(sc->txq[i]);
676}
677
678/*
679 * TX start -- called by the stack.
680 */
681int
682sfxge_if_transmit(struct ifnet *ifp, struct mbuf *m)
683{
684	struct sfxge_softc *sc;
685	struct sfxge_txq *txq;
686	int rc;
687
688	sc = (struct sfxge_softc *)ifp->if_softc;
689
690	/*
691	 * Transmit may be called when interface is up from the kernel
692	 * point of view, but not yet up (in progress) from the driver
693	 * point of view. I.e. link aggregation bring up.
694	 * Transmit may be called when interface is up from the driver
695	 * point of view, but already down from the kernel point of
696	 * view. I.e. Rx when interface shutdown is in progress.
697	 */
698	KASSERT((ifp->if_flags & IFF_UP) || (sc->if_flags & IFF_UP),
699		("interface not up"));
700
701	/* Pick the desired transmit queue. */
702	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_TSO)) {
703		int index = 0;
704
705		/* check if flowid is set */
706		if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
707			uint32_t hash = m->m_pkthdr.flowid;
708
709			index = sc->rx_indir_table[hash % SFXGE_RX_SCALE_MAX];
710		}
711		txq = sc->txq[SFXGE_TXQ_IP_TCP_UDP_CKSUM + index];
712	} else if (m->m_pkthdr.csum_flags & CSUM_DELAY_IP) {
713		txq = sc->txq[SFXGE_TXQ_IP_CKSUM];
714	} else {
715		txq = sc->txq[SFXGE_TXQ_NON_CKSUM];
716	}
717
718	rc = sfxge_tx_packet_add(txq, m);
719
720	return (rc);
721}
722
723/*
724 * Software "TSO".  Not quite as good as doing it in hardware, but
725 * still faster than segmenting in the stack.
726 */
727
728struct sfxge_tso_state {
729	/* Output position */
730	unsigned out_len;	/* Remaining length in current segment */
731	unsigned seqnum;	/* Current sequence number */
732	unsigned packet_space;	/* Remaining space in current packet */
733
734	/* Input position */
735	uint64_t dma_addr;	/* DMA address of current position */
736	unsigned in_len;	/* Remaining length in current mbuf */
737
738	const struct mbuf *mbuf; /* Input mbuf (head of chain) */
739	u_short protocol;	/* Network protocol (after VLAN decap) */
740	ssize_t nh_off;		/* Offset of network header */
741	ssize_t tcph_off;	/* Offset of TCP header */
742	unsigned header_len;	/* Number of bytes of header */
743	unsigned seg_size;	/* TCP segment size */
744};
745
746static const struct ip *tso_iph(const struct sfxge_tso_state *tso)
747{
748	KASSERT(tso->protocol == htons(ETHERTYPE_IP),
749		("tso_iph() in non-IPv4 state"));
750	return (const struct ip *)(tso->mbuf->m_data + tso->nh_off);
751}
752static __unused const struct ip6_hdr *tso_ip6h(const struct sfxge_tso_state *tso)
753{
754	KASSERT(tso->protocol == htons(ETHERTYPE_IPV6),
755		("tso_ip6h() in non-IPv6 state"));
756	return (const struct ip6_hdr *)(tso->mbuf->m_data + tso->nh_off);
757}
758static const struct tcphdr *tso_tcph(const struct sfxge_tso_state *tso)
759{
760	return (const struct tcphdr *)(tso->mbuf->m_data + tso->tcph_off);
761}
762
763/* Size of preallocated TSO header buffers.  Larger blocks must be
764 * allocated from the heap.
765 */
766#define	TSOH_STD_SIZE	128
767
768/* At most half the descriptors in the queue at any time will refer to
769 * a TSO header buffer, since they must always be followed by a
770 * payload descriptor referring to an mbuf.
771 */
772#define	TSOH_COUNT(_txq_entries)	((_txq_entries) / 2u)
773#define	TSOH_PER_PAGE	(PAGE_SIZE / TSOH_STD_SIZE)
774#define	TSOH_PAGE_COUNT(_txq_entries)	\
775	((TSOH_COUNT(_txq_entries) + TSOH_PER_PAGE - 1) / TSOH_PER_PAGE)
776
777static int tso_init(struct sfxge_txq *txq)
778{
779	struct sfxge_softc *sc = txq->sc;
780	unsigned int tsoh_page_count = TSOH_PAGE_COUNT(sc->txq_entries);
781	int i, rc;
782
783	/* Allocate TSO header buffers */
784	txq->tsoh_buffer = malloc(tsoh_page_count * sizeof(txq->tsoh_buffer[0]),
785				  M_SFXGE, M_WAITOK);
786
787	for (i = 0; i < tsoh_page_count; i++) {
788		rc = sfxge_dma_alloc(sc, PAGE_SIZE, &txq->tsoh_buffer[i]);
789		if (rc != 0)
790			goto fail;
791	}
792
793	return (0);
794
795fail:
796	while (i-- > 0)
797		sfxge_dma_free(&txq->tsoh_buffer[i]);
798	free(txq->tsoh_buffer, M_SFXGE);
799	txq->tsoh_buffer = NULL;
800	return (rc);
801}
802
803static void tso_fini(struct sfxge_txq *txq)
804{
805	int i;
806
807	if (txq->tsoh_buffer != NULL) {
808		for (i = 0; i < TSOH_PAGE_COUNT(txq->sc->txq_entries); i++)
809			sfxge_dma_free(&txq->tsoh_buffer[i]);
810		free(txq->tsoh_buffer, M_SFXGE);
811	}
812}
813
814static void tso_start(struct sfxge_tso_state *tso, struct mbuf *mbuf)
815{
816	struct ether_header *eh = mtod(mbuf, struct ether_header *);
817	const struct tcphdr *th;
818	struct tcphdr th_copy;
819
820	tso->mbuf = mbuf;
821
822	/* Find network protocol and header */
823	tso->protocol = eh->ether_type;
824	if (tso->protocol == htons(ETHERTYPE_VLAN)) {
825		struct ether_vlan_header *veh =
826			mtod(mbuf, struct ether_vlan_header *);
827		tso->protocol = veh->evl_proto;
828		tso->nh_off = sizeof(*veh);
829	} else {
830		tso->nh_off = sizeof(*eh);
831	}
832
833	/* Find TCP header */
834	if (tso->protocol == htons(ETHERTYPE_IP)) {
835		KASSERT(tso_iph(tso)->ip_p == IPPROTO_TCP,
836			("TSO required on non-TCP packet"));
837		tso->tcph_off = tso->nh_off + 4 * tso_iph(tso)->ip_hl;
838	} else {
839		KASSERT(tso->protocol == htons(ETHERTYPE_IPV6),
840			("TSO required on non-IP packet"));
841		KASSERT(tso_ip6h(tso)->ip6_nxt == IPPROTO_TCP,
842			("TSO required on non-TCP packet"));
843		tso->tcph_off = tso->nh_off + sizeof(struct ip6_hdr);
844	}
845
846	KASSERT(mbuf->m_len >= tso->tcph_off,
847		("network header is fragmented in mbuf"));
848	/* We need TCP header including flags (window is the next) */
849	if (mbuf->m_len < tso->tcph_off + offsetof(struct tcphdr, th_win)) {
850		m_copydata(tso->mbuf, tso->tcph_off, sizeof(th_copy),
851			   (caddr_t)&th_copy);
852		th = &th_copy;
853	} else {
854		th = tso_tcph(tso);
855	}
856
857	tso->header_len = tso->tcph_off + 4 * th->th_off;
858	tso->seg_size = mbuf->m_pkthdr.tso_segsz;
859
860	tso->seqnum = ntohl(th->th_seq);
861
862	/* These flags must not be duplicated */
863	KASSERT(!(th->th_flags & (TH_URG | TH_SYN | TH_RST)),
864		("incompatible TCP flag on TSO packet"));
865
866	tso->out_len = mbuf->m_pkthdr.len - tso->header_len;
867}
868
869/*
870 * tso_fill_packet_with_fragment - form descriptors for the current fragment
871 *
872 * Form descriptors for the current fragment, until we reach the end
873 * of fragment or end-of-packet.  Return 0 on success, 1 if not enough
874 * space.
875 */
876static void tso_fill_packet_with_fragment(struct sfxge_txq *txq,
877					  struct sfxge_tso_state *tso)
878{
879	efx_buffer_t *desc;
880	int n;
881
882	if (tso->in_len == 0 || tso->packet_space == 0)
883		return;
884
885	KASSERT(tso->in_len > 0, ("TSO input length went negative"));
886	KASSERT(tso->packet_space > 0, ("TSO packet space went negative"));
887
888	n = min(tso->in_len, tso->packet_space);
889
890	tso->packet_space -= n;
891	tso->out_len -= n;
892	tso->in_len -= n;
893
894	desc = &txq->pend_desc[txq->n_pend_desc++];
895	desc->eb_addr = tso->dma_addr;
896	desc->eb_size = n;
897	desc->eb_eop = tso->out_len == 0 || tso->packet_space == 0;
898
899	tso->dma_addr += n;
900}
901
902/* Callback from bus_dmamap_load() for long TSO headers. */
903static void tso_map_long_header(void *dma_addr_ret,
904				bus_dma_segment_t *segs, int nseg,
905				int error)
906{
907	*(uint64_t *)dma_addr_ret = ((__predict_true(error == 0) &&
908				      __predict_true(nseg == 1)) ?
909				     segs->ds_addr : 0);
910}
911
912/*
913 * tso_start_new_packet - generate a new header and prepare for the new packet
914 *
915 * Generate a new header and prepare for the new packet.  Return 0 on
916 * success, or an error code if failed to alloc header.
917 */
918static int tso_start_new_packet(struct sfxge_txq *txq,
919				struct sfxge_tso_state *tso,
920				unsigned int id)
921{
922	struct sfxge_tx_mapping *stmp = &txq->stmp[id];
923	struct tcphdr *tsoh_th;
924	unsigned ip_length;
925	caddr_t header;
926	uint64_t dma_addr;
927	bus_dmamap_t map;
928	efx_buffer_t *desc;
929	int rc;
930
931	/* Allocate a DMA-mapped header buffer. */
932	if (__predict_true(tso->header_len <= TSOH_STD_SIZE)) {
933		unsigned int page_index = (id / 2) / TSOH_PER_PAGE;
934		unsigned int buf_index = (id / 2) % TSOH_PER_PAGE;
935
936		header = (txq->tsoh_buffer[page_index].esm_base +
937			  buf_index * TSOH_STD_SIZE);
938		dma_addr = (txq->tsoh_buffer[page_index].esm_addr +
939			    buf_index * TSOH_STD_SIZE);
940		map = txq->tsoh_buffer[page_index].esm_map;
941
942		stmp->flags = 0;
943	} else {
944		/* We cannot use bus_dmamem_alloc() as that may sleep */
945		header = malloc(tso->header_len, M_SFXGE, M_NOWAIT);
946		if (__predict_false(!header))
947			return (ENOMEM);
948		rc = bus_dmamap_load(txq->packet_dma_tag, stmp->map,
949				     header, tso->header_len,
950				     tso_map_long_header, &dma_addr,
951				     BUS_DMA_NOWAIT);
952		if (__predict_false(dma_addr == 0)) {
953			if (rc == 0) {
954				/* Succeeded but got >1 segment */
955				bus_dmamap_unload(txq->packet_dma_tag,
956						  stmp->map);
957				rc = EINVAL;
958			}
959			free(header, M_SFXGE);
960			return (rc);
961		}
962		map = stmp->map;
963
964		txq->tso_long_headers++;
965		stmp->u.heap_buf = header;
966		stmp->flags = TX_BUF_UNMAP;
967	}
968
969	tsoh_th = (struct tcphdr *)(header + tso->tcph_off);
970
971	/* Copy and update the headers. */
972	m_copydata(tso->mbuf, 0, tso->header_len, header);
973
974	tsoh_th->th_seq = htonl(tso->seqnum);
975	tso->seqnum += tso->seg_size;
976	if (tso->out_len > tso->seg_size) {
977		/* This packet will not finish the TSO burst. */
978		ip_length = tso->header_len - tso->nh_off + tso->seg_size;
979		tsoh_th->th_flags &= ~(TH_FIN | TH_PUSH);
980	} else {
981		/* This packet will be the last in the TSO burst. */
982		ip_length = tso->header_len - tso->nh_off + tso->out_len;
983	}
984
985	if (tso->protocol == htons(ETHERTYPE_IP)) {
986		struct ip *tsoh_iph = (struct ip *)(header + tso->nh_off);
987		tsoh_iph->ip_len = htons(ip_length);
988		/* XXX We should increment ip_id, but FreeBSD doesn't
989		 * currently allocate extra IDs for multiple segments.
990		 */
991	} else {
992		struct ip6_hdr *tsoh_iph =
993			(struct ip6_hdr *)(header + tso->nh_off);
994		tsoh_iph->ip6_plen = htons(ip_length - sizeof(*tsoh_iph));
995	}
996
997	/* Make the header visible to the hardware. */
998	bus_dmamap_sync(txq->packet_dma_tag, map, BUS_DMASYNC_PREWRITE);
999
1000	tso->packet_space = tso->seg_size;
1001	txq->tso_packets++;
1002
1003	/* Form a descriptor for this header. */
1004	desc = &txq->pend_desc[txq->n_pend_desc++];
1005	desc->eb_addr = dma_addr;
1006	desc->eb_size = tso->header_len;
1007	desc->eb_eop = 0;
1008
1009	return (0);
1010}
1011
1012static int
1013sfxge_tx_queue_tso(struct sfxge_txq *txq, struct mbuf *mbuf,
1014		   const bus_dma_segment_t *dma_seg, int n_dma_seg)
1015{
1016	struct sfxge_tso_state tso;
1017	unsigned int id, next_id;
1018	unsigned skipped = 0;
1019
1020	tso_start(&tso, mbuf);
1021
1022	while (dma_seg->ds_len + skipped <= tso.header_len) {
1023		skipped += dma_seg->ds_len;
1024		--n_dma_seg;
1025		KASSERT(n_dma_seg, ("no payload found in TSO packet"));
1026		++dma_seg;
1027	}
1028	tso.in_len = dma_seg->ds_len - (tso.header_len - skipped);
1029	tso.dma_addr = dma_seg->ds_addr + (tso.header_len - skipped);
1030
1031	id = txq->added & txq->ptr_mask;
1032	if (__predict_false(tso_start_new_packet(txq, &tso, id)))
1033		return (-1);
1034
1035	while (1) {
1036		id = (id + 1) & txq->ptr_mask;
1037		tso_fill_packet_with_fragment(txq, &tso);
1038
1039		/* Move onto the next fragment? */
1040		if (tso.in_len == 0) {
1041			--n_dma_seg;
1042			if (n_dma_seg == 0)
1043				break;
1044			++dma_seg;
1045			tso.in_len = dma_seg->ds_len;
1046			tso.dma_addr = dma_seg->ds_addr;
1047		}
1048
1049		/* End of packet? */
1050		if (tso.packet_space == 0) {
1051			/* If the queue is now full due to tiny MSS,
1052			 * or we can't create another header, discard
1053			 * the remainder of the input mbuf but do not
1054			 * roll back the work we have done.
1055			 */
1056			if (txq->n_pend_desc + 1 /* header */ + n_dma_seg >
1057			    SFXGE_TSO_MAX_DESC) {
1058				txq->tso_pdrop_too_many++;
1059				break;
1060			}
1061			next_id = (id + 1) & txq->ptr_mask;
1062			if (__predict_false(tso_start_new_packet(txq, &tso,
1063								 next_id))) {
1064				txq->tso_pdrop_no_rsrc++;
1065				break;
1066			}
1067			id = next_id;
1068		}
1069	}
1070
1071	txq->tso_bursts++;
1072	return (id);
1073}
1074
1075static void
1076sfxge_tx_qunblock(struct sfxge_txq *txq)
1077{
1078	struct sfxge_softc *sc;
1079	struct sfxge_evq *evq;
1080
1081	sc = txq->sc;
1082	evq = sc->evq[txq->evq_index];
1083
1084	SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
1085
1086	if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED))
1087		return;
1088
1089	SFXGE_TXQ_LOCK(txq);
1090
1091	if (txq->blocked) {
1092		unsigned int level;
1093
1094		level = txq->added - txq->completed;
1095		if (level <= SFXGE_TXQ_UNBLOCK_LEVEL(txq->entries)) {
1096			/* reaped must be in sync with blocked */
1097			sfxge_tx_qreap(txq);
1098			txq->blocked = 0;
1099		}
1100	}
1101
1102	sfxge_tx_qdpl_service(txq);
1103	/* note: lock has been dropped */
1104}
1105
1106void
1107sfxge_tx_qflush_done(struct sfxge_txq *txq)
1108{
1109
1110	txq->flush_state = SFXGE_FLUSH_DONE;
1111}
1112
1113static void
1114sfxge_tx_qstop(struct sfxge_softc *sc, unsigned int index)
1115{
1116	struct sfxge_txq *txq;
1117	struct sfxge_evq *evq;
1118	unsigned int count;
1119
1120	txq = sc->txq[index];
1121	evq = sc->evq[txq->evq_index];
1122
1123	SFXGE_TXQ_LOCK(txq);
1124
1125	KASSERT(txq->init_state == SFXGE_TXQ_STARTED,
1126	    ("txq->init_state != SFXGE_TXQ_STARTED"));
1127
1128	txq->init_state = SFXGE_TXQ_INITIALIZED;
1129	txq->flush_state = SFXGE_FLUSH_PENDING;
1130
1131	/* Flush the transmit queue. */
1132	efx_tx_qflush(txq->common);
1133
1134	SFXGE_TXQ_UNLOCK(txq);
1135
1136	count = 0;
1137	do {
1138		/* Spin for 100ms. */
1139		DELAY(100000);
1140
1141		if (txq->flush_state != SFXGE_FLUSH_PENDING)
1142			break;
1143	} while (++count < 20);
1144
1145	SFXGE_EVQ_LOCK(evq);
1146	SFXGE_TXQ_LOCK(txq);
1147
1148	KASSERT(txq->flush_state != SFXGE_FLUSH_FAILED,
1149	    ("txq->flush_state == SFXGE_FLUSH_FAILED"));
1150
1151	txq->flush_state = SFXGE_FLUSH_DONE;
1152
1153	txq->blocked = 0;
1154	txq->pending = txq->added;
1155
1156	sfxge_tx_qcomplete(txq, evq);
1157	KASSERT(txq->completed == txq->added,
1158	    ("txq->completed != txq->added"));
1159
1160	sfxge_tx_qreap(txq);
1161	KASSERT(txq->reaped == txq->completed,
1162	    ("txq->reaped != txq->completed"));
1163
1164	txq->added = 0;
1165	txq->pending = 0;
1166	txq->completed = 0;
1167	txq->reaped = 0;
1168
1169	/* Destroy the common code transmit queue. */
1170	efx_tx_qdestroy(txq->common);
1171	txq->common = NULL;
1172
1173	efx_sram_buf_tbl_clear(sc->enp, txq->buf_base_id,
1174	    EFX_TXQ_NBUFS(sc->txq_entries));
1175
1176	SFXGE_EVQ_UNLOCK(evq);
1177	SFXGE_TXQ_UNLOCK(txq);
1178}
1179
1180static int
1181sfxge_tx_qstart(struct sfxge_softc *sc, unsigned int index)
1182{
1183	struct sfxge_txq *txq;
1184	efsys_mem_t *esmp;
1185	uint16_t flags;
1186	struct sfxge_evq *evq;
1187	int rc;
1188
1189	txq = sc->txq[index];
1190	esmp = &txq->mem;
1191	evq = sc->evq[txq->evq_index];
1192
1193	KASSERT(txq->init_state == SFXGE_TXQ_INITIALIZED,
1194	    ("txq->init_state != SFXGE_TXQ_INITIALIZED"));
1195	KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
1196	    ("evq->init_state != SFXGE_EVQ_STARTED"));
1197
1198	/* Program the buffer table. */
1199	if ((rc = efx_sram_buf_tbl_set(sc->enp, txq->buf_base_id, esmp,
1200	    EFX_TXQ_NBUFS(sc->txq_entries))) != 0)
1201		return (rc);
1202
1203	/* Determine the kind of queue we are creating. */
1204	switch (txq->type) {
1205	case SFXGE_TXQ_NON_CKSUM:
1206		flags = 0;
1207		break;
1208	case SFXGE_TXQ_IP_CKSUM:
1209		flags = EFX_CKSUM_IPV4;
1210		break;
1211	case SFXGE_TXQ_IP_TCP_UDP_CKSUM:
1212		flags = EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP;
1213		break;
1214	default:
1215		KASSERT(0, ("Impossible TX queue"));
1216		flags = 0;
1217		break;
1218	}
1219
1220	/* Create the common code transmit queue. */
1221	if ((rc = efx_tx_qcreate(sc->enp, index, txq->type, esmp,
1222	    sc->txq_entries, txq->buf_base_id, flags, evq->common,
1223	    &txq->common)) != 0)
1224		goto fail;
1225
1226	SFXGE_TXQ_LOCK(txq);
1227
1228	/* Enable the transmit queue. */
1229	efx_tx_qenable(txq->common);
1230
1231	txq->init_state = SFXGE_TXQ_STARTED;
1232
1233	SFXGE_TXQ_UNLOCK(txq);
1234
1235	return (0);
1236
1237fail:
1238	efx_sram_buf_tbl_clear(sc->enp, txq->buf_base_id,
1239	    EFX_TXQ_NBUFS(sc->txq_entries));
1240	return (rc);
1241}
1242
1243void
1244sfxge_tx_stop(struct sfxge_softc *sc)
1245{
1246	int index;
1247
1248	index = sc->txq_count;
1249	while (--index >= 0)
1250		sfxge_tx_qstop(sc, index);
1251
1252	/* Tear down the transmit module */
1253	efx_tx_fini(sc->enp);
1254}
1255
1256int
1257sfxge_tx_start(struct sfxge_softc *sc)
1258{
1259	int index;
1260	int rc;
1261
1262	/* Initialize the common code transmit module. */
1263	if ((rc = efx_tx_init(sc->enp)) != 0)
1264		return (rc);
1265
1266	for (index = 0; index < sc->txq_count; index++) {
1267		if ((rc = sfxge_tx_qstart(sc, index)) != 0)
1268			goto fail;
1269	}
1270
1271	return (0);
1272
1273fail:
1274	while (--index >= 0)
1275		sfxge_tx_qstop(sc, index);
1276
1277	efx_tx_fini(sc->enp);
1278
1279	return (rc);
1280}
1281
1282static int
1283sfxge_txq_stat_init(struct sfxge_txq *txq, struct sysctl_oid *txq_node)
1284{
1285	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(txq->sc->dev);
1286	struct sysctl_oid *stat_node;
1287	unsigned int id;
1288
1289	stat_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(txq_node), OID_AUTO,
1290				    "stats", CTLFLAG_RD, NULL,
1291				    "Tx queue statistics");
1292	if (stat_node == NULL)
1293		return (ENOMEM);
1294
1295	for (id = 0; id < nitems(sfxge_tx_stats); id++) {
1296		SYSCTL_ADD_ULONG(
1297		    ctx, SYSCTL_CHILDREN(stat_node), OID_AUTO,
1298		    sfxge_tx_stats[id].name, CTLFLAG_RD | CTLFLAG_STATS,
1299		    (unsigned long *)((caddr_t)txq + sfxge_tx_stats[id].offset),
1300		    "");
1301	}
1302
1303	return (0);
1304}
1305
1306/**
1307 * Destroy a transmit queue.
1308 */
1309static void
1310sfxge_tx_qfini(struct sfxge_softc *sc, unsigned int index)
1311{
1312	struct sfxge_txq *txq;
1313	unsigned int nmaps;
1314
1315	txq = sc->txq[index];
1316
1317	KASSERT(txq->init_state == SFXGE_TXQ_INITIALIZED,
1318	    ("txq->init_state != SFXGE_TXQ_INITIALIZED"));
1319
1320	if (txq->type == SFXGE_TXQ_IP_TCP_UDP_CKSUM)
1321		tso_fini(txq);
1322
1323	/* Free the context arrays. */
1324	free(txq->pend_desc, M_SFXGE);
1325	nmaps = sc->txq_entries;
1326	while (nmaps-- != 0)
1327		bus_dmamap_destroy(txq->packet_dma_tag, txq->stmp[nmaps].map);
1328	free(txq->stmp, M_SFXGE);
1329
1330	/* Release DMA memory mapping. */
1331	sfxge_dma_free(&txq->mem);
1332
1333	sc->txq[index] = NULL;
1334
1335	SFXGE_TXQ_LOCK_DESTROY(txq);
1336
1337	free(txq, M_SFXGE);
1338}
1339
1340static int
1341sfxge_tx_qinit(struct sfxge_softc *sc, unsigned int txq_index,
1342    enum sfxge_txq_type type, unsigned int evq_index)
1343{
1344	char name[16];
1345	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1346	struct sysctl_oid *txq_node;
1347	struct sfxge_txq *txq;
1348	struct sfxge_evq *evq;
1349	struct sfxge_tx_dpl *stdp;
1350	struct sysctl_oid *dpl_node;
1351	efsys_mem_t *esmp;
1352	unsigned int nmaps;
1353	int rc;
1354
1355	txq = malloc(sizeof(struct sfxge_txq), M_SFXGE, M_ZERO | M_WAITOK);
1356	txq->sc = sc;
1357	txq->entries = sc->txq_entries;
1358	txq->ptr_mask = txq->entries - 1;
1359
1360	sc->txq[txq_index] = txq;
1361	esmp = &txq->mem;
1362
1363	evq = sc->evq[evq_index];
1364
1365	/* Allocate and zero DMA space for the descriptor ring. */
1366	if ((rc = sfxge_dma_alloc(sc, EFX_TXQ_SIZE(sc->txq_entries), esmp)) != 0)
1367		return (rc);
1368
1369	/* Allocate buffer table entries. */
1370	sfxge_sram_buf_tbl_alloc(sc, EFX_TXQ_NBUFS(sc->txq_entries),
1371				 &txq->buf_base_id);
1372
1373	/* Create a DMA tag for packet mappings. */
1374	if (bus_dma_tag_create(sc->parent_dma_tag, 1, 0x1000,
1375	    MIN(0x3FFFFFFFFFFFUL, BUS_SPACE_MAXADDR), BUS_SPACE_MAXADDR, NULL,
1376	    NULL, 0x11000, SFXGE_TX_MAPPING_MAX_SEG, 0x1000, 0, NULL, NULL,
1377	    &txq->packet_dma_tag) != 0) {
1378		device_printf(sc->dev, "Couldn't allocate txq DMA tag\n");
1379		rc = ENOMEM;
1380		goto fail;
1381	}
1382
1383	/* Allocate pending descriptor array for batching writes. */
1384	txq->pend_desc = malloc(sizeof(efx_buffer_t) * sc->txq_entries,
1385				M_SFXGE, M_ZERO | M_WAITOK);
1386
1387	/* Allocate and initialise mbuf DMA mapping array. */
1388	txq->stmp = malloc(sizeof(struct sfxge_tx_mapping) * sc->txq_entries,
1389	    M_SFXGE, M_ZERO | M_WAITOK);
1390	for (nmaps = 0; nmaps < sc->txq_entries; nmaps++) {
1391		rc = bus_dmamap_create(txq->packet_dma_tag, 0,
1392				       &txq->stmp[nmaps].map);
1393		if (rc != 0)
1394			goto fail2;
1395	}
1396
1397	snprintf(name, sizeof(name), "%u", txq_index);
1398	txq_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->txqs_node),
1399				   OID_AUTO, name, CTLFLAG_RD, NULL, "");
1400	if (txq_node == NULL) {
1401		rc = ENOMEM;
1402		goto fail_txq_node;
1403	}
1404
1405	if (type == SFXGE_TXQ_IP_TCP_UDP_CKSUM &&
1406	    (rc = tso_init(txq)) != 0)
1407		goto fail3;
1408
1409	if (sfxge_tx_dpl_get_max <= 0) {
1410		log(LOG_ERR, "%s=%d must be greater than 0",
1411		    SFXGE_PARAM_TX_DPL_GET_MAX, sfxge_tx_dpl_get_max);
1412		rc = EINVAL;
1413		goto fail_tx_dpl_get_max;
1414	}
1415	if (sfxge_tx_dpl_get_non_tcp_max <= 0) {
1416		log(LOG_ERR, "%s=%d must be greater than 0",
1417		    SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX,
1418		    sfxge_tx_dpl_get_non_tcp_max);
1419		rc = EINVAL;
1420		goto fail_tx_dpl_get_max;
1421	}
1422	if (sfxge_tx_dpl_put_max < 0) {
1423		log(LOG_ERR, "%s=%d must be greater or equal to 0",
1424		    SFXGE_PARAM_TX_DPL_PUT_MAX, sfxge_tx_dpl_put_max);
1425		rc = EINVAL;
1426		goto fail_tx_dpl_put_max;
1427	}
1428
1429	/* Initialize the deferred packet list. */
1430	stdp = &txq->dpl;
1431	stdp->std_put_max = sfxge_tx_dpl_put_max;
1432	stdp->std_get_max = sfxge_tx_dpl_get_max;
1433	stdp->std_get_non_tcp_max = sfxge_tx_dpl_get_non_tcp_max;
1434	stdp->std_getp = &stdp->std_get;
1435
1436	SFXGE_TXQ_LOCK_INIT(txq, device_get_nameunit(sc->dev), txq_index);
1437
1438	dpl_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(txq_node), OID_AUTO,
1439				   "dpl", CTLFLAG_RD, NULL,
1440				   "Deferred packet list statistics");
1441	if (dpl_node == NULL) {
1442		rc = ENOMEM;
1443		goto fail_dpl_node;
1444	}
1445
1446	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1447			"get_count", CTLFLAG_RD | CTLFLAG_STATS,
1448			&stdp->std_get_count, 0, "");
1449	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1450			"get_non_tcp_count", CTLFLAG_RD | CTLFLAG_STATS,
1451			&stdp->std_get_non_tcp_count, 0, "");
1452	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1453			"get_hiwat", CTLFLAG_RD | CTLFLAG_STATS,
1454			&stdp->std_get_hiwat, 0, "");
1455	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
1456			"put_hiwat", CTLFLAG_RD | CTLFLAG_STATS,
1457			&stdp->std_put_hiwat, 0, "");
1458
1459	rc = sfxge_txq_stat_init(txq, txq_node);
1460	if (rc != 0)
1461		goto fail_txq_stat_init;
1462
1463	txq->type = type;
1464	txq->evq_index = evq_index;
1465	txq->txq_index = txq_index;
1466	txq->init_state = SFXGE_TXQ_INITIALIZED;
1467
1468	return (0);
1469
1470fail_txq_stat_init:
1471fail_dpl_node:
1472fail_tx_dpl_put_max:
1473fail_tx_dpl_get_max:
1474fail3:
1475fail_txq_node:
1476	free(txq->pend_desc, M_SFXGE);
1477fail2:
1478	while (nmaps-- != 0)
1479		bus_dmamap_destroy(txq->packet_dma_tag, txq->stmp[nmaps].map);
1480	free(txq->stmp, M_SFXGE);
1481	bus_dma_tag_destroy(txq->packet_dma_tag);
1482
1483fail:
1484	sfxge_dma_free(esmp);
1485
1486	return (rc);
1487}
1488
1489static int
1490sfxge_tx_stat_handler(SYSCTL_HANDLER_ARGS)
1491{
1492	struct sfxge_softc *sc = arg1;
1493	unsigned int id = arg2;
1494	unsigned long sum;
1495	unsigned int index;
1496
1497	/* Sum across all TX queues */
1498	sum = 0;
1499	for (index = 0; index < sc->txq_count; index++)
1500		sum += *(unsigned long *)((caddr_t)sc->txq[index] +
1501					  sfxge_tx_stats[id].offset);
1502
1503	return (SYSCTL_OUT(req, &sum, sizeof(sum)));
1504}
1505
1506static void
1507sfxge_tx_stat_init(struct sfxge_softc *sc)
1508{
1509	struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1510	struct sysctl_oid_list *stat_list;
1511	unsigned int id;
1512
1513	stat_list = SYSCTL_CHILDREN(sc->stats_node);
1514
1515	for (id = 0; id < nitems(sfxge_tx_stats); id++) {
1516		SYSCTL_ADD_PROC(
1517			ctx, stat_list,
1518			OID_AUTO, sfxge_tx_stats[id].name,
1519			CTLTYPE_ULONG|CTLFLAG_RD,
1520			sc, id, sfxge_tx_stat_handler, "LU",
1521			"");
1522	}
1523}
1524
1525void
1526sfxge_tx_fini(struct sfxge_softc *sc)
1527{
1528	int index;
1529
1530	index = sc->txq_count;
1531	while (--index >= 0)
1532		sfxge_tx_qfini(sc, index);
1533
1534	sc->txq_count = 0;
1535}
1536
1537
1538int
1539sfxge_tx_init(struct sfxge_softc *sc)
1540{
1541	struct sfxge_intr *intr;
1542	int index;
1543	int rc;
1544
1545	intr = &sc->intr;
1546
1547	KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1548	    ("intr->state != SFXGE_INTR_INITIALIZED"));
1549
1550	sc->txq_count = SFXGE_TXQ_NTYPES - 1 + sc->intr.n_alloc;
1551
1552	sc->txqs_node = SYSCTL_ADD_NODE(
1553		device_get_sysctl_ctx(sc->dev),
1554		SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)),
1555		OID_AUTO, "txq", CTLFLAG_RD, NULL, "Tx queues");
1556	if (sc->txqs_node == NULL) {
1557		rc = ENOMEM;
1558		goto fail_txq_node;
1559	}
1560
1561	/* Initialize the transmit queues */
1562	if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_NON_CKSUM,
1563	    SFXGE_TXQ_NON_CKSUM, 0)) != 0)
1564		goto fail;
1565
1566	if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_IP_CKSUM,
1567	    SFXGE_TXQ_IP_CKSUM, 0)) != 0)
1568		goto fail2;
1569
1570	for (index = 0;
1571	     index < sc->txq_count - SFXGE_TXQ_NTYPES + 1;
1572	     index++) {
1573		if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_NTYPES - 1 + index,
1574		    SFXGE_TXQ_IP_TCP_UDP_CKSUM, index)) != 0)
1575			goto fail3;
1576	}
1577
1578	sfxge_tx_stat_init(sc);
1579
1580	return (0);
1581
1582fail3:
1583	while (--index >= 0)
1584		sfxge_tx_qfini(sc, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index);
1585
1586	sfxge_tx_qfini(sc, SFXGE_TXQ_IP_CKSUM);
1587
1588fail2:
1589	sfxge_tx_qfini(sc, SFXGE_TXQ_NON_CKSUM);
1590
1591fail:
1592fail_txq_node:
1593	sc->txq_count = 0;
1594	return (rc);
1595}
1596