cxgb_sge.c revision 314667
1/**************************************************************************
2
3Copyright (c) 2007-2009, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/10/sys/dev/cxgb/cxgb_sge.c 314667 2017-03-04 13:03:31Z avg $");
32
33#include "opt_inet6.h"
34#include "opt_inet.h"
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/kernel.h>
39#include <sys/module.h>
40#include <sys/bus.h>
41#include <sys/conf.h>
42#include <machine/bus.h>
43#include <machine/resource.h>
44#include <sys/bus_dma.h>
45#include <sys/rman.h>
46#include <sys/queue.h>
47#include <sys/sysctl.h>
48#include <sys/taskqueue.h>
49
50#include <sys/proc.h>
51#include <sys/sbuf.h>
52#include <sys/sched.h>
53#include <sys/smp.h>
54#include <sys/systm.h>
55#include <sys/syslog.h>
56#include <sys/socket.h>
57#include <sys/sglist.h>
58
59#include <net/bpf.h>
60#include <net/ethernet.h>
61#include <net/if.h>
62#include <net/if_vlan_var.h>
63
64#include <netinet/in_systm.h>
65#include <netinet/in.h>
66#include <netinet/ip.h>
67#include <netinet/ip6.h>
68#include <netinet/tcp.h>
69
70#include <dev/pci/pcireg.h>
71#include <dev/pci/pcivar.h>
72
73#include <vm/vm.h>
74#include <vm/pmap.h>
75
76#include <cxgb_include.h>
77#include <sys/mvec.h>
78
79int	txq_fills = 0;
80int	multiq_tx_enable = 1;
81
82#ifdef TCP_OFFLOAD
83CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS);
84#endif
85
86extern struct sysctl_oid_list sysctl__hw_cxgb_children;
87int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
88TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
89SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
90    "size of per-queue mbuf ring");
91
92static int cxgb_tx_coalesce_force = 0;
93TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
94SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
95    &cxgb_tx_coalesce_force, 0,
96    "coalesce small packets into a single work request regardless of ring state");
97
98#define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
99#define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
100#define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
101#define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
102#define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
103#define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
104#define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
105
106
107static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
108TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
109    &cxgb_tx_coalesce_enable_start);
110SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
111    &cxgb_tx_coalesce_enable_start, 0,
112    "coalesce enable threshold");
113static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
114TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
115SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
116    &cxgb_tx_coalesce_enable_stop, 0,
117    "coalesce disable threshold");
118static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
119TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
120SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
121    &cxgb_tx_reclaim_threshold, 0,
122    "tx cleaning minimum threshold");
123
124/*
125 * XXX don't re-enable this until TOE stops assuming
126 * we have an m_ext
127 */
128static int recycle_enable = 0;
129
130extern int cxgb_use_16k_clusters;
131extern int nmbjumbop;
132extern int nmbjumbo9;
133extern int nmbjumbo16;
134
135#define USE_GTS 0
136
137#define SGE_RX_SM_BUF_SIZE	1536
138#define SGE_RX_DROP_THRES	16
139#define SGE_RX_COPY_THRES	128
140
141/*
142 * Period of the Tx buffer reclaim timer.  This timer does not need to run
143 * frequently as Tx buffers are usually reclaimed by new Tx packets.
144 */
145#define TX_RECLAIM_PERIOD       (hz >> 1)
146
147/*
148 * Values for sge_txq.flags
149 */
150enum {
151	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
152	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
153};
154
155struct tx_desc {
156	uint64_t	flit[TX_DESC_FLITS];
157} __packed;
158
159struct rx_desc {
160	uint32_t	addr_lo;
161	uint32_t	len_gen;
162	uint32_t	gen2;
163	uint32_t	addr_hi;
164} __packed;
165
166struct rsp_desc {               /* response queue descriptor */
167	struct rss_header	rss_hdr;
168	uint32_t		flags;
169	uint32_t		len_cq;
170	uint8_t			imm_data[47];
171	uint8_t			intr_gen;
172} __packed;
173
174#define RX_SW_DESC_MAP_CREATED	(1 << 0)
175#define TX_SW_DESC_MAP_CREATED	(1 << 1)
176#define RX_SW_DESC_INUSE        (1 << 3)
177#define TX_SW_DESC_MAPPED       (1 << 4)
178
179#define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
180#define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
181#define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
182#define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
183
184struct tx_sw_desc {                /* SW state per Tx descriptor */
185	struct mbuf	*m;
186	bus_dmamap_t	map;
187	int		flags;
188};
189
190struct rx_sw_desc {                /* SW state per Rx descriptor */
191	caddr_t		rxsd_cl;
192	struct mbuf	*m;
193	bus_dmamap_t	map;
194	int		flags;
195};
196
197struct txq_state {
198	unsigned int	compl;
199	unsigned int	gen;
200	unsigned int	pidx;
201};
202
203struct refill_fl_cb_arg {
204	int               error;
205	bus_dma_segment_t seg;
206	int               nseg;
207};
208
209
210/*
211 * Maps a number of flits to the number of Tx descriptors that can hold them.
212 * The formula is
213 *
214 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
215 *
216 * HW allows up to 4 descriptors to be combined into a WR.
217 */
218static uint8_t flit_desc_map[] = {
219	0,
220#if SGE_NUM_GENBITS == 1
221	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
222	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
223	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
225#elif SGE_NUM_GENBITS == 2
226	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
227	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
228	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
229	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
230#else
231# error "SGE_NUM_GENBITS must be 1 or 2"
232#endif
233};
234
235#define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
236#define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
237#define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
238#define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
239#define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
240#define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
241	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
242#define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
243#define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
244	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
245#define	TXQ_RING_DEQUEUE(qs) \
246	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
247
248int cxgb_debug = 0;
249
250static void sge_timer_cb(void *arg);
251static void sge_timer_reclaim(void *arg, int ncount);
252static void sge_txq_reclaim_handler(void *arg, int ncount);
253static void cxgb_start_locked(struct sge_qset *qs);
254
255/*
256 * XXX need to cope with bursty scheduling by looking at a wider
257 * window than we are now for determining the need for coalescing
258 *
259 */
260static __inline uint64_t
261check_pkt_coalesce(struct sge_qset *qs)
262{
263        struct adapter *sc;
264        struct sge_txq *txq;
265	uint8_t *fill;
266
267	if (__predict_false(cxgb_tx_coalesce_force))
268		return (1);
269	txq = &qs->txq[TXQ_ETH];
270        sc = qs->port->adapter;
271	fill = &sc->tunq_fill[qs->idx];
272
273	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
274		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
275	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
276		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
277	/*
278	 * if the hardware transmit queue is more than 1/8 full
279	 * we mark it as coalescing - we drop back from coalescing
280	 * when we go below 1/32 full and there are no packets enqueued,
281	 * this provides us with some degree of hysteresis
282	 */
283        if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
284	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
285                *fill = 0;
286        else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
287                *fill = 1;
288
289	return (sc->tunq_coalesce);
290}
291
292#ifdef __LP64__
293static void
294set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
295{
296	uint64_t wr_hilo;
297#if _BYTE_ORDER == _LITTLE_ENDIAN
298	wr_hilo = wr_hi;
299	wr_hilo |= (((uint64_t)wr_lo)<<32);
300#else
301	wr_hilo = wr_lo;
302	wr_hilo |= (((uint64_t)wr_hi)<<32);
303#endif
304	wrp->wrh_hilo = wr_hilo;
305}
306#else
307static void
308set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
309{
310
311	wrp->wrh_hi = wr_hi;
312	wmb();
313	wrp->wrh_lo = wr_lo;
314}
315#endif
316
317struct coalesce_info {
318	int count;
319	int nbytes;
320};
321
322static int
323coalesce_check(struct mbuf *m, void *arg)
324{
325	struct coalesce_info *ci = arg;
326	int *count = &ci->count;
327	int *nbytes = &ci->nbytes;
328
329	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
330		(*count < 7) && (m->m_next == NULL))) {
331		*count += 1;
332		*nbytes += m->m_len;
333		return (1);
334	}
335	return (0);
336}
337
338static struct mbuf *
339cxgb_dequeue(struct sge_qset *qs)
340{
341	struct mbuf *m, *m_head, *m_tail;
342	struct coalesce_info ci;
343
344
345	if (check_pkt_coalesce(qs) == 0)
346		return TXQ_RING_DEQUEUE(qs);
347
348	m_head = m_tail = NULL;
349	ci.count = ci.nbytes = 0;
350	do {
351		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
352		if (m_head == NULL) {
353			m_tail = m_head = m;
354		} else if (m != NULL) {
355			m_tail->m_nextpkt = m;
356			m_tail = m;
357		}
358	} while (m != NULL);
359	if (ci.count > 7)
360		panic("trying to coalesce %d packets in to one WR", ci.count);
361	return (m_head);
362}
363
364/**
365 *	reclaim_completed_tx - reclaims completed Tx descriptors
366 *	@adapter: the adapter
367 *	@q: the Tx queue to reclaim completed descriptors from
368 *
369 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
370 *	and frees the associated buffers if possible.  Called with the Tx
371 *	queue's lock held.
372 */
373static __inline int
374reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
375{
376	struct sge_txq *q = &qs->txq[queue];
377	int reclaim = desc_reclaimable(q);
378
379	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
380	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
381		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
382
383	if (reclaim < reclaim_min)
384		return (0);
385
386	mtx_assert(&qs->lock, MA_OWNED);
387	if (reclaim > 0) {
388		t3_free_tx_desc(qs, reclaim, queue);
389		q->cleaned += reclaim;
390		q->in_use -= reclaim;
391	}
392	if (isset(&qs->txq_stopped, TXQ_ETH))
393                clrbit(&qs->txq_stopped, TXQ_ETH);
394
395	return (reclaim);
396}
397
398/**
399 *	should_restart_tx - are there enough resources to restart a Tx queue?
400 *	@q: the Tx queue
401 *
402 *	Checks if there are enough descriptors to restart a suspended Tx queue.
403 */
404static __inline int
405should_restart_tx(const struct sge_txq *q)
406{
407	unsigned int r = q->processed - q->cleaned;
408
409	return q->in_use - r < (q->size >> 1);
410}
411
412/**
413 *	t3_sge_init - initialize SGE
414 *	@adap: the adapter
415 *	@p: the SGE parameters
416 *
417 *	Performs SGE initialization needed every time after a chip reset.
418 *	We do not initialize any of the queue sets here, instead the driver
419 *	top-level must request those individually.  We also do not enable DMA
420 *	here, that should be done after the queues have been set up.
421 */
422void
423t3_sge_init(adapter_t *adap, struct sge_params *p)
424{
425	u_int ctrl, ups;
426
427	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
428
429	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
430	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
431	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
432	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
433#if SGE_NUM_GENBITS == 1
434	ctrl |= F_EGRGENCTRL;
435#endif
436	if (adap->params.rev > 0) {
437		if (!(adap->flags & (USING_MSIX | USING_MSI)))
438			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
439	}
440	t3_write_reg(adap, A_SG_CONTROL, ctrl);
441	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
442		     V_LORCQDRBTHRSH(512));
443	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
444	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
445		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
446	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
447		     adap->params.rev < T3_REV_C ? 1000 : 500);
448	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
449	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
450	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
451	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
452	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
453}
454
455
456/**
457 *	sgl_len - calculates the size of an SGL of the given capacity
458 *	@n: the number of SGL entries
459 *
460 *	Calculates the number of flits needed for a scatter/gather list that
461 *	can hold the given number of entries.
462 */
463static __inline unsigned int
464sgl_len(unsigned int n)
465{
466	return ((3 * n) / 2 + (n & 1));
467}
468
469/**
470 *	get_imm_packet - return the next ingress packet buffer from a response
471 *	@resp: the response descriptor containing the packet data
472 *
473 *	Return a packet containing the immediate data of the given response.
474 */
475static int
476get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
477{
478
479	if (resp->rss_hdr.opcode == CPL_RX_DATA) {
480		const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0];
481		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
482	} else if (resp->rss_hdr.opcode == CPL_RX_PKT) {
483		const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0];
484		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
485	} else
486		m->m_len = IMMED_PKT_SIZE;
487	m->m_ext.ext_buf = NULL;
488	m->m_ext.ext_type = 0;
489	memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len);
490	return (0);
491}
492
493static __inline u_int
494flits_to_desc(u_int n)
495{
496	return (flit_desc_map[n]);
497}
498
499#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
500		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
501		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
502		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
503		    F_HIRCQPARITYERROR)
504#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
505#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
506		      F_RSPQDISABLED)
507
508/**
509 *	t3_sge_err_intr_handler - SGE async event interrupt handler
510 *	@adapter: the adapter
511 *
512 *	Interrupt handler for SGE asynchronous (non-data) events.
513 */
514void
515t3_sge_err_intr_handler(adapter_t *adapter)
516{
517	unsigned int v, status;
518
519	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
520	if (status & SGE_PARERR)
521		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
522			 status & SGE_PARERR);
523	if (status & SGE_FRAMINGERR)
524		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
525			 status & SGE_FRAMINGERR);
526	if (status & F_RSPQCREDITOVERFOW)
527		CH_ALERT(adapter, "SGE response queue credit overflow\n");
528
529	if (status & F_RSPQDISABLED) {
530		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
531
532		CH_ALERT(adapter,
533			 "packet delivered to disabled response queue (0x%x)\n",
534			 (v >> S_RSPQ0DISABLED) & 0xff);
535	}
536
537	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
538	if (status & SGE_FATALERR)
539		t3_fatal_err(adapter);
540}
541
542void
543t3_sge_prep(adapter_t *adap, struct sge_params *p)
544{
545	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
546
547	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
548	nqsets *= adap->params.nports;
549
550	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
551
552	while (!powerof2(fl_q_size))
553		fl_q_size--;
554
555	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
556	    is_offload(adap);
557
558#if __FreeBSD_version >= 700111
559	if (use_16k) {
560		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
561		jumbo_buf_size = MJUM16BYTES;
562	} else {
563		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
564		jumbo_buf_size = MJUM9BYTES;
565	}
566#else
567	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
568	jumbo_buf_size = MJUMPAGESIZE;
569#endif
570	while (!powerof2(jumbo_q_size))
571		jumbo_q_size--;
572
573	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
574		device_printf(adap->dev,
575		    "Insufficient clusters and/or jumbo buffers.\n");
576
577	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
578
579	for (i = 0; i < SGE_QSETS; ++i) {
580		struct qset_params *q = p->qset + i;
581
582		if (adap->params.nports > 2) {
583			q->coalesce_usecs = 50;
584		} else {
585#ifdef INVARIANTS
586			q->coalesce_usecs = 10;
587#else
588			q->coalesce_usecs = 5;
589#endif
590		}
591		q->polling = 0;
592		q->rspq_size = RSPQ_Q_SIZE;
593		q->fl_size = fl_q_size;
594		q->jumbo_size = jumbo_q_size;
595		q->jumbo_buf_size = jumbo_buf_size;
596		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
597		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
598		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
599		q->cong_thres = 0;
600	}
601}
602
603int
604t3_sge_alloc(adapter_t *sc)
605{
606
607	/* The parent tag. */
608	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
609				1, 0,			/* algnmnt, boundary */
610				BUS_SPACE_MAXADDR,	/* lowaddr */
611				BUS_SPACE_MAXADDR,	/* highaddr */
612				NULL, NULL,		/* filter, filterarg */
613				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
614				BUS_SPACE_UNRESTRICTED, /* nsegments */
615				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
616				0,			/* flags */
617				NULL, NULL,		/* lock, lockarg */
618				&sc->parent_dmat)) {
619		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
620		return (ENOMEM);
621	}
622
623	/*
624	 * DMA tag for normal sized RX frames
625	 */
626	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
627		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
628		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
629		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
630		return (ENOMEM);
631	}
632
633	/*
634	 * DMA tag for jumbo sized RX frames.
635	 */
636	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
637		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
638		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
639		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
640		return (ENOMEM);
641	}
642
643	/*
644	 * DMA tag for TX frames.
645	 */
646	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
647		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
648		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
649		NULL, NULL, &sc->tx_dmat)) {
650		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
651		return (ENOMEM);
652	}
653
654	return (0);
655}
656
657int
658t3_sge_free(struct adapter * sc)
659{
660
661	if (sc->tx_dmat != NULL)
662		bus_dma_tag_destroy(sc->tx_dmat);
663
664	if (sc->rx_jumbo_dmat != NULL)
665		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
666
667	if (sc->rx_dmat != NULL)
668		bus_dma_tag_destroy(sc->rx_dmat);
669
670	if (sc->parent_dmat != NULL)
671		bus_dma_tag_destroy(sc->parent_dmat);
672
673	return (0);
674}
675
676void
677t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
678{
679
680	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
681	qs->rspq.polling = 0 /* p->polling */;
682}
683
684#if !defined(__i386__) && !defined(__amd64__)
685static void
686refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
687{
688	struct refill_fl_cb_arg *cb_arg = arg;
689
690	cb_arg->error = error;
691	cb_arg->seg = segs[0];
692	cb_arg->nseg = nseg;
693
694}
695#endif
696/**
697 *	refill_fl - refill an SGE free-buffer list
698 *	@sc: the controller softc
699 *	@q: the free-list to refill
700 *	@n: the number of new buffers to allocate
701 *
702 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
703 *	The caller must assure that @n does not exceed the queue's capacity.
704 */
705static void
706refill_fl(adapter_t *sc, struct sge_fl *q, int n)
707{
708	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
709	struct rx_desc *d = &q->desc[q->pidx];
710	struct refill_fl_cb_arg cb_arg;
711	struct mbuf *m;
712	caddr_t cl;
713	int err;
714
715	cb_arg.error = 0;
716	while (n--) {
717		/*
718		 * We allocate an uninitialized mbuf + cluster, mbuf is
719		 * initialized after rx.
720		 */
721		if (q->zone == zone_pack) {
722			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
723				break;
724			cl = m->m_ext.ext_buf;
725		} else {
726			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
727				break;
728			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
729				uma_zfree(q->zone, cl);
730				break;
731			}
732		}
733		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
734			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
735				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
736				uma_zfree(q->zone, cl);
737				goto done;
738			}
739			sd->flags |= RX_SW_DESC_MAP_CREATED;
740		}
741#if !defined(__i386__) && !defined(__amd64__)
742		err = bus_dmamap_load(q->entry_tag, sd->map,
743		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
744
745		if (err != 0 || cb_arg.error) {
746			if (q->zone == zone_pack)
747				uma_zfree(q->zone, cl);
748			m_free(m);
749			goto done;
750		}
751#else
752		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
753#endif
754		sd->flags |= RX_SW_DESC_INUSE;
755		sd->rxsd_cl = cl;
756		sd->m = m;
757		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
758		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
759		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
760		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
761
762		d++;
763		sd++;
764
765		if (++q->pidx == q->size) {
766			q->pidx = 0;
767			q->gen ^= 1;
768			sd = q->sdesc;
769			d = q->desc;
770		}
771		q->credits++;
772		q->db_pending++;
773	}
774
775done:
776	if (q->db_pending >= 32) {
777		q->db_pending = 0;
778		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
779	}
780}
781
782
783/**
784 *	free_rx_bufs - free the Rx buffers on an SGE free list
785 *	@sc: the controle softc
786 *	@q: the SGE free list to clean up
787 *
788 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
789 *	this queue should be stopped before calling this function.
790 */
791static void
792free_rx_bufs(adapter_t *sc, struct sge_fl *q)
793{
794	u_int cidx = q->cidx;
795
796	while (q->credits--) {
797		struct rx_sw_desc *d = &q->sdesc[cidx];
798
799		if (d->flags & RX_SW_DESC_INUSE) {
800			bus_dmamap_unload(q->entry_tag, d->map);
801			bus_dmamap_destroy(q->entry_tag, d->map);
802			if (q->zone == zone_pack) {
803				m_init(d->m, zone_pack, MCLBYTES,
804				    M_NOWAIT, MT_DATA, M_EXT);
805				uma_zfree(zone_pack, d->m);
806			} else {
807				m_init(d->m, zone_mbuf, MLEN,
808				    M_NOWAIT, MT_DATA, 0);
809				uma_zfree(zone_mbuf, d->m);
810				uma_zfree(q->zone, d->rxsd_cl);
811			}
812		}
813
814		d->rxsd_cl = NULL;
815		d->m = NULL;
816		if (++cidx == q->size)
817			cidx = 0;
818	}
819}
820
821static __inline void
822__refill_fl(adapter_t *adap, struct sge_fl *fl)
823{
824	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
825}
826
827static __inline void
828__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
829{
830	uint32_t reclaimable = fl->size - fl->credits;
831
832	if (reclaimable > 0)
833		refill_fl(adap, fl, min(max, reclaimable));
834}
835
836/**
837 *	recycle_rx_buf - recycle a receive buffer
838 *	@adapter: the adapter
839 *	@q: the SGE free list
840 *	@idx: index of buffer to recycle
841 *
842 *	Recycles the specified buffer on the given free list by adding it at
843 *	the next available slot on the list.
844 */
845static void
846recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
847{
848	struct rx_desc *from = &q->desc[idx];
849	struct rx_desc *to   = &q->desc[q->pidx];
850
851	q->sdesc[q->pidx] = q->sdesc[idx];
852	to->addr_lo = from->addr_lo;        // already big endian
853	to->addr_hi = from->addr_hi;        // likewise
854	wmb();	/* necessary ? */
855	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
856	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
857	q->credits++;
858
859	if (++q->pidx == q->size) {
860		q->pidx = 0;
861		q->gen ^= 1;
862	}
863	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
864}
865
866static void
867alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
868{
869	uint32_t *addr;
870
871	addr = arg;
872	*addr = segs[0].ds_addr;
873}
874
875static int
876alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
877    bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
878    bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
879{
880	size_t len = nelem * elem_size;
881	void *s = NULL;
882	void *p = NULL;
883	int err;
884
885	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
886				      BUS_SPACE_MAXADDR_32BIT,
887				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
888				      len, 0, NULL, NULL, tag)) != 0) {
889		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
890		return (ENOMEM);
891	}
892
893	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
894				    map)) != 0) {
895		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
896		return (ENOMEM);
897	}
898
899	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
900	bzero(p, len);
901	*(void **)desc = p;
902
903	if (sw_size) {
904		len = nelem * sw_size;
905		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
906		*(void **)sdesc = s;
907	}
908	if (parent_entry_tag == NULL)
909		return (0);
910
911	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
912				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
913		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
914				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
915		                      NULL, NULL, entry_tag)) != 0) {
916		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
917		return (ENOMEM);
918	}
919	return (0);
920}
921
922static void
923sge_slow_intr_handler(void *arg, int ncount)
924{
925	adapter_t *sc = arg;
926
927	t3_slow_intr_handler(sc);
928	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
929	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
930}
931
932/**
933 *	sge_timer_cb - perform periodic maintenance of an SGE qset
934 *	@data: the SGE queue set to maintain
935 *
936 *	Runs periodically from a timer to perform maintenance of an SGE queue
937 *	set.  It performs two tasks:
938 *
939 *	a) Cleans up any completed Tx descriptors that may still be pending.
940 *	Normal descriptor cleanup happens when new packets are added to a Tx
941 *	queue so this timer is relatively infrequent and does any cleanup only
942 *	if the Tx queue has not seen any new packets in a while.  We make a
943 *	best effort attempt to reclaim descriptors, in that we don't wait
944 *	around if we cannot get a queue's lock (which most likely is because
945 *	someone else is queueing new packets and so will also handle the clean
946 *	up).  Since control queues use immediate data exclusively we don't
947 *	bother cleaning them up here.
948 *
949 *	b) Replenishes Rx queues that have run out due to memory shortage.
950 *	Normally new Rx buffers are added when existing ones are consumed but
951 *	when out of memory a queue can become empty.  We try to add only a few
952 *	buffers here, the queue will be replenished fully as these new buffers
953 *	are used up if memory shortage has subsided.
954 *
955 *	c) Return coalesced response queue credits in case a response queue is
956 *	starved.
957 *
958 *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
959 *	fifo overflows and the FW doesn't implement any recovery scheme yet.
960 */
961static void
962sge_timer_cb(void *arg)
963{
964	adapter_t *sc = arg;
965	if ((sc->flags & USING_MSIX) == 0) {
966
967		struct port_info *pi;
968		struct sge_qset *qs;
969		struct sge_txq  *txq;
970		int i, j;
971		int reclaim_ofl, refill_rx;
972
973		if (sc->open_device_map == 0)
974			return;
975
976		for (i = 0; i < sc->params.nports; i++) {
977			pi = &sc->port[i];
978			for (j = 0; j < pi->nqsets; j++) {
979				qs = &sc->sge.qs[pi->first_qset + j];
980				txq = &qs->txq[0];
981				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
982				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
983				    (qs->fl[1].credits < qs->fl[1].size));
984				if (reclaim_ofl || refill_rx) {
985					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
986					break;
987				}
988			}
989		}
990	}
991
992	if (sc->params.nports > 2) {
993		int i;
994
995		for_each_port(sc, i) {
996			struct port_info *pi = &sc->port[i];
997
998			t3_write_reg(sc, A_SG_KDOORBELL,
999				     F_SELEGRCNTX |
1000				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
1001		}
1002	}
1003	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
1004	    sc->open_device_map != 0)
1005		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1006}
1007
1008/*
1009 * This is meant to be a catch-all function to keep sge state private
1010 * to sge.c
1011 *
1012 */
1013int
1014t3_sge_init_adapter(adapter_t *sc)
1015{
1016	callout_init(&sc->sge_timer_ch, 1);
1017	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1018	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1019	return (0);
1020}
1021
1022int
1023t3_sge_reset_adapter(adapter_t *sc)
1024{
1025	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1026	return (0);
1027}
1028
1029int
1030t3_sge_init_port(struct port_info *pi)
1031{
1032	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1033	return (0);
1034}
1035
1036/**
1037 *	refill_rspq - replenish an SGE response queue
1038 *	@adapter: the adapter
1039 *	@q: the response queue to replenish
1040 *	@credits: how many new responses to make available
1041 *
1042 *	Replenishes a response queue by making the supplied number of responses
1043 *	available to HW.
1044 */
1045static __inline void
1046refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1047{
1048
1049	/* mbufs are allocated on demand when a rspq entry is processed. */
1050	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1051		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1052}
1053
1054static void
1055sge_txq_reclaim_handler(void *arg, int ncount)
1056{
1057	struct sge_qset *qs = arg;
1058	int i;
1059
1060	for (i = 0; i < 3; i++)
1061		reclaim_completed_tx(qs, 16, i);
1062}
1063
1064static void
1065sge_timer_reclaim(void *arg, int ncount)
1066{
1067	struct port_info *pi = arg;
1068	int i, nqsets = pi->nqsets;
1069	adapter_t *sc = pi->adapter;
1070	struct sge_qset *qs;
1071	struct mtx *lock;
1072
1073	KASSERT((sc->flags & USING_MSIX) == 0,
1074	    ("can't call timer reclaim for msi-x"));
1075
1076	for (i = 0; i < nqsets; i++) {
1077		qs = &sc->sge.qs[pi->first_qset + i];
1078
1079		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1080		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1081			    &sc->sge.qs[0].rspq.lock;
1082
1083		if (mtx_trylock(lock)) {
1084			/* XXX currently assume that we are *NOT* polling */
1085			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1086
1087			if (qs->fl[0].credits < qs->fl[0].size - 16)
1088				__refill_fl(sc, &qs->fl[0]);
1089			if (qs->fl[1].credits < qs->fl[1].size - 16)
1090				__refill_fl(sc, &qs->fl[1]);
1091
1092			if (status & (1 << qs->rspq.cntxt_id)) {
1093				if (qs->rspq.credits) {
1094					refill_rspq(sc, &qs->rspq, 1);
1095					qs->rspq.credits--;
1096					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1097					    1 << qs->rspq.cntxt_id);
1098				}
1099			}
1100			mtx_unlock(lock);
1101		}
1102	}
1103}
1104
1105/**
1106 *	init_qset_cntxt - initialize an SGE queue set context info
1107 *	@qs: the queue set
1108 *	@id: the queue set id
1109 *
1110 *	Initializes the TIDs and context ids for the queues of a queue set.
1111 */
1112static void
1113init_qset_cntxt(struct sge_qset *qs, u_int id)
1114{
1115
1116	qs->rspq.cntxt_id = id;
1117	qs->fl[0].cntxt_id = 2 * id;
1118	qs->fl[1].cntxt_id = 2 * id + 1;
1119	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1120	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1121	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1122	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1123	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1124
1125	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1126	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1127	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1128}
1129
1130
1131static void
1132txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1133{
1134	txq->in_use += ndesc;
1135	/*
1136	 * XXX we don't handle stopping of queue
1137	 * presumably start handles this when we bump against the end
1138	 */
1139	txqs->gen = txq->gen;
1140	txq->unacked += ndesc;
1141	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1142	txq->unacked &= 31;
1143	txqs->pidx = txq->pidx;
1144	txq->pidx += ndesc;
1145#ifdef INVARIANTS
1146	if (((txqs->pidx > txq->cidx) &&
1147		(txq->pidx < txqs->pidx) &&
1148		(txq->pidx >= txq->cidx)) ||
1149	    ((txqs->pidx < txq->cidx) &&
1150		(txq->pidx >= txq-> cidx)) ||
1151	    ((txqs->pidx < txq->cidx) &&
1152		(txq->cidx < txqs->pidx)))
1153		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1154		    txqs->pidx, txq->pidx, txq->cidx);
1155#endif
1156	if (txq->pidx >= txq->size) {
1157		txq->pidx -= txq->size;
1158		txq->gen ^= 1;
1159	}
1160
1161}
1162
1163/**
1164 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1165 *	@m: the packet mbufs
1166 *      @nsegs: the number of segments
1167 *
1168 * 	Returns the number of Tx descriptors needed for the given Ethernet
1169 * 	packet.  Ethernet packets require addition of WR and CPL headers.
1170 */
1171static __inline unsigned int
1172calc_tx_descs(const struct mbuf *m, int nsegs)
1173{
1174	unsigned int flits;
1175
1176	if (m->m_pkthdr.len <= PIO_LEN)
1177		return 1;
1178
1179	flits = sgl_len(nsegs) + 2;
1180	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1181		flits++;
1182
1183	return flits_to_desc(flits);
1184}
1185
1186/**
1187 *	make_sgl - populate a scatter/gather list for a packet
1188 *	@sgp: the SGL to populate
1189 *	@segs: the packet dma segments
1190 *	@nsegs: the number of segments
1191 *
1192 *	Generates a scatter/gather list for the buffers that make up a packet
1193 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1194 *	appropriately.
1195 */
1196static __inline void
1197make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1198{
1199	int i, idx;
1200
1201	for (idx = 0, i = 0; i < nsegs; i++) {
1202		/*
1203		 * firmware doesn't like empty segments
1204		 */
1205		if (segs[i].ds_len == 0)
1206			continue;
1207		if (i && idx == 0)
1208			++sgp;
1209
1210		sgp->len[idx] = htobe32(segs[i].ds_len);
1211		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1212		idx ^= 1;
1213	}
1214
1215	if (idx) {
1216		sgp->len[idx] = 0;
1217		sgp->addr[idx] = 0;
1218	}
1219}
1220
1221/**
1222 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1223 *	@adap: the adapter
1224 *	@q: the Tx queue
1225 *
1226 *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1227 *	where the HW is going to sleep just after we checked, however,
1228 *	then the interrupt handler will detect the outstanding TX packet
1229 *	and ring the doorbell for us.
1230 *
1231 *	When GTS is disabled we unconditionally ring the doorbell.
1232 */
1233static __inline void
1234check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1235{
1236#if USE_GTS
1237	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1238	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1239		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1240#ifdef T3_TRACE
1241		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1242			  q->cntxt_id);
1243#endif
1244		t3_write_reg(adap, A_SG_KDOORBELL,
1245			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1246	}
1247#else
1248	if (mustring || ++q->db_pending >= 32) {
1249		wmb();            /* write descriptors before telling HW */
1250		t3_write_reg(adap, A_SG_KDOORBELL,
1251		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1252		q->db_pending = 0;
1253	}
1254#endif
1255}
1256
1257static __inline void
1258wr_gen2(struct tx_desc *d, unsigned int gen)
1259{
1260#if SGE_NUM_GENBITS == 2
1261	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1262#endif
1263}
1264
1265/**
1266 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1267 *	@ndesc: number of Tx descriptors spanned by the SGL
1268 *	@txd: first Tx descriptor to be written
1269 *	@txqs: txq state (generation and producer index)
1270 *	@txq: the SGE Tx queue
1271 *	@sgl: the SGL
1272 *	@flits: number of flits to the start of the SGL in the first descriptor
1273 *	@sgl_flits: the SGL size in flits
1274 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1275 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1276 *
1277 *	Write a work request header and an associated SGL.  If the SGL is
1278 *	small enough to fit into one Tx descriptor it has already been written
1279 *	and we just need to write the WR header.  Otherwise we distribute the
1280 *	SGL across the number of descriptors it spans.
1281 */
1282static void
1283write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1284    const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1285    unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1286{
1287
1288	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1289	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1290
1291	if (__predict_true(ndesc == 1)) {
1292		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1293		    V_WR_SGLSFLT(flits)) | wr_hi,
1294		    htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) |
1295		    wr_lo);
1296
1297		wr_gen2(txd, txqs->gen);
1298
1299	} else {
1300		unsigned int ogen = txqs->gen;
1301		const uint64_t *fp = (const uint64_t *)sgl;
1302		struct work_request_hdr *wp = wrp;
1303
1304		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1305		    V_WR_SGLSFLT(flits)) | wr_hi;
1306
1307		while (sgl_flits) {
1308			unsigned int avail = WR_FLITS - flits;
1309
1310			if (avail > sgl_flits)
1311				avail = sgl_flits;
1312			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1313			sgl_flits -= avail;
1314			ndesc--;
1315			if (!sgl_flits)
1316				break;
1317
1318			fp += avail;
1319			txd++;
1320			txsd++;
1321			if (++txqs->pidx == txq->size) {
1322				txqs->pidx = 0;
1323				txqs->gen ^= 1;
1324				txd = txq->desc;
1325				txsd = txq->sdesc;
1326			}
1327
1328			/*
1329			 * when the head of the mbuf chain
1330			 * is freed all clusters will be freed
1331			 * with it
1332			 */
1333			wrp = (struct work_request_hdr *)txd;
1334			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1335			    V_WR_SGLSFLT(1)) | wr_hi;
1336			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1337				    sgl_flits + 1)) |
1338			    V_WR_GEN(txqs->gen)) | wr_lo;
1339			wr_gen2(txd, txqs->gen);
1340			flits = 1;
1341		}
1342		wrp->wrh_hi |= htonl(F_WR_EOP);
1343		wmb();
1344		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1345		wr_gen2((struct tx_desc *)wp, ogen);
1346	}
1347}
1348
1349/* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1350#define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1351
1352#define GET_VTAG(cntrl, m) \
1353do { \
1354	if ((m)->m_flags & M_VLANTAG)					            \
1355		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1356} while (0)
1357
1358static int
1359t3_encap(struct sge_qset *qs, struct mbuf **m)
1360{
1361	adapter_t *sc;
1362	struct mbuf *m0;
1363	struct sge_txq *txq;
1364	struct txq_state txqs;
1365	struct port_info *pi;
1366	unsigned int ndesc, flits, cntrl, mlen;
1367	int err, nsegs, tso_info = 0;
1368
1369	struct work_request_hdr *wrp;
1370	struct tx_sw_desc *txsd;
1371	struct sg_ent *sgp, *sgl;
1372	uint32_t wr_hi, wr_lo, sgl_flits;
1373	bus_dma_segment_t segs[TX_MAX_SEGS];
1374
1375	struct tx_desc *txd;
1376
1377	pi = qs->port;
1378	sc = pi->adapter;
1379	txq = &qs->txq[TXQ_ETH];
1380	txd = &txq->desc[txq->pidx];
1381	txsd = &txq->sdesc[txq->pidx];
1382	sgl = txq->txq_sgl;
1383
1384	prefetch(txd);
1385	m0 = *m;
1386
1387	mtx_assert(&qs->lock, MA_OWNED);
1388	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1389	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1390
1391	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1392	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1393		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1394
1395	if (m0->m_nextpkt != NULL) {
1396		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1397		ndesc = 1;
1398		mlen = 0;
1399	} else {
1400		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1401		    &m0, segs, &nsegs))) {
1402			if (cxgb_debug)
1403				printf("failed ... err=%d\n", err);
1404			return (err);
1405		}
1406		mlen = m0->m_pkthdr.len;
1407		ndesc = calc_tx_descs(m0, nsegs);
1408	}
1409	txq_prod(txq, ndesc, &txqs);
1410
1411	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1412	txsd->m = m0;
1413
1414	if (m0->m_nextpkt != NULL) {
1415		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1416		int i, fidx;
1417
1418		if (nsegs > 7)
1419			panic("trying to coalesce %d packets in to one WR", nsegs);
1420		txq->txq_coalesced += nsegs;
1421		wrp = (struct work_request_hdr *)txd;
1422		flits = nsegs*2 + 1;
1423
1424		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1425			struct cpl_tx_pkt_batch_entry *cbe;
1426			uint64_t flit;
1427			uint32_t *hflit = (uint32_t *)&flit;
1428			int cflags = m0->m_pkthdr.csum_flags;
1429
1430			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1431			GET_VTAG(cntrl, m0);
1432			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1433			if (__predict_false(!(cflags & CSUM_IP)))
1434				cntrl |= F_TXPKT_IPCSUM_DIS;
1435			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP |
1436			    CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1437				cntrl |= F_TXPKT_L4CSUM_DIS;
1438
1439			hflit[0] = htonl(cntrl);
1440			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1441			flit |= htobe64(1 << 24);
1442			cbe = &cpl_batch->pkt_entry[i];
1443			cbe->cntrl = hflit[0];
1444			cbe->len = hflit[1];
1445			cbe->addr = htobe64(segs[i].ds_addr);
1446		}
1447
1448		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1449		    V_WR_SGLSFLT(flits)) |
1450		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1451		wr_lo = htonl(V_WR_LEN(flits) |
1452		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1453		set_wr_hdr(wrp, wr_hi, wr_lo);
1454		wmb();
1455		ETHER_BPF_MTAP(pi->ifp, m0);
1456		wr_gen2(txd, txqs.gen);
1457		check_ring_tx_db(sc, txq, 0);
1458		return (0);
1459	} else if (tso_info) {
1460		uint16_t eth_type;
1461		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1462		struct ether_header *eh;
1463		void *l3hdr;
1464		struct tcphdr *tcp;
1465
1466		txd->flit[2] = 0;
1467		GET_VTAG(cntrl, m0);
1468		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1469		hdr->cntrl = htonl(cntrl);
1470		hdr->len = htonl(mlen | 0x80000000);
1471
1472		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1473			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x",
1474			    m0, mlen, m0->m_pkthdr.tso_segsz,
1475			    (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags);
1476			panic("tx tso packet too small");
1477		}
1478
1479		/* Make sure that ether, ip, tcp headers are all in m0 */
1480		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1481			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1482			if (__predict_false(m0 == NULL)) {
1483				/* XXX panic probably an overreaction */
1484				panic("couldn't fit header into mbuf");
1485			}
1486		}
1487
1488		eh = mtod(m0, struct ether_header *);
1489		eth_type = eh->ether_type;
1490		if (eth_type == htons(ETHERTYPE_VLAN)) {
1491			struct ether_vlan_header *evh = (void *)eh;
1492
1493			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
1494			l3hdr = evh + 1;
1495			eth_type = evh->evl_proto;
1496		} else {
1497			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
1498			l3hdr = eh + 1;
1499		}
1500
1501		if (eth_type == htons(ETHERTYPE_IP)) {
1502			struct ip *ip = l3hdr;
1503
1504			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
1505			tcp = (struct tcphdr *)(ip + 1);
1506		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
1507			struct ip6_hdr *ip6 = l3hdr;
1508
1509			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
1510			    ("%s: CSUM_TSO with ip6_nxt %d",
1511			    __func__, ip6->ip6_nxt));
1512
1513			tso_info |= F_LSO_IPV6;
1514			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
1515			tcp = (struct tcphdr *)(ip6 + 1);
1516		} else
1517			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
1518
1519		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
1520		hdr->lso_info = htonl(tso_info);
1521
1522		if (__predict_false(mlen <= PIO_LEN)) {
1523			/*
1524			 * pkt not undersized but fits in PIO_LEN
1525			 * Indicates a TSO bug at the higher levels.
1526			 */
1527			txsd->m = NULL;
1528			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1529			flits = (mlen + 7) / 8 + 3;
1530			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1531					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1532					  F_WR_SOP | F_WR_EOP | txqs.compl);
1533			wr_lo = htonl(V_WR_LEN(flits) |
1534			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1535			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1536			wmb();
1537			ETHER_BPF_MTAP(pi->ifp, m0);
1538			wr_gen2(txd, txqs.gen);
1539			check_ring_tx_db(sc, txq, 0);
1540			m_freem(m0);
1541			return (0);
1542		}
1543		flits = 3;
1544	} else {
1545		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1546
1547		GET_VTAG(cntrl, m0);
1548		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1549		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1550			cntrl |= F_TXPKT_IPCSUM_DIS;
1551		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP |
1552		    CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1553			cntrl |= F_TXPKT_L4CSUM_DIS;
1554		cpl->cntrl = htonl(cntrl);
1555		cpl->len = htonl(mlen | 0x80000000);
1556
1557		if (mlen <= PIO_LEN) {
1558			txsd->m = NULL;
1559			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1560			flits = (mlen + 7) / 8 + 2;
1561
1562			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1563			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1564					  F_WR_SOP | F_WR_EOP | txqs.compl);
1565			wr_lo = htonl(V_WR_LEN(flits) |
1566			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1567			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1568			wmb();
1569			ETHER_BPF_MTAP(pi->ifp, m0);
1570			wr_gen2(txd, txqs.gen);
1571			check_ring_tx_db(sc, txq, 0);
1572			m_freem(m0);
1573			return (0);
1574		}
1575		flits = 2;
1576	}
1577	wrp = (struct work_request_hdr *)txd;
1578	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1579	make_sgl(sgp, segs, nsegs);
1580
1581	sgl_flits = sgl_len(nsegs);
1582
1583	ETHER_BPF_MTAP(pi->ifp, m0);
1584
1585	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1586	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1587	wr_lo = htonl(V_WR_TID(txq->token));
1588	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1589	    sgl_flits, wr_hi, wr_lo);
1590	check_ring_tx_db(sc, txq, 0);
1591
1592	return (0);
1593}
1594
1595void
1596cxgb_tx_watchdog(void *arg)
1597{
1598	struct sge_qset *qs = arg;
1599	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1600
1601        if (qs->coalescing != 0 &&
1602	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1603	    TXQ_RING_EMPTY(qs))
1604                qs->coalescing = 0;
1605        else if (qs->coalescing == 0 &&
1606	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1607                qs->coalescing = 1;
1608	if (TXQ_TRYLOCK(qs)) {
1609		qs->qs_flags |= QS_FLUSHING;
1610		cxgb_start_locked(qs);
1611		qs->qs_flags &= ~QS_FLUSHING;
1612		TXQ_UNLOCK(qs);
1613	}
1614	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1615		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1616		    qs, txq->txq_watchdog.c_cpu);
1617}
1618
1619static void
1620cxgb_tx_timeout(void *arg)
1621{
1622	struct sge_qset *qs = arg;
1623	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1624
1625	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1626                qs->coalescing = 1;
1627	if (TXQ_TRYLOCK(qs)) {
1628		qs->qs_flags |= QS_TIMEOUT;
1629		cxgb_start_locked(qs);
1630		qs->qs_flags &= ~QS_TIMEOUT;
1631		TXQ_UNLOCK(qs);
1632	}
1633}
1634
1635static void
1636cxgb_start_locked(struct sge_qset *qs)
1637{
1638	struct mbuf *m_head = NULL;
1639	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1640	struct port_info *pi = qs->port;
1641	struct ifnet *ifp = pi->ifp;
1642
1643	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1644		reclaim_completed_tx(qs, 0, TXQ_ETH);
1645
1646	if (!pi->link_config.link_ok) {
1647		TXQ_RING_FLUSH(qs);
1648		return;
1649	}
1650	TXQ_LOCK_ASSERT(qs);
1651	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1652	    pi->link_config.link_ok) {
1653		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1654
1655		if (txq->size - txq->in_use <= TX_MAX_DESC)
1656			break;
1657
1658		if ((m_head = cxgb_dequeue(qs)) == NULL)
1659			break;
1660		/*
1661		 *  Encapsulation can modify our pointer, and or make it
1662		 *  NULL on failure.  In that event, we can't requeue.
1663		 */
1664		if (t3_encap(qs, &m_head) || m_head == NULL)
1665			break;
1666
1667		m_head = NULL;
1668	}
1669
1670	if (txq->db_pending)
1671		check_ring_tx_db(pi->adapter, txq, 1);
1672
1673	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1674	    pi->link_config.link_ok)
1675		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1676		    qs, txq->txq_timer.c_cpu);
1677	if (m_head != NULL)
1678		m_freem(m_head);
1679}
1680
1681static int
1682cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1683{
1684	struct port_info *pi = qs->port;
1685	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1686	struct buf_ring *br = txq->txq_mr;
1687	int error, avail;
1688
1689	avail = txq->size - txq->in_use;
1690	TXQ_LOCK_ASSERT(qs);
1691
1692	/*
1693	 * We can only do a direct transmit if the following are true:
1694	 * - we aren't coalescing (ring < 3/4 full)
1695	 * - the link is up -- checked in caller
1696	 * - there are no packets enqueued already
1697	 * - there is space in hardware transmit queue
1698	 */
1699	if (check_pkt_coalesce(qs) == 0 &&
1700	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1701		if (t3_encap(qs, &m)) {
1702			if (m != NULL &&
1703			    (error = drbr_enqueue(ifp, br, m)) != 0)
1704				return (error);
1705		} else {
1706			if (txq->db_pending)
1707				check_ring_tx_db(pi->adapter, txq, 1);
1708
1709			/*
1710			 * We've bypassed the buf ring so we need to update
1711			 * the stats directly
1712			 */
1713			txq->txq_direct_packets++;
1714			txq->txq_direct_bytes += m->m_pkthdr.len;
1715		}
1716	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1717		return (error);
1718
1719	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1720	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1721	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1722		cxgb_start_locked(qs);
1723	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1724		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1725		    qs, txq->txq_timer.c_cpu);
1726	return (0);
1727}
1728
1729int
1730cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1731{
1732	struct sge_qset *qs;
1733	struct port_info *pi = ifp->if_softc;
1734	int error, qidx = pi->first_qset;
1735
1736	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1737	    ||(!pi->link_config.link_ok)) {
1738		m_freem(m);
1739		return (0);
1740	}
1741
1742	/* check if flowid is set */
1743	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
1744		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1745
1746	qs = &pi->adapter->sge.qs[qidx];
1747
1748	if (TXQ_TRYLOCK(qs)) {
1749		/* XXX running */
1750		error = cxgb_transmit_locked(ifp, qs, m);
1751		TXQ_UNLOCK(qs);
1752	} else
1753		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1754	return (error);
1755}
1756
1757void
1758cxgb_qflush(struct ifnet *ifp)
1759{
1760	/*
1761	 * flush any enqueued mbufs in the buf_rings
1762	 * and in the transmit queues
1763	 * no-op for now
1764	 */
1765	return;
1766}
1767
1768/**
1769 *	write_imm - write a packet into a Tx descriptor as immediate data
1770 *	@d: the Tx descriptor to write
1771 *	@m: the packet
1772 *	@len: the length of packet data to write as immediate data
1773 *	@gen: the generation bit value to write
1774 *
1775 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1776 *	contains a work request at its beginning.  We must write the packet
1777 *	carefully so the SGE doesn't read accidentally before it's written in
1778 *	its entirety.
1779 */
1780static __inline void
1781write_imm(struct tx_desc *d, caddr_t src,
1782	  unsigned int len, unsigned int gen)
1783{
1784	struct work_request_hdr *from = (struct work_request_hdr *)src;
1785	struct work_request_hdr *to = (struct work_request_hdr *)d;
1786	uint32_t wr_hi, wr_lo;
1787
1788	KASSERT(len <= WR_LEN && len >= sizeof(*from),
1789	    ("%s: invalid len %d", __func__, len));
1790
1791	memcpy(&to[1], &from[1], len - sizeof(*from));
1792	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1793	    V_WR_BCNTLFLT(len & 7));
1794	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8));
1795	set_wr_hdr(to, wr_hi, wr_lo);
1796	wmb();
1797	wr_gen2(d, gen);
1798}
1799
1800/**
1801 *	check_desc_avail - check descriptor availability on a send queue
1802 *	@adap: the adapter
1803 *	@q: the TX queue
1804 *	@m: the packet needing the descriptors
1805 *	@ndesc: the number of Tx descriptors needed
1806 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1807 *
1808 *	Checks if the requested number of Tx descriptors is available on an
1809 *	SGE send queue.  If the queue is already suspended or not enough
1810 *	descriptors are available the packet is queued for later transmission.
1811 *	Must be called with the Tx queue locked.
1812 *
1813 *	Returns 0 if enough descriptors are available, 1 if there aren't
1814 *	enough descriptors and the packet has been queued, and 2 if the caller
1815 *	needs to retry because there weren't enough descriptors at the
1816 *	beginning of the call but some freed up in the mean time.
1817 */
1818static __inline int
1819check_desc_avail(adapter_t *adap, struct sge_txq *q,
1820		 struct mbuf *m, unsigned int ndesc,
1821		 unsigned int qid)
1822{
1823	/*
1824	 * XXX We currently only use this for checking the control queue
1825	 * the control queue is only used for binding qsets which happens
1826	 * at init time so we are guaranteed enough descriptors
1827	 */
1828	if (__predict_false(!mbufq_empty(&q->sendq))) {
1829addq_exit:	mbufq_tail(&q->sendq, m);
1830		return 1;
1831	}
1832	if (__predict_false(q->size - q->in_use < ndesc)) {
1833
1834		struct sge_qset *qs = txq_to_qset(q, qid);
1835
1836		setbit(&qs->txq_stopped, qid);
1837		if (should_restart_tx(q) &&
1838		    test_and_clear_bit(qid, &qs->txq_stopped))
1839			return 2;
1840
1841		q->stops++;
1842		goto addq_exit;
1843	}
1844	return 0;
1845}
1846
1847
1848/**
1849 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1850 *	@q: the SGE control Tx queue
1851 *
1852 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1853 *	that send only immediate data (presently just the control queues) and
1854 *	thus do not have any mbufs
1855 */
1856static __inline void
1857reclaim_completed_tx_imm(struct sge_txq *q)
1858{
1859	unsigned int reclaim = q->processed - q->cleaned;
1860
1861	q->in_use -= reclaim;
1862	q->cleaned += reclaim;
1863}
1864
1865/**
1866 *	ctrl_xmit - send a packet through an SGE control Tx queue
1867 *	@adap: the adapter
1868 *	@q: the control queue
1869 *	@m: the packet
1870 *
1871 *	Send a packet through an SGE control Tx queue.  Packets sent through
1872 *	a control queue must fit entirely as immediate data in a single Tx
1873 *	descriptor and have no page fragments.
1874 */
1875static int
1876ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1877{
1878	int ret;
1879	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1880	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1881
1882	KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__));
1883
1884	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1885	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1886
1887	TXQ_LOCK(qs);
1888again:	reclaim_completed_tx_imm(q);
1889
1890	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1891	if (__predict_false(ret)) {
1892		if (ret == 1) {
1893			TXQ_UNLOCK(qs);
1894			return (ENOSPC);
1895		}
1896		goto again;
1897	}
1898	write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1899
1900	q->in_use++;
1901	if (++q->pidx >= q->size) {
1902		q->pidx = 0;
1903		q->gen ^= 1;
1904	}
1905	TXQ_UNLOCK(qs);
1906	wmb();
1907	t3_write_reg(adap, A_SG_KDOORBELL,
1908	    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1909
1910	m_free(m);
1911	return (0);
1912}
1913
1914
1915/**
1916 *	restart_ctrlq - restart a suspended control queue
1917 *	@qs: the queue set cotaining the control queue
1918 *
1919 *	Resumes transmission on a suspended Tx control queue.
1920 */
1921static void
1922restart_ctrlq(void *data, int npending)
1923{
1924	struct mbuf *m;
1925	struct sge_qset *qs = (struct sge_qset *)data;
1926	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1927	adapter_t *adap = qs->port->adapter;
1928
1929	TXQ_LOCK(qs);
1930again:	reclaim_completed_tx_imm(q);
1931
1932	while (q->in_use < q->size &&
1933	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1934
1935		write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1936		m_free(m);
1937
1938		if (++q->pidx >= q->size) {
1939			q->pidx = 0;
1940			q->gen ^= 1;
1941		}
1942		q->in_use++;
1943	}
1944	if (!mbufq_empty(&q->sendq)) {
1945		setbit(&qs->txq_stopped, TXQ_CTRL);
1946
1947		if (should_restart_tx(q) &&
1948		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1949			goto again;
1950		q->stops++;
1951	}
1952	TXQ_UNLOCK(qs);
1953	t3_write_reg(adap, A_SG_KDOORBELL,
1954		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1955}
1956
1957
1958/*
1959 * Send a management message through control queue 0
1960 */
1961int
1962t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1963{
1964	return ctrl_xmit(adap, &adap->sge.qs[0], m);
1965}
1966
1967/**
1968 *	free_qset - free the resources of an SGE queue set
1969 *	@sc: the controller owning the queue set
1970 *	@q: the queue set
1971 *
1972 *	Release the HW and SW resources associated with an SGE queue set, such
1973 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1974 *	queue set must be quiesced prior to calling this.
1975 */
1976static void
1977t3_free_qset(adapter_t *sc, struct sge_qset *q)
1978{
1979	int i;
1980
1981	reclaim_completed_tx(q, 0, TXQ_ETH);
1982	if (q->txq[TXQ_ETH].txq_mr != NULL)
1983		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
1984	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
1985		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
1986		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
1987	}
1988
1989	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1990		if (q->fl[i].desc) {
1991			mtx_lock_spin(&sc->sge.reg_lock);
1992			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1993			mtx_unlock_spin(&sc->sge.reg_lock);
1994			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1995			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1996					q->fl[i].desc_map);
1997			bus_dma_tag_destroy(q->fl[i].desc_tag);
1998			bus_dma_tag_destroy(q->fl[i].entry_tag);
1999		}
2000		if (q->fl[i].sdesc) {
2001			free_rx_bufs(sc, &q->fl[i]);
2002			free(q->fl[i].sdesc, M_DEVBUF);
2003		}
2004	}
2005
2006	mtx_unlock(&q->lock);
2007	MTX_DESTROY(&q->lock);
2008	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2009		if (q->txq[i].desc) {
2010			mtx_lock_spin(&sc->sge.reg_lock);
2011			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2012			mtx_unlock_spin(&sc->sge.reg_lock);
2013			bus_dmamap_unload(q->txq[i].desc_tag,
2014					q->txq[i].desc_map);
2015			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2016					q->txq[i].desc_map);
2017			bus_dma_tag_destroy(q->txq[i].desc_tag);
2018			bus_dma_tag_destroy(q->txq[i].entry_tag);
2019		}
2020		if (q->txq[i].sdesc) {
2021			free(q->txq[i].sdesc, M_DEVBUF);
2022		}
2023	}
2024
2025	if (q->rspq.desc) {
2026		mtx_lock_spin(&sc->sge.reg_lock);
2027		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2028		mtx_unlock_spin(&sc->sge.reg_lock);
2029
2030		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2031		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2032			        q->rspq.desc_map);
2033		bus_dma_tag_destroy(q->rspq.desc_tag);
2034		MTX_DESTROY(&q->rspq.lock);
2035	}
2036
2037#if defined(INET6) || defined(INET)
2038	tcp_lro_free(&q->lro.ctrl);
2039#endif
2040
2041	bzero(q, sizeof(*q));
2042}
2043
2044/**
2045 *	t3_free_sge_resources - free SGE resources
2046 *	@sc: the adapter softc
2047 *
2048 *	Frees resources used by the SGE queue sets.
2049 */
2050void
2051t3_free_sge_resources(adapter_t *sc, int nqsets)
2052{
2053	int i;
2054
2055	for (i = 0; i < nqsets; ++i) {
2056		TXQ_LOCK(&sc->sge.qs[i]);
2057		t3_free_qset(sc, &sc->sge.qs[i]);
2058	}
2059}
2060
2061/**
2062 *	t3_sge_start - enable SGE
2063 *	@sc: the controller softc
2064 *
2065 *	Enables the SGE for DMAs.  This is the last step in starting packet
2066 *	transfers.
2067 */
2068void
2069t3_sge_start(adapter_t *sc)
2070{
2071	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2072}
2073
2074/**
2075 *	t3_sge_stop - disable SGE operation
2076 *	@sc: the adapter
2077 *
2078 *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2079 *	from error interrupts) or from normal process context.  In the latter
2080 *	case it also disables any pending queue restart tasklets.  Note that
2081 *	if it is called in interrupt context it cannot disable the restart
2082 *	tasklets as it cannot wait, however the tasklets will have no effect
2083 *	since the doorbells are disabled and the driver will call this again
2084 *	later from process context, at which time the tasklets will be stopped
2085 *	if they are still running.
2086 */
2087void
2088t3_sge_stop(adapter_t *sc)
2089{
2090	int i, nqsets;
2091
2092	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2093
2094	if (sc->tq == NULL)
2095		return;
2096
2097	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2098		nqsets += sc->port[i].nqsets;
2099#ifdef notyet
2100	/*
2101	 *
2102	 * XXX
2103	 */
2104	for (i = 0; i < nqsets; ++i) {
2105		struct sge_qset *qs = &sc->sge.qs[i];
2106
2107		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2108		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2109	}
2110#endif
2111}
2112
2113/**
2114 *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2115 *	@adapter: the adapter
2116 *	@q: the Tx queue to reclaim descriptors from
2117 *	@reclaimable: the number of descriptors to reclaim
2118 *      @m_vec_size: maximum number of buffers to reclaim
2119 *      @desc_reclaimed: returns the number of descriptors reclaimed
2120 *
2121 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2122 *	Tx buffers.  Called with the Tx queue lock held.
2123 *
2124 *      Returns number of buffers of reclaimed
2125 */
2126void
2127t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2128{
2129	struct tx_sw_desc *txsd;
2130	unsigned int cidx, mask;
2131	struct sge_txq *q = &qs->txq[queue];
2132
2133#ifdef T3_TRACE
2134	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2135		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2136#endif
2137	cidx = q->cidx;
2138	mask = q->size - 1;
2139	txsd = &q->sdesc[cidx];
2140
2141	mtx_assert(&qs->lock, MA_OWNED);
2142	while (reclaimable--) {
2143		prefetch(q->sdesc[(cidx + 1) & mask].m);
2144		prefetch(q->sdesc[(cidx + 2) & mask].m);
2145
2146		if (txsd->m != NULL) {
2147			if (txsd->flags & TX_SW_DESC_MAPPED) {
2148				bus_dmamap_unload(q->entry_tag, txsd->map);
2149				txsd->flags &= ~TX_SW_DESC_MAPPED;
2150			}
2151			m_freem_list(txsd->m);
2152			txsd->m = NULL;
2153		} else
2154			q->txq_skipped++;
2155
2156		++txsd;
2157		if (++cidx == q->size) {
2158			cidx = 0;
2159			txsd = q->sdesc;
2160		}
2161	}
2162	q->cidx = cidx;
2163
2164}
2165
2166/**
2167 *	is_new_response - check if a response is newly written
2168 *	@r: the response descriptor
2169 *	@q: the response queue
2170 *
2171 *	Returns true if a response descriptor contains a yet unprocessed
2172 *	response.
2173 */
2174static __inline int
2175is_new_response(const struct rsp_desc *r,
2176    const struct sge_rspq *q)
2177{
2178	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2179}
2180
2181#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2182#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2183			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2184			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2185			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2186
2187/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2188#define NOMEM_INTR_DELAY 2500
2189
2190#ifdef TCP_OFFLOAD
2191/**
2192 *	write_ofld_wr - write an offload work request
2193 *	@adap: the adapter
2194 *	@m: the packet to send
2195 *	@q: the Tx queue
2196 *	@pidx: index of the first Tx descriptor to write
2197 *	@gen: the generation value to use
2198 *	@ndesc: number of descriptors the packet will occupy
2199 *
2200 *	Write an offload work request to send the supplied packet.  The packet
2201 *	data already carry the work request with most fields populated.
2202 */
2203static void
2204write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q,
2205    unsigned int pidx, unsigned int gen, unsigned int ndesc)
2206{
2207	unsigned int sgl_flits, flits;
2208	int i, idx, nsegs, wrlen;
2209	struct work_request_hdr *from;
2210	struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1];
2211	struct tx_desc *d = &q->desc[pidx];
2212	struct txq_state txqs;
2213	struct sglist_seg *segs;
2214	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2215	struct sglist *sgl;
2216
2217	from = (void *)(oh + 1);	/* Start of WR within mbuf */
2218	wrlen = m->m_len - sizeof(*oh);
2219
2220	if (!(oh->flags & F_HDR_SGL)) {
2221		write_imm(d, (caddr_t)from, wrlen, gen);
2222
2223		/*
2224		 * mbuf with "real" immediate tx data will be enqueue_wr'd by
2225		 * t3_push_frames and freed in wr_ack.  Others, like those sent
2226		 * down by close_conn, t3_send_reset, etc. should be freed here.
2227		 */
2228		if (!(oh->flags & F_HDR_DF))
2229			m_free(m);
2230		return;
2231	}
2232
2233	memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from));
2234
2235	sgl = oh->sgl;
2236	flits = wrlen / 8;
2237	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl;
2238
2239	nsegs = sgl->sg_nseg;
2240	segs = sgl->sg_segs;
2241	for (idx = 0, i = 0; i < nsegs; i++) {
2242		KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__));
2243		if (i && idx == 0)
2244			++sgp;
2245		sgp->len[idx] = htobe32(segs[i].ss_len);
2246		sgp->addr[idx] = htobe64(segs[i].ss_paddr);
2247		idx ^= 1;
2248	}
2249	if (idx) {
2250		sgp->len[idx] = 0;
2251		sgp->addr[idx] = 0;
2252	}
2253
2254	sgl_flits = sgl_len(nsegs);
2255	txqs.gen = gen;
2256	txqs.pidx = pidx;
2257	txqs.compl = 0;
2258
2259	write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits,
2260	    from->wrh_hi, from->wrh_lo);
2261}
2262
2263/**
2264 *	ofld_xmit - send a packet through an offload queue
2265 *	@adap: the adapter
2266 *	@q: the Tx offload queue
2267 *	@m: the packet
2268 *
2269 *	Send an offload packet through an SGE offload queue.
2270 */
2271static int
2272ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2273{
2274	int ret;
2275	unsigned int ndesc;
2276	unsigned int pidx, gen;
2277	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2278	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2279
2280	ndesc = G_HDR_NDESC(oh->flags);
2281
2282	TXQ_LOCK(qs);
2283again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2284	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2285	if (__predict_false(ret)) {
2286		if (ret == 1) {
2287			TXQ_UNLOCK(qs);
2288			return (EINTR);
2289		}
2290		goto again;
2291	}
2292
2293	gen = q->gen;
2294	q->in_use += ndesc;
2295	pidx = q->pidx;
2296	q->pidx += ndesc;
2297	if (q->pidx >= q->size) {
2298		q->pidx -= q->size;
2299		q->gen ^= 1;
2300	}
2301
2302	write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2303	check_ring_tx_db(adap, q, 1);
2304	TXQ_UNLOCK(qs);
2305
2306	return (0);
2307}
2308
2309/**
2310 *	restart_offloadq - restart a suspended offload queue
2311 *	@qs: the queue set cotaining the offload queue
2312 *
2313 *	Resumes transmission on a suspended Tx offload queue.
2314 */
2315static void
2316restart_offloadq(void *data, int npending)
2317{
2318	struct mbuf *m;
2319	struct sge_qset *qs = data;
2320	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2321	adapter_t *adap = qs->port->adapter;
2322	int cleaned;
2323
2324	TXQ_LOCK(qs);
2325again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2326
2327	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2328		unsigned int gen, pidx;
2329		struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2330		unsigned int ndesc = G_HDR_NDESC(oh->flags);
2331
2332		if (__predict_false(q->size - q->in_use < ndesc)) {
2333			setbit(&qs->txq_stopped, TXQ_OFLD);
2334			if (should_restart_tx(q) &&
2335			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2336				goto again;
2337			q->stops++;
2338			break;
2339		}
2340
2341		gen = q->gen;
2342		q->in_use += ndesc;
2343		pidx = q->pidx;
2344		q->pidx += ndesc;
2345		if (q->pidx >= q->size) {
2346			q->pidx -= q->size;
2347			q->gen ^= 1;
2348		}
2349
2350		(void)mbufq_dequeue(&q->sendq);
2351		TXQ_UNLOCK(qs);
2352		write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2353		TXQ_LOCK(qs);
2354	}
2355#if USE_GTS
2356	set_bit(TXQ_RUNNING, &q->flags);
2357	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2358#endif
2359	TXQ_UNLOCK(qs);
2360	wmb();
2361	t3_write_reg(adap, A_SG_KDOORBELL,
2362		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2363}
2364
2365/**
2366 *	t3_offload_tx - send an offload packet
2367 *	@m: the packet
2368 *
2369 *	Sends an offload packet.  We use the packet priority to select the
2370 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2371 *	should be sent as regular or control, bits 1-3 select the queue set.
2372 */
2373int
2374t3_offload_tx(struct adapter *sc, struct mbuf *m)
2375{
2376	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2377	struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)];
2378
2379	if (oh->flags & F_HDR_CTRL) {
2380		m_adj(m, sizeof (*oh));	/* trim ofld_hdr off */
2381		return (ctrl_xmit(sc, qs, m));
2382	} else
2383		return (ofld_xmit(sc, qs, m));
2384}
2385#endif
2386
2387static void
2388restart_tx(struct sge_qset *qs)
2389{
2390	struct adapter *sc = qs->port->adapter;
2391
2392	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2393	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2394	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2395		qs->txq[TXQ_OFLD].restarts++;
2396		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2397	}
2398
2399	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2400	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2401	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2402		qs->txq[TXQ_CTRL].restarts++;
2403		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2404	}
2405}
2406
2407/**
2408 *	t3_sge_alloc_qset - initialize an SGE queue set
2409 *	@sc: the controller softc
2410 *	@id: the queue set id
2411 *	@nports: how many Ethernet ports will be using this queue set
2412 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2413 *	@p: configuration parameters for this queue set
2414 *	@ntxq: number of Tx queues for the queue set
2415 *	@pi: port info for queue set
2416 *
2417 *	Allocate resources and initialize an SGE queue set.  A queue set
2418 *	comprises a response queue, two Rx free-buffer queues, and up to 3
2419 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2420 *	queue, offload queue, and control queue.
2421 */
2422int
2423t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2424		  const struct qset_params *p, int ntxq, struct port_info *pi)
2425{
2426	struct sge_qset *q = &sc->sge.qs[id];
2427	int i, ret = 0;
2428
2429	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2430	q->port = pi;
2431	q->adap = sc;
2432
2433	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2434	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2435		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2436		goto err;
2437	}
2438	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2439	    M_NOWAIT | M_ZERO)) == NULL) {
2440		device_printf(sc->dev, "failed to allocate ifq\n");
2441		goto err;
2442	}
2443	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2444	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2445	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2446	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2447	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2448
2449	init_qset_cntxt(q, id);
2450	q->idx = id;
2451	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2452		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2453		    &q->fl[0].desc, &q->fl[0].sdesc,
2454		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2455		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2456		printf("error %d from alloc ring fl0\n", ret);
2457		goto err;
2458	}
2459
2460	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2461		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2462		    &q->fl[1].desc, &q->fl[1].sdesc,
2463		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2464		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2465		printf("error %d from alloc ring fl1\n", ret);
2466		goto err;
2467	}
2468
2469	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2470		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2471		    &q->rspq.desc_tag, &q->rspq.desc_map,
2472		    NULL, NULL)) != 0) {
2473		printf("error %d from alloc ring rspq\n", ret);
2474		goto err;
2475	}
2476
2477	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2478	    device_get_unit(sc->dev), irq_vec_idx);
2479	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2480
2481	for (i = 0; i < ntxq; ++i) {
2482		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2483
2484		if ((ret = alloc_ring(sc, p->txq_size[i],
2485			    sizeof(struct tx_desc), sz,
2486			    &q->txq[i].phys_addr, &q->txq[i].desc,
2487			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2488			    &q->txq[i].desc_map,
2489			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2490			printf("error %d from alloc ring tx %i\n", ret, i);
2491			goto err;
2492		}
2493		mbufq_init(&q->txq[i].sendq);
2494		q->txq[i].gen = 1;
2495		q->txq[i].size = p->txq_size[i];
2496	}
2497
2498#ifdef TCP_OFFLOAD
2499	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2500#endif
2501	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2502	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2503	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2504
2505	q->fl[0].gen = q->fl[1].gen = 1;
2506	q->fl[0].size = p->fl_size;
2507	q->fl[1].size = p->jumbo_size;
2508
2509	q->rspq.gen = 1;
2510	q->rspq.cidx = 0;
2511	q->rspq.size = p->rspq_size;
2512
2513	q->txq[TXQ_ETH].stop_thres = nports *
2514	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2515
2516	q->fl[0].buf_size = MCLBYTES;
2517	q->fl[0].zone = zone_pack;
2518	q->fl[0].type = EXT_PACKET;
2519
2520	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2521		q->fl[1].zone = zone_jumbo16;
2522		q->fl[1].type = EXT_JUMBO16;
2523	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2524		q->fl[1].zone = zone_jumbo9;
2525		q->fl[1].type = EXT_JUMBO9;
2526	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2527		q->fl[1].zone = zone_jumbop;
2528		q->fl[1].type = EXT_JUMBOP;
2529	} else {
2530		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2531		ret = EDOOFUS;
2532		goto err;
2533	}
2534	q->fl[1].buf_size = p->jumbo_buf_size;
2535
2536	/* Allocate and setup the lro_ctrl structure */
2537	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2538#if defined(INET6) || defined(INET)
2539	ret = tcp_lro_init(&q->lro.ctrl);
2540	if (ret) {
2541		printf("error %d from tcp_lro_init\n", ret);
2542		goto err;
2543	}
2544#endif
2545	q->lro.ctrl.ifp = pi->ifp;
2546
2547	mtx_lock_spin(&sc->sge.reg_lock);
2548	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2549				   q->rspq.phys_addr, q->rspq.size,
2550				   q->fl[0].buf_size, 1, 0);
2551	if (ret) {
2552		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2553		goto err_unlock;
2554	}
2555
2556	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2557		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2558					  q->fl[i].phys_addr, q->fl[i].size,
2559					  q->fl[i].buf_size, p->cong_thres, 1,
2560					  0);
2561		if (ret) {
2562			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2563			goto err_unlock;
2564		}
2565	}
2566
2567	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2568				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2569				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2570				 1, 0);
2571	if (ret) {
2572		printf("error %d from t3_sge_init_ecntxt\n", ret);
2573		goto err_unlock;
2574	}
2575
2576	if (ntxq > 1) {
2577		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2578					 USE_GTS, SGE_CNTXT_OFLD, id,
2579					 q->txq[TXQ_OFLD].phys_addr,
2580					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2581		if (ret) {
2582			printf("error %d from t3_sge_init_ecntxt\n", ret);
2583			goto err_unlock;
2584		}
2585	}
2586
2587	if (ntxq > 2) {
2588		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2589					 SGE_CNTXT_CTRL, id,
2590					 q->txq[TXQ_CTRL].phys_addr,
2591					 q->txq[TXQ_CTRL].size,
2592					 q->txq[TXQ_CTRL].token, 1, 0);
2593		if (ret) {
2594			printf("error %d from t3_sge_init_ecntxt\n", ret);
2595			goto err_unlock;
2596		}
2597	}
2598
2599	mtx_unlock_spin(&sc->sge.reg_lock);
2600	t3_update_qset_coalesce(q, p);
2601
2602	refill_fl(sc, &q->fl[0], q->fl[0].size);
2603	refill_fl(sc, &q->fl[1], q->fl[1].size);
2604	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2605
2606	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2607		     V_NEWTIMER(q->rspq.holdoff_tmr));
2608
2609	return (0);
2610
2611err_unlock:
2612	mtx_unlock_spin(&sc->sge.reg_lock);
2613err:
2614	TXQ_LOCK(q);
2615	t3_free_qset(sc, q);
2616
2617	return (ret);
2618}
2619
2620/*
2621 * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2622 * ethernet data.  Hardware assistance with various checksums and any vlan tag
2623 * will also be taken into account here.
2624 */
2625void
2626t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad)
2627{
2628	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2629	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2630	struct ifnet *ifp = pi->ifp;
2631
2632	if (cpl->vlan_valid) {
2633		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2634		m->m_flags |= M_VLANTAG;
2635	}
2636
2637	m->m_pkthdr.rcvif = ifp;
2638	/*
2639	 * adjust after conversion to mbuf chain
2640	 */
2641	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2642	m->m_len -= (sizeof(*cpl) + ethpad);
2643	m->m_data += (sizeof(*cpl) + ethpad);
2644
2645	if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) {
2646		struct ether_header *eh = mtod(m, void *);
2647		uint16_t eh_type;
2648
2649		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2650			struct ether_vlan_header *evh = mtod(m, void *);
2651
2652			eh_type = evh->evl_proto;
2653		} else
2654			eh_type = eh->ether_type;
2655
2656		if (ifp->if_capenable & IFCAP_RXCSUM &&
2657		    eh_type == htons(ETHERTYPE_IP)) {
2658			m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
2659			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2660			m->m_pkthdr.csum_data = 0xffff;
2661		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
2662		    eh_type == htons(ETHERTYPE_IPV6)) {
2663			m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
2664			    CSUM_PSEUDO_HDR);
2665			m->m_pkthdr.csum_data = 0xffff;
2666		}
2667	}
2668}
2669
2670/**
2671 *	get_packet - return the next ingress packet buffer from a free list
2672 *	@adap: the adapter that received the packet
2673 *	@drop_thres: # of remaining buffers before we start dropping packets
2674 *	@qs: the qset that the SGE free list holding the packet belongs to
2675 *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2676 *      @r: response descriptor
2677 *
2678 *	Get the next packet from a free list and complete setup of the
2679 *	sk_buff.  If the packet is small we make a copy and recycle the
2680 *	original buffer, otherwise we use the original buffer itself.  If a
2681 *	positive drop threshold is supplied packets are dropped and their
2682 *	buffers recycled if (a) the number of remaining buffers is under the
2683 *	threshold and the packet is too big to copy, or (b) the packet should
2684 *	be copied but there is no memory for the copy.
2685 */
2686static int
2687get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2688    struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2689{
2690
2691	unsigned int len_cq =  ntohl(r->len_cq);
2692	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2693	int mask, cidx = fl->cidx;
2694	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2695	uint32_t len = G_RSPD_LEN(len_cq);
2696	uint32_t flags = M_EXT;
2697	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2698	caddr_t cl;
2699	struct mbuf *m;
2700	int ret = 0;
2701
2702	mask = fl->size - 1;
2703	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2704	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2705	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2706	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2707
2708	fl->credits--;
2709	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2710
2711	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2712	    sopeop == RSPQ_SOP_EOP) {
2713		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
2714			goto skip_recycle;
2715		cl = mtod(m, void *);
2716		memcpy(cl, sd->rxsd_cl, len);
2717		recycle_rx_buf(adap, fl, fl->cidx);
2718		m->m_pkthdr.len = m->m_len = len;
2719		m->m_flags = 0;
2720		mh->mh_head = mh->mh_tail = m;
2721		ret = 1;
2722		goto done;
2723	} else {
2724	skip_recycle:
2725		bus_dmamap_unload(fl->entry_tag, sd->map);
2726		cl = sd->rxsd_cl;
2727		m = sd->m;
2728
2729		if ((sopeop == RSPQ_SOP_EOP) ||
2730		    (sopeop == RSPQ_SOP))
2731			flags |= M_PKTHDR;
2732		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2733		if (fl->zone == zone_pack) {
2734			/*
2735			 * restore clobbered data pointer
2736			 */
2737			m->m_data = m->m_ext.ext_buf;
2738		} else {
2739			m_cljset(m, cl, fl->type);
2740		}
2741		m->m_len = len;
2742	}
2743	switch(sopeop) {
2744	case RSPQ_SOP_EOP:
2745		ret = 1;
2746		/* FALLTHROUGH */
2747	case RSPQ_SOP:
2748		mh->mh_head = mh->mh_tail = m;
2749		m->m_pkthdr.len = len;
2750		break;
2751	case RSPQ_EOP:
2752		ret = 1;
2753		/* FALLTHROUGH */
2754	case RSPQ_NSOP_NEOP:
2755		if (mh->mh_tail == NULL) {
2756			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2757			m_freem(m);
2758			break;
2759		}
2760		mh->mh_tail->m_next = m;
2761		mh->mh_tail = m;
2762		mh->mh_head->m_pkthdr.len += len;
2763		break;
2764	}
2765	if (cxgb_debug)
2766		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2767done:
2768	if (++fl->cidx == fl->size)
2769		fl->cidx = 0;
2770
2771	return (ret);
2772}
2773
2774/**
2775 *	handle_rsp_cntrl_info - handles control information in a response
2776 *	@qs: the queue set corresponding to the response
2777 *	@flags: the response control flags
2778 *
2779 *	Handles the control information of an SGE response, such as GTS
2780 *	indications and completion credits for the queue set's Tx queues.
2781 *	HW coalesces credits, we don't do any extra SW coalescing.
2782 */
2783static __inline void
2784handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2785{
2786	unsigned int credits;
2787
2788#if USE_GTS
2789	if (flags & F_RSPD_TXQ0_GTS)
2790		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2791#endif
2792	credits = G_RSPD_TXQ0_CR(flags);
2793	if (credits)
2794		qs->txq[TXQ_ETH].processed += credits;
2795
2796	credits = G_RSPD_TXQ2_CR(flags);
2797	if (credits)
2798		qs->txq[TXQ_CTRL].processed += credits;
2799
2800# if USE_GTS
2801	if (flags & F_RSPD_TXQ1_GTS)
2802		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2803# endif
2804	credits = G_RSPD_TXQ1_CR(flags);
2805	if (credits)
2806		qs->txq[TXQ_OFLD].processed += credits;
2807
2808}
2809
2810static void
2811check_ring_db(adapter_t *adap, struct sge_qset *qs,
2812    unsigned int sleeping)
2813{
2814	;
2815}
2816
2817/**
2818 *	process_responses - process responses from an SGE response queue
2819 *	@adap: the adapter
2820 *	@qs: the queue set to which the response queue belongs
2821 *	@budget: how many responses can be processed in this round
2822 *
2823 *	Process responses from an SGE response queue up to the supplied budget.
2824 *	Responses include received packets as well as credits and other events
2825 *	for the queues that belong to the response queue's queue set.
2826 *	A negative budget is effectively unlimited.
2827 *
2828 *	Additionally choose the interrupt holdoff time for the next interrupt
2829 *	on this queue.  If the system is under memory shortage use a fairly
2830 *	long delay to help recovery.
2831 */
2832static int
2833process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2834{
2835	struct sge_rspq *rspq = &qs->rspq;
2836	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2837	int budget_left = budget;
2838	unsigned int sleeping = 0;
2839#if defined(INET6) || defined(INET)
2840	int lro_enabled = qs->lro.enabled;
2841	int skip_lro;
2842	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2843#endif
2844	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2845#ifdef DEBUG
2846	static int last_holdoff = 0;
2847	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2848		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2849		last_holdoff = rspq->holdoff_tmr;
2850	}
2851#endif
2852	rspq->next_holdoff = rspq->holdoff_tmr;
2853
2854	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2855		int eth, eop = 0, ethpad = 0;
2856		uint32_t flags = ntohl(r->flags);
2857		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2858		uint8_t opcode = r->rss_hdr.opcode;
2859
2860		eth = (opcode == CPL_RX_PKT);
2861
2862		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2863			struct mbuf *m;
2864
2865			if (cxgb_debug)
2866				printf("async notification\n");
2867
2868			if (mh->mh_head == NULL) {
2869				mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA);
2870				m = mh->mh_head;
2871			} else {
2872				m = m_gethdr(M_NOWAIT, MT_DATA);
2873			}
2874			if (m == NULL)
2875				goto no_mem;
2876
2877                        memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2878			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2879                        *mtod(m, uint8_t *) = CPL_ASYNC_NOTIF;
2880			opcode = CPL_ASYNC_NOTIF;
2881			eop = 1;
2882                        rspq->async_notif++;
2883			goto skip;
2884		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2885			struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
2886
2887			if (m == NULL) {
2888		no_mem:
2889				rspq->next_holdoff = NOMEM_INTR_DELAY;
2890				budget_left--;
2891				break;
2892			}
2893			if (mh->mh_head == NULL)
2894				mh->mh_head = m;
2895                        else
2896				mh->mh_tail->m_next = m;
2897			mh->mh_tail = m;
2898
2899			get_imm_packet(adap, r, m);
2900			mh->mh_head->m_pkthdr.len += m->m_len;
2901			eop = 1;
2902			rspq->imm_data++;
2903		} else if (r->len_cq) {
2904			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2905
2906			eop = get_packet(adap, drop_thresh, qs, mh, r);
2907			if (eop) {
2908				if (r->rss_hdr.hash_type && !adap->timestamp) {
2909					M_HASHTYPE_SET(mh->mh_head, M_HASHTYPE_OPAQUE);
2910					mh->mh_head->m_pkthdr.flowid = rss_hash;
2911				}
2912			}
2913
2914			ethpad = 2;
2915		} else {
2916			rspq->pure_rsps++;
2917		}
2918	skip:
2919		if (flags & RSPD_CTRL_MASK) {
2920			sleeping |= flags & RSPD_GTS_MASK;
2921			handle_rsp_cntrl_info(qs, flags);
2922		}
2923
2924		if (!eth && eop) {
2925			rspq->offload_pkts++;
2926#ifdef TCP_OFFLOAD
2927			adap->cpl_handler[opcode](qs, r, mh->mh_head);
2928#else
2929			m_freem(mh->mh_head);
2930#endif
2931			mh->mh_head = NULL;
2932		} else if (eth && eop) {
2933			struct mbuf *m = mh->mh_head;
2934
2935			t3_rx_eth(adap, m, ethpad);
2936
2937			/*
2938			 * The T304 sends incoming packets on any qset.  If LRO
2939			 * is also enabled, we could end up sending packet up
2940			 * lro_ctrl->ifp's input.  That is incorrect.
2941			 *
2942			 * The mbuf's rcvif was derived from the cpl header and
2943			 * is accurate.  Skip LRO and just use that.
2944			 */
2945#if defined(INET6) || defined(INET)
2946			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
2947
2948			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
2949			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
2950			    ) {
2951				/* successfully queue'd for LRO */
2952			} else
2953#endif
2954			{
2955				/*
2956				 * LRO not enabled, packet unsuitable for LRO,
2957				 * or unable to queue.  Pass it up right now in
2958				 * either case.
2959				 */
2960				struct ifnet *ifp = m->m_pkthdr.rcvif;
2961				(*ifp->if_input)(ifp, m);
2962			}
2963			mh->mh_head = NULL;
2964
2965		}
2966
2967		r++;
2968		if (__predict_false(++rspq->cidx == rspq->size)) {
2969			rspq->cidx = 0;
2970			rspq->gen ^= 1;
2971			r = rspq->desc;
2972		}
2973
2974		if (++rspq->credits >= 64) {
2975			refill_rspq(adap, rspq, rspq->credits);
2976			rspq->credits = 0;
2977		}
2978		__refill_fl_lt(adap, &qs->fl[0], 32);
2979		__refill_fl_lt(adap, &qs->fl[1], 32);
2980		--budget_left;
2981	}
2982
2983#if defined(INET6) || defined(INET)
2984	/* Flush LRO */
2985	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
2986		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
2987		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
2988		tcp_lro_flush(lro_ctrl, queued);
2989	}
2990#endif
2991
2992	if (sleeping)
2993		check_ring_db(adap, qs, sleeping);
2994
2995	mb();  /* commit Tx queue processed updates */
2996	if (__predict_false(qs->txq_stopped > 1))
2997		restart_tx(qs);
2998
2999	__refill_fl_lt(adap, &qs->fl[0], 512);
3000	__refill_fl_lt(adap, &qs->fl[1], 512);
3001	budget -= budget_left;
3002	return (budget);
3003}
3004
3005/*
3006 * A helper function that processes responses and issues GTS.
3007 */
3008static __inline int
3009process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3010{
3011	int work;
3012	static int last_holdoff = 0;
3013
3014	work = process_responses(adap, rspq_to_qset(rq), -1);
3015
3016	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3017		printf("next_holdoff=%d\n", rq->next_holdoff);
3018		last_holdoff = rq->next_holdoff;
3019	}
3020	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3021	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3022
3023	return (work);
3024}
3025
3026
3027/*
3028 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3029 * Handles data events from SGE response queues as well as error and other
3030 * async events as they all use the same interrupt pin.  We use one SGE
3031 * response queue per port in this mode and protect all response queues with
3032 * queue 0's lock.
3033 */
3034void
3035t3b_intr(void *data)
3036{
3037	uint32_t i, map;
3038	adapter_t *adap = data;
3039	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3040
3041	t3_write_reg(adap, A_PL_CLI, 0);
3042	map = t3_read_reg(adap, A_SG_DATA_INTR);
3043
3044	if (!map)
3045		return;
3046
3047	if (__predict_false(map & F_ERRINTR)) {
3048		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3049		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3050		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3051	}
3052
3053	mtx_lock(&q0->lock);
3054	for_each_port(adap, i)
3055	    if (map & (1 << i))
3056			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3057	mtx_unlock(&q0->lock);
3058}
3059
3060/*
3061 * The MSI interrupt handler.  This needs to handle data events from SGE
3062 * response queues as well as error and other async events as they all use
3063 * the same MSI vector.  We use one SGE response queue per port in this mode
3064 * and protect all response queues with queue 0's lock.
3065 */
3066void
3067t3_intr_msi(void *data)
3068{
3069	adapter_t *adap = data;
3070	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3071	int i, new_packets = 0;
3072
3073	mtx_lock(&q0->lock);
3074
3075	for_each_port(adap, i)
3076	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3077		    new_packets = 1;
3078	mtx_unlock(&q0->lock);
3079	if (new_packets == 0) {
3080		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3081		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3082		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3083	}
3084}
3085
3086void
3087t3_intr_msix(void *data)
3088{
3089	struct sge_qset *qs = data;
3090	adapter_t *adap = qs->port->adapter;
3091	struct sge_rspq *rspq = &qs->rspq;
3092
3093	if (process_responses_gts(adap, rspq) == 0)
3094		rspq->unhandled_irqs++;
3095}
3096
3097#define QDUMP_SBUF_SIZE		32 * 400
3098static int
3099t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3100{
3101	struct sge_rspq *rspq;
3102	struct sge_qset *qs;
3103	int i, err, dump_end, idx;
3104	struct sbuf *sb;
3105	struct rsp_desc *rspd;
3106	uint32_t data[4];
3107
3108	rspq = arg1;
3109	qs = rspq_to_qset(rspq);
3110	if (rspq->rspq_dump_count == 0)
3111		return (0);
3112	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3113		log(LOG_WARNING,
3114		    "dump count is too large %d\n", rspq->rspq_dump_count);
3115		rspq->rspq_dump_count = 0;
3116		return (EINVAL);
3117	}
3118	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3119		log(LOG_WARNING,
3120		    "dump start of %d is greater than queue size\n",
3121		    rspq->rspq_dump_start);
3122		rspq->rspq_dump_start = 0;
3123		return (EINVAL);
3124	}
3125	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3126	if (err)
3127		return (err);
3128	err = sysctl_wire_old_buffer(req, 0);
3129	if (err)
3130		return (err);
3131	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3132
3133	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3134	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3135	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3136	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3137	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3138
3139	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3140	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3141
3142	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3143	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3144		idx = i & (RSPQ_Q_SIZE-1);
3145
3146		rspd = &rspq->desc[idx];
3147		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3148		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3149		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3150		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3151		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3152		    be32toh(rspd->len_cq), rspd->intr_gen);
3153	}
3154
3155	err = sbuf_finish(sb);
3156	/* Output a trailing NUL. */
3157	if (err == 0)
3158		err = SYSCTL_OUT(req, "", 1);
3159	sbuf_delete(sb);
3160	return (err);
3161}
3162
3163static int
3164t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3165{
3166	struct sge_txq *txq;
3167	struct sge_qset *qs;
3168	int i, j, err, dump_end;
3169	struct sbuf *sb;
3170	struct tx_desc *txd;
3171	uint32_t *WR, wr_hi, wr_lo, gen;
3172	uint32_t data[4];
3173
3174	txq = arg1;
3175	qs = txq_to_qset(txq, TXQ_ETH);
3176	if (txq->txq_dump_count == 0) {
3177		return (0);
3178	}
3179	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3180		log(LOG_WARNING,
3181		    "dump count is too large %d\n", txq->txq_dump_count);
3182		txq->txq_dump_count = 1;
3183		return (EINVAL);
3184	}
3185	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3186		log(LOG_WARNING,
3187		    "dump start of %d is greater than queue size\n",
3188		    txq->txq_dump_start);
3189		txq->txq_dump_start = 0;
3190		return (EINVAL);
3191	}
3192	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3193	if (err)
3194		return (err);
3195	err = sysctl_wire_old_buffer(req, 0);
3196	if (err)
3197		return (err);
3198	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3199
3200	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3201	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3202	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3203	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3204	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3205	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3206	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3207	    txq->txq_dump_start,
3208	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3209
3210	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3211	for (i = txq->txq_dump_start; i < dump_end; i++) {
3212		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3213		WR = (uint32_t *)txd->flit;
3214		wr_hi = ntohl(WR[0]);
3215		wr_lo = ntohl(WR[1]);
3216		gen = G_WR_GEN(wr_lo);
3217
3218		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3219		    wr_hi, wr_lo, gen);
3220		for (j = 2; j < 30; j += 4)
3221			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3222			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3223
3224	}
3225	err = sbuf_finish(sb);
3226	/* Output a trailing NUL. */
3227	if (err == 0)
3228		err = SYSCTL_OUT(req, "", 1);
3229	sbuf_delete(sb);
3230	return (err);
3231}
3232
3233static int
3234t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3235{
3236	struct sge_txq *txq;
3237	struct sge_qset *qs;
3238	int i, j, err, dump_end;
3239	struct sbuf *sb;
3240	struct tx_desc *txd;
3241	uint32_t *WR, wr_hi, wr_lo, gen;
3242
3243	txq = arg1;
3244	qs = txq_to_qset(txq, TXQ_CTRL);
3245	if (txq->txq_dump_count == 0) {
3246		return (0);
3247	}
3248	if (txq->txq_dump_count > 256) {
3249		log(LOG_WARNING,
3250		    "dump count is too large %d\n", txq->txq_dump_count);
3251		txq->txq_dump_count = 1;
3252		return (EINVAL);
3253	}
3254	if (txq->txq_dump_start > 255) {
3255		log(LOG_WARNING,
3256		    "dump start of %d is greater than queue size\n",
3257		    txq->txq_dump_start);
3258		txq->txq_dump_start = 0;
3259		return (EINVAL);
3260	}
3261
3262	err = sysctl_wire_old_buffer(req, 0);
3263	if (err != 0)
3264		return (err);
3265	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3266	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3267	    txq->txq_dump_start,
3268	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3269
3270	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3271	for (i = txq->txq_dump_start; i < dump_end; i++) {
3272		txd = &txq->desc[i & (255)];
3273		WR = (uint32_t *)txd->flit;
3274		wr_hi = ntohl(WR[0]);
3275		wr_lo = ntohl(WR[1]);
3276		gen = G_WR_GEN(wr_lo);
3277
3278		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3279		    wr_hi, wr_lo, gen);
3280		for (j = 2; j < 30; j += 4)
3281			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3282			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3283
3284	}
3285	err = sbuf_finish(sb);
3286	/* Output a trailing NUL. */
3287	if (err == 0)
3288		err = SYSCTL_OUT(req, "", 1);
3289	sbuf_delete(sb);
3290	return (err);
3291}
3292
3293static int
3294t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3295{
3296	adapter_t *sc = arg1;
3297	struct qset_params *qsp = &sc->params.sge.qset[0];
3298	int coalesce_usecs;
3299	struct sge_qset *qs;
3300	int i, j, err, nqsets = 0;
3301	struct mtx *lock;
3302
3303	if ((sc->flags & FULL_INIT_DONE) == 0)
3304		return (ENXIO);
3305
3306	coalesce_usecs = qsp->coalesce_usecs;
3307        err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3308
3309	if (err != 0) {
3310		return (err);
3311	}
3312	if (coalesce_usecs == qsp->coalesce_usecs)
3313		return (0);
3314
3315	for (i = 0; i < sc->params.nports; i++)
3316		for (j = 0; j < sc->port[i].nqsets; j++)
3317			nqsets++;
3318
3319	coalesce_usecs = max(1, coalesce_usecs);
3320
3321	for (i = 0; i < nqsets; i++) {
3322		qs = &sc->sge.qs[i];
3323		qsp = &sc->params.sge.qset[i];
3324		qsp->coalesce_usecs = coalesce_usecs;
3325
3326		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3327			    &sc->sge.qs[0].rspq.lock;
3328
3329		mtx_lock(lock);
3330		t3_update_qset_coalesce(qs, qsp);
3331		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3332		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3333		mtx_unlock(lock);
3334	}
3335
3336	return (0);
3337}
3338
3339static int
3340t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3341{
3342	adapter_t *sc = arg1;
3343	int rc, timestamp;
3344
3345	if ((sc->flags & FULL_INIT_DONE) == 0)
3346		return (ENXIO);
3347
3348	timestamp = sc->timestamp;
3349	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3350
3351	if (rc != 0)
3352		return (rc);
3353
3354	if (timestamp != sc->timestamp) {
3355		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3356		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3357		sc->timestamp = timestamp;
3358	}
3359
3360	return (0);
3361}
3362
3363void
3364t3_add_attach_sysctls(adapter_t *sc)
3365{
3366	struct sysctl_ctx_list *ctx;
3367	struct sysctl_oid_list *children;
3368
3369	ctx = device_get_sysctl_ctx(sc->dev);
3370	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3371
3372	/* random information */
3373	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3374	    "firmware_version",
3375	    CTLFLAG_RD, sc->fw_version,
3376	    0, "firmware version");
3377	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3378	    "hw_revision",
3379	    CTLFLAG_RD, &sc->params.rev,
3380	    0, "chip model");
3381	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3382	    "port_types",
3383	    CTLFLAG_RD, sc->port_types,
3384	    0, "type of ports");
3385	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3386	    "enable_debug",
3387	    CTLFLAG_RW, &cxgb_debug,
3388	    0, "enable verbose debugging output");
3389	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3390	    CTLFLAG_RD, &sc->tunq_coalesce,
3391	    "#tunneled packets freed");
3392	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3393	    "txq_overrun",
3394	    CTLFLAG_RD, &txq_fills,
3395	    0, "#times txq overrun");
3396	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3397	    "core_clock",
3398	    CTLFLAG_RD, &sc->params.vpd.cclk,
3399	    0, "core clock frequency (in KHz)");
3400}
3401
3402
3403static const char *rspq_name = "rspq";
3404static const char *txq_names[] =
3405{
3406	"txq_eth",
3407	"txq_ofld",
3408	"txq_ctrl"
3409};
3410
3411static int
3412sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3413{
3414	struct port_info *p = arg1;
3415	uint64_t *parg;
3416
3417	if (!p)
3418		return (EINVAL);
3419
3420	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3421	PORT_LOCK(p);
3422	t3_mac_update_stats(&p->mac);
3423	PORT_UNLOCK(p);
3424
3425	return (sysctl_handle_64(oidp, parg, 0, req));
3426}
3427
3428void
3429t3_add_configured_sysctls(adapter_t *sc)
3430{
3431	struct sysctl_ctx_list *ctx;
3432	struct sysctl_oid_list *children;
3433	int i, j;
3434
3435	ctx = device_get_sysctl_ctx(sc->dev);
3436	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3437
3438	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3439	    "intr_coal",
3440	    CTLTYPE_INT|CTLFLAG_RW, sc,
3441	    0, t3_set_coalesce_usecs,
3442	    "I", "interrupt coalescing timer (us)");
3443
3444	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3445	    "pkt_timestamp",
3446	    CTLTYPE_INT | CTLFLAG_RW, sc,
3447	    0, t3_pkt_timestamp,
3448	    "I", "provide packet timestamp instead of connection hash");
3449
3450	for (i = 0; i < sc->params.nports; i++) {
3451		struct port_info *pi = &sc->port[i];
3452		struct sysctl_oid *poid;
3453		struct sysctl_oid_list *poidlist;
3454		struct mac_stats *mstats = &pi->mac.stats;
3455
3456		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3457		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3458		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3459		poidlist = SYSCTL_CHILDREN(poid);
3460		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3461		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3462		    0, "#queue sets");
3463
3464		for (j = 0; j < pi->nqsets; j++) {
3465			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3466			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3467					  *ctrlqpoid, *lropoid;
3468			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3469					       *txqpoidlist, *ctrlqpoidlist,
3470					       *lropoidlist;
3471			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3472
3473			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3474
3475			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3476			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3477			qspoidlist = SYSCTL_CHILDREN(qspoid);
3478
3479			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3480					CTLFLAG_RD, &qs->fl[0].empty, 0,
3481					"freelist #0 empty");
3482			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3483					CTLFLAG_RD, &qs->fl[1].empty, 0,
3484					"freelist #1 empty");
3485
3486			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3487			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3488			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3489
3490			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3491			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3492			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3493
3494			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3495			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3496			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3497
3498			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3499			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3500			lropoidlist = SYSCTL_CHILDREN(lropoid);
3501
3502			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3503			    CTLFLAG_RD, &qs->rspq.size,
3504			    0, "#entries in response queue");
3505			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3506			    CTLFLAG_RD, &qs->rspq.cidx,
3507			    0, "consumer index");
3508			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3509			    CTLFLAG_RD, &qs->rspq.credits,
3510			    0, "#credits");
3511			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3512			    CTLFLAG_RD, &qs->rspq.starved,
3513			    0, "#times starved");
3514			SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3515			    CTLFLAG_RD, &qs->rspq.phys_addr,
3516			    "physical_address_of the queue");
3517			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3518			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3519			    0, "start rspq dump entry");
3520			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3521			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3522			    0, "#rspq entries to dump");
3523			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3524			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3525			    0, t3_dump_rspq, "A", "dump of the response queue");
3526
3527			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3528			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3529			    "#tunneled packets dropped");
3530			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3531			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3532			    0, "#tunneled packets waiting to be sent");
3533#if 0
3534			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3535			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3536			    0, "#tunneled packets queue producer index");
3537			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3538			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3539			    0, "#tunneled packets queue consumer index");
3540#endif
3541			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3542			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3543			    0, "#tunneled packets processed by the card");
3544			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3545			    CTLFLAG_RD, &txq->cleaned,
3546			    0, "#tunneled packets cleaned");
3547			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3548			    CTLFLAG_RD, &txq->in_use,
3549			    0, "#tunneled packet slots in use");
3550			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees",
3551			    CTLFLAG_RD, &txq->txq_frees,
3552			    "#tunneled packets freed");
3553			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3554			    CTLFLAG_RD, &txq->txq_skipped,
3555			    0, "#tunneled packet descriptors skipped");
3556			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3557			    CTLFLAG_RD, &txq->txq_coalesced,
3558			    "#tunneled packets coalesced");
3559			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3560			    CTLFLAG_RD, &txq->txq_enqueued,
3561			    0, "#tunneled packets enqueued to hardware");
3562			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3563			    CTLFLAG_RD, &qs->txq_stopped,
3564			    0, "tx queues stopped");
3565			SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3566			    CTLFLAG_RD, &txq->phys_addr,
3567			    "physical_address_of the queue");
3568			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3569			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3570			    0, "txq generation");
3571			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3572			    CTLFLAG_RD, &txq->cidx,
3573			    0, "hardware queue cidx");
3574			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3575			    CTLFLAG_RD, &txq->pidx,
3576			    0, "hardware queue pidx");
3577			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3578			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3579			    0, "txq start idx for dump");
3580			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3581			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3582			    0, "txq #entries to dump");
3583			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3584			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3585			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3586
3587			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3588			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3589			    0, "ctrlq start idx for dump");
3590			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3591			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3592			    0, "ctrl #entries to dump");
3593			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3594			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3595			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3596
3597			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3598			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3599			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3600			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3601			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3602			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3603			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3604			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3605		}
3606
3607		/* Now add a node for mac stats. */
3608		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3609		    CTLFLAG_RD, NULL, "MAC statistics");
3610		poidlist = SYSCTL_CHILDREN(poid);
3611
3612		/*
3613		 * We (ab)use the length argument (arg2) to pass on the offset
3614		 * of the data that we are interested in.  This is only required
3615		 * for the quad counters that are updated from the hardware (we
3616		 * make sure that we return the latest value).
3617		 * sysctl_handle_macstat first updates *all* the counters from
3618		 * the hardware, and then returns the latest value of the
3619		 * requested counter.  Best would be to update only the
3620		 * requested counter from hardware, but t3_mac_update_stats()
3621		 * hides all the register details and we don't want to dive into
3622		 * all that here.
3623		 */
3624#define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3625    (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3626    sysctl_handle_macstat, "QU", 0)
3627		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3628		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3629		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3630		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3631		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3632		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3633		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3634		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3635		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3636		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3637		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3638		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3639		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3640		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3641		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3642		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3643		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3644		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3645		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3646		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3647		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3648		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3649		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3650		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3651		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3652		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3653		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3654		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3655		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3656		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3657		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3658		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3659		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3660		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3661		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3662		CXGB_SYSCTL_ADD_QUAD(rx_short);
3663		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3664		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3665		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3666		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3667		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3668		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3669		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3670		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3671		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3672		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3673#undef CXGB_SYSCTL_ADD_QUAD
3674
3675#define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3676    CTLFLAG_RD, &mstats->a, 0)
3677		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3678		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3679		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3680		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3681		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3682		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3683		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3684		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3685		CXGB_SYSCTL_ADD_ULONG(num_resets);
3686		CXGB_SYSCTL_ADD_ULONG(link_faults);
3687#undef CXGB_SYSCTL_ADD_ULONG
3688	}
3689}
3690
3691/**
3692 *	t3_get_desc - dump an SGE descriptor for debugging purposes
3693 *	@qs: the queue set
3694 *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3695 *	@idx: the descriptor index in the queue
3696 *	@data: where to dump the descriptor contents
3697 *
3698 *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3699 *	size of the descriptor.
3700 */
3701int
3702t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3703		unsigned char *data)
3704{
3705	if (qnum >= 6)
3706		return (EINVAL);
3707
3708	if (qnum < 3) {
3709		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3710			return -EINVAL;
3711		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3712		return sizeof(struct tx_desc);
3713	}
3714
3715	if (qnum == 3) {
3716		if (!qs->rspq.desc || idx >= qs->rspq.size)
3717			return (EINVAL);
3718		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3719		return sizeof(struct rsp_desc);
3720	}
3721
3722	qnum -= 4;
3723	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3724		return (EINVAL);
3725	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3726	return sizeof(struct rx_desc);
3727}
3728