cxgb_sge.c revision 273736
1/**************************************************************************
2
3Copyright (c) 2007-2009, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/10/sys/dev/cxgb/cxgb_sge.c 273736 2014-10-27 14:38:00Z hselasky $");
32
33#include "opt_inet6.h"
34#include "opt_inet.h"
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/kernel.h>
39#include <sys/module.h>
40#include <sys/bus.h>
41#include <sys/conf.h>
42#include <machine/bus.h>
43#include <machine/resource.h>
44#include <sys/bus_dma.h>
45#include <sys/rman.h>
46#include <sys/queue.h>
47#include <sys/sysctl.h>
48#include <sys/taskqueue.h>
49
50#include <sys/proc.h>
51#include <sys/sbuf.h>
52#include <sys/sched.h>
53#include <sys/smp.h>
54#include <sys/systm.h>
55#include <sys/syslog.h>
56#include <sys/socket.h>
57#include <sys/sglist.h>
58
59#include <net/bpf.h>
60#include <net/ethernet.h>
61#include <net/if.h>
62#include <net/if_vlan_var.h>
63
64#include <netinet/in_systm.h>
65#include <netinet/in.h>
66#include <netinet/ip.h>
67#include <netinet/ip6.h>
68#include <netinet/tcp.h>
69
70#include <dev/pci/pcireg.h>
71#include <dev/pci/pcivar.h>
72
73#include <vm/vm.h>
74#include <vm/pmap.h>
75
76#include <cxgb_include.h>
77#include <sys/mvec.h>
78
79int	txq_fills = 0;
80int	multiq_tx_enable = 1;
81
82#ifdef TCP_OFFLOAD
83CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS);
84#endif
85
86extern struct sysctl_oid_list sysctl__hw_cxgb_children;
87int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
88TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
89SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
90    "size of per-queue mbuf ring");
91
92static int cxgb_tx_coalesce_force = 0;
93TUNABLE_INT("hw.cxgb.tx_coalesce_force", &cxgb_tx_coalesce_force);
94SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RW,
95    &cxgb_tx_coalesce_force, 0,
96    "coalesce small packets into a single work request regardless of ring state");
97
98#define	COALESCE_START_DEFAULT		TX_ETH_Q_SIZE>>1
99#define	COALESCE_START_MAX		(TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3))
100#define	COALESCE_STOP_DEFAULT		TX_ETH_Q_SIZE>>2
101#define	COALESCE_STOP_MIN		TX_ETH_Q_SIZE>>5
102#define	TX_RECLAIM_DEFAULT		TX_ETH_Q_SIZE>>5
103#define	TX_RECLAIM_MAX			TX_ETH_Q_SIZE>>2
104#define	TX_RECLAIM_MIN			TX_ETH_Q_SIZE>>6
105
106
107static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT;
108TUNABLE_INT("hw.cxgb.tx_coalesce_enable_start",
109    &cxgb_tx_coalesce_enable_start);
110SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RW,
111    &cxgb_tx_coalesce_enable_start, 0,
112    "coalesce enable threshold");
113static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT;
114TUNABLE_INT("hw.cxgb.tx_coalesce_enable_stop", &cxgb_tx_coalesce_enable_stop);
115SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RW,
116    &cxgb_tx_coalesce_enable_stop, 0,
117    "coalesce disable threshold");
118static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
119TUNABLE_INT("hw.cxgb.tx_reclaim_threshold", &cxgb_tx_reclaim_threshold);
120SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RW,
121    &cxgb_tx_reclaim_threshold, 0,
122    "tx cleaning minimum threshold");
123
124/*
125 * XXX don't re-enable this until TOE stops assuming
126 * we have an m_ext
127 */
128static int recycle_enable = 0;
129
130extern int cxgb_use_16k_clusters;
131extern int nmbjumbop;
132extern int nmbjumbo9;
133extern int nmbjumbo16;
134
135#define USE_GTS 0
136
137#define SGE_RX_SM_BUF_SIZE	1536
138#define SGE_RX_DROP_THRES	16
139#define SGE_RX_COPY_THRES	128
140
141/*
142 * Period of the Tx buffer reclaim timer.  This timer does not need to run
143 * frequently as Tx buffers are usually reclaimed by new Tx packets.
144 */
145#define TX_RECLAIM_PERIOD       (hz >> 1)
146
147/*
148 * Values for sge_txq.flags
149 */
150enum {
151	TXQ_RUNNING	= 1 << 0,  /* fetch engine is running */
152	TXQ_LAST_PKT_DB = 1 << 1,  /* last packet rang the doorbell */
153};
154
155struct tx_desc {
156	uint64_t	flit[TX_DESC_FLITS];
157} __packed;
158
159struct rx_desc {
160	uint32_t	addr_lo;
161	uint32_t	len_gen;
162	uint32_t	gen2;
163	uint32_t	addr_hi;
164} __packed;
165
166struct rsp_desc {               /* response queue descriptor */
167	struct rss_header	rss_hdr;
168	uint32_t		flags;
169	uint32_t		len_cq;
170	uint8_t			imm_data[47];
171	uint8_t			intr_gen;
172} __packed;
173
174#define RX_SW_DESC_MAP_CREATED	(1 << 0)
175#define TX_SW_DESC_MAP_CREATED	(1 << 1)
176#define RX_SW_DESC_INUSE        (1 << 3)
177#define TX_SW_DESC_MAPPED       (1 << 4)
178
179#define RSPQ_NSOP_NEOP           G_RSPD_SOP_EOP(0)
180#define RSPQ_EOP                 G_RSPD_SOP_EOP(F_RSPD_EOP)
181#define RSPQ_SOP                 G_RSPD_SOP_EOP(F_RSPD_SOP)
182#define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
183
184struct tx_sw_desc {                /* SW state per Tx descriptor */
185	struct mbuf	*m;
186	bus_dmamap_t	map;
187	int		flags;
188};
189
190struct rx_sw_desc {                /* SW state per Rx descriptor */
191	caddr_t		rxsd_cl;
192	struct mbuf	*m;
193	bus_dmamap_t	map;
194	int		flags;
195};
196
197struct txq_state {
198	unsigned int	compl;
199	unsigned int	gen;
200	unsigned int	pidx;
201};
202
203struct refill_fl_cb_arg {
204	int               error;
205	bus_dma_segment_t seg;
206	int               nseg;
207};
208
209
210/*
211 * Maps a number of flits to the number of Tx descriptors that can hold them.
212 * The formula is
213 *
214 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
215 *
216 * HW allows up to 4 descriptors to be combined into a WR.
217 */
218static uint8_t flit_desc_map[] = {
219	0,
220#if SGE_NUM_GENBITS == 1
221	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
222	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
223	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
225#elif SGE_NUM_GENBITS == 2
226	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
227	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
228	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
229	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
230#else
231# error "SGE_NUM_GENBITS must be 1 or 2"
232#endif
233};
234
235#define	TXQ_LOCK_ASSERT(qs)	mtx_assert(&(qs)->lock, MA_OWNED)
236#define	TXQ_TRYLOCK(qs)		mtx_trylock(&(qs)->lock)
237#define	TXQ_LOCK(qs)		mtx_lock(&(qs)->lock)
238#define	TXQ_UNLOCK(qs)		mtx_unlock(&(qs)->lock)
239#define	TXQ_RING_EMPTY(qs)	drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
240#define	TXQ_RING_NEEDS_ENQUEUE(qs)					\
241	drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
242#define	TXQ_RING_FLUSH(qs)	drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
243#define	TXQ_RING_DEQUEUE_COND(qs, func, arg)				\
244	drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg)
245#define	TXQ_RING_DEQUEUE(qs) \
246	drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr)
247
248int cxgb_debug = 0;
249
250static void sge_timer_cb(void *arg);
251static void sge_timer_reclaim(void *arg, int ncount);
252static void sge_txq_reclaim_handler(void *arg, int ncount);
253static void cxgb_start_locked(struct sge_qset *qs);
254
255/*
256 * XXX need to cope with bursty scheduling by looking at a wider
257 * window than we are now for determining the need for coalescing
258 *
259 */
260static __inline uint64_t
261check_pkt_coalesce(struct sge_qset *qs)
262{
263        struct adapter *sc;
264        struct sge_txq *txq;
265	uint8_t *fill;
266
267	if (__predict_false(cxgb_tx_coalesce_force))
268		return (1);
269	txq = &qs->txq[TXQ_ETH];
270        sc = qs->port->adapter;
271	fill = &sc->tunq_fill[qs->idx];
272
273	if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX)
274		cxgb_tx_coalesce_enable_start = COALESCE_START_MAX;
275	if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN)
276		cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN;
277	/*
278	 * if the hardware transmit queue is more than 1/8 full
279	 * we mark it as coalescing - we drop back from coalescing
280	 * when we go below 1/32 full and there are no packets enqueued,
281	 * this provides us with some degree of hysteresis
282	 */
283        if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
284	    TXQ_RING_EMPTY(qs) && (qs->coalescing == 0))
285                *fill = 0;
286        else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start))
287                *fill = 1;
288
289	return (sc->tunq_coalesce);
290}
291
292#ifdef __LP64__
293static void
294set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
295{
296	uint64_t wr_hilo;
297#if _BYTE_ORDER == _LITTLE_ENDIAN
298	wr_hilo = wr_hi;
299	wr_hilo |= (((uint64_t)wr_lo)<<32);
300#else
301	wr_hilo = wr_lo;
302	wr_hilo |= (((uint64_t)wr_hi)<<32);
303#endif
304	wrp->wrh_hilo = wr_hilo;
305}
306#else
307static void
308set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo)
309{
310
311	wrp->wrh_hi = wr_hi;
312	wmb();
313	wrp->wrh_lo = wr_lo;
314}
315#endif
316
317struct coalesce_info {
318	int count;
319	int nbytes;
320};
321
322static int
323coalesce_check(struct mbuf *m, void *arg)
324{
325	struct coalesce_info *ci = arg;
326	int *count = &ci->count;
327	int *nbytes = &ci->nbytes;
328
329	if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) &&
330		(*count < 7) && (m->m_next == NULL))) {
331		*count += 1;
332		*nbytes += m->m_len;
333		return (1);
334	}
335	return (0);
336}
337
338static struct mbuf *
339cxgb_dequeue(struct sge_qset *qs)
340{
341	struct mbuf *m, *m_head, *m_tail;
342	struct coalesce_info ci;
343
344
345	if (check_pkt_coalesce(qs) == 0)
346		return TXQ_RING_DEQUEUE(qs);
347
348	m_head = m_tail = NULL;
349	ci.count = ci.nbytes = 0;
350	do {
351		m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci);
352		if (m_head == NULL) {
353			m_tail = m_head = m;
354		} else if (m != NULL) {
355			m_tail->m_nextpkt = m;
356			m_tail = m;
357		}
358	} while (m != NULL);
359	if (ci.count > 7)
360		panic("trying to coalesce %d packets in to one WR", ci.count);
361	return (m_head);
362}
363
364/**
365 *	reclaim_completed_tx - reclaims completed Tx descriptors
366 *	@adapter: the adapter
367 *	@q: the Tx queue to reclaim completed descriptors from
368 *
369 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
370 *	and frees the associated buffers if possible.  Called with the Tx
371 *	queue's lock held.
372 */
373static __inline int
374reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue)
375{
376	struct sge_txq *q = &qs->txq[queue];
377	int reclaim = desc_reclaimable(q);
378
379	if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) ||
380	    (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN))
381		cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT;
382
383	if (reclaim < reclaim_min)
384		return (0);
385
386	mtx_assert(&qs->lock, MA_OWNED);
387	if (reclaim > 0) {
388		t3_free_tx_desc(qs, reclaim, queue);
389		q->cleaned += reclaim;
390		q->in_use -= reclaim;
391	}
392	if (isset(&qs->txq_stopped, TXQ_ETH))
393                clrbit(&qs->txq_stopped, TXQ_ETH);
394
395	return (reclaim);
396}
397
398/**
399 *	should_restart_tx - are there enough resources to restart a Tx queue?
400 *	@q: the Tx queue
401 *
402 *	Checks if there are enough descriptors to restart a suspended Tx queue.
403 */
404static __inline int
405should_restart_tx(const struct sge_txq *q)
406{
407	unsigned int r = q->processed - q->cleaned;
408
409	return q->in_use - r < (q->size >> 1);
410}
411
412/**
413 *	t3_sge_init - initialize SGE
414 *	@adap: the adapter
415 *	@p: the SGE parameters
416 *
417 *	Performs SGE initialization needed every time after a chip reset.
418 *	We do not initialize any of the queue sets here, instead the driver
419 *	top-level must request those individually.  We also do not enable DMA
420 *	here, that should be done after the queues have been set up.
421 */
422void
423t3_sge_init(adapter_t *adap, struct sge_params *p)
424{
425	u_int ctrl, ups;
426
427	ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */
428
429	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
430	       F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
431	       V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
432	       V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
433#if SGE_NUM_GENBITS == 1
434	ctrl |= F_EGRGENCTRL;
435#endif
436	if (adap->params.rev > 0) {
437		if (!(adap->flags & (USING_MSIX | USING_MSI)))
438			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
439	}
440	t3_write_reg(adap, A_SG_CONTROL, ctrl);
441	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
442		     V_LORCQDRBTHRSH(512));
443	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
444	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
445		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
446	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
447		     adap->params.rev < T3_REV_C ? 1000 : 500);
448	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
449	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
450	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
451	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
452	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
453}
454
455
456/**
457 *	sgl_len - calculates the size of an SGL of the given capacity
458 *	@n: the number of SGL entries
459 *
460 *	Calculates the number of flits needed for a scatter/gather list that
461 *	can hold the given number of entries.
462 */
463static __inline unsigned int
464sgl_len(unsigned int n)
465{
466	return ((3 * n) / 2 + (n & 1));
467}
468
469/**
470 *	get_imm_packet - return the next ingress packet buffer from a response
471 *	@resp: the response descriptor containing the packet data
472 *
473 *	Return a packet containing the immediate data of the given response.
474 */
475static int
476get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m)
477{
478
479	if (resp->rss_hdr.opcode == CPL_RX_DATA) {
480		const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0];
481		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
482	} else if (resp->rss_hdr.opcode == CPL_RX_PKT) {
483		const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0];
484		m->m_len = sizeof(*cpl) + ntohs(cpl->len);
485	} else
486		m->m_len = IMMED_PKT_SIZE;
487	m->m_ext.ext_buf = NULL;
488	m->m_ext.ext_type = 0;
489	memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len);
490	return (0);
491}
492
493static __inline u_int
494flits_to_desc(u_int n)
495{
496	return (flit_desc_map[n]);
497}
498
499#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
500		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
501		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
502		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
503		    F_HIRCQPARITYERROR)
504#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
505#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
506		      F_RSPQDISABLED)
507
508/**
509 *	t3_sge_err_intr_handler - SGE async event interrupt handler
510 *	@adapter: the adapter
511 *
512 *	Interrupt handler for SGE asynchronous (non-data) events.
513 */
514void
515t3_sge_err_intr_handler(adapter_t *adapter)
516{
517	unsigned int v, status;
518
519	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
520	if (status & SGE_PARERR)
521		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
522			 status & SGE_PARERR);
523	if (status & SGE_FRAMINGERR)
524		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
525			 status & SGE_FRAMINGERR);
526	if (status & F_RSPQCREDITOVERFOW)
527		CH_ALERT(adapter, "SGE response queue credit overflow\n");
528
529	if (status & F_RSPQDISABLED) {
530		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
531
532		CH_ALERT(adapter,
533			 "packet delivered to disabled response queue (0x%x)\n",
534			 (v >> S_RSPQ0DISABLED) & 0xff);
535	}
536
537	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
538	if (status & SGE_FATALERR)
539		t3_fatal_err(adapter);
540}
541
542void
543t3_sge_prep(adapter_t *adap, struct sge_params *p)
544{
545	int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size;
546
547	nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus);
548	nqsets *= adap->params.nports;
549
550	fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE);
551
552	while (!powerof2(fl_q_size))
553		fl_q_size--;
554
555	use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters :
556	    is_offload(adap);
557
558#if __FreeBSD_version >= 700111
559	if (use_16k) {
560		jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE);
561		jumbo_buf_size = MJUM16BYTES;
562	} else {
563		jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE);
564		jumbo_buf_size = MJUM9BYTES;
565	}
566#else
567	jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE);
568	jumbo_buf_size = MJUMPAGESIZE;
569#endif
570	while (!powerof2(jumbo_q_size))
571		jumbo_q_size--;
572
573	if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2))
574		device_printf(adap->dev,
575		    "Insufficient clusters and/or jumbo buffers.\n");
576
577	p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data);
578
579	for (i = 0; i < SGE_QSETS; ++i) {
580		struct qset_params *q = p->qset + i;
581
582		if (adap->params.nports > 2) {
583			q->coalesce_usecs = 50;
584		} else {
585#ifdef INVARIANTS
586			q->coalesce_usecs = 10;
587#else
588			q->coalesce_usecs = 5;
589#endif
590		}
591		q->polling = 0;
592		q->rspq_size = RSPQ_Q_SIZE;
593		q->fl_size = fl_q_size;
594		q->jumbo_size = jumbo_q_size;
595		q->jumbo_buf_size = jumbo_buf_size;
596		q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE;
597		q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16;
598		q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE;
599		q->cong_thres = 0;
600	}
601}
602
603int
604t3_sge_alloc(adapter_t *sc)
605{
606
607	/* The parent tag. */
608	if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */
609				1, 0,			/* algnmnt, boundary */
610				BUS_SPACE_MAXADDR,	/* lowaddr */
611				BUS_SPACE_MAXADDR,	/* highaddr */
612				NULL, NULL,		/* filter, filterarg */
613				BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
614				BUS_SPACE_UNRESTRICTED, /* nsegments */
615				BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
616				0,			/* flags */
617				NULL, NULL,		/* lock, lockarg */
618				&sc->parent_dmat)) {
619		device_printf(sc->dev, "Cannot allocate parent DMA tag\n");
620		return (ENOMEM);
621	}
622
623	/*
624	 * DMA tag for normal sized RX frames
625	 */
626	if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR,
627		BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1,
628		MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) {
629		device_printf(sc->dev, "Cannot allocate RX DMA tag\n");
630		return (ENOMEM);
631	}
632
633	/*
634	 * DMA tag for jumbo sized RX frames.
635	 */
636	if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR,
637		BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES,
638		BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) {
639		device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n");
640		return (ENOMEM);
641	}
642
643	/*
644	 * DMA tag for TX frames.
645	 */
646	if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR,
647		BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
648		TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
649		NULL, NULL, &sc->tx_dmat)) {
650		device_printf(sc->dev, "Cannot allocate TX DMA tag\n");
651		return (ENOMEM);
652	}
653
654	return (0);
655}
656
657int
658t3_sge_free(struct adapter * sc)
659{
660
661	if (sc->tx_dmat != NULL)
662		bus_dma_tag_destroy(sc->tx_dmat);
663
664	if (sc->rx_jumbo_dmat != NULL)
665		bus_dma_tag_destroy(sc->rx_jumbo_dmat);
666
667	if (sc->rx_dmat != NULL)
668		bus_dma_tag_destroy(sc->rx_dmat);
669
670	if (sc->parent_dmat != NULL)
671		bus_dma_tag_destroy(sc->parent_dmat);
672
673	return (0);
674}
675
676void
677t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
678{
679
680	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
681	qs->rspq.polling = 0 /* p->polling */;
682}
683
684#if !defined(__i386__) && !defined(__amd64__)
685static void
686refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
687{
688	struct refill_fl_cb_arg *cb_arg = arg;
689
690	cb_arg->error = error;
691	cb_arg->seg = segs[0];
692	cb_arg->nseg = nseg;
693
694}
695#endif
696/**
697 *	refill_fl - refill an SGE free-buffer list
698 *	@sc: the controller softc
699 *	@q: the free-list to refill
700 *	@n: the number of new buffers to allocate
701 *
702 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers.
703 *	The caller must assure that @n does not exceed the queue's capacity.
704 */
705static void
706refill_fl(adapter_t *sc, struct sge_fl *q, int n)
707{
708	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
709	struct rx_desc *d = &q->desc[q->pidx];
710	struct refill_fl_cb_arg cb_arg;
711	struct mbuf *m;
712	caddr_t cl;
713	int err;
714
715	cb_arg.error = 0;
716	while (n--) {
717		/*
718		 * We allocate an uninitialized mbuf + cluster, mbuf is
719		 * initialized after rx.
720		 */
721		if (q->zone == zone_pack) {
722			if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL)
723				break;
724			cl = m->m_ext.ext_buf;
725		} else {
726			if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL)
727				break;
728			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
729				uma_zfree(q->zone, cl);
730				break;
731			}
732		}
733		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
734			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
735				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
736				uma_zfree(q->zone, cl);
737				goto done;
738			}
739			sd->flags |= RX_SW_DESC_MAP_CREATED;
740		}
741#if !defined(__i386__) && !defined(__amd64__)
742		err = bus_dmamap_load(q->entry_tag, sd->map,
743		    cl, q->buf_size, refill_fl_cb, &cb_arg, 0);
744
745		if (err != 0 || cb_arg.error) {
746			if (q->zone == zone_pack)
747				uma_zfree(q->zone, cl);
748			m_free(m);
749			goto done;
750		}
751#else
752		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl);
753#endif
754		sd->flags |= RX_SW_DESC_INUSE;
755		sd->rxsd_cl = cl;
756		sd->m = m;
757		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
758		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
759		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
760		d->gen2 = htobe32(V_FLD_GEN2(q->gen));
761
762		d++;
763		sd++;
764
765		if (++q->pidx == q->size) {
766			q->pidx = 0;
767			q->gen ^= 1;
768			sd = q->sdesc;
769			d = q->desc;
770		}
771		q->credits++;
772		q->db_pending++;
773	}
774
775done:
776	if (q->db_pending >= 32) {
777		q->db_pending = 0;
778		t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
779	}
780}
781
782
783/**
784 *	free_rx_bufs - free the Rx buffers on an SGE free list
785 *	@sc: the controle softc
786 *	@q: the SGE free list to clean up
787 *
788 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
789 *	this queue should be stopped before calling this function.
790 */
791static void
792free_rx_bufs(adapter_t *sc, struct sge_fl *q)
793{
794	u_int cidx = q->cidx;
795
796	while (q->credits--) {
797		struct rx_sw_desc *d = &q->sdesc[cidx];
798
799		if (d->flags & RX_SW_DESC_INUSE) {
800			bus_dmamap_unload(q->entry_tag, d->map);
801			bus_dmamap_destroy(q->entry_tag, d->map);
802			if (q->zone == zone_pack) {
803				m_init(d->m, zone_pack, MCLBYTES,
804				    M_NOWAIT, MT_DATA, M_EXT);
805				uma_zfree(zone_pack, d->m);
806			} else {
807				m_init(d->m, zone_mbuf, MLEN,
808				    M_NOWAIT, MT_DATA, 0);
809				uma_zfree(zone_mbuf, d->m);
810				uma_zfree(q->zone, d->rxsd_cl);
811			}
812		}
813
814		d->rxsd_cl = NULL;
815		d->m = NULL;
816		if (++cidx == q->size)
817			cidx = 0;
818	}
819}
820
821static __inline void
822__refill_fl(adapter_t *adap, struct sge_fl *fl)
823{
824	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
825}
826
827static __inline void
828__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
829{
830	uint32_t reclaimable = fl->size - fl->credits;
831
832	if (reclaimable > 0)
833		refill_fl(adap, fl, min(max, reclaimable));
834}
835
836/**
837 *	recycle_rx_buf - recycle a receive buffer
838 *	@adapter: the adapter
839 *	@q: the SGE free list
840 *	@idx: index of buffer to recycle
841 *
842 *	Recycles the specified buffer on the given free list by adding it at
843 *	the next available slot on the list.
844 */
845static void
846recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx)
847{
848	struct rx_desc *from = &q->desc[idx];
849	struct rx_desc *to   = &q->desc[q->pidx];
850
851	q->sdesc[q->pidx] = q->sdesc[idx];
852	to->addr_lo = from->addr_lo;        // already big endian
853	to->addr_hi = from->addr_hi;        // likewise
854	wmb();	/* necessary ? */
855	to->len_gen = htobe32(V_FLD_GEN1(q->gen));
856	to->gen2 = htobe32(V_FLD_GEN2(q->gen));
857	q->credits++;
858
859	if (++q->pidx == q->size) {
860		q->pidx = 0;
861		q->gen ^= 1;
862	}
863	t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
864}
865
866static void
867alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
868{
869	uint32_t *addr;
870
871	addr = arg;
872	*addr = segs[0].ds_addr;
873}
874
875static int
876alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size,
877    bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag,
878    bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag)
879{
880	size_t len = nelem * elem_size;
881	void *s = NULL;
882	void *p = NULL;
883	int err;
884
885	if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0,
886				      BUS_SPACE_MAXADDR_32BIT,
887				      BUS_SPACE_MAXADDR, NULL, NULL, len, 1,
888				      len, 0, NULL, NULL, tag)) != 0) {
889		device_printf(sc->dev, "Cannot allocate descriptor tag\n");
890		return (ENOMEM);
891	}
892
893	if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT,
894				    map)) != 0) {
895		device_printf(sc->dev, "Cannot allocate descriptor memory\n");
896		return (ENOMEM);
897	}
898
899	bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0);
900	bzero(p, len);
901	*(void **)desc = p;
902
903	if (sw_size) {
904		len = nelem * sw_size;
905		s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
906		*(void **)sdesc = s;
907	}
908	if (parent_entry_tag == NULL)
909		return (0);
910
911	if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0,
912				      BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR,
913		                      NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS,
914				      TX_MAX_SIZE, BUS_DMA_ALLOCNOW,
915		                      NULL, NULL, entry_tag)) != 0) {
916		device_printf(sc->dev, "Cannot allocate descriptor entry tag\n");
917		return (ENOMEM);
918	}
919	return (0);
920}
921
922static void
923sge_slow_intr_handler(void *arg, int ncount)
924{
925	adapter_t *sc = arg;
926
927	t3_slow_intr_handler(sc);
928	t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask);
929	(void) t3_read_reg(sc, A_PL_INT_ENABLE0);
930}
931
932/**
933 *	sge_timer_cb - perform periodic maintenance of an SGE qset
934 *	@data: the SGE queue set to maintain
935 *
936 *	Runs periodically from a timer to perform maintenance of an SGE queue
937 *	set.  It performs two tasks:
938 *
939 *	a) Cleans up any completed Tx descriptors that may still be pending.
940 *	Normal descriptor cleanup happens when new packets are added to a Tx
941 *	queue so this timer is relatively infrequent and does any cleanup only
942 *	if the Tx queue has not seen any new packets in a while.  We make a
943 *	best effort attempt to reclaim descriptors, in that we don't wait
944 *	around if we cannot get a queue's lock (which most likely is because
945 *	someone else is queueing new packets and so will also handle the clean
946 *	up).  Since control queues use immediate data exclusively we don't
947 *	bother cleaning them up here.
948 *
949 *	b) Replenishes Rx queues that have run out due to memory shortage.
950 *	Normally new Rx buffers are added when existing ones are consumed but
951 *	when out of memory a queue can become empty.  We try to add only a few
952 *	buffers here, the queue will be replenished fully as these new buffers
953 *	are used up if memory shortage has subsided.
954 *
955 *	c) Return coalesced response queue credits in case a response queue is
956 *	starved.
957 *
958 *	d) Ring doorbells for T304 tunnel queues since we have seen doorbell
959 *	fifo overflows and the FW doesn't implement any recovery scheme yet.
960 */
961static void
962sge_timer_cb(void *arg)
963{
964	adapter_t *sc = arg;
965	if ((sc->flags & USING_MSIX) == 0) {
966
967		struct port_info *pi;
968		struct sge_qset *qs;
969		struct sge_txq  *txq;
970		int i, j;
971		int reclaim_ofl, refill_rx;
972
973		if (sc->open_device_map == 0)
974			return;
975
976		for (i = 0; i < sc->params.nports; i++) {
977			pi = &sc->port[i];
978			for (j = 0; j < pi->nqsets; j++) {
979				qs = &sc->sge.qs[pi->first_qset + j];
980				txq = &qs->txq[0];
981				reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned;
982				refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
983				    (qs->fl[1].credits < qs->fl[1].size));
984				if (reclaim_ofl || refill_rx) {
985					taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task);
986					break;
987				}
988			}
989		}
990	}
991
992	if (sc->params.nports > 2) {
993		int i;
994
995		for_each_port(sc, i) {
996			struct port_info *pi = &sc->port[i];
997
998			t3_write_reg(sc, A_SG_KDOORBELL,
999				     F_SELEGRCNTX |
1000				     (FW_TUNNEL_SGEEC_START + pi->first_qset));
1001		}
1002	}
1003	if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) &&
1004	    sc->open_device_map != 0)
1005		callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1006}
1007
1008/*
1009 * This is meant to be a catch-all function to keep sge state private
1010 * to sge.c
1011 *
1012 */
1013int
1014t3_sge_init_adapter(adapter_t *sc)
1015{
1016	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
1017	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1018	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
1019	return (0);
1020}
1021
1022int
1023t3_sge_reset_adapter(adapter_t *sc)
1024{
1025	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
1026	return (0);
1027}
1028
1029int
1030t3_sge_init_port(struct port_info *pi)
1031{
1032	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
1033	return (0);
1034}
1035
1036/**
1037 *	refill_rspq - replenish an SGE response queue
1038 *	@adapter: the adapter
1039 *	@q: the response queue to replenish
1040 *	@credits: how many new responses to make available
1041 *
1042 *	Replenishes a response queue by making the supplied number of responses
1043 *	available to HW.
1044 */
1045static __inline void
1046refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
1047{
1048
1049	/* mbufs are allocated on demand when a rspq entry is processed. */
1050	t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN,
1051		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
1052}
1053
1054static void
1055sge_txq_reclaim_handler(void *arg, int ncount)
1056{
1057	struct sge_qset *qs = arg;
1058	int i;
1059
1060	for (i = 0; i < 3; i++)
1061		reclaim_completed_tx(qs, 16, i);
1062}
1063
1064static void
1065sge_timer_reclaim(void *arg, int ncount)
1066{
1067	struct port_info *pi = arg;
1068	int i, nqsets = pi->nqsets;
1069	adapter_t *sc = pi->adapter;
1070	struct sge_qset *qs;
1071	struct mtx *lock;
1072
1073	KASSERT((sc->flags & USING_MSIX) == 0,
1074	    ("can't call timer reclaim for msi-x"));
1075
1076	for (i = 0; i < nqsets; i++) {
1077		qs = &sc->sge.qs[pi->first_qset + i];
1078
1079		reclaim_completed_tx(qs, 16, TXQ_OFLD);
1080		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
1081			    &sc->sge.qs[0].rspq.lock;
1082
1083		if (mtx_trylock(lock)) {
1084			/* XXX currently assume that we are *NOT* polling */
1085			uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS);
1086
1087			if (qs->fl[0].credits < qs->fl[0].size - 16)
1088				__refill_fl(sc, &qs->fl[0]);
1089			if (qs->fl[1].credits < qs->fl[1].size - 16)
1090				__refill_fl(sc, &qs->fl[1]);
1091
1092			if (status & (1 << qs->rspq.cntxt_id)) {
1093				if (qs->rspq.credits) {
1094					refill_rspq(sc, &qs->rspq, 1);
1095					qs->rspq.credits--;
1096					t3_write_reg(sc, A_SG_RSPQ_FL_STATUS,
1097					    1 << qs->rspq.cntxt_id);
1098				}
1099			}
1100			mtx_unlock(lock);
1101		}
1102	}
1103}
1104
1105/**
1106 *	init_qset_cntxt - initialize an SGE queue set context info
1107 *	@qs: the queue set
1108 *	@id: the queue set id
1109 *
1110 *	Initializes the TIDs and context ids for the queues of a queue set.
1111 */
1112static void
1113init_qset_cntxt(struct sge_qset *qs, u_int id)
1114{
1115
1116	qs->rspq.cntxt_id = id;
1117	qs->fl[0].cntxt_id = 2 * id;
1118	qs->fl[1].cntxt_id = 2 * id + 1;
1119	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
1120	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
1121	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
1122	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
1123	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
1124
1125	mbufq_init(&qs->txq[TXQ_ETH].sendq);
1126	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
1127	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
1128}
1129
1130
1131static void
1132txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs)
1133{
1134	txq->in_use += ndesc;
1135	/*
1136	 * XXX we don't handle stopping of queue
1137	 * presumably start handles this when we bump against the end
1138	 */
1139	txqs->gen = txq->gen;
1140	txq->unacked += ndesc;
1141	txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5);
1142	txq->unacked &= 31;
1143	txqs->pidx = txq->pidx;
1144	txq->pidx += ndesc;
1145#ifdef INVARIANTS
1146	if (((txqs->pidx > txq->cidx) &&
1147		(txq->pidx < txqs->pidx) &&
1148		(txq->pidx >= txq->cidx)) ||
1149	    ((txqs->pidx < txq->cidx) &&
1150		(txq->pidx >= txq-> cidx)) ||
1151	    ((txqs->pidx < txq->cidx) &&
1152		(txq->cidx < txqs->pidx)))
1153		panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d",
1154		    txqs->pidx, txq->pidx, txq->cidx);
1155#endif
1156	if (txq->pidx >= txq->size) {
1157		txq->pidx -= txq->size;
1158		txq->gen ^= 1;
1159	}
1160
1161}
1162
1163/**
1164 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
1165 *	@m: the packet mbufs
1166 *      @nsegs: the number of segments
1167 *
1168 * 	Returns the number of Tx descriptors needed for the given Ethernet
1169 * 	packet.  Ethernet packets require addition of WR and CPL headers.
1170 */
1171static __inline unsigned int
1172calc_tx_descs(const struct mbuf *m, int nsegs)
1173{
1174	unsigned int flits;
1175
1176	if (m->m_pkthdr.len <= PIO_LEN)
1177		return 1;
1178
1179	flits = sgl_len(nsegs) + 2;
1180	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1181		flits++;
1182
1183	return flits_to_desc(flits);
1184}
1185
1186/**
1187 *	make_sgl - populate a scatter/gather list for a packet
1188 *	@sgp: the SGL to populate
1189 *	@segs: the packet dma segments
1190 *	@nsegs: the number of segments
1191 *
1192 *	Generates a scatter/gather list for the buffers that make up a packet
1193 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1194 *	appropriately.
1195 */
1196static __inline void
1197make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
1198{
1199	int i, idx;
1200
1201	for (idx = 0, i = 0; i < nsegs; i++) {
1202		/*
1203		 * firmware doesn't like empty segments
1204		 */
1205		if (segs[i].ds_len == 0)
1206			continue;
1207		if (i && idx == 0)
1208			++sgp;
1209
1210		sgp->len[idx] = htobe32(segs[i].ds_len);
1211		sgp->addr[idx] = htobe64(segs[i].ds_addr);
1212		idx ^= 1;
1213	}
1214
1215	if (idx) {
1216		sgp->len[idx] = 0;
1217		sgp->addr[idx] = 0;
1218	}
1219}
1220
1221/**
1222 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1223 *	@adap: the adapter
1224 *	@q: the Tx queue
1225 *
1226 *	Ring the doorbell if a Tx queue is asleep.  There is a natural race,
1227 *	where the HW is going to sleep just after we checked, however,
1228 *	then the interrupt handler will detect the outstanding TX packet
1229 *	and ring the doorbell for us.
1230 *
1231 *	When GTS is disabled we unconditionally ring the doorbell.
1232 */
1233static __inline void
1234check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring)
1235{
1236#if USE_GTS
1237	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1238	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1239		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1240#ifdef T3_TRACE
1241		T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d",
1242			  q->cntxt_id);
1243#endif
1244		t3_write_reg(adap, A_SG_KDOORBELL,
1245			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1246	}
1247#else
1248	if (mustring || ++q->db_pending >= 32) {
1249		wmb();            /* write descriptors before telling HW */
1250		t3_write_reg(adap, A_SG_KDOORBELL,
1251		    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1252		q->db_pending = 0;
1253	}
1254#endif
1255}
1256
1257static __inline void
1258wr_gen2(struct tx_desc *d, unsigned int gen)
1259{
1260#if SGE_NUM_GENBITS == 2
1261	d->flit[TX_DESC_FLITS - 1] = htobe64(gen);
1262#endif
1263}
1264
1265/**
1266 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1267 *	@ndesc: number of Tx descriptors spanned by the SGL
1268 *	@txd: first Tx descriptor to be written
1269 *	@txqs: txq state (generation and producer index)
1270 *	@txq: the SGE Tx queue
1271 *	@sgl: the SGL
1272 *	@flits: number of flits to the start of the SGL in the first descriptor
1273 *	@sgl_flits: the SGL size in flits
1274 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1275 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1276 *
1277 *	Write a work request header and an associated SGL.  If the SGL is
1278 *	small enough to fit into one Tx descriptor it has already been written
1279 *	and we just need to write the WR header.  Otherwise we distribute the
1280 *	SGL across the number of descriptors it spans.
1281 */
1282static void
1283write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
1284    const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
1285    unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo)
1286{
1287
1288	struct work_request_hdr *wrp = (struct work_request_hdr *)txd;
1289	struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx];
1290
1291	if (__predict_true(ndesc == 1)) {
1292		set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1293		    V_WR_SGLSFLT(flits)) | wr_hi,
1294		    htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) |
1295		    wr_lo);
1296
1297		wr_gen2(txd, txqs->gen);
1298
1299	} else {
1300		unsigned int ogen = txqs->gen;
1301		const uint64_t *fp = (const uint64_t *)sgl;
1302		struct work_request_hdr *wp = wrp;
1303
1304		wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1305		    V_WR_SGLSFLT(flits)) | wr_hi;
1306
1307		while (sgl_flits) {
1308			unsigned int avail = WR_FLITS - flits;
1309
1310			if (avail > sgl_flits)
1311				avail = sgl_flits;
1312			memcpy(&txd->flit[flits], fp, avail * sizeof(*fp));
1313			sgl_flits -= avail;
1314			ndesc--;
1315			if (!sgl_flits)
1316				break;
1317
1318			fp += avail;
1319			txd++;
1320			txsd++;
1321			if (++txqs->pidx == txq->size) {
1322				txqs->pidx = 0;
1323				txqs->gen ^= 1;
1324				txd = txq->desc;
1325				txsd = txq->sdesc;
1326			}
1327
1328			/*
1329			 * when the head of the mbuf chain
1330			 * is freed all clusters will be freed
1331			 * with it
1332			 */
1333			wrp = (struct work_request_hdr *)txd;
1334			wrp->wrh_hi = htonl(V_WR_DATATYPE(1) |
1335			    V_WR_SGLSFLT(1)) | wr_hi;
1336			wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS,
1337				    sgl_flits + 1)) |
1338			    V_WR_GEN(txqs->gen)) | wr_lo;
1339			wr_gen2(txd, txqs->gen);
1340			flits = 1;
1341		}
1342		wrp->wrh_hi |= htonl(F_WR_EOP);
1343		wmb();
1344		wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1345		wr_gen2((struct tx_desc *)wp, ogen);
1346	}
1347}
1348
1349/* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */
1350#define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20)
1351
1352#define GET_VTAG(cntrl, m) \
1353do { \
1354	if ((m)->m_flags & M_VLANTAG)					            \
1355		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
1356} while (0)
1357
1358static int
1359t3_encap(struct sge_qset *qs, struct mbuf **m)
1360{
1361	adapter_t *sc;
1362	struct mbuf *m0;
1363	struct sge_txq *txq;
1364	struct txq_state txqs;
1365	struct port_info *pi;
1366	unsigned int ndesc, flits, cntrl, mlen;
1367	int err, nsegs, tso_info = 0;
1368
1369	struct work_request_hdr *wrp;
1370	struct tx_sw_desc *txsd;
1371	struct sg_ent *sgp, *sgl;
1372	uint32_t wr_hi, wr_lo, sgl_flits;
1373	bus_dma_segment_t segs[TX_MAX_SEGS];
1374
1375	struct tx_desc *txd;
1376
1377	pi = qs->port;
1378	sc = pi->adapter;
1379	txq = &qs->txq[TXQ_ETH];
1380	txd = &txq->desc[txq->pidx];
1381	txsd = &txq->sdesc[txq->pidx];
1382	sgl = txq->txq_sgl;
1383
1384	prefetch(txd);
1385	m0 = *m;
1386
1387	mtx_assert(&qs->lock, MA_OWNED);
1388	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1389	KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n"));
1390
1391	if  (m0->m_nextpkt == NULL && m0->m_next != NULL &&
1392	    m0->m_pkthdr.csum_flags & (CSUM_TSO))
1393		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
1394
1395	if (m0->m_nextpkt != NULL) {
1396		busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs);
1397		ndesc = 1;
1398		mlen = 0;
1399	} else {
1400		if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map,
1401		    &m0, segs, &nsegs))) {
1402			if (cxgb_debug)
1403				printf("failed ... err=%d\n", err);
1404			return (err);
1405		}
1406		mlen = m0->m_pkthdr.len;
1407		ndesc = calc_tx_descs(m0, nsegs);
1408	}
1409	txq_prod(txq, ndesc, &txqs);
1410
1411	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs));
1412	txsd->m = m0;
1413
1414	if (m0->m_nextpkt != NULL) {
1415		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
1416		int i, fidx;
1417
1418		if (nsegs > 7)
1419			panic("trying to coalesce %d packets in to one WR", nsegs);
1420		txq->txq_coalesced += nsegs;
1421		wrp = (struct work_request_hdr *)txd;
1422		flits = nsegs*2 + 1;
1423
1424		for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) {
1425			struct cpl_tx_pkt_batch_entry *cbe;
1426			uint64_t flit;
1427			uint32_t *hflit = (uint32_t *)&flit;
1428			int cflags = m0->m_pkthdr.csum_flags;
1429
1430			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
1431			GET_VTAG(cntrl, m0);
1432			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1433			if (__predict_false(!(cflags & CSUM_IP)))
1434				cntrl |= F_TXPKT_IPCSUM_DIS;
1435			if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP |
1436			    CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1437				cntrl |= F_TXPKT_L4CSUM_DIS;
1438
1439			hflit[0] = htonl(cntrl);
1440			hflit[1] = htonl(segs[i].ds_len | 0x80000000);
1441			flit |= htobe64(1 << 24);
1442			cbe = &cpl_batch->pkt_entry[i];
1443			cbe->cntrl = hflit[0];
1444			cbe->len = hflit[1];
1445			cbe->addr = htobe64(segs[i].ds_addr);
1446		}
1447
1448		wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1449		    V_WR_SGLSFLT(flits)) |
1450		    htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1451		wr_lo = htonl(V_WR_LEN(flits) |
1452		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
1453		set_wr_hdr(wrp, wr_hi, wr_lo);
1454		wmb();
1455		ETHER_BPF_MTAP(pi->ifp, m0);
1456		wr_gen2(txd, txqs.gen);
1457		check_ring_tx_db(sc, txq, 0);
1458		return (0);
1459	} else if (tso_info) {
1460		uint16_t eth_type;
1461		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
1462		struct ether_header *eh;
1463		void *l3hdr;
1464		struct tcphdr *tcp;
1465
1466		txd->flit[2] = 0;
1467		GET_VTAG(cntrl, m0);
1468		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1469		hdr->cntrl = htonl(cntrl);
1470		hdr->len = htonl(mlen | 0x80000000);
1471
1472		if (__predict_false(mlen < TCPPKTHDRSIZE)) {
1473			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x",
1474			    m0, mlen, m0->m_pkthdr.tso_segsz,
1475			    (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags);
1476			panic("tx tso packet too small");
1477		}
1478
1479		/* Make sure that ether, ip, tcp headers are all in m0 */
1480		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
1481			m0 = m_pullup(m0, TCPPKTHDRSIZE);
1482			if (__predict_false(m0 == NULL)) {
1483				/* XXX panic probably an overreaction */
1484				panic("couldn't fit header into mbuf");
1485			}
1486		}
1487
1488		eh = mtod(m0, struct ether_header *);
1489		eth_type = eh->ether_type;
1490		if (eth_type == htons(ETHERTYPE_VLAN)) {
1491			struct ether_vlan_header *evh = (void *)eh;
1492
1493			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN);
1494			l3hdr = evh + 1;
1495			eth_type = evh->evl_proto;
1496		} else {
1497			tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II);
1498			l3hdr = eh + 1;
1499		}
1500
1501		if (eth_type == htons(ETHERTYPE_IP)) {
1502			struct ip *ip = l3hdr;
1503
1504			tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl);
1505			tcp = (struct tcphdr *)(ip + 1);
1506		} else if (eth_type == htons(ETHERTYPE_IPV6)) {
1507			struct ip6_hdr *ip6 = l3hdr;
1508
1509			KASSERT(ip6->ip6_nxt == IPPROTO_TCP,
1510			    ("%s: CSUM_TSO with ip6_nxt %d",
1511			    __func__, ip6->ip6_nxt));
1512
1513			tso_info |= F_LSO_IPV6;
1514			tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2);
1515			tcp = (struct tcphdr *)(ip6 + 1);
1516		} else
1517			panic("%s: CSUM_TSO but neither ip nor ip6", __func__);
1518
1519		tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off);
1520		hdr->lso_info = htonl(tso_info);
1521
1522		if (__predict_false(mlen <= PIO_LEN)) {
1523			/*
1524			 * pkt not undersized but fits in PIO_LEN
1525			 * Indicates a TSO bug at the higher levels.
1526			 */
1527			txsd->m = NULL;
1528			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
1529			flits = (mlen + 7) / 8 + 3;
1530			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1531					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1532					  F_WR_SOP | F_WR_EOP | txqs.compl);
1533			wr_lo = htonl(V_WR_LEN(flits) |
1534			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1535			set_wr_hdr(&hdr->wr, wr_hi, wr_lo);
1536			wmb();
1537			ETHER_BPF_MTAP(pi->ifp, m0);
1538			wr_gen2(txd, txqs.gen);
1539			check_ring_tx_db(sc, txq, 0);
1540			m_freem(m0);
1541			return (0);
1542		}
1543		flits = 3;
1544	} else {
1545		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
1546
1547		GET_VTAG(cntrl, m0);
1548		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1549		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
1550			cntrl |= F_TXPKT_IPCSUM_DIS;
1551		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP |
1552		    CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6))))
1553			cntrl |= F_TXPKT_L4CSUM_DIS;
1554		cpl->cntrl = htonl(cntrl);
1555		cpl->len = htonl(mlen | 0x80000000);
1556
1557		if (mlen <= PIO_LEN) {
1558			txsd->m = NULL;
1559			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
1560			flits = (mlen + 7) / 8 + 2;
1561
1562			wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
1563			    V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
1564					  F_WR_SOP | F_WR_EOP | txqs.compl);
1565			wr_lo = htonl(V_WR_LEN(flits) |
1566			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
1567			set_wr_hdr(&cpl->wr, wr_hi, wr_lo);
1568			wmb();
1569			ETHER_BPF_MTAP(pi->ifp, m0);
1570			wr_gen2(txd, txqs.gen);
1571			check_ring_tx_db(sc, txq, 0);
1572			m_freem(m0);
1573			return (0);
1574		}
1575		flits = 2;
1576	}
1577	wrp = (struct work_request_hdr *)txd;
1578	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
1579	make_sgl(sgp, segs, nsegs);
1580
1581	sgl_flits = sgl_len(nsegs);
1582
1583	ETHER_BPF_MTAP(pi->ifp, m0);
1584
1585	KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc));
1586	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
1587	wr_lo = htonl(V_WR_TID(txq->token));
1588	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits,
1589	    sgl_flits, wr_hi, wr_lo);
1590	check_ring_tx_db(sc, txq, 0);
1591
1592	return (0);
1593}
1594
1595void
1596cxgb_tx_watchdog(void *arg)
1597{
1598	struct sge_qset *qs = arg;
1599	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1600
1601        if (qs->coalescing != 0 &&
1602	    (txq->in_use <= cxgb_tx_coalesce_enable_stop) &&
1603	    TXQ_RING_EMPTY(qs))
1604                qs->coalescing = 0;
1605        else if (qs->coalescing == 0 &&
1606	    (txq->in_use >= cxgb_tx_coalesce_enable_start))
1607                qs->coalescing = 1;
1608	if (TXQ_TRYLOCK(qs)) {
1609		qs->qs_flags |= QS_FLUSHING;
1610		cxgb_start_locked(qs);
1611		qs->qs_flags &= ~QS_FLUSHING;
1612		TXQ_UNLOCK(qs);
1613	}
1614	if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING)
1615		callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog,
1616		    qs, txq->txq_watchdog.c_cpu);
1617}
1618
1619static void
1620cxgb_tx_timeout(void *arg)
1621{
1622	struct sge_qset *qs = arg;
1623	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1624
1625	if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3)))
1626                qs->coalescing = 1;
1627	if (TXQ_TRYLOCK(qs)) {
1628		qs->qs_flags |= QS_TIMEOUT;
1629		cxgb_start_locked(qs);
1630		qs->qs_flags &= ~QS_TIMEOUT;
1631		TXQ_UNLOCK(qs);
1632	}
1633}
1634
1635static void
1636cxgb_start_locked(struct sge_qset *qs)
1637{
1638	struct mbuf *m_head = NULL;
1639	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1640	struct port_info *pi = qs->port;
1641	struct ifnet *ifp = pi->ifp;
1642
1643	if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT))
1644		reclaim_completed_tx(qs, 0, TXQ_ETH);
1645
1646	if (!pi->link_config.link_ok) {
1647		TXQ_RING_FLUSH(qs);
1648		return;
1649	}
1650	TXQ_LOCK_ASSERT(qs);
1651	while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) &&
1652	    pi->link_config.link_ok) {
1653		reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1654
1655		if (txq->size - txq->in_use <= TX_MAX_DESC)
1656			break;
1657
1658		if ((m_head = cxgb_dequeue(qs)) == NULL)
1659			break;
1660		/*
1661		 *  Encapsulation can modify our pointer, and or make it
1662		 *  NULL on failure.  In that event, we can't requeue.
1663		 */
1664		if (t3_encap(qs, &m_head) || m_head == NULL)
1665			break;
1666
1667		m_head = NULL;
1668	}
1669
1670	if (txq->db_pending)
1671		check_ring_tx_db(pi->adapter, txq, 1);
1672
1673	if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 &&
1674	    pi->link_config.link_ok)
1675		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1676		    qs, txq->txq_timer.c_cpu);
1677	if (m_head != NULL)
1678		m_freem(m_head);
1679}
1680
1681static int
1682cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m)
1683{
1684	struct port_info *pi = qs->port;
1685	struct sge_txq *txq = &qs->txq[TXQ_ETH];
1686	struct buf_ring *br = txq->txq_mr;
1687	int error, avail;
1688
1689	avail = txq->size - txq->in_use;
1690	TXQ_LOCK_ASSERT(qs);
1691
1692	/*
1693	 * We can only do a direct transmit if the following are true:
1694	 * - we aren't coalescing (ring < 3/4 full)
1695	 * - the link is up -- checked in caller
1696	 * - there are no packets enqueued already
1697	 * - there is space in hardware transmit queue
1698	 */
1699	if (check_pkt_coalesce(qs) == 0 &&
1700	    !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) {
1701		if (t3_encap(qs, &m)) {
1702			if (m != NULL &&
1703			    (error = drbr_enqueue(ifp, br, m)) != 0)
1704				return (error);
1705		} else {
1706			if (txq->db_pending)
1707				check_ring_tx_db(pi->adapter, txq, 1);
1708
1709			/*
1710			 * We've bypassed the buf ring so we need to update
1711			 * the stats directly
1712			 */
1713			txq->txq_direct_packets++;
1714			txq->txq_direct_bytes += m->m_pkthdr.len;
1715		}
1716	} else if ((error = drbr_enqueue(ifp, br, m)) != 0)
1717		return (error);
1718
1719	reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH);
1720	if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok &&
1721	    (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7)))
1722		cxgb_start_locked(qs);
1723	else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer))
1724		callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout,
1725		    qs, txq->txq_timer.c_cpu);
1726	return (0);
1727}
1728
1729int
1730cxgb_transmit(struct ifnet *ifp, struct mbuf *m)
1731{
1732	struct sge_qset *qs;
1733	struct port_info *pi = ifp->if_softc;
1734	int error, qidx = pi->first_qset;
1735
1736	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0
1737	    ||(!pi->link_config.link_ok)) {
1738		m_freem(m);
1739		return (0);
1740	}
1741
1742	if (m->m_flags & M_FLOWID)
1743		qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset;
1744
1745	qs = &pi->adapter->sge.qs[qidx];
1746
1747	if (TXQ_TRYLOCK(qs)) {
1748		/* XXX running */
1749		error = cxgb_transmit_locked(ifp, qs, m);
1750		TXQ_UNLOCK(qs);
1751	} else
1752		error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m);
1753	return (error);
1754}
1755
1756void
1757cxgb_qflush(struct ifnet *ifp)
1758{
1759	/*
1760	 * flush any enqueued mbufs in the buf_rings
1761	 * and in the transmit queues
1762	 * no-op for now
1763	 */
1764	return;
1765}
1766
1767/**
1768 *	write_imm - write a packet into a Tx descriptor as immediate data
1769 *	@d: the Tx descriptor to write
1770 *	@m: the packet
1771 *	@len: the length of packet data to write as immediate data
1772 *	@gen: the generation bit value to write
1773 *
1774 *	Writes a packet as immediate data into a Tx descriptor.  The packet
1775 *	contains a work request at its beginning.  We must write the packet
1776 *	carefully so the SGE doesn't read accidentally before it's written in
1777 *	its entirety.
1778 */
1779static __inline void
1780write_imm(struct tx_desc *d, caddr_t src,
1781	  unsigned int len, unsigned int gen)
1782{
1783	struct work_request_hdr *from = (struct work_request_hdr *)src;
1784	struct work_request_hdr *to = (struct work_request_hdr *)d;
1785	uint32_t wr_hi, wr_lo;
1786
1787	KASSERT(len <= WR_LEN && len >= sizeof(*from),
1788	    ("%s: invalid len %d", __func__, len));
1789
1790	memcpy(&to[1], &from[1], len - sizeof(*from));
1791	wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP |
1792	    V_WR_BCNTLFLT(len & 7));
1793	wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8));
1794	set_wr_hdr(to, wr_hi, wr_lo);
1795	wmb();
1796	wr_gen2(d, gen);
1797}
1798
1799/**
1800 *	check_desc_avail - check descriptor availability on a send queue
1801 *	@adap: the adapter
1802 *	@q: the TX queue
1803 *	@m: the packet needing the descriptors
1804 *	@ndesc: the number of Tx descriptors needed
1805 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1806 *
1807 *	Checks if the requested number of Tx descriptors is available on an
1808 *	SGE send queue.  If the queue is already suspended or not enough
1809 *	descriptors are available the packet is queued for later transmission.
1810 *	Must be called with the Tx queue locked.
1811 *
1812 *	Returns 0 if enough descriptors are available, 1 if there aren't
1813 *	enough descriptors and the packet has been queued, and 2 if the caller
1814 *	needs to retry because there weren't enough descriptors at the
1815 *	beginning of the call but some freed up in the mean time.
1816 */
1817static __inline int
1818check_desc_avail(adapter_t *adap, struct sge_txq *q,
1819		 struct mbuf *m, unsigned int ndesc,
1820		 unsigned int qid)
1821{
1822	/*
1823	 * XXX We currently only use this for checking the control queue
1824	 * the control queue is only used for binding qsets which happens
1825	 * at init time so we are guaranteed enough descriptors
1826	 */
1827	if (__predict_false(!mbufq_empty(&q->sendq))) {
1828addq_exit:	mbufq_tail(&q->sendq, m);
1829		return 1;
1830	}
1831	if (__predict_false(q->size - q->in_use < ndesc)) {
1832
1833		struct sge_qset *qs = txq_to_qset(q, qid);
1834
1835		setbit(&qs->txq_stopped, qid);
1836		if (should_restart_tx(q) &&
1837		    test_and_clear_bit(qid, &qs->txq_stopped))
1838			return 2;
1839
1840		q->stops++;
1841		goto addq_exit;
1842	}
1843	return 0;
1844}
1845
1846
1847/**
1848 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1849 *	@q: the SGE control Tx queue
1850 *
1851 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1852 *	that send only immediate data (presently just the control queues) and
1853 *	thus do not have any mbufs
1854 */
1855static __inline void
1856reclaim_completed_tx_imm(struct sge_txq *q)
1857{
1858	unsigned int reclaim = q->processed - q->cleaned;
1859
1860	q->in_use -= reclaim;
1861	q->cleaned += reclaim;
1862}
1863
1864/**
1865 *	ctrl_xmit - send a packet through an SGE control Tx queue
1866 *	@adap: the adapter
1867 *	@q: the control queue
1868 *	@m: the packet
1869 *
1870 *	Send a packet through an SGE control Tx queue.  Packets sent through
1871 *	a control queue must fit entirely as immediate data in a single Tx
1872 *	descriptor and have no page fragments.
1873 */
1874static int
1875ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
1876{
1877	int ret;
1878	struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *);
1879	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1880
1881	KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__));
1882
1883	wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP);
1884	wrp->wrh_lo = htonl(V_WR_TID(q->token));
1885
1886	TXQ_LOCK(qs);
1887again:	reclaim_completed_tx_imm(q);
1888
1889	ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL);
1890	if (__predict_false(ret)) {
1891		if (ret == 1) {
1892			TXQ_UNLOCK(qs);
1893			return (ENOSPC);
1894		}
1895		goto again;
1896	}
1897	write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1898
1899	q->in_use++;
1900	if (++q->pidx >= q->size) {
1901		q->pidx = 0;
1902		q->gen ^= 1;
1903	}
1904	TXQ_UNLOCK(qs);
1905	wmb();
1906	t3_write_reg(adap, A_SG_KDOORBELL,
1907	    F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1908
1909	m_free(m);
1910	return (0);
1911}
1912
1913
1914/**
1915 *	restart_ctrlq - restart a suspended control queue
1916 *	@qs: the queue set cotaining the control queue
1917 *
1918 *	Resumes transmission on a suspended Tx control queue.
1919 */
1920static void
1921restart_ctrlq(void *data, int npending)
1922{
1923	struct mbuf *m;
1924	struct sge_qset *qs = (struct sge_qset *)data;
1925	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1926	adapter_t *adap = qs->port->adapter;
1927
1928	TXQ_LOCK(qs);
1929again:	reclaim_completed_tx_imm(q);
1930
1931	while (q->in_use < q->size &&
1932	       (m = mbufq_dequeue(&q->sendq)) != NULL) {
1933
1934		write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen);
1935		m_free(m);
1936
1937		if (++q->pidx >= q->size) {
1938			q->pidx = 0;
1939			q->gen ^= 1;
1940		}
1941		q->in_use++;
1942	}
1943	if (!mbufq_empty(&q->sendq)) {
1944		setbit(&qs->txq_stopped, TXQ_CTRL);
1945
1946		if (should_restart_tx(q) &&
1947		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1948			goto again;
1949		q->stops++;
1950	}
1951	TXQ_UNLOCK(qs);
1952	t3_write_reg(adap, A_SG_KDOORBELL,
1953		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1954}
1955
1956
1957/*
1958 * Send a management message through control queue 0
1959 */
1960int
1961t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
1962{
1963	return ctrl_xmit(adap, &adap->sge.qs[0], m);
1964}
1965
1966/**
1967 *	free_qset - free the resources of an SGE queue set
1968 *	@sc: the controller owning the queue set
1969 *	@q: the queue set
1970 *
1971 *	Release the HW and SW resources associated with an SGE queue set, such
1972 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
1973 *	queue set must be quiesced prior to calling this.
1974 */
1975static void
1976t3_free_qset(adapter_t *sc, struct sge_qset *q)
1977{
1978	int i;
1979
1980	reclaim_completed_tx(q, 0, TXQ_ETH);
1981	if (q->txq[TXQ_ETH].txq_mr != NULL)
1982		buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF);
1983	if (q->txq[TXQ_ETH].txq_ifq != NULL) {
1984		ifq_delete(q->txq[TXQ_ETH].txq_ifq);
1985		free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF);
1986	}
1987
1988	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
1989		if (q->fl[i].desc) {
1990			mtx_lock_spin(&sc->sge.reg_lock);
1991			t3_sge_disable_fl(sc, q->fl[i].cntxt_id);
1992			mtx_unlock_spin(&sc->sge.reg_lock);
1993			bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map);
1994			bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc,
1995					q->fl[i].desc_map);
1996			bus_dma_tag_destroy(q->fl[i].desc_tag);
1997			bus_dma_tag_destroy(q->fl[i].entry_tag);
1998		}
1999		if (q->fl[i].sdesc) {
2000			free_rx_bufs(sc, &q->fl[i]);
2001			free(q->fl[i].sdesc, M_DEVBUF);
2002		}
2003	}
2004
2005	mtx_unlock(&q->lock);
2006	MTX_DESTROY(&q->lock);
2007	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
2008		if (q->txq[i].desc) {
2009			mtx_lock_spin(&sc->sge.reg_lock);
2010			t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0);
2011			mtx_unlock_spin(&sc->sge.reg_lock);
2012			bus_dmamap_unload(q->txq[i].desc_tag,
2013					q->txq[i].desc_map);
2014			bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc,
2015					q->txq[i].desc_map);
2016			bus_dma_tag_destroy(q->txq[i].desc_tag);
2017			bus_dma_tag_destroy(q->txq[i].entry_tag);
2018		}
2019		if (q->txq[i].sdesc) {
2020			free(q->txq[i].sdesc, M_DEVBUF);
2021		}
2022	}
2023
2024	if (q->rspq.desc) {
2025		mtx_lock_spin(&sc->sge.reg_lock);
2026		t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id);
2027		mtx_unlock_spin(&sc->sge.reg_lock);
2028
2029		bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map);
2030		bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc,
2031			        q->rspq.desc_map);
2032		bus_dma_tag_destroy(q->rspq.desc_tag);
2033		MTX_DESTROY(&q->rspq.lock);
2034	}
2035
2036#if defined(INET6) || defined(INET)
2037	tcp_lro_free(&q->lro.ctrl);
2038#endif
2039
2040	bzero(q, sizeof(*q));
2041}
2042
2043/**
2044 *	t3_free_sge_resources - free SGE resources
2045 *	@sc: the adapter softc
2046 *
2047 *	Frees resources used by the SGE queue sets.
2048 */
2049void
2050t3_free_sge_resources(adapter_t *sc, int nqsets)
2051{
2052	int i;
2053
2054	for (i = 0; i < nqsets; ++i) {
2055		TXQ_LOCK(&sc->sge.qs[i]);
2056		t3_free_qset(sc, &sc->sge.qs[i]);
2057	}
2058}
2059
2060/**
2061 *	t3_sge_start - enable SGE
2062 *	@sc: the controller softc
2063 *
2064 *	Enables the SGE for DMAs.  This is the last step in starting packet
2065 *	transfers.
2066 */
2067void
2068t3_sge_start(adapter_t *sc)
2069{
2070	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2071}
2072
2073/**
2074 *	t3_sge_stop - disable SGE operation
2075 *	@sc: the adapter
2076 *
2077 *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
2078 *	from error interrupts) or from normal process context.  In the latter
2079 *	case it also disables any pending queue restart tasklets.  Note that
2080 *	if it is called in interrupt context it cannot disable the restart
2081 *	tasklets as it cannot wait, however the tasklets will have no effect
2082 *	since the doorbells are disabled and the driver will call this again
2083 *	later from process context, at which time the tasklets will be stopped
2084 *	if they are still running.
2085 */
2086void
2087t3_sge_stop(adapter_t *sc)
2088{
2089	int i, nqsets;
2090
2091	t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0);
2092
2093	if (sc->tq == NULL)
2094		return;
2095
2096	for (nqsets = i = 0; i < (sc)->params.nports; i++)
2097		nqsets += sc->port[i].nqsets;
2098#ifdef notyet
2099	/*
2100	 *
2101	 * XXX
2102	 */
2103	for (i = 0; i < nqsets; ++i) {
2104		struct sge_qset *qs = &sc->sge.qs[i];
2105
2106		taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2107		taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2108	}
2109#endif
2110}
2111
2112/**
2113 *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
2114 *	@adapter: the adapter
2115 *	@q: the Tx queue to reclaim descriptors from
2116 *	@reclaimable: the number of descriptors to reclaim
2117 *      @m_vec_size: maximum number of buffers to reclaim
2118 *      @desc_reclaimed: returns the number of descriptors reclaimed
2119 *
2120 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
2121 *	Tx buffers.  Called with the Tx queue lock held.
2122 *
2123 *      Returns number of buffers of reclaimed
2124 */
2125void
2126t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue)
2127{
2128	struct tx_sw_desc *txsd;
2129	unsigned int cidx, mask;
2130	struct sge_txq *q = &qs->txq[queue];
2131
2132#ifdef T3_TRACE
2133	T3_TRACE2(sc->tb[q->cntxt_id & 7],
2134		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
2135#endif
2136	cidx = q->cidx;
2137	mask = q->size - 1;
2138	txsd = &q->sdesc[cidx];
2139
2140	mtx_assert(&qs->lock, MA_OWNED);
2141	while (reclaimable--) {
2142		prefetch(q->sdesc[(cidx + 1) & mask].m);
2143		prefetch(q->sdesc[(cidx + 2) & mask].m);
2144
2145		if (txsd->m != NULL) {
2146			if (txsd->flags & TX_SW_DESC_MAPPED) {
2147				bus_dmamap_unload(q->entry_tag, txsd->map);
2148				txsd->flags &= ~TX_SW_DESC_MAPPED;
2149			}
2150			m_freem_list(txsd->m);
2151			txsd->m = NULL;
2152		} else
2153			q->txq_skipped++;
2154
2155		++txsd;
2156		if (++cidx == q->size) {
2157			cidx = 0;
2158			txsd = q->sdesc;
2159		}
2160	}
2161	q->cidx = cidx;
2162
2163}
2164
2165/**
2166 *	is_new_response - check if a response is newly written
2167 *	@r: the response descriptor
2168 *	@q: the response queue
2169 *
2170 *	Returns true if a response descriptor contains a yet unprocessed
2171 *	response.
2172 */
2173static __inline int
2174is_new_response(const struct rsp_desc *r,
2175    const struct sge_rspq *q)
2176{
2177	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2178}
2179
2180#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2181#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2182			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2183			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2184			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2185
2186/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2187#define NOMEM_INTR_DELAY 2500
2188
2189#ifdef TCP_OFFLOAD
2190/**
2191 *	write_ofld_wr - write an offload work request
2192 *	@adap: the adapter
2193 *	@m: the packet to send
2194 *	@q: the Tx queue
2195 *	@pidx: index of the first Tx descriptor to write
2196 *	@gen: the generation value to use
2197 *	@ndesc: number of descriptors the packet will occupy
2198 *
2199 *	Write an offload work request to send the supplied packet.  The packet
2200 *	data already carry the work request with most fields populated.
2201 */
2202static void
2203write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q,
2204    unsigned int pidx, unsigned int gen, unsigned int ndesc)
2205{
2206	unsigned int sgl_flits, flits;
2207	int i, idx, nsegs, wrlen;
2208	struct work_request_hdr *from;
2209	struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1];
2210	struct tx_desc *d = &q->desc[pidx];
2211	struct txq_state txqs;
2212	struct sglist_seg *segs;
2213	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2214	struct sglist *sgl;
2215
2216	from = (void *)(oh + 1);	/* Start of WR within mbuf */
2217	wrlen = m->m_len - sizeof(*oh);
2218
2219	if (!(oh->flags & F_HDR_SGL)) {
2220		write_imm(d, (caddr_t)from, wrlen, gen);
2221
2222		/*
2223		 * mbuf with "real" immediate tx data will be enqueue_wr'd by
2224		 * t3_push_frames and freed in wr_ack.  Others, like those sent
2225		 * down by close_conn, t3_send_reset, etc. should be freed here.
2226		 */
2227		if (!(oh->flags & F_HDR_DF))
2228			m_free(m);
2229		return;
2230	}
2231
2232	memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from));
2233
2234	sgl = oh->sgl;
2235	flits = wrlen / 8;
2236	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl;
2237
2238	nsegs = sgl->sg_nseg;
2239	segs = sgl->sg_segs;
2240	for (idx = 0, i = 0; i < nsegs; i++) {
2241		KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__));
2242		if (i && idx == 0)
2243			++sgp;
2244		sgp->len[idx] = htobe32(segs[i].ss_len);
2245		sgp->addr[idx] = htobe64(segs[i].ss_paddr);
2246		idx ^= 1;
2247	}
2248	if (idx) {
2249		sgp->len[idx] = 0;
2250		sgp->addr[idx] = 0;
2251	}
2252
2253	sgl_flits = sgl_len(nsegs);
2254	txqs.gen = gen;
2255	txqs.pidx = pidx;
2256	txqs.compl = 0;
2257
2258	write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits,
2259	    from->wrh_hi, from->wrh_lo);
2260}
2261
2262/**
2263 *	ofld_xmit - send a packet through an offload queue
2264 *	@adap: the adapter
2265 *	@q: the Tx offload queue
2266 *	@m: the packet
2267 *
2268 *	Send an offload packet through an SGE offload queue.
2269 */
2270static int
2271ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m)
2272{
2273	int ret;
2274	unsigned int ndesc;
2275	unsigned int pidx, gen;
2276	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2277	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2278
2279	ndesc = G_HDR_NDESC(oh->flags);
2280
2281	TXQ_LOCK(qs);
2282again:	reclaim_completed_tx(qs, 16, TXQ_OFLD);
2283	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
2284	if (__predict_false(ret)) {
2285		if (ret == 1) {
2286			TXQ_UNLOCK(qs);
2287			return (EINTR);
2288		}
2289		goto again;
2290	}
2291
2292	gen = q->gen;
2293	q->in_use += ndesc;
2294	pidx = q->pidx;
2295	q->pidx += ndesc;
2296	if (q->pidx >= q->size) {
2297		q->pidx -= q->size;
2298		q->gen ^= 1;
2299	}
2300
2301	write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2302	check_ring_tx_db(adap, q, 1);
2303	TXQ_UNLOCK(qs);
2304
2305	return (0);
2306}
2307
2308/**
2309 *	restart_offloadq - restart a suspended offload queue
2310 *	@qs: the queue set cotaining the offload queue
2311 *
2312 *	Resumes transmission on a suspended Tx offload queue.
2313 */
2314static void
2315restart_offloadq(void *data, int npending)
2316{
2317	struct mbuf *m;
2318	struct sge_qset *qs = data;
2319	struct sge_txq *q = &qs->txq[TXQ_OFLD];
2320	adapter_t *adap = qs->port->adapter;
2321	int cleaned;
2322
2323	TXQ_LOCK(qs);
2324again:	cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD);
2325
2326	while ((m = mbufq_peek(&q->sendq)) != NULL) {
2327		unsigned int gen, pidx;
2328		struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2329		unsigned int ndesc = G_HDR_NDESC(oh->flags);
2330
2331		if (__predict_false(q->size - q->in_use < ndesc)) {
2332			setbit(&qs->txq_stopped, TXQ_OFLD);
2333			if (should_restart_tx(q) &&
2334			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
2335				goto again;
2336			q->stops++;
2337			break;
2338		}
2339
2340		gen = q->gen;
2341		q->in_use += ndesc;
2342		pidx = q->pidx;
2343		q->pidx += ndesc;
2344		if (q->pidx >= q->size) {
2345			q->pidx -= q->size;
2346			q->gen ^= 1;
2347		}
2348
2349		(void)mbufq_dequeue(&q->sendq);
2350		TXQ_UNLOCK(qs);
2351		write_ofld_wr(adap, m, q, pidx, gen, ndesc);
2352		TXQ_LOCK(qs);
2353	}
2354#if USE_GTS
2355	set_bit(TXQ_RUNNING, &q->flags);
2356	set_bit(TXQ_LAST_PKT_DB, &q->flags);
2357#endif
2358	TXQ_UNLOCK(qs);
2359	wmb();
2360	t3_write_reg(adap, A_SG_KDOORBELL,
2361		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
2362}
2363
2364/**
2365 *	t3_offload_tx - send an offload packet
2366 *	@m: the packet
2367 *
2368 *	Sends an offload packet.  We use the packet priority to select the
2369 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
2370 *	should be sent as regular or control, bits 1-3 select the queue set.
2371 */
2372int
2373t3_offload_tx(struct adapter *sc, struct mbuf *m)
2374{
2375	struct ofld_hdr *oh = mtod(m, struct ofld_hdr *);
2376	struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)];
2377
2378	if (oh->flags & F_HDR_CTRL) {
2379		m_adj(m, sizeof (*oh));	/* trim ofld_hdr off */
2380		return (ctrl_xmit(sc, qs, m));
2381	} else
2382		return (ofld_xmit(sc, qs, m));
2383}
2384#endif
2385
2386static void
2387restart_tx(struct sge_qset *qs)
2388{
2389	struct adapter *sc = qs->port->adapter;
2390
2391	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
2392	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2393	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2394		qs->txq[TXQ_OFLD].restarts++;
2395		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
2396	}
2397
2398	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
2399	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2400	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2401		qs->txq[TXQ_CTRL].restarts++;
2402		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
2403	}
2404}
2405
2406/**
2407 *	t3_sge_alloc_qset - initialize an SGE queue set
2408 *	@sc: the controller softc
2409 *	@id: the queue set id
2410 *	@nports: how many Ethernet ports will be using this queue set
2411 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
2412 *	@p: configuration parameters for this queue set
2413 *	@ntxq: number of Tx queues for the queue set
2414 *	@pi: port info for queue set
2415 *
2416 *	Allocate resources and initialize an SGE queue set.  A queue set
2417 *	comprises a response queue, two Rx free-buffer queues, and up to 3
2418 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
2419 *	queue, offload queue, and control queue.
2420 */
2421int
2422t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
2423		  const struct qset_params *p, int ntxq, struct port_info *pi)
2424{
2425	struct sge_qset *q = &sc->sge.qs[id];
2426	int i, ret = 0;
2427
2428	MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF);
2429	q->port = pi;
2430	q->adap = sc;
2431
2432	if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size,
2433	    M_DEVBUF, M_WAITOK, &q->lock)) == NULL) {
2434		device_printf(sc->dev, "failed to allocate mbuf ring\n");
2435		goto err;
2436	}
2437	if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF,
2438	    M_NOWAIT | M_ZERO)) == NULL) {
2439		device_printf(sc->dev, "failed to allocate ifq\n");
2440		goto err;
2441	}
2442	ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp);
2443	callout_init(&q->txq[TXQ_ETH].txq_timer, 1);
2444	callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1);
2445	q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus;
2446	q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus;
2447
2448	init_qset_cntxt(q, id);
2449	q->idx = id;
2450	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
2451		    sizeof(struct rx_sw_desc), &q->fl[0].phys_addr,
2452		    &q->fl[0].desc, &q->fl[0].sdesc,
2453		    &q->fl[0].desc_tag, &q->fl[0].desc_map,
2454		    sc->rx_dmat, &q->fl[0].entry_tag)) != 0) {
2455		printf("error %d from alloc ring fl0\n", ret);
2456		goto err;
2457	}
2458
2459	if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc),
2460		    sizeof(struct rx_sw_desc), &q->fl[1].phys_addr,
2461		    &q->fl[1].desc, &q->fl[1].sdesc,
2462		    &q->fl[1].desc_tag, &q->fl[1].desc_map,
2463		    sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) {
2464		printf("error %d from alloc ring fl1\n", ret);
2465		goto err;
2466	}
2467
2468	if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0,
2469		    &q->rspq.phys_addr, &q->rspq.desc, NULL,
2470		    &q->rspq.desc_tag, &q->rspq.desc_map,
2471		    NULL, NULL)) != 0) {
2472		printf("error %d from alloc ring rspq\n", ret);
2473		goto err;
2474	}
2475
2476	snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d",
2477	    device_get_unit(sc->dev), irq_vec_idx);
2478	MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF);
2479
2480	for (i = 0; i < ntxq; ++i) {
2481		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2482
2483		if ((ret = alloc_ring(sc, p->txq_size[i],
2484			    sizeof(struct tx_desc), sz,
2485			    &q->txq[i].phys_addr, &q->txq[i].desc,
2486			    &q->txq[i].sdesc, &q->txq[i].desc_tag,
2487			    &q->txq[i].desc_map,
2488			    sc->tx_dmat, &q->txq[i].entry_tag)) != 0) {
2489			printf("error %d from alloc ring tx %i\n", ret, i);
2490			goto err;
2491		}
2492		mbufq_init(&q->txq[i].sendq);
2493		q->txq[i].gen = 1;
2494		q->txq[i].size = p->txq_size[i];
2495	}
2496
2497#ifdef TCP_OFFLOAD
2498	TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q);
2499#endif
2500	TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q);
2501	TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2502	TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q);
2503
2504	q->fl[0].gen = q->fl[1].gen = 1;
2505	q->fl[0].size = p->fl_size;
2506	q->fl[1].size = p->jumbo_size;
2507
2508	q->rspq.gen = 1;
2509	q->rspq.cidx = 0;
2510	q->rspq.size = p->rspq_size;
2511
2512	q->txq[TXQ_ETH].stop_thres = nports *
2513	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
2514
2515	q->fl[0].buf_size = MCLBYTES;
2516	q->fl[0].zone = zone_pack;
2517	q->fl[0].type = EXT_PACKET;
2518
2519	if (p->jumbo_buf_size ==  MJUM16BYTES) {
2520		q->fl[1].zone = zone_jumbo16;
2521		q->fl[1].type = EXT_JUMBO16;
2522	} else if (p->jumbo_buf_size ==  MJUM9BYTES) {
2523		q->fl[1].zone = zone_jumbo9;
2524		q->fl[1].type = EXT_JUMBO9;
2525	} else if (p->jumbo_buf_size ==  MJUMPAGESIZE) {
2526		q->fl[1].zone = zone_jumbop;
2527		q->fl[1].type = EXT_JUMBOP;
2528	} else {
2529		KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size));
2530		ret = EDOOFUS;
2531		goto err;
2532	}
2533	q->fl[1].buf_size = p->jumbo_buf_size;
2534
2535	/* Allocate and setup the lro_ctrl structure */
2536	q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO);
2537#if defined(INET6) || defined(INET)
2538	ret = tcp_lro_init(&q->lro.ctrl);
2539	if (ret) {
2540		printf("error %d from tcp_lro_init\n", ret);
2541		goto err;
2542	}
2543#endif
2544	q->lro.ctrl.ifp = pi->ifp;
2545
2546	mtx_lock_spin(&sc->sge.reg_lock);
2547	ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx,
2548				   q->rspq.phys_addr, q->rspq.size,
2549				   q->fl[0].buf_size, 1, 0);
2550	if (ret) {
2551		printf("error %d from t3_sge_init_rspcntxt\n", ret);
2552		goto err_unlock;
2553	}
2554
2555	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2556		ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0,
2557					  q->fl[i].phys_addr, q->fl[i].size,
2558					  q->fl[i].buf_size, p->cong_thres, 1,
2559					  0);
2560		if (ret) {
2561			printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i);
2562			goto err_unlock;
2563		}
2564	}
2565
2566	ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2567				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2568				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2569				 1, 0);
2570	if (ret) {
2571		printf("error %d from t3_sge_init_ecntxt\n", ret);
2572		goto err_unlock;
2573	}
2574
2575	if (ntxq > 1) {
2576		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id,
2577					 USE_GTS, SGE_CNTXT_OFLD, id,
2578					 q->txq[TXQ_OFLD].phys_addr,
2579					 q->txq[TXQ_OFLD].size, 0, 1, 0);
2580		if (ret) {
2581			printf("error %d from t3_sge_init_ecntxt\n", ret);
2582			goto err_unlock;
2583		}
2584	}
2585
2586	if (ntxq > 2) {
2587		ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0,
2588					 SGE_CNTXT_CTRL, id,
2589					 q->txq[TXQ_CTRL].phys_addr,
2590					 q->txq[TXQ_CTRL].size,
2591					 q->txq[TXQ_CTRL].token, 1, 0);
2592		if (ret) {
2593			printf("error %d from t3_sge_init_ecntxt\n", ret);
2594			goto err_unlock;
2595		}
2596	}
2597
2598	mtx_unlock_spin(&sc->sge.reg_lock);
2599	t3_update_qset_coalesce(q, p);
2600
2601	refill_fl(sc, &q->fl[0], q->fl[0].size);
2602	refill_fl(sc, &q->fl[1], q->fl[1].size);
2603	refill_rspq(sc, &q->rspq, q->rspq.size - 1);
2604
2605	t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2606		     V_NEWTIMER(q->rspq.holdoff_tmr));
2607
2608	return (0);
2609
2610err_unlock:
2611	mtx_unlock_spin(&sc->sge.reg_lock);
2612err:
2613	TXQ_LOCK(q);
2614	t3_free_qset(sc, q);
2615
2616	return (ret);
2617}
2618
2619/*
2620 * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with
2621 * ethernet data.  Hardware assistance with various checksums and any vlan tag
2622 * will also be taken into account here.
2623 */
2624void
2625t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad)
2626{
2627	struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad);
2628	struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]];
2629	struct ifnet *ifp = pi->ifp;
2630
2631	if (cpl->vlan_valid) {
2632		m->m_pkthdr.ether_vtag = ntohs(cpl->vlan);
2633		m->m_flags |= M_VLANTAG;
2634	}
2635
2636	m->m_pkthdr.rcvif = ifp;
2637	/*
2638	 * adjust after conversion to mbuf chain
2639	 */
2640	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
2641	m->m_len -= (sizeof(*cpl) + ethpad);
2642	m->m_data += (sizeof(*cpl) + ethpad);
2643
2644	if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) {
2645		struct ether_header *eh = mtod(m, void *);
2646		uint16_t eh_type;
2647
2648		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2649			struct ether_vlan_header *evh = mtod(m, void *);
2650
2651			eh_type = evh->evl_proto;
2652		} else
2653			eh_type = eh->ether_type;
2654
2655		if (ifp->if_capenable & IFCAP_RXCSUM &&
2656		    eh_type == htons(ETHERTYPE_IP)) {
2657			m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
2658			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2659			m->m_pkthdr.csum_data = 0xffff;
2660		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
2661		    eh_type == htons(ETHERTYPE_IPV6)) {
2662			m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
2663			    CSUM_PSEUDO_HDR);
2664			m->m_pkthdr.csum_data = 0xffff;
2665		}
2666	}
2667}
2668
2669/**
2670 *	get_packet - return the next ingress packet buffer from a free list
2671 *	@adap: the adapter that received the packet
2672 *	@drop_thres: # of remaining buffers before we start dropping packets
2673 *	@qs: the qset that the SGE free list holding the packet belongs to
2674 *      @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain
2675 *      @r: response descriptor
2676 *
2677 *	Get the next packet from a free list and complete setup of the
2678 *	sk_buff.  If the packet is small we make a copy and recycle the
2679 *	original buffer, otherwise we use the original buffer itself.  If a
2680 *	positive drop threshold is supplied packets are dropped and their
2681 *	buffers recycled if (a) the number of remaining buffers is under the
2682 *	threshold and the packet is too big to copy, or (b) the packet should
2683 *	be copied but there is no memory for the copy.
2684 */
2685static int
2686get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
2687    struct t3_mbuf_hdr *mh, struct rsp_desc *r)
2688{
2689
2690	unsigned int len_cq =  ntohl(r->len_cq);
2691	struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2692	int mask, cidx = fl->cidx;
2693	struct rx_sw_desc *sd = &fl->sdesc[cidx];
2694	uint32_t len = G_RSPD_LEN(len_cq);
2695	uint32_t flags = M_EXT;
2696	uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags));
2697	caddr_t cl;
2698	struct mbuf *m;
2699	int ret = 0;
2700
2701	mask = fl->size - 1;
2702	prefetch(fl->sdesc[(cidx + 1) & mask].m);
2703	prefetch(fl->sdesc[(cidx + 2) & mask].m);
2704	prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl);
2705	prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl);
2706
2707	fl->credits--;
2708	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
2709
2710	if (recycle_enable && len <= SGE_RX_COPY_THRES &&
2711	    sopeop == RSPQ_SOP_EOP) {
2712		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
2713			goto skip_recycle;
2714		cl = mtod(m, void *);
2715		memcpy(cl, sd->rxsd_cl, len);
2716		recycle_rx_buf(adap, fl, fl->cidx);
2717		m->m_pkthdr.len = m->m_len = len;
2718		m->m_flags = 0;
2719		mh->mh_head = mh->mh_tail = m;
2720		ret = 1;
2721		goto done;
2722	} else {
2723	skip_recycle:
2724		bus_dmamap_unload(fl->entry_tag, sd->map);
2725		cl = sd->rxsd_cl;
2726		m = sd->m;
2727
2728		if ((sopeop == RSPQ_SOP_EOP) ||
2729		    (sopeop == RSPQ_SOP))
2730			flags |= M_PKTHDR;
2731		m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags);
2732		if (fl->zone == zone_pack) {
2733			/*
2734			 * restore clobbered data pointer
2735			 */
2736			m->m_data = m->m_ext.ext_buf;
2737		} else {
2738			m_cljset(m, cl, fl->type);
2739		}
2740		m->m_len = len;
2741	}
2742	switch(sopeop) {
2743	case RSPQ_SOP_EOP:
2744		ret = 1;
2745		/* FALLTHROUGH */
2746	case RSPQ_SOP:
2747		mh->mh_head = mh->mh_tail = m;
2748		m->m_pkthdr.len = len;
2749		break;
2750	case RSPQ_EOP:
2751		ret = 1;
2752		/* FALLTHROUGH */
2753	case RSPQ_NSOP_NEOP:
2754		if (mh->mh_tail == NULL) {
2755			log(LOG_ERR, "discarding intermediate descriptor entry\n");
2756			m_freem(m);
2757			break;
2758		}
2759		mh->mh_tail->m_next = m;
2760		mh->mh_tail = m;
2761		mh->mh_head->m_pkthdr.len += len;
2762		break;
2763	}
2764	if (cxgb_debug)
2765		printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len);
2766done:
2767	if (++fl->cidx == fl->size)
2768		fl->cidx = 0;
2769
2770	return (ret);
2771}
2772
2773/**
2774 *	handle_rsp_cntrl_info - handles control information in a response
2775 *	@qs: the queue set corresponding to the response
2776 *	@flags: the response control flags
2777 *
2778 *	Handles the control information of an SGE response, such as GTS
2779 *	indications and completion credits for the queue set's Tx queues.
2780 *	HW coalesces credits, we don't do any extra SW coalescing.
2781 */
2782static __inline void
2783handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
2784{
2785	unsigned int credits;
2786
2787#if USE_GTS
2788	if (flags & F_RSPD_TXQ0_GTS)
2789		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2790#endif
2791	credits = G_RSPD_TXQ0_CR(flags);
2792	if (credits)
2793		qs->txq[TXQ_ETH].processed += credits;
2794
2795	credits = G_RSPD_TXQ2_CR(flags);
2796	if (credits)
2797		qs->txq[TXQ_CTRL].processed += credits;
2798
2799# if USE_GTS
2800	if (flags & F_RSPD_TXQ1_GTS)
2801		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2802# endif
2803	credits = G_RSPD_TXQ1_CR(flags);
2804	if (credits)
2805		qs->txq[TXQ_OFLD].processed += credits;
2806
2807}
2808
2809static void
2810check_ring_db(adapter_t *adap, struct sge_qset *qs,
2811    unsigned int sleeping)
2812{
2813	;
2814}
2815
2816/**
2817 *	process_responses - process responses from an SGE response queue
2818 *	@adap: the adapter
2819 *	@qs: the queue set to which the response queue belongs
2820 *	@budget: how many responses can be processed in this round
2821 *
2822 *	Process responses from an SGE response queue up to the supplied budget.
2823 *	Responses include received packets as well as credits and other events
2824 *	for the queues that belong to the response queue's queue set.
2825 *	A negative budget is effectively unlimited.
2826 *
2827 *	Additionally choose the interrupt holdoff time for the next interrupt
2828 *	on this queue.  If the system is under memory shortage use a fairly
2829 *	long delay to help recovery.
2830 */
2831static int
2832process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
2833{
2834	struct sge_rspq *rspq = &qs->rspq;
2835	struct rsp_desc *r = &rspq->desc[rspq->cidx];
2836	int budget_left = budget;
2837	unsigned int sleeping = 0;
2838#if defined(INET6) || defined(INET)
2839	int lro_enabled = qs->lro.enabled;
2840	int skip_lro;
2841	struct lro_ctrl *lro_ctrl = &qs->lro.ctrl;
2842#endif
2843	struct t3_mbuf_hdr *mh = &rspq->rspq_mh;
2844#ifdef DEBUG
2845	static int last_holdoff = 0;
2846	if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) {
2847		printf("next_holdoff=%d\n", rspq->holdoff_tmr);
2848		last_holdoff = rspq->holdoff_tmr;
2849	}
2850#endif
2851	rspq->next_holdoff = rspq->holdoff_tmr;
2852
2853	while (__predict_true(budget_left && is_new_response(r, rspq))) {
2854		int eth, eop = 0, ethpad = 0;
2855		uint32_t flags = ntohl(r->flags);
2856		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
2857		uint8_t opcode = r->rss_hdr.opcode;
2858
2859		eth = (opcode == CPL_RX_PKT);
2860
2861		if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) {
2862			struct mbuf *m;
2863
2864			if (cxgb_debug)
2865				printf("async notification\n");
2866
2867			if (mh->mh_head == NULL) {
2868				mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA);
2869				m = mh->mh_head;
2870			} else {
2871				m = m_gethdr(M_NOWAIT, MT_DATA);
2872			}
2873			if (m == NULL)
2874				goto no_mem;
2875
2876                        memcpy(mtod(m, char *), r, AN_PKT_SIZE);
2877			m->m_len = m->m_pkthdr.len = AN_PKT_SIZE;
2878                        *mtod(m, char *) = CPL_ASYNC_NOTIF;
2879			opcode = CPL_ASYNC_NOTIF;
2880			eop = 1;
2881                        rspq->async_notif++;
2882			goto skip;
2883		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
2884			struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
2885
2886			if (m == NULL) {
2887		no_mem:
2888				rspq->next_holdoff = NOMEM_INTR_DELAY;
2889				budget_left--;
2890				break;
2891			}
2892			if (mh->mh_head == NULL)
2893				mh->mh_head = m;
2894                        else
2895				mh->mh_tail->m_next = m;
2896			mh->mh_tail = m;
2897
2898			get_imm_packet(adap, r, m);
2899			mh->mh_head->m_pkthdr.len += m->m_len;
2900			eop = 1;
2901			rspq->imm_data++;
2902		} else if (r->len_cq) {
2903			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
2904
2905			eop = get_packet(adap, drop_thresh, qs, mh, r);
2906			if (eop) {
2907				if (r->rss_hdr.hash_type && !adap->timestamp)
2908					mh->mh_head->m_flags |= M_FLOWID;
2909				mh->mh_head->m_pkthdr.flowid = rss_hash;
2910			}
2911
2912			ethpad = 2;
2913		} else {
2914			rspq->pure_rsps++;
2915		}
2916	skip:
2917		if (flags & RSPD_CTRL_MASK) {
2918			sleeping |= flags & RSPD_GTS_MASK;
2919			handle_rsp_cntrl_info(qs, flags);
2920		}
2921
2922		if (!eth && eop) {
2923			rspq->offload_pkts++;
2924#ifdef TCP_OFFLOAD
2925			adap->cpl_handler[opcode](qs, r, mh->mh_head);
2926#else
2927			m_freem(mh->mh_head);
2928#endif
2929			mh->mh_head = NULL;
2930		} else if (eth && eop) {
2931			struct mbuf *m = mh->mh_head;
2932
2933			t3_rx_eth(adap, m, ethpad);
2934
2935			/*
2936			 * The T304 sends incoming packets on any qset.  If LRO
2937			 * is also enabled, we could end up sending packet up
2938			 * lro_ctrl->ifp's input.  That is incorrect.
2939			 *
2940			 * The mbuf's rcvif was derived from the cpl header and
2941			 * is accurate.  Skip LRO and just use that.
2942			 */
2943#if defined(INET6) || defined(INET)
2944			skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif);
2945
2946			if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro
2947			    && (tcp_lro_rx(lro_ctrl, m, 0) == 0)
2948			    ) {
2949				/* successfully queue'd for LRO */
2950			} else
2951#endif
2952			{
2953				/*
2954				 * LRO not enabled, packet unsuitable for LRO,
2955				 * or unable to queue.  Pass it up right now in
2956				 * either case.
2957				 */
2958				struct ifnet *ifp = m->m_pkthdr.rcvif;
2959				(*ifp->if_input)(ifp, m);
2960			}
2961			mh->mh_head = NULL;
2962
2963		}
2964
2965		r++;
2966		if (__predict_false(++rspq->cidx == rspq->size)) {
2967			rspq->cidx = 0;
2968			rspq->gen ^= 1;
2969			r = rspq->desc;
2970		}
2971
2972		if (++rspq->credits >= 64) {
2973			refill_rspq(adap, rspq, rspq->credits);
2974			rspq->credits = 0;
2975		}
2976		__refill_fl_lt(adap, &qs->fl[0], 32);
2977		__refill_fl_lt(adap, &qs->fl[1], 32);
2978		--budget_left;
2979	}
2980
2981#if defined(INET6) || defined(INET)
2982	/* Flush LRO */
2983	while (!SLIST_EMPTY(&lro_ctrl->lro_active)) {
2984		struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active);
2985		SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next);
2986		tcp_lro_flush(lro_ctrl, queued);
2987	}
2988#endif
2989
2990	if (sleeping)
2991		check_ring_db(adap, qs, sleeping);
2992
2993	mb();  /* commit Tx queue processed updates */
2994	if (__predict_false(qs->txq_stopped > 1))
2995		restart_tx(qs);
2996
2997	__refill_fl_lt(adap, &qs->fl[0], 512);
2998	__refill_fl_lt(adap, &qs->fl[1], 512);
2999	budget -= budget_left;
3000	return (budget);
3001}
3002
3003/*
3004 * A helper function that processes responses and issues GTS.
3005 */
3006static __inline int
3007process_responses_gts(adapter_t *adap, struct sge_rspq *rq)
3008{
3009	int work;
3010	static int last_holdoff = 0;
3011
3012	work = process_responses(adap, rspq_to_qset(rq), -1);
3013
3014	if (cxgb_debug && (rq->next_holdoff != last_holdoff)) {
3015		printf("next_holdoff=%d\n", rq->next_holdoff);
3016		last_holdoff = rq->next_holdoff;
3017	}
3018	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
3019	    V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
3020
3021	return (work);
3022}
3023
3024
3025/*
3026 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
3027 * Handles data events from SGE response queues as well as error and other
3028 * async events as they all use the same interrupt pin.  We use one SGE
3029 * response queue per port in this mode and protect all response queues with
3030 * queue 0's lock.
3031 */
3032void
3033t3b_intr(void *data)
3034{
3035	uint32_t i, map;
3036	adapter_t *adap = data;
3037	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3038
3039	t3_write_reg(adap, A_PL_CLI, 0);
3040	map = t3_read_reg(adap, A_SG_DATA_INTR);
3041
3042	if (!map)
3043		return;
3044
3045	if (__predict_false(map & F_ERRINTR)) {
3046		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3047		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3048		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3049	}
3050
3051	mtx_lock(&q0->lock);
3052	for_each_port(adap, i)
3053	    if (map & (1 << i))
3054			process_responses_gts(adap, &adap->sge.qs[i].rspq);
3055	mtx_unlock(&q0->lock);
3056}
3057
3058/*
3059 * The MSI interrupt handler.  This needs to handle data events from SGE
3060 * response queues as well as error and other async events as they all use
3061 * the same MSI vector.  We use one SGE response queue per port in this mode
3062 * and protect all response queues with queue 0's lock.
3063 */
3064void
3065t3_intr_msi(void *data)
3066{
3067	adapter_t *adap = data;
3068	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
3069	int i, new_packets = 0;
3070
3071	mtx_lock(&q0->lock);
3072
3073	for_each_port(adap, i)
3074	    if (process_responses_gts(adap, &adap->sge.qs[i].rspq))
3075		    new_packets = 1;
3076	mtx_unlock(&q0->lock);
3077	if (new_packets == 0) {
3078		t3_write_reg(adap, A_PL_INT_ENABLE0, 0);
3079		(void) t3_read_reg(adap, A_PL_INT_ENABLE0);
3080		taskqueue_enqueue(adap->tq, &adap->slow_intr_task);
3081	}
3082}
3083
3084void
3085t3_intr_msix(void *data)
3086{
3087	struct sge_qset *qs = data;
3088	adapter_t *adap = qs->port->adapter;
3089	struct sge_rspq *rspq = &qs->rspq;
3090
3091	if (process_responses_gts(adap, rspq) == 0)
3092		rspq->unhandled_irqs++;
3093}
3094
3095#define QDUMP_SBUF_SIZE		32 * 400
3096static int
3097t3_dump_rspq(SYSCTL_HANDLER_ARGS)
3098{
3099	struct sge_rspq *rspq;
3100	struct sge_qset *qs;
3101	int i, err, dump_end, idx;
3102	struct sbuf *sb;
3103	struct rsp_desc *rspd;
3104	uint32_t data[4];
3105
3106	rspq = arg1;
3107	qs = rspq_to_qset(rspq);
3108	if (rspq->rspq_dump_count == 0)
3109		return (0);
3110	if (rspq->rspq_dump_count > RSPQ_Q_SIZE) {
3111		log(LOG_WARNING,
3112		    "dump count is too large %d\n", rspq->rspq_dump_count);
3113		rspq->rspq_dump_count = 0;
3114		return (EINVAL);
3115	}
3116	if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) {
3117		log(LOG_WARNING,
3118		    "dump start of %d is greater than queue size\n",
3119		    rspq->rspq_dump_start);
3120		rspq->rspq_dump_start = 0;
3121		return (EINVAL);
3122	}
3123	err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data);
3124	if (err)
3125		return (err);
3126	err = sysctl_wire_old_buffer(req, 0);
3127	if (err)
3128		return (err);
3129	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3130
3131	sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n",
3132	    (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f),
3133	    ((data[2] >> 26) & 1), ((data[2] >> 27) & 1));
3134	sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n",
3135	    ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]);
3136
3137	sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start,
3138	    (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1));
3139
3140	dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count;
3141	for (i = rspq->rspq_dump_start; i < dump_end; i++) {
3142		idx = i & (RSPQ_Q_SIZE-1);
3143
3144		rspd = &rspq->desc[idx];
3145		sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n",
3146		    idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx,
3147		    rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx));
3148		sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n",
3149		    rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags),
3150		    be32toh(rspd->len_cq), rspd->intr_gen);
3151	}
3152
3153	err = sbuf_finish(sb);
3154	/* Output a trailing NUL. */
3155	if (err == 0)
3156		err = SYSCTL_OUT(req, "", 1);
3157	sbuf_delete(sb);
3158	return (err);
3159}
3160
3161static int
3162t3_dump_txq_eth(SYSCTL_HANDLER_ARGS)
3163{
3164	struct sge_txq *txq;
3165	struct sge_qset *qs;
3166	int i, j, err, dump_end;
3167	struct sbuf *sb;
3168	struct tx_desc *txd;
3169	uint32_t *WR, wr_hi, wr_lo, gen;
3170	uint32_t data[4];
3171
3172	txq = arg1;
3173	qs = txq_to_qset(txq, TXQ_ETH);
3174	if (txq->txq_dump_count == 0) {
3175		return (0);
3176	}
3177	if (txq->txq_dump_count > TX_ETH_Q_SIZE) {
3178		log(LOG_WARNING,
3179		    "dump count is too large %d\n", txq->txq_dump_count);
3180		txq->txq_dump_count = 1;
3181		return (EINVAL);
3182	}
3183	if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) {
3184		log(LOG_WARNING,
3185		    "dump start of %d is greater than queue size\n",
3186		    txq->txq_dump_start);
3187		txq->txq_dump_start = 0;
3188		return (EINVAL);
3189	}
3190	err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data);
3191	if (err)
3192		return (err);
3193	err = sysctl_wire_old_buffer(req, 0);
3194	if (err)
3195		return (err);
3196	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3197
3198	sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n",
3199	    (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16),
3200	    (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1));
3201	sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n",
3202	    ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1),
3203	    ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1));
3204	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3205	    txq->txq_dump_start,
3206	    (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1));
3207
3208	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3209	for (i = txq->txq_dump_start; i < dump_end; i++) {
3210		txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)];
3211		WR = (uint32_t *)txd->flit;
3212		wr_hi = ntohl(WR[0]);
3213		wr_lo = ntohl(WR[1]);
3214		gen = G_WR_GEN(wr_lo);
3215
3216		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3217		    wr_hi, wr_lo, gen);
3218		for (j = 2; j < 30; j += 4)
3219			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3220			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3221
3222	}
3223	err = sbuf_finish(sb);
3224	/* Output a trailing NUL. */
3225	if (err == 0)
3226		err = SYSCTL_OUT(req, "", 1);
3227	sbuf_delete(sb);
3228	return (err);
3229}
3230
3231static int
3232t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS)
3233{
3234	struct sge_txq *txq;
3235	struct sge_qset *qs;
3236	int i, j, err, dump_end;
3237	struct sbuf *sb;
3238	struct tx_desc *txd;
3239	uint32_t *WR, wr_hi, wr_lo, gen;
3240
3241	txq = arg1;
3242	qs = txq_to_qset(txq, TXQ_CTRL);
3243	if (txq->txq_dump_count == 0) {
3244		return (0);
3245	}
3246	if (txq->txq_dump_count > 256) {
3247		log(LOG_WARNING,
3248		    "dump count is too large %d\n", txq->txq_dump_count);
3249		txq->txq_dump_count = 1;
3250		return (EINVAL);
3251	}
3252	if (txq->txq_dump_start > 255) {
3253		log(LOG_WARNING,
3254		    "dump start of %d is greater than queue size\n",
3255		    txq->txq_dump_start);
3256		txq->txq_dump_start = 0;
3257		return (EINVAL);
3258	}
3259
3260	err = sysctl_wire_old_buffer(req, 0);
3261	if (err != 0)
3262		return (err);
3263	sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req);
3264	sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx,
3265	    txq->txq_dump_start,
3266	    (txq->txq_dump_start + txq->txq_dump_count) & 255);
3267
3268	dump_end = txq->txq_dump_start + txq->txq_dump_count;
3269	for (i = txq->txq_dump_start; i < dump_end; i++) {
3270		txd = &txq->desc[i & (255)];
3271		WR = (uint32_t *)txd->flit;
3272		wr_hi = ntohl(WR[0]);
3273		wr_lo = ntohl(WR[1]);
3274		gen = G_WR_GEN(wr_lo);
3275
3276		sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n",
3277		    wr_hi, wr_lo, gen);
3278		for (j = 2; j < 30; j += 4)
3279			sbuf_printf(sb, "\t%08x %08x %08x %08x \n",
3280			    WR[j], WR[j + 1], WR[j + 2], WR[j + 3]);
3281
3282	}
3283	err = sbuf_finish(sb);
3284	/* Output a trailing NUL. */
3285	if (err == 0)
3286		err = SYSCTL_OUT(req, "", 1);
3287	sbuf_delete(sb);
3288	return (err);
3289}
3290
3291static int
3292t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
3293{
3294	adapter_t *sc = arg1;
3295	struct qset_params *qsp = &sc->params.sge.qset[0];
3296	int coalesce_usecs;
3297	struct sge_qset *qs;
3298	int i, j, err, nqsets = 0;
3299	struct mtx *lock;
3300
3301	if ((sc->flags & FULL_INIT_DONE) == 0)
3302		return (ENXIO);
3303
3304	coalesce_usecs = qsp->coalesce_usecs;
3305        err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
3306
3307	if (err != 0) {
3308		return (err);
3309	}
3310	if (coalesce_usecs == qsp->coalesce_usecs)
3311		return (0);
3312
3313	for (i = 0; i < sc->params.nports; i++)
3314		for (j = 0; j < sc->port[i].nqsets; j++)
3315			nqsets++;
3316
3317	coalesce_usecs = max(1, coalesce_usecs);
3318
3319	for (i = 0; i < nqsets; i++) {
3320		qs = &sc->sge.qs[i];
3321		qsp = &sc->params.sge.qset[i];
3322		qsp->coalesce_usecs = coalesce_usecs;
3323
3324		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
3325			    &sc->sge.qs[0].rspq.lock;
3326
3327		mtx_lock(lock);
3328		t3_update_qset_coalesce(qs, qsp);
3329		t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
3330		    V_NEWTIMER(qs->rspq.holdoff_tmr));
3331		mtx_unlock(lock);
3332	}
3333
3334	return (0);
3335}
3336
3337static int
3338t3_pkt_timestamp(SYSCTL_HANDLER_ARGS)
3339{
3340	adapter_t *sc = arg1;
3341	int rc, timestamp;
3342
3343	if ((sc->flags & FULL_INIT_DONE) == 0)
3344		return (ENXIO);
3345
3346	timestamp = sc->timestamp;
3347	rc = sysctl_handle_int(oidp, &timestamp, arg2, req);
3348
3349	if (rc != 0)
3350		return (rc);
3351
3352	if (timestamp != sc->timestamp) {
3353		t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS,
3354		    timestamp ? F_ENABLERXPKTTMSTPRSS : 0);
3355		sc->timestamp = timestamp;
3356	}
3357
3358	return (0);
3359}
3360
3361void
3362t3_add_attach_sysctls(adapter_t *sc)
3363{
3364	struct sysctl_ctx_list *ctx;
3365	struct sysctl_oid_list *children;
3366
3367	ctx = device_get_sysctl_ctx(sc->dev);
3368	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3369
3370	/* random information */
3371	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3372	    "firmware_version",
3373	    CTLFLAG_RD, sc->fw_version,
3374	    0, "firmware version");
3375	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3376	    "hw_revision",
3377	    CTLFLAG_RD, &sc->params.rev,
3378	    0, "chip model");
3379	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
3380	    "port_types",
3381	    CTLFLAG_RD, sc->port_types,
3382	    0, "type of ports");
3383	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3384	    "enable_debug",
3385	    CTLFLAG_RW, &cxgb_debug,
3386	    0, "enable verbose debugging output");
3387	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce",
3388	    CTLFLAG_RD, &sc->tunq_coalesce,
3389	    "#tunneled packets freed");
3390	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
3391	    "txq_overrun",
3392	    CTLFLAG_RD, &txq_fills,
3393	    0, "#times txq overrun");
3394	SYSCTL_ADD_UINT(ctx, children, OID_AUTO,
3395	    "core_clock",
3396	    CTLFLAG_RD, &sc->params.vpd.cclk,
3397	    0, "core clock frequency (in KHz)");
3398}
3399
3400
3401static const char *rspq_name = "rspq";
3402static const char *txq_names[] =
3403{
3404	"txq_eth",
3405	"txq_ofld",
3406	"txq_ctrl"
3407};
3408
3409static int
3410sysctl_handle_macstat(SYSCTL_HANDLER_ARGS)
3411{
3412	struct port_info *p = arg1;
3413	uint64_t *parg;
3414
3415	if (!p)
3416		return (EINVAL);
3417
3418	parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2);
3419	PORT_LOCK(p);
3420	t3_mac_update_stats(&p->mac);
3421	PORT_UNLOCK(p);
3422
3423	return (sysctl_handle_64(oidp, parg, 0, req));
3424}
3425
3426void
3427t3_add_configured_sysctls(adapter_t *sc)
3428{
3429	struct sysctl_ctx_list *ctx;
3430	struct sysctl_oid_list *children;
3431	int i, j;
3432
3433	ctx = device_get_sysctl_ctx(sc->dev);
3434	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
3435
3436	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3437	    "intr_coal",
3438	    CTLTYPE_INT|CTLFLAG_RW, sc,
3439	    0, t3_set_coalesce_usecs,
3440	    "I", "interrupt coalescing timer (us)");
3441
3442	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
3443	    "pkt_timestamp",
3444	    CTLTYPE_INT | CTLFLAG_RW, sc,
3445	    0, t3_pkt_timestamp,
3446	    "I", "provide packet timestamp instead of connection hash");
3447
3448	for (i = 0; i < sc->params.nports; i++) {
3449		struct port_info *pi = &sc->port[i];
3450		struct sysctl_oid *poid;
3451		struct sysctl_oid_list *poidlist;
3452		struct mac_stats *mstats = &pi->mac.stats;
3453
3454		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
3455		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
3456		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
3457		poidlist = SYSCTL_CHILDREN(poid);
3458		SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO,
3459		    "nqsets", CTLFLAG_RD, &pi->nqsets,
3460		    0, "#queue sets");
3461
3462		for (j = 0; j < pi->nqsets; j++) {
3463			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
3464			struct sysctl_oid *qspoid, *rspqpoid, *txqpoid,
3465					  *ctrlqpoid, *lropoid;
3466			struct sysctl_oid_list *qspoidlist, *rspqpoidlist,
3467					       *txqpoidlist, *ctrlqpoidlist,
3468					       *lropoidlist;
3469			struct sge_txq *txq = &qs->txq[TXQ_ETH];
3470
3471			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
3472
3473			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
3474			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
3475			qspoidlist = SYSCTL_CHILDREN(qspoid);
3476
3477			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty",
3478					CTLFLAG_RD, &qs->fl[0].empty, 0,
3479					"freelist #0 empty");
3480			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty",
3481					CTLFLAG_RD, &qs->fl[1].empty, 0,
3482					"freelist #1 empty");
3483
3484			rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3485			    rspq_name, CTLFLAG_RD, NULL, "rspq statistics");
3486			rspqpoidlist = SYSCTL_CHILDREN(rspqpoid);
3487
3488			txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3489			    txq_names[0], CTLFLAG_RD, NULL, "txq statistics");
3490			txqpoidlist = SYSCTL_CHILDREN(txqpoid);
3491
3492			ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3493			    txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics");
3494			ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid);
3495
3496			lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO,
3497			    "lro_stats", CTLFLAG_RD, NULL, "LRO statistics");
3498			lropoidlist = SYSCTL_CHILDREN(lropoid);
3499
3500			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size",
3501			    CTLFLAG_RD, &qs->rspq.size,
3502			    0, "#entries in response queue");
3503			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx",
3504			    CTLFLAG_RD, &qs->rspq.cidx,
3505			    0, "consumer index");
3506			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits",
3507			    CTLFLAG_RD, &qs->rspq.credits,
3508			    0, "#credits");
3509			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved",
3510			    CTLFLAG_RD, &qs->rspq.starved,
3511			    0, "#times starved");
3512			SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr",
3513			    CTLFLAG_RD, &qs->rspq.phys_addr,
3514			    "physical_address_of the queue");
3515			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start",
3516			    CTLFLAG_RW, &qs->rspq.rspq_dump_start,
3517			    0, "start rspq dump entry");
3518			SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count",
3519			    CTLFLAG_RW, &qs->rspq.rspq_dump_count,
3520			    0, "#rspq entries to dump");
3521			SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump",
3522			    CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq,
3523			    0, t3_dump_rspq, "A", "dump of the response queue");
3524
3525			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped",
3526			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops,
3527			    "#tunneled packets dropped");
3528			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen",
3529			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
3530			    0, "#tunneled packets waiting to be sent");
3531#if 0
3532			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx",
3533			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
3534			    0, "#tunneled packets queue producer index");
3535			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx",
3536			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
3537			    0, "#tunneled packets queue consumer index");
3538#endif
3539			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed",
3540			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
3541			    0, "#tunneled packets processed by the card");
3542			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned",
3543			    CTLFLAG_RD, &txq->cleaned,
3544			    0, "#tunneled packets cleaned");
3545			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use",
3546			    CTLFLAG_RD, &txq->in_use,
3547			    0, "#tunneled packet slots in use");
3548			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees",
3549			    CTLFLAG_RD, &txq->txq_frees,
3550			    "#tunneled packets freed");
3551			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped",
3552			    CTLFLAG_RD, &txq->txq_skipped,
3553			    0, "#tunneled packet descriptors skipped");
3554			SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced",
3555			    CTLFLAG_RD, &txq->txq_coalesced,
3556			    "#tunneled packets coalesced");
3557			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued",
3558			    CTLFLAG_RD, &txq->txq_enqueued,
3559			    0, "#tunneled packets enqueued to hardware");
3560			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags",
3561			    CTLFLAG_RD, &qs->txq_stopped,
3562			    0, "tx queues stopped");
3563			SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr",
3564			    CTLFLAG_RD, &txq->phys_addr,
3565			    "physical_address_of the queue");
3566			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen",
3567			    CTLFLAG_RW, &qs->txq[TXQ_ETH].gen,
3568			    0, "txq generation");
3569			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx",
3570			    CTLFLAG_RD, &txq->cidx,
3571			    0, "hardware queue cidx");
3572			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx",
3573			    CTLFLAG_RD, &txq->pidx,
3574			    0, "hardware queue pidx");
3575			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start",
3576			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start,
3577			    0, "txq start idx for dump");
3578			SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count",
3579			    CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count,
3580			    0, "txq #entries to dump");
3581			SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump",
3582			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH],
3583			    0, t3_dump_txq_eth, "A", "dump of the transmit queue");
3584
3585			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start",
3586			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start,
3587			    0, "ctrlq start idx for dump");
3588			SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count",
3589			    CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count,
3590			    0, "ctrl #entries to dump");
3591			SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump",
3592			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
3593			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
3594
3595			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
3596			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
3597			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
3598			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
3599			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
3600			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
3601			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
3602			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);
3603		}
3604
3605		/* Now add a node for mac stats. */
3606		poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats",
3607		    CTLFLAG_RD, NULL, "MAC statistics");
3608		poidlist = SYSCTL_CHILDREN(poid);
3609
3610		/*
3611		 * We (ab)use the length argument (arg2) to pass on the offset
3612		 * of the data that we are interested in.  This is only required
3613		 * for the quad counters that are updated from the hardware (we
3614		 * make sure that we return the latest value).
3615		 * sysctl_handle_macstat first updates *all* the counters from
3616		 * the hardware, and then returns the latest value of the
3617		 * requested counter.  Best would be to update only the
3618		 * requested counter from hardware, but t3_mac_update_stats()
3619		 * hides all the register details and we don't want to dive into
3620		 * all that here.
3621		 */
3622#define CXGB_SYSCTL_ADD_QUAD(a)	SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \
3623    (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \
3624    sysctl_handle_macstat, "QU", 0)
3625		CXGB_SYSCTL_ADD_QUAD(tx_octets);
3626		CXGB_SYSCTL_ADD_QUAD(tx_octets_bad);
3627		CXGB_SYSCTL_ADD_QUAD(tx_frames);
3628		CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames);
3629		CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames);
3630		CXGB_SYSCTL_ADD_QUAD(tx_pause);
3631		CXGB_SYSCTL_ADD_QUAD(tx_deferred);
3632		CXGB_SYSCTL_ADD_QUAD(tx_late_collisions);
3633		CXGB_SYSCTL_ADD_QUAD(tx_total_collisions);
3634		CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions);
3635		CXGB_SYSCTL_ADD_QUAD(tx_underrun);
3636		CXGB_SYSCTL_ADD_QUAD(tx_len_errs);
3637		CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs);
3638		CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral);
3639		CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs);
3640		CXGB_SYSCTL_ADD_QUAD(tx_frames_64);
3641		CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127);
3642		CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255);
3643		CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511);
3644		CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023);
3645		CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518);
3646		CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max);
3647		CXGB_SYSCTL_ADD_QUAD(rx_octets);
3648		CXGB_SYSCTL_ADD_QUAD(rx_octets_bad);
3649		CXGB_SYSCTL_ADD_QUAD(rx_frames);
3650		CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames);
3651		CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames);
3652		CXGB_SYSCTL_ADD_QUAD(rx_pause);
3653		CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs);
3654		CXGB_SYSCTL_ADD_QUAD(rx_align_errs);
3655		CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs);
3656		CXGB_SYSCTL_ADD_QUAD(rx_data_errs);
3657		CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs);
3658		CXGB_SYSCTL_ADD_QUAD(rx_runt);
3659		CXGB_SYSCTL_ADD_QUAD(rx_jabber);
3660		CXGB_SYSCTL_ADD_QUAD(rx_short);
3661		CXGB_SYSCTL_ADD_QUAD(rx_too_long);
3662		CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs);
3663		CXGB_SYSCTL_ADD_QUAD(rx_cong_drops);
3664		CXGB_SYSCTL_ADD_QUAD(rx_frames_64);
3665		CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127);
3666		CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255);
3667		CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511);
3668		CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023);
3669		CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518);
3670		CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max);
3671#undef CXGB_SYSCTL_ADD_QUAD
3672
3673#define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \
3674    CTLFLAG_RD, &mstats->a, 0)
3675		CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err);
3676		CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err);
3677		CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun);
3678		CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl);
3679		CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss);
3680		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err);
3681		CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change);
3682		CXGB_SYSCTL_ADD_ULONG(num_toggled);
3683		CXGB_SYSCTL_ADD_ULONG(num_resets);
3684		CXGB_SYSCTL_ADD_ULONG(link_faults);
3685#undef CXGB_SYSCTL_ADD_ULONG
3686	}
3687}
3688
3689/**
3690 *	t3_get_desc - dump an SGE descriptor for debugging purposes
3691 *	@qs: the queue set
3692 *	@qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3693 *	@idx: the descriptor index in the queue
3694 *	@data: where to dump the descriptor contents
3695 *
3696 *	Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3697 *	size of the descriptor.
3698 */
3699int
3700t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3701		unsigned char *data)
3702{
3703	if (qnum >= 6)
3704		return (EINVAL);
3705
3706	if (qnum < 3) {
3707		if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3708			return -EINVAL;
3709		memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3710		return sizeof(struct tx_desc);
3711	}
3712
3713	if (qnum == 3) {
3714		if (!qs->rspq.desc || idx >= qs->rspq.size)
3715			return (EINVAL);
3716		memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3717		return sizeof(struct rsp_desc);
3718	}
3719
3720	qnum -= 4;
3721	if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3722		return (EINVAL);
3723	memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3724	return sizeof(struct rx_desc);
3725}
3726