t4_sge.c revision 308304
1/*-
2 * Copyright (c) 2011 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/10/sys/dev/cxgbe/t4_sge.c 308304 2016-11-04 18:45:06Z jhb $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#include <sys/types.h>
35#include <sys/mbuf.h>
36#include <sys/socket.h>
37#include <sys/kernel.h>
38#include <sys/malloc.h>
39#include <sys/queue.h>
40#include <sys/sbuf.h>
41#include <sys/taskqueue.h>
42#include <sys/time.h>
43#include <sys/sglist.h>
44#include <sys/sysctl.h>
45#include <sys/smp.h>
46#include <sys/counter.h>
47#include <net/bpf.h>
48#include <net/ethernet.h>
49#include <net/if.h>
50#include <net/if_vlan_var.h>
51#include <netinet/in.h>
52#include <netinet/ip.h>
53#include <netinet/ip6.h>
54#include <netinet/tcp.h>
55#include <machine/md_var.h>
56#include <vm/vm.h>
57#include <vm/pmap.h>
58#ifdef DEV_NETMAP
59#include <machine/bus.h>
60#include <sys/selinfo.h>
61#include <net/if_var.h>
62#include <net/netmap.h>
63#include <dev/netmap/netmap_kern.h>
64#endif
65
66#include "common/common.h"
67#include "common/t4_regs.h"
68#include "common/t4_regs_values.h"
69#include "common/t4_msg.h"
70#include "t4_mp_ring.h"
71
72#ifdef T4_PKT_TIMESTAMP
73#define RX_COPY_THRESHOLD (MINCLSIZE - 8)
74#else
75#define RX_COPY_THRESHOLD MINCLSIZE
76#endif
77
78/*
79 * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
80 * 0-7 are valid values.
81 */
82int fl_pktshift = 2;
83TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift);
84
85/*
86 * Pad ethernet payload up to this boundary.
87 * -1: driver should figure out a good value.
88 *  0: disable padding.
89 *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
90 */
91int fl_pad = -1;
92TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
93
94/*
95 * Status page length.
96 * -1: driver should figure out a good value.
97 *  64 or 128 are the only other valid values.
98 */
99int spg_len = -1;
100TUNABLE_INT("hw.cxgbe.spg_len", &spg_len);
101
102/*
103 * Congestion drops.
104 * -1: no congestion feedback (not recommended).
105 *  0: backpressure the channel instead of dropping packets right away.
106 *  1: no backpressure, drop packets for the congested queue immediately.
107 */
108static int cong_drop = 0;
109TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
110
111/*
112 * Deliver multiple frames in the same free list buffer if they fit.
113 * -1: let the driver decide whether to enable buffer packing or not.
114 *  0: disable buffer packing.
115 *  1: enable buffer packing.
116 */
117static int buffer_packing = -1;
118TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing);
119
120/*
121 * Start next frame in a packed buffer at this boundary.
122 * -1: driver should figure out a good value.
123 * T4: driver will ignore this and use the same value as fl_pad above.
124 * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
125 */
126static int fl_pack = -1;
127TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
128
129/*
130 * Allow the driver to create mbuf(s) in a cluster allocated for rx.
131 * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
132 * 1: ok to create mbuf(s) within a cluster if there is room.
133 */
134static int allow_mbufs_in_cluster = 1;
135TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
136
137/*
138 * Largest rx cluster size that the driver is allowed to allocate.
139 */
140static int largest_rx_cluster = MJUM16BYTES;
141TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
142
143/*
144 * Size of cluster allocation that's most likely to succeed.  The driver will
145 * fall back to this size if it fails to allocate clusters larger than this.
146 */
147static int safest_rx_cluster = PAGE_SIZE;
148TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
149
150struct txpkts {
151	u_int wr_type;		/* type 0 or type 1 */
152	u_int npkt;		/* # of packets in this work request */
153	u_int plen;		/* total payload (sum of all packets) */
154	u_int len16;		/* # of 16B pieces used by this work request */
155};
156
157/* A packet's SGL.  This + m_pkthdr has all info needed for tx */
158struct sgl {
159	struct sglist sg;
160	struct sglist_seg seg[TX_SGL_SEGS];
161};
162
163static int service_iq(struct sge_iq *, int);
164static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
165static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
166static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
167static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
168static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
169    uint16_t, char *);
170static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
171    bus_addr_t *, void **);
172static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
173    void *);
174static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
175    int, int);
176static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
177static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
178    struct sge_fl *);
179static int alloc_fwq(struct adapter *);
180static int free_fwq(struct adapter *);
181static int alloc_mgmtq(struct adapter *);
182static int free_mgmtq(struct adapter *);
183static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
184    struct sysctl_oid *);
185static int free_rxq(struct vi_info *, struct sge_rxq *);
186#ifdef TCP_OFFLOAD
187static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
188    struct sysctl_oid *);
189static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
190#endif
191#ifdef DEV_NETMAP
192static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int,
193    struct sysctl_oid *);
194static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
195static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int,
196    struct sysctl_oid *);
197static int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
198#endif
199static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
200static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
201#ifdef TCP_OFFLOAD
202static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
203#endif
204static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
205static int free_eq(struct adapter *, struct sge_eq *);
206static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
207    struct sysctl_oid *);
208static int free_wrq(struct adapter *, struct sge_wrq *);
209static int alloc_txq(struct vi_info *, struct sge_txq *, int,
210    struct sysctl_oid *);
211static int free_txq(struct vi_info *, struct sge_txq *);
212static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
213static inline void ring_fl_db(struct adapter *, struct sge_fl *);
214static int refill_fl(struct adapter *, struct sge_fl *, int);
215static void refill_sfl(void *);
216static int alloc_fl_sdesc(struct sge_fl *);
217static void free_fl_sdesc(struct adapter *, struct sge_fl *);
218static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
219static void find_safe_refill_source(struct adapter *, struct sge_fl *);
220static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
221
222static inline void get_pkt_gl(struct mbuf *, struct sglist *);
223static inline u_int txpkt_len16(u_int, u_int);
224static inline u_int txpkts0_len16(u_int);
225static inline u_int txpkts1_len16(void);
226static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
227    struct mbuf *, u_int);
228static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
229static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
230static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
231    struct mbuf *, const struct txpkts *, u_int);
232static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
233static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
234static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
235static inline uint16_t read_hw_cidx(struct sge_eq *);
236static inline u_int reclaimable_tx_desc(struct sge_eq *);
237static inline u_int total_available_tx_desc(struct sge_eq *);
238static u_int reclaim_tx_descs(struct sge_txq *, u_int);
239static void tx_reclaim(void *, int);
240static __be64 get_flit(struct sglist_seg *, int, int);
241static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
242    struct mbuf *);
243static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
244    struct mbuf *);
245static void wrq_tx_drain(void *, int);
246static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
247
248static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
249static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
250
251static counter_u64_t extfree_refs;
252static counter_u64_t extfree_rels;
253
254/*
255 * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
256 */
257void
258t4_sge_modload(void)
259{
260
261	if (fl_pktshift < 0 || fl_pktshift > 7) {
262		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
263		    " using 2 instead.\n", fl_pktshift);
264		fl_pktshift = 2;
265	}
266
267	if (spg_len != 64 && spg_len != 128) {
268		int len;
269
270#if defined(__i386__) || defined(__amd64__)
271		len = cpu_clflush_line_size > 64 ? 128 : 64;
272#else
273		len = 64;
274#endif
275		if (spg_len != -1) {
276			printf("Invalid hw.cxgbe.spg_len value (%d),"
277			    " using %d instead.\n", spg_len, len);
278		}
279		spg_len = len;
280	}
281
282	if (cong_drop < -1 || cong_drop > 1) {
283		printf("Invalid hw.cxgbe.cong_drop value (%d),"
284		    " using 0 instead.\n", cong_drop);
285		cong_drop = 0;
286	}
287
288	extfree_refs = counter_u64_alloc(M_WAITOK);
289	extfree_rels = counter_u64_alloc(M_WAITOK);
290	counter_u64_zero(extfree_refs);
291	counter_u64_zero(extfree_rels);
292}
293
294void
295t4_sge_modunload(void)
296{
297
298	counter_u64_free(extfree_refs);
299	counter_u64_free(extfree_rels);
300}
301
302uint64_t
303t4_sge_extfree_refs(void)
304{
305	uint64_t refs, rels;
306
307	rels = counter_u64_fetch(extfree_rels);
308	refs = counter_u64_fetch(extfree_refs);
309
310	return (refs - rels);
311}
312
313void
314t4_init_sge_cpl_handlers(struct adapter *sc)
315{
316
317	t4_register_cpl_handler(sc, CPL_FW4_MSG, handle_fw_msg);
318	t4_register_cpl_handler(sc, CPL_FW6_MSG, handle_fw_msg);
319	t4_register_cpl_handler(sc, CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
320	t4_register_cpl_handler(sc, CPL_RX_PKT, t4_eth_rx);
321	t4_register_fw_msg_handler(sc, FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
322}
323
324static inline void
325setup_pad_and_pack_boundaries(struct adapter *sc)
326{
327	uint32_t v, m;
328	int pad, pack;
329
330	pad = fl_pad;
331	if (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad)) {
332		/*
333		 * If there is any chance that we might use buffer packing and
334		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
335		 * it to 32 in all other cases.
336		 */
337		pad = is_t4(sc) && buffer_packing ? 64 : 32;
338
339		/*
340		 * For fl_pad = 0 we'll still write a reasonable value to the
341		 * register but all the freelists will opt out of padding.
342		 * We'll complain here only if the user tried to set it to a
343		 * value greater than 0 that was invalid.
344		 */
345		if (fl_pad > 0) {
346			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
347			    " (%d), using %d instead.\n", fl_pad, pad);
348		}
349	}
350	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
351	v = V_INGPADBOUNDARY(ilog2(pad) - 5);
352	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
353
354	if (is_t4(sc)) {
355		if (fl_pack != -1 && fl_pack != pad) {
356			/* Complain but carry on. */
357			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
358			    " using %d instead.\n", fl_pack, pad);
359		}
360		return;
361	}
362
363	pack = fl_pack;
364	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
365	    !powerof2(fl_pack)) {
366		pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
367		MPASS(powerof2(pack));
368		if (pack < 16)
369			pack = 16;
370		if (pack == 32)
371			pack = 64;
372		if (pack > 4096)
373			pack = 4096;
374		if (fl_pack != -1) {
375			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
376			    " (%d), using %d instead.\n", fl_pack, pack);
377		}
378	}
379	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
380	if (pack == 16)
381		v = V_INGPACKBOUNDARY(0);
382	else
383		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
384
385	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
386	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
387}
388
389/*
390 * adap->params.vpd.cclk must be set up before this is called.
391 */
392void
393t4_tweak_chip_settings(struct adapter *sc)
394{
395	int i;
396	uint32_t v, m;
397	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
398	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
399	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
400	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
401	static int sge_flbuf_sizes[] = {
402		MCLBYTES,
403#if MJUMPAGESIZE != MCLBYTES
404		MJUMPAGESIZE,
405		MJUMPAGESIZE - CL_METADATA_SIZE,
406		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
407#endif
408		MJUM9BYTES,
409		MJUM16BYTES,
410		MCLBYTES - MSIZE - CL_METADATA_SIZE,
411		MJUM9BYTES - CL_METADATA_SIZE,
412		MJUM16BYTES - CL_METADATA_SIZE,
413	};
414
415	KASSERT(sc->flags & MASTER_PF,
416	    ("%s: trying to change chip settings when not master.", __func__));
417
418	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
419	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
420	    V_EGRSTATUSPAGESIZE(spg_len == 128);
421	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
422
423	setup_pad_and_pack_boundaries(sc);
424
425	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
426	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
427	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
428	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
429	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
430	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
431	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
432	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
433	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
434
435	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
436	    ("%s: hw buffer size table too big", __func__));
437	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
438		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
439		    sge_flbuf_sizes[i]);
440	}
441
442	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
443	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
444	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
445
446	KASSERT(intr_timer[0] <= timer_max,
447	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
448	    timer_max));
449	for (i = 1; i < nitems(intr_timer); i++) {
450		KASSERT(intr_timer[i] >= intr_timer[i - 1],
451		    ("%s: timers not listed in increasing order (%d)",
452		    __func__, i));
453
454		while (intr_timer[i] > timer_max) {
455			if (i == nitems(intr_timer) - 1) {
456				intr_timer[i] = timer_max;
457				break;
458			}
459			intr_timer[i] += intr_timer[i - 1];
460			intr_timer[i] /= 2;
461		}
462	}
463
464	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
465	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
466	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
467	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
468	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
469	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
470	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
471	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
472	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
473
474	/* 4K, 16K, 64K, 256K DDP "page sizes" */
475	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
476	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
477
478	m = v = F_TDDPTAGTCB;
479	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
480
481	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
482	    F_RESETDDPOFFSET;
483	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
484	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
485}
486
487/*
488 * SGE wants the buffer to be at least 64B and then a multiple of 16.  If
489 * padding is is use the buffer's start and end need to be aligned to the pad
490 * boundary as well.  We'll just make sure that the size is a multiple of the
491 * boundary here, it is up to the buffer allocation code to make sure the start
492 * of the buffer is aligned as well.
493 */
494static inline int
495hwsz_ok(struct adapter *sc, int hwsz)
496{
497	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
498
499	return (hwsz >= 64 && (hwsz & mask) == 0);
500}
501
502/*
503 * XXX: driver really should be able to deal with unexpected settings.
504 */
505int
506t4_read_chip_settings(struct adapter *sc)
507{
508	struct sge *s = &sc->sge;
509	struct sge_params *sp = &sc->params.sge;
510	int i, j, n, rc = 0;
511	uint32_t m, v, r;
512	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
513	static int sw_buf_sizes[] = {	/* Sorted by size */
514		MCLBYTES,
515#if MJUMPAGESIZE != MCLBYTES
516		MJUMPAGESIZE,
517#endif
518		MJUM9BYTES,
519		MJUM16BYTES
520	};
521	struct sw_zone_info *swz, *safe_swz;
522	struct hw_buf_info *hwb;
523
524	t4_init_sge_params(sc);
525
526	m = F_RXPKTCPLMODE;
527	v = F_RXPKTCPLMODE;
528	r = t4_read_reg(sc, A_SGE_CONTROL);
529	if ((r & m) != v) {
530		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
531		rc = EINVAL;
532	}
533
534	/*
535	 * If this changes then every single use of PAGE_SHIFT in the driver
536	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
537	 */
538	if (sp->page_shift != PAGE_SHIFT) {
539		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
540		rc = EINVAL;
541	}
542
543	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
544	hwb = &s->hw_buf_info[0];
545	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
546		r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
547		hwb->size = r;
548		hwb->zidx = hwsz_ok(sc, r) ? -1 : -2;
549		hwb->next = -1;
550	}
551
552	/*
553	 * Create a sorted list in decreasing order of hw buffer sizes (and so
554	 * increasing order of spare area) for each software zone.
555	 *
556	 * If padding is enabled then the start and end of the buffer must align
557	 * to the pad boundary; if packing is enabled then they must align with
558	 * the pack boundary as well.  Allocations from the cluster zones are
559	 * aligned to min(size, 4K), so the buffer starts at that alignment and
560	 * ends at hwb->size alignment.  If mbuf inlining is allowed the
561	 * starting alignment will be reduced to MSIZE and the driver will
562	 * exercise appropriate caution when deciding on the best buffer layout
563	 * to use.
564	 */
565	n = 0;	/* no usable buffer size to begin with */
566	swz = &s->sw_zone_info[0];
567	safe_swz = NULL;
568	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
569		int8_t head = -1, tail = -1;
570
571		swz->size = sw_buf_sizes[i];
572		swz->zone = m_getzone(swz->size);
573		swz->type = m_gettype(swz->size);
574
575		if (swz->size < PAGE_SIZE) {
576			MPASS(powerof2(swz->size));
577			if (fl_pad && (swz->size % sp->pad_boundary != 0))
578				continue;
579		}
580
581		if (swz->size == safest_rx_cluster)
582			safe_swz = swz;
583
584		hwb = &s->hw_buf_info[0];
585		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
586			if (hwb->zidx != -1 || hwb->size > swz->size)
587				continue;
588#ifdef INVARIANTS
589			if (fl_pad)
590				MPASS(hwb->size % sp->pad_boundary == 0);
591#endif
592			hwb->zidx = i;
593			if (head == -1)
594				head = tail = j;
595			else if (hwb->size < s->hw_buf_info[tail].size) {
596				s->hw_buf_info[tail].next = j;
597				tail = j;
598			} else {
599				int8_t *cur;
600				struct hw_buf_info *t;
601
602				for (cur = &head; *cur != -1; cur = &t->next) {
603					t = &s->hw_buf_info[*cur];
604					if (hwb->size == t->size) {
605						hwb->zidx = -2;
606						break;
607					}
608					if (hwb->size > t->size) {
609						hwb->next = *cur;
610						*cur = j;
611						break;
612					}
613				}
614			}
615		}
616		swz->head_hwidx = head;
617		swz->tail_hwidx = tail;
618
619		if (tail != -1) {
620			n++;
621			if (swz->size - s->hw_buf_info[tail].size >=
622			    CL_METADATA_SIZE)
623				sc->flags |= BUF_PACKING_OK;
624		}
625	}
626	if (n == 0) {
627		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
628		rc = EINVAL;
629	}
630
631	s->safe_hwidx1 = -1;
632	s->safe_hwidx2 = -1;
633	if (safe_swz != NULL) {
634		s->safe_hwidx1 = safe_swz->head_hwidx;
635		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
636			int spare;
637
638			hwb = &s->hw_buf_info[i];
639#ifdef INVARIANTS
640			if (fl_pad)
641				MPASS(hwb->size % sp->pad_boundary == 0);
642#endif
643			spare = safe_swz->size - hwb->size;
644			if (spare >= CL_METADATA_SIZE) {
645				s->safe_hwidx2 = i;
646				break;
647			}
648		}
649	}
650
651	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
652	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
653	if (r != v) {
654		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
655		rc = EINVAL;
656	}
657
658	m = v = F_TDDPTAGTCB;
659	r = t4_read_reg(sc, A_ULP_RX_CTL);
660	if ((r & m) != v) {
661		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
662		rc = EINVAL;
663	}
664
665	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
666	    F_RESETDDPOFFSET;
667	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
668	r = t4_read_reg(sc, A_TP_PARA_REG5);
669	if ((r & m) != v) {
670		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
671		rc = EINVAL;
672	}
673
674	t4_init_tp_params(sc);
675
676	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
677	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
678
679	return (rc);
680}
681
682int
683t4_create_dma_tag(struct adapter *sc)
684{
685	int rc;
686
687	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
688	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
689	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
690	    NULL, &sc->dmat);
691	if (rc != 0) {
692		device_printf(sc->dev,
693		    "failed to create main DMA tag: %d\n", rc);
694	}
695
696	return (rc);
697}
698
699void
700t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
701    struct sysctl_oid_list *children)
702{
703	struct sge_params *sp = &sc->params.sge;
704
705	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
706	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
707	    "freelist buffer sizes");
708
709	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
710	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
711
712	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
713	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
714
715	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
716	    NULL, sp->spg_len, "status page size (bytes)");
717
718	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
719	    NULL, cong_drop, "congestion drop setting");
720
721	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
722	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
723}
724
725int
726t4_destroy_dma_tag(struct adapter *sc)
727{
728	if (sc->dmat)
729		bus_dma_tag_destroy(sc->dmat);
730
731	return (0);
732}
733
734/*
735 * Allocate and initialize the firmware event queue and the management queue.
736 *
737 * Returns errno on failure.  Resources allocated up to that point may still be
738 * allocated.  Caller is responsible for cleanup in case this function fails.
739 */
740int
741t4_setup_adapter_queues(struct adapter *sc)
742{
743	int rc;
744
745	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
746
747	sysctl_ctx_init(&sc->ctx);
748	sc->flags |= ADAP_SYSCTL_CTX;
749
750	/*
751	 * Firmware event queue
752	 */
753	rc = alloc_fwq(sc);
754	if (rc != 0)
755		return (rc);
756
757	/*
758	 * Management queue.  This is just a control queue that uses the fwq as
759	 * its associated iq.
760	 */
761	rc = alloc_mgmtq(sc);
762
763	return (rc);
764}
765
766/*
767 * Idempotent
768 */
769int
770t4_teardown_adapter_queues(struct adapter *sc)
771{
772
773	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
774
775	/* Do this before freeing the queue */
776	if (sc->flags & ADAP_SYSCTL_CTX) {
777		sysctl_ctx_free(&sc->ctx);
778		sc->flags &= ~ADAP_SYSCTL_CTX;
779	}
780
781	free_mgmtq(sc);
782	free_fwq(sc);
783
784	return (0);
785}
786
787static inline int
788first_vector(struct vi_info *vi)
789{
790	struct adapter *sc = vi->pi->adapter;
791
792	if (sc->intr_count == 1)
793		return (0);
794
795	return (vi->first_intr);
796}
797
798/*
799 * Given an arbitrary "index," come up with an iq that can be used by other
800 * queues (of this VI) for interrupt forwarding, SGE egress updates, etc.
801 * The iq returned is guaranteed to be something that takes direct interrupts.
802 */
803static struct sge_iq *
804vi_intr_iq(struct vi_info *vi, int idx)
805{
806	struct adapter *sc = vi->pi->adapter;
807	struct sge *s = &sc->sge;
808	struct sge_iq *iq = NULL;
809	int nintr, i;
810
811	if (sc->intr_count == 1)
812		return (&sc->sge.fwq);
813
814	nintr = vi->nintr;
815	KASSERT(nintr != 0,
816	    ("%s: vi %p has no exclusive interrupts, total interrupts = %d",
817	    __func__, vi, sc->intr_count));
818	i = idx % nintr;
819
820	if (vi->flags & INTR_RXQ) {
821	       	if (i < vi->nrxq) {
822			iq = &s->rxq[vi->first_rxq + i].iq;
823			goto done;
824		}
825		i -= vi->nrxq;
826	}
827#ifdef TCP_OFFLOAD
828	if (vi->flags & INTR_OFLD_RXQ) {
829	       	if (i < vi->nofldrxq) {
830			iq = &s->ofld_rxq[vi->first_ofld_rxq + i].iq;
831			goto done;
832		}
833		i -= vi->nofldrxq;
834	}
835#endif
836	panic("%s: vi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__,
837	    vi, vi->flags & INTR_ALL, idx, nintr);
838done:
839	MPASS(iq != NULL);
840	KASSERT(iq->flags & IQ_INTR,
841	    ("%s: iq %p (vi %p, intr_flags 0x%lx, idx %d)", __func__, iq, vi,
842	    vi->flags & INTR_ALL, idx));
843	return (iq);
844}
845
846/* Maximum payload that can be delivered with a single iq descriptor */
847static inline int
848mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
849{
850	int payload;
851
852#ifdef TCP_OFFLOAD
853	if (toe) {
854		payload = sc->tt.rx_coalesce ?
855		    G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
856	} else {
857#endif
858		/* large enough even when hw VLAN extraction is disabled */
859		payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
860		    ETHER_VLAN_ENCAP_LEN + mtu;
861#ifdef TCP_OFFLOAD
862	}
863#endif
864
865	return (payload);
866}
867
868int
869t4_setup_vi_queues(struct vi_info *vi)
870{
871	int rc = 0, i, j, intr_idx, iqid;
872	struct sge_rxq *rxq;
873	struct sge_txq *txq;
874	struct sge_wrq *ctrlq;
875#ifdef TCP_OFFLOAD
876	struct sge_ofld_rxq *ofld_rxq;
877	struct sge_wrq *ofld_txq;
878#endif
879#ifdef DEV_NETMAP
880	int saved_idx;
881	struct sge_nm_rxq *nm_rxq;
882	struct sge_nm_txq *nm_txq;
883#endif
884	char name[16];
885	struct port_info *pi = vi->pi;
886	struct adapter *sc = pi->adapter;
887	struct ifnet *ifp = vi->ifp;
888	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
889	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
890	int maxp, mtu = ifp->if_mtu;
891
892	/* Interrupt vector to start from (when using multiple vectors) */
893	intr_idx = first_vector(vi);
894
895#ifdef DEV_NETMAP
896	saved_idx = intr_idx;
897	if (ifp->if_capabilities & IFCAP_NETMAP) {
898
899		/* netmap is supported with direct interrupts only. */
900		MPASS(vi->flags & INTR_RXQ);
901
902		/*
903		 * We don't have buffers to back the netmap rx queues
904		 * right now so we create the queues in a way that
905		 * doesn't set off any congestion signal in the chip.
906		 */
907		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
908		    CTLFLAG_RD, NULL, "rx queues");
909		for_each_nm_rxq(vi, i, nm_rxq) {
910			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
911			if (rc != 0)
912				goto done;
913			intr_idx++;
914		}
915
916		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
917		    CTLFLAG_RD, NULL, "tx queues");
918		for_each_nm_txq(vi, i, nm_txq) {
919			iqid = vi->first_nm_rxq + (i % vi->nnmrxq);
920			rc = alloc_nm_txq(vi, nm_txq, iqid, i, oid);
921			if (rc != 0)
922				goto done;
923		}
924	}
925
926	/* Normal rx queues and netmap rx queues share the same interrupts. */
927	intr_idx = saved_idx;
928#endif
929
930	/*
931	 * First pass over all NIC and TOE rx queues:
932	 * a) initialize iq and fl
933	 * b) allocate queue iff it will take direct interrupts.
934	 */
935	maxp = mtu_to_max_payload(sc, mtu, 0);
936	if (vi->flags & INTR_RXQ) {
937		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
938		    CTLFLAG_RD, NULL, "rx queues");
939	}
940	for_each_rxq(vi, i, rxq) {
941
942		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
943
944		snprintf(name, sizeof(name), "%s rxq%d-fl",
945		    device_get_nameunit(vi->dev), i);
946		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
947
948		if (vi->flags & INTR_RXQ) {
949			rxq->iq.flags |= IQ_INTR;
950			rc = alloc_rxq(vi, rxq, intr_idx, i, oid);
951			if (rc != 0)
952				goto done;
953			intr_idx++;
954		}
955	}
956#ifdef DEV_NETMAP
957	if (ifp->if_capabilities & IFCAP_NETMAP)
958		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
959#endif
960#ifdef TCP_OFFLOAD
961	maxp = mtu_to_max_payload(sc, mtu, 1);
962	if (vi->flags & INTR_OFLD_RXQ) {
963		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
964		    CTLFLAG_RD, NULL,
965		    "rx queues for offloaded TCP connections");
966	}
967	for_each_ofld_rxq(vi, i, ofld_rxq) {
968
969		init_iq(&ofld_rxq->iq, sc, vi->tmr_idx, vi->pktc_idx,
970		    vi->qsize_rxq);
971
972		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
973		    device_get_nameunit(vi->dev), i);
974		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
975
976		if (vi->flags & INTR_OFLD_RXQ) {
977			ofld_rxq->iq.flags |= IQ_INTR;
978			rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid);
979			if (rc != 0)
980				goto done;
981			intr_idx++;
982		}
983	}
984#endif
985
986	/*
987	 * Second pass over all NIC and TOE rx queues.  The queues forwarding
988	 * their interrupts are allocated now.
989	 */
990	j = 0;
991	if (!(vi->flags & INTR_RXQ)) {
992		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
993		    CTLFLAG_RD, NULL, "rx queues");
994		for_each_rxq(vi, i, rxq) {
995			MPASS(!(rxq->iq.flags & IQ_INTR));
996
997			intr_idx = vi_intr_iq(vi, j)->abs_id;
998
999			rc = alloc_rxq(vi, rxq, intr_idx, i, oid);
1000			if (rc != 0)
1001				goto done;
1002			j++;
1003		}
1004	}
1005#ifdef TCP_OFFLOAD
1006	if (vi->nofldrxq != 0 && !(vi->flags & INTR_OFLD_RXQ)) {
1007		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
1008		    CTLFLAG_RD, NULL,
1009		    "rx queues for offloaded TCP connections");
1010		for_each_ofld_rxq(vi, i, ofld_rxq) {
1011			MPASS(!(ofld_rxq->iq.flags & IQ_INTR));
1012
1013			intr_idx = vi_intr_iq(vi, j)->abs_id;
1014
1015			rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid);
1016			if (rc != 0)
1017				goto done;
1018			j++;
1019		}
1020	}
1021#endif
1022
1023	/*
1024	 * Now the tx queues.  Only one pass needed.
1025	 */
1026	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
1027	    NULL, "tx queues");
1028	j = 0;
1029	for_each_txq(vi, i, txq) {
1030		iqid = vi_intr_iq(vi, j)->cntxt_id;
1031		snprintf(name, sizeof(name), "%s txq%d",
1032		    device_get_nameunit(vi->dev), i);
1033		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, iqid,
1034		    name);
1035
1036		rc = alloc_txq(vi, txq, i, oid);
1037		if (rc != 0)
1038			goto done;
1039		j++;
1040	}
1041#ifdef TCP_OFFLOAD
1042	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
1043	    CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
1044	for_each_ofld_txq(vi, i, ofld_txq) {
1045		struct sysctl_oid *oid2;
1046
1047		iqid = vi_intr_iq(vi, j)->cntxt_id;
1048		snprintf(name, sizeof(name), "%s ofld_txq%d",
1049		    device_get_nameunit(vi->dev), i);
1050		init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan,
1051		    iqid, name);
1052
1053		snprintf(name, sizeof(name), "%d", i);
1054		oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
1055		    name, CTLFLAG_RD, NULL, "offload tx queue");
1056
1057		rc = alloc_wrq(sc, vi, ofld_txq, oid2);
1058		if (rc != 0)
1059			goto done;
1060		j++;
1061	}
1062#endif
1063
1064	/*
1065	 * Finally, the control queue.
1066	 */
1067	if (!IS_MAIN_VI(vi))
1068		goto done;
1069	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD,
1070	    NULL, "ctrl queue");
1071	ctrlq = &sc->sge.ctrlq[pi->port_id];
1072	iqid = vi_intr_iq(vi, 0)->cntxt_id;
1073	snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(vi->dev));
1074	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid,
1075	    name);
1076	rc = alloc_wrq(sc, vi, ctrlq, oid);
1077
1078done:
1079	if (rc)
1080		t4_teardown_vi_queues(vi);
1081
1082	return (rc);
1083}
1084
1085/*
1086 * Idempotent
1087 */
1088int
1089t4_teardown_vi_queues(struct vi_info *vi)
1090{
1091	int i;
1092	struct port_info *pi = vi->pi;
1093	struct adapter *sc = pi->adapter;
1094	struct sge_rxq *rxq;
1095	struct sge_txq *txq;
1096#ifdef TCP_OFFLOAD
1097	struct sge_ofld_rxq *ofld_rxq;
1098	struct sge_wrq *ofld_txq;
1099#endif
1100#ifdef DEV_NETMAP
1101	struct sge_nm_rxq *nm_rxq;
1102	struct sge_nm_txq *nm_txq;
1103#endif
1104
1105	/* Do this before freeing the queues */
1106	if (vi->flags & VI_SYSCTL_CTX) {
1107		sysctl_ctx_free(&vi->ctx);
1108		vi->flags &= ~VI_SYSCTL_CTX;
1109	}
1110
1111#ifdef DEV_NETMAP
1112	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
1113		for_each_nm_txq(vi, i, nm_txq) {
1114			free_nm_txq(vi, nm_txq);
1115		}
1116
1117		for_each_nm_rxq(vi, i, nm_rxq) {
1118			free_nm_rxq(vi, nm_rxq);
1119		}
1120	}
1121#endif
1122
1123	/*
1124	 * Take down all the tx queues first, as they reference the rx queues
1125	 * (for egress updates, etc.).
1126	 */
1127
1128	if (IS_MAIN_VI(vi))
1129		free_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
1130
1131	for_each_txq(vi, i, txq) {
1132		free_txq(vi, txq);
1133	}
1134#ifdef TCP_OFFLOAD
1135	for_each_ofld_txq(vi, i, ofld_txq) {
1136		free_wrq(sc, ofld_txq);
1137	}
1138#endif
1139
1140	/*
1141	 * Then take down the rx queues that forward their interrupts, as they
1142	 * reference other rx queues.
1143	 */
1144
1145	for_each_rxq(vi, i, rxq) {
1146		if ((rxq->iq.flags & IQ_INTR) == 0)
1147			free_rxq(vi, rxq);
1148	}
1149#ifdef TCP_OFFLOAD
1150	for_each_ofld_rxq(vi, i, ofld_rxq) {
1151		if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
1152			free_ofld_rxq(vi, ofld_rxq);
1153	}
1154#endif
1155
1156	/*
1157	 * Then take down the rx queues that take direct interrupts.
1158	 */
1159
1160	for_each_rxq(vi, i, rxq) {
1161		if (rxq->iq.flags & IQ_INTR)
1162			free_rxq(vi, rxq);
1163	}
1164#ifdef TCP_OFFLOAD
1165	for_each_ofld_rxq(vi, i, ofld_rxq) {
1166		if (ofld_rxq->iq.flags & IQ_INTR)
1167			free_ofld_rxq(vi, ofld_rxq);
1168	}
1169#endif
1170
1171	return (0);
1172}
1173
1174/*
1175 * Deals with errors and the firmware event queue.  All data rx queues forward
1176 * their interrupt to the firmware event queue.
1177 */
1178void
1179t4_intr_all(void *arg)
1180{
1181	struct adapter *sc = arg;
1182	struct sge_iq *fwq = &sc->sge.fwq;
1183
1184	t4_intr_err(arg);
1185	if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) {
1186		service_iq(fwq, 0);
1187		atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE);
1188	}
1189}
1190
1191/* Deals with error interrupts */
1192void
1193t4_intr_err(void *arg)
1194{
1195	struct adapter *sc = arg;
1196
1197	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
1198	t4_slow_intr_handler(sc);
1199}
1200
1201void
1202t4_intr_evt(void *arg)
1203{
1204	struct sge_iq *iq = arg;
1205
1206	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1207		service_iq(iq, 0);
1208		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1209	}
1210}
1211
1212void
1213t4_intr(void *arg)
1214{
1215	struct sge_iq *iq = arg;
1216
1217	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1218		service_iq(iq, 0);
1219		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1220	}
1221}
1222
1223void
1224t4_vi_intr(void *arg)
1225{
1226	struct irq *irq = arg;
1227
1228#ifdef DEV_NETMAP
1229	if (atomic_cmpset_int(&irq->nm_state, NM_ON, NM_BUSY)) {
1230		t4_nm_intr(irq->nm_rxq);
1231		atomic_cmpset_int(&irq->nm_state, NM_BUSY, NM_ON);
1232	}
1233#endif
1234	if (irq->rxq != NULL)
1235		t4_intr(irq->rxq);
1236}
1237
1238/*
1239 * Deals with anything and everything on the given ingress queue.
1240 */
1241static int
1242service_iq(struct sge_iq *iq, int budget)
1243{
1244	struct sge_iq *q;
1245	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
1246	struct sge_fl *fl;			/* Use iff IQ_HAS_FL */
1247	struct adapter *sc = iq->adapter;
1248	struct iq_desc *d = &iq->desc[iq->cidx];
1249	int ndescs = 0, limit;
1250	int rsp_type, refill;
1251	uint32_t lq;
1252	uint16_t fl_hw_cidx;
1253	struct mbuf *m0;
1254	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
1255#if defined(INET) || defined(INET6)
1256	const struct timeval lro_timeout = {0, sc->lro_timeout};
1257#endif
1258
1259	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
1260
1261	limit = budget ? budget : iq->qsize / 16;
1262
1263	if (iq->flags & IQ_HAS_FL) {
1264		fl = &rxq->fl;
1265		fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
1266	} else {
1267		fl = NULL;
1268		fl_hw_cidx = 0;			/* to silence gcc warning */
1269	}
1270
1271	/*
1272	 * We always come back and check the descriptor ring for new indirect
1273	 * interrupts and other responses after running a single handler.
1274	 */
1275	for (;;) {
1276		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
1277
1278			rmb();
1279
1280			refill = 0;
1281			m0 = NULL;
1282			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
1283			lq = be32toh(d->rsp.pldbuflen_qid);
1284
1285			switch (rsp_type) {
1286			case X_RSPD_TYPE_FLBUF:
1287
1288				KASSERT(iq->flags & IQ_HAS_FL,
1289				    ("%s: data for an iq (%p) with no freelist",
1290				    __func__, iq));
1291
1292				m0 = get_fl_payload(sc, fl, lq);
1293				if (__predict_false(m0 == NULL))
1294					goto process_iql;
1295				refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2;
1296#ifdef T4_PKT_TIMESTAMP
1297				/*
1298				 * 60 bit timestamp for the payload is
1299				 * *(uint64_t *)m0->m_pktdat.  Note that it is
1300				 * in the leading free-space in the mbuf.  The
1301				 * kernel can clobber it during a pullup,
1302				 * m_copymdata, etc.  You need to make sure that
1303				 * the mbuf reaches you unmolested if you care
1304				 * about the timestamp.
1305				 */
1306				*(uint64_t *)m0->m_pktdat =
1307				    be64toh(ctrl->u.last_flit) &
1308				    0xfffffffffffffff;
1309#endif
1310
1311				/* fall through */
1312
1313			case X_RSPD_TYPE_CPL:
1314				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
1315				    ("%s: bad opcode %02x.", __func__,
1316				    d->rss.opcode));
1317				sc->cpl_handler[d->rss.opcode](iq, &d->rss, m0);
1318				break;
1319
1320			case X_RSPD_TYPE_INTR:
1321
1322				/*
1323				 * Interrupts should be forwarded only to queues
1324				 * that are not forwarding their interrupts.
1325				 * This means service_iq can recurse but only 1
1326				 * level deep.
1327				 */
1328				KASSERT(budget == 0,
1329				    ("%s: budget %u, rsp_type %u", __func__,
1330				    budget, rsp_type));
1331
1332				/*
1333				 * There are 1K interrupt-capable queues (qids 0
1334				 * through 1023).  A response type indicating a
1335				 * forwarded interrupt with a qid >= 1K is an
1336				 * iWARP async notification.
1337				 */
1338				if (lq >= 1024) {
1339                                        sc->an_handler(iq, &d->rsp);
1340                                        break;
1341                                }
1342
1343				q = sc->sge.iqmap[lq - sc->sge.iq_start];
1344				if (atomic_cmpset_int(&q->state, IQS_IDLE,
1345				    IQS_BUSY)) {
1346					if (service_iq(q, q->qsize / 16) == 0) {
1347						atomic_cmpset_int(&q->state,
1348						    IQS_BUSY, IQS_IDLE);
1349					} else {
1350						STAILQ_INSERT_TAIL(&iql, q,
1351						    link);
1352					}
1353				}
1354				break;
1355
1356			default:
1357				KASSERT(0,
1358				    ("%s: illegal response type %d on iq %p",
1359				    __func__, rsp_type, iq));
1360				log(LOG_ERR,
1361				    "%s: illegal response type %d on iq %p",
1362				    device_get_nameunit(sc->dev), rsp_type, iq);
1363				break;
1364			}
1365
1366			d++;
1367			if (__predict_false(++iq->cidx == iq->sidx)) {
1368				iq->cidx = 0;
1369				iq->gen ^= F_RSPD_GEN;
1370				d = &iq->desc[0];
1371			}
1372			if (__predict_false(++ndescs == limit)) {
1373				t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
1374				    V_CIDXINC(ndescs) |
1375				    V_INGRESSQID(iq->cntxt_id) |
1376				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1377				ndescs = 0;
1378
1379#if defined(INET) || defined(INET6)
1380				if (iq->flags & IQ_LRO_ENABLED &&
1381				    sc->lro_timeout != 0) {
1382					tcp_lro_flush_inactive(&rxq->lro,
1383					    &lro_timeout);
1384				}
1385#endif
1386
1387				if (budget) {
1388					if (iq->flags & IQ_HAS_FL) {
1389						FL_LOCK(fl);
1390						refill_fl(sc, fl, 32);
1391						FL_UNLOCK(fl);
1392					}
1393					return (EINPROGRESS);
1394				}
1395			}
1396			if (refill) {
1397				FL_LOCK(fl);
1398				refill_fl(sc, fl, 32);
1399				FL_UNLOCK(fl);
1400				fl_hw_cidx = fl->hw_cidx;
1401			}
1402		}
1403
1404process_iql:
1405		if (STAILQ_EMPTY(&iql))
1406			break;
1407
1408		/*
1409		 * Process the head only, and send it to the back of the list if
1410		 * it's still not done.
1411		 */
1412		q = STAILQ_FIRST(&iql);
1413		STAILQ_REMOVE_HEAD(&iql, link);
1414		if (service_iq(q, q->qsize / 8) == 0)
1415			atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
1416		else
1417			STAILQ_INSERT_TAIL(&iql, q, link);
1418	}
1419
1420#if defined(INET) || defined(INET6)
1421	if (iq->flags & IQ_LRO_ENABLED) {
1422		struct lro_ctrl *lro = &rxq->lro;
1423		struct lro_entry *l;
1424
1425		while (!SLIST_EMPTY(&lro->lro_active)) {
1426			l = SLIST_FIRST(&lro->lro_active);
1427			SLIST_REMOVE_HEAD(&lro->lro_active, next);
1428			tcp_lro_flush(lro, l);
1429		}
1430	}
1431#endif
1432
1433	t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) |
1434	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1435
1436	if (iq->flags & IQ_HAS_FL) {
1437		int starved;
1438
1439		FL_LOCK(fl);
1440		starved = refill_fl(sc, fl, 64);
1441		FL_UNLOCK(fl);
1442		if (__predict_false(starved != 0))
1443			add_fl_to_sfl(sc, fl);
1444	}
1445
1446	return (0);
1447}
1448
1449static inline int
1450cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
1451{
1452	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
1453
1454	if (rc)
1455		MPASS(cll->region3 >= CL_METADATA_SIZE);
1456
1457	return (rc);
1458}
1459
1460static inline struct cluster_metadata *
1461cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
1462    caddr_t cl)
1463{
1464
1465	if (cl_has_metadata(fl, cll)) {
1466		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1467
1468		return ((struct cluster_metadata *)(cl + swz->size) - 1);
1469	}
1470	return (NULL);
1471}
1472
1473static int
1474rxb_free(struct mbuf *m, void *arg1, void *arg2)
1475{
1476	uma_zone_t zone = arg1;
1477	caddr_t cl = arg2;
1478
1479	uma_zfree(zone, cl);
1480	counter_u64_add(extfree_rels, 1);
1481
1482	return (EXT_FREE_OK);
1483}
1484
1485/*
1486 * The mbuf returned by this function could be allocated from zone_mbuf or
1487 * constructed in spare room in the cluster.
1488 *
1489 * The mbuf carries the payload in one of these ways
1490 * a) frame inside the mbuf (mbuf from zone_mbuf)
1491 * b) m_cljset (for clusters without metadata) zone_mbuf
1492 * c) m_extaddref (cluster with metadata) inline mbuf
1493 * d) m_extaddref (cluster with metadata) zone_mbuf
1494 */
1495static struct mbuf *
1496get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
1497    int remaining)
1498{
1499	struct mbuf *m;
1500	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1501	struct cluster_layout *cll = &sd->cll;
1502	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1503	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
1504	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
1505	int len, blen;
1506	caddr_t payload;
1507
1508	blen = hwb->size - fl->rx_offset;	/* max possible in this buf */
1509	len = min(remaining, blen);
1510	payload = sd->cl + cll->region1 + fl->rx_offset;
1511	if (fl->flags & FL_BUF_PACKING) {
1512		const u_int l = fr_offset + len;
1513		const u_int pad = roundup2(l, fl->buf_boundary) - l;
1514
1515		if (fl->rx_offset + len + pad < hwb->size)
1516			blen = len + pad;
1517		MPASS(fl->rx_offset + blen <= hwb->size);
1518	} else {
1519		MPASS(fl->rx_offset == 0);	/* not packing */
1520	}
1521
1522
1523	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
1524
1525		/*
1526		 * Copy payload into a freshly allocated mbuf.
1527		 */
1528
1529		m = fr_offset == 0 ?
1530		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1531		if (m == NULL)
1532			return (NULL);
1533		fl->mbuf_allocated++;
1534#ifdef T4_PKT_TIMESTAMP
1535		/* Leave room for a timestamp */
1536		m->m_data += 8;
1537#endif
1538		/* copy data to mbuf */
1539		bcopy(payload, mtod(m, caddr_t), len);
1540
1541	} else if (sd->nmbuf * MSIZE < cll->region1) {
1542
1543		/*
1544		 * There's spare room in the cluster for an mbuf.  Create one
1545		 * and associate it with the payload that's in the cluster.
1546		 */
1547
1548		MPASS(clm != NULL);
1549		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
1550		/* No bzero required */
1551		if (m_init(m, NULL, 0, M_NOWAIT, MT_DATA,
1552		    fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE))
1553			return (NULL);
1554		fl->mbuf_inlined++;
1555		m_extaddref(m, payload, blen, &clm->refcount, rxb_free,
1556		    swz->zone, sd->cl);
1557		if (sd->nmbuf++ == 0)
1558			counter_u64_add(extfree_refs, 1);
1559
1560	} else {
1561
1562		/*
1563		 * Grab an mbuf from zone_mbuf and associate it with the
1564		 * payload in the cluster.
1565		 */
1566
1567		m = fr_offset == 0 ?
1568		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1569		if (m == NULL)
1570			return (NULL);
1571		fl->mbuf_allocated++;
1572		if (clm != NULL) {
1573			m_extaddref(m, payload, blen, &clm->refcount,
1574			    rxb_free, swz->zone, sd->cl);
1575			if (sd->nmbuf++ == 0)
1576				counter_u64_add(extfree_refs, 1);
1577		} else {
1578			m_cljset(m, sd->cl, swz->type);
1579			sd->cl = NULL;	/* consumed, not a recycle candidate */
1580		}
1581	}
1582	if (fr_offset == 0)
1583		m->m_pkthdr.len = remaining;
1584	m->m_len = len;
1585
1586	if (fl->flags & FL_BUF_PACKING) {
1587		fl->rx_offset += blen;
1588		MPASS(fl->rx_offset <= hwb->size);
1589		if (fl->rx_offset < hwb->size)
1590			return (m);	/* without advancing the cidx */
1591	}
1592
1593	if (__predict_false(++fl->cidx % 8 == 0)) {
1594		uint16_t cidx = fl->cidx / 8;
1595
1596		if (__predict_false(cidx == fl->sidx))
1597			fl->cidx = cidx = 0;
1598		fl->hw_cidx = cidx;
1599	}
1600	fl->rx_offset = 0;
1601
1602	return (m);
1603}
1604
1605static struct mbuf *
1606get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf)
1607{
1608	struct mbuf *m0, *m, **pnext;
1609	u_int remaining;
1610	const u_int total = G_RSPD_LEN(len_newbuf);
1611
1612	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
1613		M_ASSERTPKTHDR(fl->m0);
1614		MPASS(fl->m0->m_pkthdr.len == total);
1615		MPASS(fl->remaining < total);
1616
1617		m0 = fl->m0;
1618		pnext = fl->pnext;
1619		remaining = fl->remaining;
1620		fl->flags &= ~FL_BUF_RESUME;
1621		goto get_segment;
1622	}
1623
1624	if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
1625		fl->rx_offset = 0;
1626		if (__predict_false(++fl->cidx % 8 == 0)) {
1627			uint16_t cidx = fl->cidx / 8;
1628
1629			if (__predict_false(cidx == fl->sidx))
1630				fl->cidx = cidx = 0;
1631			fl->hw_cidx = cidx;
1632		}
1633	}
1634
1635	/*
1636	 * Payload starts at rx_offset in the current hw buffer.  Its length is
1637	 * 'len' and it may span multiple hw buffers.
1638	 */
1639
1640	m0 = get_scatter_segment(sc, fl, 0, total);
1641	if (m0 == NULL)
1642		return (NULL);
1643	remaining = total - m0->m_len;
1644	pnext = &m0->m_next;
1645	while (remaining > 0) {
1646get_segment:
1647		MPASS(fl->rx_offset == 0);
1648		m = get_scatter_segment(sc, fl, total - remaining, remaining);
1649		if (__predict_false(m == NULL)) {
1650			fl->m0 = m0;
1651			fl->pnext = pnext;
1652			fl->remaining = remaining;
1653			fl->flags |= FL_BUF_RESUME;
1654			return (NULL);
1655		}
1656		*pnext = m;
1657		pnext = &m->m_next;
1658		remaining -= m->m_len;
1659	}
1660	*pnext = NULL;
1661
1662	M_ASSERTPKTHDR(m0);
1663	return (m0);
1664}
1665
1666static int
1667t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
1668{
1669	struct sge_rxq *rxq = iq_to_rxq(iq);
1670	struct ifnet *ifp = rxq->ifp;
1671	struct adapter *sc = iq->adapter;
1672	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
1673#if defined(INET) || defined(INET6)
1674	struct lro_ctrl *lro = &rxq->lro;
1675#endif
1676
1677	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
1678	    rss->opcode));
1679
1680	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
1681	m0->m_len -= sc->params.sge.fl_pktshift;
1682	m0->m_data += sc->params.sge.fl_pktshift;
1683
1684	m0->m_pkthdr.rcvif = ifp;
1685	M_HASHTYPE_SET(m0, M_HASHTYPE_OPAQUE);
1686	m0->m_pkthdr.flowid = be32toh(rss->hash_val);
1687
1688	if (cpl->csum_calc && !cpl->err_vec) {
1689		if (ifp->if_capenable & IFCAP_RXCSUM &&
1690		    cpl->l2info & htobe32(F_RXF_IP)) {
1691			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
1692			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1693			rxq->rxcsum++;
1694		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
1695		    cpl->l2info & htobe32(F_RXF_IP6)) {
1696			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
1697			    CSUM_PSEUDO_HDR);
1698			rxq->rxcsum++;
1699		}
1700
1701		if (__predict_false(cpl->ip_frag))
1702			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
1703		else
1704			m0->m_pkthdr.csum_data = 0xffff;
1705	}
1706
1707	if (cpl->vlan_ex) {
1708		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
1709		m0->m_flags |= M_VLANTAG;
1710		rxq->vlan_extraction++;
1711	}
1712
1713#if defined(INET) || defined(INET6)
1714	if (cpl->l2info & htobe32(F_RXF_LRO) &&
1715	    iq->flags & IQ_LRO_ENABLED &&
1716	    tcp_lro_rx(lro, m0, 0) == 0) {
1717		/* queued for LRO */
1718	} else
1719#endif
1720	ifp->if_input(ifp, m0);
1721
1722	return (0);
1723}
1724
1725/*
1726 * Must drain the wrq or make sure that someone else will.
1727 */
1728static void
1729wrq_tx_drain(void *arg, int n)
1730{
1731	struct sge_wrq *wrq = arg;
1732	struct sge_eq *eq = &wrq->eq;
1733
1734	EQ_LOCK(eq);
1735	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
1736		drain_wrq_wr_list(wrq->adapter, wrq);
1737	EQ_UNLOCK(eq);
1738}
1739
1740static void
1741drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
1742{
1743	struct sge_eq *eq = &wrq->eq;
1744	u_int available, dbdiff;	/* # of hardware descriptors */
1745	u_int n;
1746	struct wrqe *wr;
1747	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
1748
1749	EQ_LOCK_ASSERT_OWNED(eq);
1750	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
1751	wr = STAILQ_FIRST(&wrq->wr_list);
1752	MPASS(wr != NULL);	/* Must be called with something useful to do */
1753	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
1754
1755	do {
1756		eq->cidx = read_hw_cidx(eq);
1757		if (eq->pidx == eq->cidx)
1758			available = eq->sidx - 1;
1759		else
1760			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
1761
1762		MPASS(wr->wrq == wrq);
1763		n = howmany(wr->wr_len, EQ_ESIZE);
1764		if (available < n)
1765			return;
1766
1767		dst = (void *)&eq->desc[eq->pidx];
1768		if (__predict_true(eq->sidx - eq->pidx > n)) {
1769			/* Won't wrap, won't end exactly at the status page. */
1770			bcopy(&wr->wr[0], dst, wr->wr_len);
1771			eq->pidx += n;
1772		} else {
1773			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
1774
1775			bcopy(&wr->wr[0], dst, first_portion);
1776			if (wr->wr_len > first_portion) {
1777				bcopy(&wr->wr[first_portion], &eq->desc[0],
1778				    wr->wr_len - first_portion);
1779			}
1780			eq->pidx = n - (eq->sidx - eq->pidx);
1781		}
1782
1783		if (available < eq->sidx / 4 &&
1784		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
1785			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
1786			    F_FW_WR_EQUEQ);
1787			eq->equeqidx = eq->pidx;
1788		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
1789			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
1790			eq->equeqidx = eq->pidx;
1791		}
1792
1793		dbdiff += n;
1794		if (dbdiff >= 16) {
1795			ring_eq_db(sc, eq, dbdiff);
1796			dbdiff = 0;
1797		}
1798
1799		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
1800		free_wrqe(wr);
1801		MPASS(wrq->nwr_pending > 0);
1802		wrq->nwr_pending--;
1803		MPASS(wrq->ndesc_needed >= n);
1804		wrq->ndesc_needed -= n;
1805	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
1806
1807	if (dbdiff)
1808		ring_eq_db(sc, eq, dbdiff);
1809}
1810
1811/*
1812 * Doesn't fail.  Holds on to work requests it can't send right away.
1813 */
1814void
1815t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
1816{
1817#ifdef INVARIANTS
1818	struct sge_eq *eq = &wrq->eq;
1819#endif
1820
1821	EQ_LOCK_ASSERT_OWNED(eq);
1822	MPASS(wr != NULL);
1823	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
1824	MPASS((wr->wr_len & 0x7) == 0);
1825
1826	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
1827	wrq->nwr_pending++;
1828	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
1829
1830	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
1831		return;	/* commit_wrq_wr will drain wr_list as well. */
1832
1833	drain_wrq_wr_list(sc, wrq);
1834
1835	/* Doorbell must have caught up to the pidx. */
1836	MPASS(eq->pidx == eq->dbidx);
1837}
1838
1839void
1840t4_update_fl_bufsize(struct ifnet *ifp)
1841{
1842	struct vi_info *vi = ifp->if_softc;
1843	struct adapter *sc = vi->pi->adapter;
1844	struct sge_rxq *rxq;
1845#ifdef TCP_OFFLOAD
1846	struct sge_ofld_rxq *ofld_rxq;
1847#endif
1848	struct sge_fl *fl;
1849	int i, maxp, mtu = ifp->if_mtu;
1850
1851	maxp = mtu_to_max_payload(sc, mtu, 0);
1852	for_each_rxq(vi, i, rxq) {
1853		fl = &rxq->fl;
1854
1855		FL_LOCK(fl);
1856		find_best_refill_source(sc, fl, maxp);
1857		FL_UNLOCK(fl);
1858	}
1859#ifdef TCP_OFFLOAD
1860	maxp = mtu_to_max_payload(sc, mtu, 1);
1861	for_each_ofld_rxq(vi, i, ofld_rxq) {
1862		fl = &ofld_rxq->fl;
1863
1864		FL_LOCK(fl);
1865		find_best_refill_source(sc, fl, maxp);
1866		FL_UNLOCK(fl);
1867	}
1868#endif
1869}
1870
1871static inline int
1872mbuf_nsegs(struct mbuf *m)
1873{
1874
1875	M_ASSERTPKTHDR(m);
1876	KASSERT(m->m_pkthdr.l5hlen > 0,
1877	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
1878
1879	return (m->m_pkthdr.l5hlen);
1880}
1881
1882static inline void
1883set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
1884{
1885
1886	M_ASSERTPKTHDR(m);
1887	m->m_pkthdr.l5hlen = nsegs;
1888}
1889
1890static inline int
1891mbuf_len16(struct mbuf *m)
1892{
1893	int n;
1894
1895	M_ASSERTPKTHDR(m);
1896	n = m->m_pkthdr.PH_loc.eigth[0];
1897	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
1898
1899	return (n);
1900}
1901
1902static inline void
1903set_mbuf_len16(struct mbuf *m, uint8_t len16)
1904{
1905
1906	M_ASSERTPKTHDR(m);
1907	m->m_pkthdr.PH_loc.eigth[0] = len16;
1908}
1909
1910static inline int
1911needs_tso(struct mbuf *m)
1912{
1913
1914	M_ASSERTPKTHDR(m);
1915
1916	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1917		KASSERT(m->m_pkthdr.tso_segsz > 0,
1918		    ("%s: TSO requested in mbuf %p but MSS not provided",
1919		    __func__, m));
1920		return (1);
1921	}
1922
1923	return (0);
1924}
1925
1926static inline int
1927needs_l3_csum(struct mbuf *m)
1928{
1929
1930	M_ASSERTPKTHDR(m);
1931
1932	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
1933		return (1);
1934	return (0);
1935}
1936
1937static inline int
1938needs_l4_csum(struct mbuf *m)
1939{
1940
1941	M_ASSERTPKTHDR(m);
1942
1943	if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
1944	    CSUM_TCP_IPV6 | CSUM_TSO))
1945		return (1);
1946	return (0);
1947}
1948
1949static inline int
1950needs_vlan_insertion(struct mbuf *m)
1951{
1952
1953	M_ASSERTPKTHDR(m);
1954
1955	if (m->m_flags & M_VLANTAG) {
1956		KASSERT(m->m_pkthdr.ether_vtag != 0,
1957		    ("%s: HWVLAN requested in mbuf %p but tag not provided",
1958		    __func__, m));
1959		return (1);
1960	}
1961	return (0);
1962}
1963
1964static void *
1965m_advance(struct mbuf **pm, int *poffset, int len)
1966{
1967	struct mbuf *m = *pm;
1968	int offset = *poffset;
1969	uintptr_t p = 0;
1970
1971	MPASS(len > 0);
1972
1973	while (len) {
1974		if (offset + len < m->m_len) {
1975			offset += len;
1976			p = mtod(m, uintptr_t) + offset;
1977			break;
1978		}
1979		len -= m->m_len - offset;
1980		m = m->m_next;
1981		offset = 0;
1982		MPASS(m != NULL);
1983	}
1984	*poffset = offset;
1985	*pm = m;
1986	return ((void *)p);
1987}
1988
1989static inline int
1990same_paddr(char *a, char *b)
1991{
1992
1993	if (a == b)
1994		return (1);
1995	else if (a != NULL && b != NULL) {
1996		vm_offset_t x = (vm_offset_t)a;
1997		vm_offset_t y = (vm_offset_t)b;
1998
1999		if ((x & PAGE_MASK) == (y & PAGE_MASK) &&
2000		    pmap_kextract(x) == pmap_kextract(y))
2001			return (1);
2002	}
2003
2004	return (0);
2005}
2006
2007/*
2008 * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
2009 * must have at least one mbuf that's not empty.
2010 */
2011static inline int
2012count_mbuf_nsegs(struct mbuf *m)
2013{
2014	char *prev_end, *start;
2015	int len, nsegs;
2016
2017	MPASS(m != NULL);
2018
2019	nsegs = 0;
2020	prev_end = NULL;
2021	for (; m; m = m->m_next) {
2022
2023		len = m->m_len;
2024		if (__predict_false(len == 0))
2025			continue;
2026		start = mtod(m, char *);
2027
2028		nsegs += sglist_count(start, len);
2029		if (same_paddr(prev_end, start))
2030			nsegs--;
2031		prev_end = start + len;
2032	}
2033
2034	MPASS(nsegs > 0);
2035	return (nsegs);
2036}
2037
2038/*
2039 * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
2040 * a) caller can assume it's been freed if this function returns with an error.
2041 * b) it may get defragged up if the gather list is too long for the hardware.
2042 */
2043int
2044parse_pkt(struct mbuf **mp)
2045{
2046	struct mbuf *m0 = *mp, *m;
2047	int rc, nsegs, defragged = 0, offset;
2048	struct ether_header *eh;
2049	void *l3hdr;
2050#if defined(INET) || defined(INET6)
2051	struct tcphdr *tcp;
2052#endif
2053	uint16_t eh_type;
2054
2055	M_ASSERTPKTHDR(m0);
2056	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
2057		rc = EINVAL;
2058fail:
2059		m_freem(m0);
2060		*mp = NULL;
2061		return (rc);
2062	}
2063restart:
2064	/*
2065	 * First count the number of gather list segments in the payload.
2066	 * Defrag the mbuf if nsegs exceeds the hardware limit.
2067	 */
2068	M_ASSERTPKTHDR(m0);
2069	MPASS(m0->m_pkthdr.len > 0);
2070	nsegs = count_mbuf_nsegs(m0);
2071	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
2072		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
2073			rc = EFBIG;
2074			goto fail;
2075		}
2076		*mp = m0 = m;	/* update caller's copy after defrag */
2077		goto restart;
2078	}
2079
2080	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
2081		m0 = m_pullup(m0, m0->m_pkthdr.len);
2082		if (m0 == NULL) {
2083			/* Should have left well enough alone. */
2084			rc = EFBIG;
2085			goto fail;
2086		}
2087		*mp = m0;	/* update caller's copy after pullup */
2088		goto restart;
2089	}
2090	set_mbuf_nsegs(m0, nsegs);
2091	set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
2092
2093	if (!needs_tso(m0))
2094		return (0);
2095
2096	m = m0;
2097	eh = mtod(m, struct ether_header *);
2098	eh_type = ntohs(eh->ether_type);
2099	if (eh_type == ETHERTYPE_VLAN) {
2100		struct ether_vlan_header *evh = (void *)eh;
2101
2102		eh_type = ntohs(evh->evl_proto);
2103		m0->m_pkthdr.l2hlen = sizeof(*evh);
2104	} else
2105		m0->m_pkthdr.l2hlen = sizeof(*eh);
2106
2107	offset = 0;
2108	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
2109
2110	switch (eh_type) {
2111#ifdef INET6
2112	case ETHERTYPE_IPV6:
2113	{
2114		struct ip6_hdr *ip6 = l3hdr;
2115
2116		MPASS(ip6->ip6_nxt == IPPROTO_TCP);
2117
2118		m0->m_pkthdr.l3hlen = sizeof(*ip6);
2119		break;
2120	}
2121#endif
2122#ifdef INET
2123	case ETHERTYPE_IP:
2124	{
2125		struct ip *ip = l3hdr;
2126
2127		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
2128		break;
2129	}
2130#endif
2131	default:
2132		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
2133		    " with the same INET/INET6 options as the kernel.",
2134		    __func__, eh_type);
2135	}
2136
2137#if defined(INET) || defined(INET6)
2138	tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
2139	m0->m_pkthdr.l4hlen = tcp->th_off * 4;
2140#endif
2141	MPASS(m0 == *mp);
2142	return (0);
2143}
2144
2145void *
2146start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
2147{
2148	struct sge_eq *eq = &wrq->eq;
2149	struct adapter *sc = wrq->adapter;
2150	int ndesc, available;
2151	struct wrqe *wr;
2152	void *w;
2153
2154	MPASS(len16 > 0);
2155	ndesc = howmany(len16, EQ_ESIZE / 16);
2156	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
2157
2158	EQ_LOCK(eq);
2159
2160	if (!STAILQ_EMPTY(&wrq->wr_list))
2161		drain_wrq_wr_list(sc, wrq);
2162
2163	if (!STAILQ_EMPTY(&wrq->wr_list)) {
2164slowpath:
2165		EQ_UNLOCK(eq);
2166		wr = alloc_wrqe(len16 * 16, wrq);
2167		if (__predict_false(wr == NULL))
2168			return (NULL);
2169		cookie->pidx = -1;
2170		cookie->ndesc = ndesc;
2171		return (&wr->wr);
2172	}
2173
2174	eq->cidx = read_hw_cidx(eq);
2175	if (eq->pidx == eq->cidx)
2176		available = eq->sidx - 1;
2177	else
2178		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2179	if (available < ndesc)
2180		goto slowpath;
2181
2182	cookie->pidx = eq->pidx;
2183	cookie->ndesc = ndesc;
2184	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
2185
2186	w = &eq->desc[eq->pidx];
2187	IDXINCR(eq->pidx, ndesc, eq->sidx);
2188	if (__predict_false(eq->pidx < ndesc - 1)) {
2189		w = &wrq->ss[0];
2190		wrq->ss_pidx = cookie->pidx;
2191		wrq->ss_len = len16 * 16;
2192	}
2193
2194	EQ_UNLOCK(eq);
2195
2196	return (w);
2197}
2198
2199void
2200commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
2201{
2202	struct sge_eq *eq = &wrq->eq;
2203	struct adapter *sc = wrq->adapter;
2204	int ndesc, pidx;
2205	struct wrq_cookie *prev, *next;
2206
2207	if (cookie->pidx == -1) {
2208		struct wrqe *wr = __containerof(w, struct wrqe, wr);
2209
2210		t4_wrq_tx(sc, wr);
2211		return;
2212	}
2213
2214	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
2215	pidx = cookie->pidx;
2216	MPASS(pidx >= 0 && pidx < eq->sidx);
2217	if (__predict_false(w == &wrq->ss[0])) {
2218		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
2219
2220		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
2221		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
2222		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
2223		wrq->tx_wrs_ss++;
2224	} else
2225		wrq->tx_wrs_direct++;
2226
2227	EQ_LOCK(eq);
2228	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
2229	next = TAILQ_NEXT(cookie, link);
2230	if (prev == NULL) {
2231		MPASS(pidx == eq->dbidx);
2232		if (next == NULL || ndesc >= 16)
2233			ring_eq_db(wrq->adapter, eq, ndesc);
2234		else {
2235			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
2236			next->pidx = pidx;
2237			next->ndesc += ndesc;
2238		}
2239	} else {
2240		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
2241		prev->ndesc += ndesc;
2242	}
2243	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
2244
2245	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
2246		drain_wrq_wr_list(sc, wrq);
2247
2248#ifdef INVARIANTS
2249	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
2250		/* Doorbell must have caught up to the pidx. */
2251		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
2252	}
2253#endif
2254	EQ_UNLOCK(eq);
2255}
2256
2257static u_int
2258can_resume_eth_tx(struct mp_ring *r)
2259{
2260	struct sge_eq *eq = r->cookie;
2261
2262	return (total_available_tx_desc(eq) > eq->sidx / 8);
2263}
2264
2265static inline int
2266cannot_use_txpkts(struct mbuf *m)
2267{
2268	/* maybe put a GL limit too, to avoid silliness? */
2269
2270	return (needs_tso(m));
2271}
2272
2273/*
2274 * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
2275 * be consumed.  Return the actual number consumed.  0 indicates a stall.
2276 */
2277static u_int
2278eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
2279{
2280	struct sge_txq *txq = r->cookie;
2281	struct sge_eq *eq = &txq->eq;
2282	struct ifnet *ifp = txq->ifp;
2283	struct vi_info *vi = ifp->if_softc;
2284	struct port_info *pi = vi->pi;
2285	struct adapter *sc = pi->adapter;
2286	u_int total, remaining;		/* # of packets */
2287	u_int available, dbdiff;	/* # of hardware descriptors */
2288	u_int n, next_cidx;
2289	struct mbuf *m0, *tail;
2290	struct txpkts txp;
2291	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
2292
2293	remaining = IDXDIFF(pidx, cidx, r->size);
2294	MPASS(remaining > 0);	/* Must not be called without work to do. */
2295	total = 0;
2296
2297	TXQ_LOCK(txq);
2298	if (__predict_false((eq->flags & EQ_ENABLED) == 0)) {
2299		while (cidx != pidx) {
2300			m0 = r->items[cidx];
2301			m_freem(m0);
2302			if (++cidx == r->size)
2303				cidx = 0;
2304		}
2305		reclaim_tx_descs(txq, 2048);
2306		total = remaining;
2307		goto done;
2308	}
2309
2310	/* How many hardware descriptors do we have readily available. */
2311	if (eq->pidx == eq->cidx)
2312		available = eq->sidx - 1;
2313	else
2314		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
2315	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
2316
2317	while (remaining > 0) {
2318
2319		m0 = r->items[cidx];
2320		M_ASSERTPKTHDR(m0);
2321		MPASS(m0->m_nextpkt == NULL);
2322
2323		if (available < SGE_MAX_WR_NDESC) {
2324			available += reclaim_tx_descs(txq, 64);
2325			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
2326				break;	/* out of descriptors */
2327		}
2328
2329		next_cidx = cidx + 1;
2330		if (__predict_false(next_cidx == r->size))
2331			next_cidx = 0;
2332
2333		wr = (void *)&eq->desc[eq->pidx];
2334		if (remaining > 1 &&
2335		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
2336
2337			/* pkts at cidx, next_cidx should both be in txp. */
2338			MPASS(txp.npkt == 2);
2339			tail = r->items[next_cidx];
2340			MPASS(tail->m_nextpkt == NULL);
2341			ETHER_BPF_MTAP(ifp, m0);
2342			ETHER_BPF_MTAP(ifp, tail);
2343			m0->m_nextpkt = tail;
2344
2345			if (__predict_false(++next_cidx == r->size))
2346				next_cidx = 0;
2347
2348			while (next_cidx != pidx) {
2349				if (add_to_txpkts(r->items[next_cidx], &txp,
2350				    available) != 0)
2351					break;
2352				tail->m_nextpkt = r->items[next_cidx];
2353				tail = tail->m_nextpkt;
2354				ETHER_BPF_MTAP(ifp, tail);
2355				if (__predict_false(++next_cidx == r->size))
2356					next_cidx = 0;
2357			}
2358
2359			n = write_txpkts_wr(txq, wr, m0, &txp, available);
2360			total += txp.npkt;
2361			remaining -= txp.npkt;
2362		} else {
2363			total++;
2364			remaining--;
2365			n = write_txpkt_wr(txq, (void *)wr, m0, available);
2366			ETHER_BPF_MTAP(ifp, m0);
2367		}
2368		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
2369
2370		available -= n;
2371		dbdiff += n;
2372		IDXINCR(eq->pidx, n, eq->sidx);
2373
2374		if (total_available_tx_desc(eq) < eq->sidx / 4 &&
2375		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
2376			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
2377			    F_FW_WR_EQUEQ);
2378			eq->equeqidx = eq->pidx;
2379		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
2380			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
2381			eq->equeqidx = eq->pidx;
2382		}
2383
2384		if (dbdiff >= 16 && remaining >= 4) {
2385			ring_eq_db(sc, eq, dbdiff);
2386			available += reclaim_tx_descs(txq, 4 * dbdiff);
2387			dbdiff = 0;
2388		}
2389
2390		cidx = next_cidx;
2391	}
2392	if (dbdiff != 0) {
2393		ring_eq_db(sc, eq, dbdiff);
2394		reclaim_tx_descs(txq, 32);
2395	}
2396done:
2397	TXQ_UNLOCK(txq);
2398
2399	return (total);
2400}
2401
2402static inline void
2403init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
2404    int qsize)
2405{
2406
2407	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
2408	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
2409	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
2410	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
2411
2412	iq->flags = 0;
2413	iq->adapter = sc;
2414	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
2415	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
2416	if (pktc_idx >= 0) {
2417		iq->intr_params |= F_QINTR_CNT_EN;
2418		iq->intr_pktc_idx = pktc_idx;
2419	}
2420	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
2421	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
2422}
2423
2424static inline void
2425init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
2426{
2427
2428	fl->qsize = qsize;
2429	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
2430	strlcpy(fl->lockname, name, sizeof(fl->lockname));
2431	if (sc->flags & BUF_PACKING_OK &&
2432	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
2433	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
2434		fl->flags |= FL_BUF_PACKING;
2435	find_best_refill_source(sc, fl, maxp);
2436	find_safe_refill_source(sc, fl);
2437}
2438
2439static inline void
2440init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
2441    uint8_t tx_chan, uint16_t iqid, char *name)
2442{
2443	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
2444
2445	eq->flags = eqtype & EQ_TYPEMASK;
2446	eq->tx_chan = tx_chan;
2447	eq->iqid = iqid;
2448	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
2449	strlcpy(eq->lockname, name, sizeof(eq->lockname));
2450}
2451
2452static int
2453alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
2454    bus_dmamap_t *map, bus_addr_t *pa, void **va)
2455{
2456	int rc;
2457
2458	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
2459	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
2460	if (rc != 0) {
2461		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
2462		goto done;
2463	}
2464
2465	rc = bus_dmamem_alloc(*tag, va,
2466	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
2467	if (rc != 0) {
2468		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
2469		goto done;
2470	}
2471
2472	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
2473	if (rc != 0) {
2474		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
2475		goto done;
2476	}
2477done:
2478	if (rc)
2479		free_ring(sc, *tag, *map, *pa, *va);
2480
2481	return (rc);
2482}
2483
2484static int
2485free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
2486    bus_addr_t pa, void *va)
2487{
2488	if (pa)
2489		bus_dmamap_unload(tag, map);
2490	if (va)
2491		bus_dmamem_free(tag, va, map);
2492	if (tag)
2493		bus_dma_tag_destroy(tag);
2494
2495	return (0);
2496}
2497
2498/*
2499 * Allocates the ring for an ingress queue and an optional freelist.  If the
2500 * freelist is specified it will be allocated and then associated with the
2501 * ingress queue.
2502 *
2503 * Returns errno on failure.  Resources allocated up to that point may still be
2504 * allocated.  Caller is responsible for cleanup in case this function fails.
2505 *
2506 * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then
2507 * the intr_idx specifies the vector, starting from 0.  Otherwise it specifies
2508 * the abs_id of the ingress queue to which its interrupts should be forwarded.
2509 */
2510static int
2511alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
2512    int intr_idx, int cong)
2513{
2514	int rc, i, cntxt_id;
2515	size_t len;
2516	struct fw_iq_cmd c;
2517	struct port_info *pi = vi->pi;
2518	struct adapter *sc = iq->adapter;
2519	struct sge_params *sp = &sc->params.sge;
2520	__be32 v = 0;
2521
2522	len = iq->qsize * IQ_ESIZE;
2523	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
2524	    (void **)&iq->desc);
2525	if (rc != 0)
2526		return (rc);
2527
2528	bzero(&c, sizeof(c));
2529	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
2530	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
2531	    V_FW_IQ_CMD_VFN(0));
2532
2533	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
2534	    FW_LEN16(c));
2535
2536	/* Special handling for firmware event queue */
2537	if (iq == &sc->sge.fwq)
2538		v |= F_FW_IQ_CMD_IQASYNCH;
2539
2540	if (iq->flags & IQ_INTR) {
2541		KASSERT(intr_idx < sc->intr_count,
2542		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
2543	} else
2544		v |= F_FW_IQ_CMD_IQANDST;
2545	v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
2546
2547	c.type_to_iqandstindex = htobe32(v |
2548	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
2549	    V_FW_IQ_CMD_VIID(vi->viid) |
2550	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
2551	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
2552	    F_FW_IQ_CMD_IQGTSMODE |
2553	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
2554	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
2555	c.iqsize = htobe16(iq->qsize);
2556	c.iqaddr = htobe64(iq->ba);
2557	if (cong >= 0)
2558		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
2559
2560	if (fl) {
2561		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
2562
2563		len = fl->qsize * EQ_ESIZE;
2564		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
2565		    &fl->ba, (void **)&fl->desc);
2566		if (rc)
2567			return (rc);
2568
2569		/* Allocate space for one software descriptor per buffer. */
2570		rc = alloc_fl_sdesc(fl);
2571		if (rc != 0) {
2572			device_printf(sc->dev,
2573			    "failed to setup fl software descriptors: %d\n",
2574			    rc);
2575			return (rc);
2576		}
2577
2578		if (fl->flags & FL_BUF_PACKING) {
2579			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
2580			fl->buf_boundary = sp->pack_boundary;
2581		} else {
2582			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
2583			fl->buf_boundary = 16;
2584		}
2585		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
2586			fl->buf_boundary = sp->pad_boundary;
2587
2588		c.iqns_to_fl0congen |=
2589		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
2590			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
2591			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
2592			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
2593			    0));
2594		if (cong >= 0) {
2595			c.iqns_to_fl0congen |=
2596				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
2597				    F_FW_IQ_CMD_FL0CONGCIF |
2598				    F_FW_IQ_CMD_FL0CONGEN);
2599		}
2600		c.fl0dcaen_to_fl0cidxfthresh =
2601		    htobe16(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_128B) |
2602			V_FW_IQ_CMD_FL0FBMAX(X_FETCHBURSTMAX_512B));
2603		c.fl0size = htobe16(fl->qsize);
2604		c.fl0addr = htobe64(fl->ba);
2605	}
2606
2607	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2608	if (rc != 0) {
2609		device_printf(sc->dev,
2610		    "failed to create ingress queue: %d\n", rc);
2611		return (rc);
2612	}
2613
2614	iq->cidx = 0;
2615	iq->gen = F_RSPD_GEN;
2616	iq->intr_next = iq->intr_params;
2617	iq->cntxt_id = be16toh(c.iqid);
2618	iq->abs_id = be16toh(c.physiqid);
2619	iq->flags |= IQ_ALLOCATED;
2620
2621	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
2622	if (cntxt_id >= sc->sge.niq) {
2623		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
2624		    cntxt_id, sc->sge.niq - 1);
2625	}
2626	sc->sge.iqmap[cntxt_id] = iq;
2627
2628	if (fl) {
2629		u_int qid;
2630
2631		iq->flags |= IQ_HAS_FL;
2632		fl->cntxt_id = be16toh(c.fl0id);
2633		fl->pidx = fl->cidx = 0;
2634
2635		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
2636		if (cntxt_id >= sc->sge.neq) {
2637			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
2638			    __func__, cntxt_id, sc->sge.neq - 1);
2639		}
2640		sc->sge.eqmap[cntxt_id] = (void *)fl;
2641
2642		qid = fl->cntxt_id;
2643		if (isset(&sc->doorbells, DOORBELL_UDB)) {
2644			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
2645			uint32_t mask = (1 << s_qpp) - 1;
2646			volatile uint8_t *udb;
2647
2648			udb = sc->udbs_base + UDBS_DB_OFFSET;
2649			udb += (qid >> s_qpp) << PAGE_SHIFT;
2650			qid &= mask;
2651			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
2652				udb += qid << UDBS_SEG_SHIFT;
2653				qid = 0;
2654			}
2655			fl->udb = (volatile void *)udb;
2656		}
2657		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
2658
2659		FL_LOCK(fl);
2660		/* Enough to make sure the SGE doesn't think it's starved */
2661		refill_fl(sc, fl, fl->lowat);
2662		FL_UNLOCK(fl);
2663	}
2664
2665	if (is_t5(sc) && cong >= 0) {
2666		uint32_t param, val;
2667
2668		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
2669		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
2670		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
2671		if (cong == 0)
2672			val = 1 << 19;
2673		else {
2674			val = 2 << 19;
2675			for (i = 0; i < 4; i++) {
2676				if (cong & (1 << i))
2677					val |= 1 << (i << 2);
2678			}
2679		}
2680
2681		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
2682		if (rc != 0) {
2683			/* report error but carry on */
2684			device_printf(sc->dev,
2685			    "failed to set congestion manager context for "
2686			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
2687		}
2688	}
2689
2690	/* Enable IQ interrupts */
2691	atomic_store_rel_int(&iq->state, IQS_IDLE);
2692	t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_SEINTARM(iq->intr_params) |
2693	    V_INGRESSQID(iq->cntxt_id));
2694
2695	return (0);
2696}
2697
2698static int
2699free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
2700{
2701	int rc;
2702	struct adapter *sc = iq->adapter;
2703	device_t dev;
2704
2705	if (sc == NULL)
2706		return (0);	/* nothing to do */
2707
2708	dev = vi ? vi->dev : sc->dev;
2709
2710	if (iq->flags & IQ_ALLOCATED) {
2711		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
2712		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
2713		    fl ? fl->cntxt_id : 0xffff, 0xffff);
2714		if (rc != 0) {
2715			device_printf(dev,
2716			    "failed to free queue %p: %d\n", iq, rc);
2717			return (rc);
2718		}
2719		iq->flags &= ~IQ_ALLOCATED;
2720	}
2721
2722	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
2723
2724	bzero(iq, sizeof(*iq));
2725
2726	if (fl) {
2727		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
2728		    fl->desc);
2729
2730		if (fl->sdesc)
2731			free_fl_sdesc(sc, fl);
2732
2733		if (mtx_initialized(&fl->fl_lock))
2734			mtx_destroy(&fl->fl_lock);
2735
2736		bzero(fl, sizeof(*fl));
2737	}
2738
2739	return (0);
2740}
2741
2742static void
2743add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
2744    struct sge_fl *fl)
2745{
2746	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2747
2748	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
2749	    "freelist");
2750	children = SYSCTL_CHILDREN(oid);
2751
2752	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
2753	    CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I",
2754	    "SGE context id of the freelist");
2755	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
2756	    fl_pad ? 1 : 0, "padding enabled");
2757	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
2758	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
2759	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
2760	    0, "consumer index");
2761	if (fl->flags & FL_BUF_PACKING) {
2762		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
2763		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
2764	}
2765	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
2766	    0, "producer index");
2767	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated",
2768	    CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated");
2769	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined",
2770	    CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters");
2771	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
2772	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
2773	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
2774	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
2775	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
2776	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
2777}
2778
2779static int
2780alloc_fwq(struct adapter *sc)
2781{
2782	int rc, intr_idx;
2783	struct sge_iq *fwq = &sc->sge.fwq;
2784	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
2785	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2786
2787	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
2788	fwq->flags |= IQ_INTR;	/* always */
2789	intr_idx = sc->intr_count > 1 ? 1 : 0;
2790	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
2791	if (rc != 0) {
2792		device_printf(sc->dev,
2793		    "failed to create firmware event queue: %d\n", rc);
2794		return (rc);
2795	}
2796
2797	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
2798	    NULL, "firmware event queue");
2799	children = SYSCTL_CHILDREN(oid);
2800
2801	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id",
2802	    CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I",
2803	    "absolute id of the queue");
2804	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id",
2805	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I",
2806	    "SGE context id of the queue");
2807	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx",
2808	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I",
2809	    "consumer index");
2810
2811	return (0);
2812}
2813
2814static int
2815free_fwq(struct adapter *sc)
2816{
2817	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
2818}
2819
2820static int
2821alloc_mgmtq(struct adapter *sc)
2822{
2823	int rc;
2824	struct sge_wrq *mgmtq = &sc->sge.mgmtq;
2825	char name[16];
2826	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
2827	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2828
2829	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD,
2830	    NULL, "management queue");
2831
2832	snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev));
2833	init_eq(sc, &mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan,
2834	    sc->sge.fwq.cntxt_id, name);
2835	rc = alloc_wrq(sc, NULL, mgmtq, oid);
2836	if (rc != 0) {
2837		device_printf(sc->dev,
2838		    "failed to create management queue: %d\n", rc);
2839		return (rc);
2840	}
2841
2842	return (0);
2843}
2844
2845static int
2846free_mgmtq(struct adapter *sc)
2847{
2848
2849	return free_wrq(sc, &sc->sge.mgmtq);
2850}
2851
2852int
2853tnl_cong(struct port_info *pi, int drop)
2854{
2855
2856	if (drop == -1)
2857		return (-1);
2858	else if (drop == 1)
2859		return (0);
2860	else
2861		return (pi->rx_chan_map);
2862}
2863
2864static int
2865alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
2866    struct sysctl_oid *oid)
2867{
2868	int rc;
2869	struct sysctl_oid_list *children;
2870	char name[16];
2871
2872	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
2873	    tnl_cong(vi->pi, cong_drop));
2874	if (rc != 0)
2875		return (rc);
2876
2877	/*
2878	 * The freelist is just barely above the starvation threshold right now,
2879	 * fill it up a bit more.
2880	 */
2881	FL_LOCK(&rxq->fl);
2882	refill_fl(vi->pi->adapter, &rxq->fl, 128);
2883	FL_UNLOCK(&rxq->fl);
2884
2885#if defined(INET) || defined(INET6)
2886	rc = tcp_lro_init(&rxq->lro);
2887	if (rc != 0)
2888		return (rc);
2889	rxq->lro.ifp = vi->ifp; /* also indicates LRO init'ed */
2890
2891	if (vi->ifp->if_capenable & IFCAP_LRO)
2892		rxq->iq.flags |= IQ_LRO_ENABLED;
2893#endif
2894	rxq->ifp = vi->ifp;
2895
2896	children = SYSCTL_CHILDREN(oid);
2897
2898	snprintf(name, sizeof(name), "%d", idx);
2899	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
2900	    NULL, "rx queue");
2901	children = SYSCTL_CHILDREN(oid);
2902
2903	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id",
2904	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I",
2905	    "absolute id of the queue");
2906	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id",
2907	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I",
2908	    "SGE context id of the queue");
2909	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
2910	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I",
2911	    "consumer index");
2912#if defined(INET) || defined(INET6)
2913	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
2914	    &rxq->lro.lro_queued, 0, NULL);
2915	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
2916	    &rxq->lro.lro_flushed, 0, NULL);
2917#endif
2918	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
2919	    &rxq->rxcsum, "# of times hardware assisted with checksum");
2920	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
2921	    CTLFLAG_RD, &rxq->vlan_extraction,
2922	    "# of times hardware extracted 802.1Q tag");
2923
2924	add_fl_sysctls(&vi->ctx, oid, &rxq->fl);
2925
2926	return (rc);
2927}
2928
2929static int
2930free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
2931{
2932	int rc;
2933
2934#if defined(INET) || defined(INET6)
2935	if (rxq->lro.ifp) {
2936		tcp_lro_free(&rxq->lro);
2937		rxq->lro.ifp = NULL;
2938	}
2939#endif
2940
2941	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
2942	if (rc == 0)
2943		bzero(rxq, sizeof(*rxq));
2944
2945	return (rc);
2946}
2947
2948#ifdef TCP_OFFLOAD
2949static int
2950alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
2951    int intr_idx, int idx, struct sysctl_oid *oid)
2952{
2953	int rc;
2954	struct sysctl_oid_list *children;
2955	char name[16];
2956
2957	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx,
2958	    vi->pi->rx_chan_map);
2959	if (rc != 0)
2960		return (rc);
2961
2962	children = SYSCTL_CHILDREN(oid);
2963
2964	snprintf(name, sizeof(name), "%d", idx);
2965	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
2966	    NULL, "rx queue");
2967	children = SYSCTL_CHILDREN(oid);
2968
2969	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id",
2970	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16,
2971	    "I", "absolute id of the queue");
2972	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id",
2973	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16,
2974	    "I", "SGE context id of the queue");
2975	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
2976	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I",
2977	    "consumer index");
2978
2979	add_fl_sysctls(&vi->ctx, oid, &ofld_rxq->fl);
2980
2981	return (rc);
2982}
2983
2984static int
2985free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
2986{
2987	int rc;
2988
2989	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
2990	if (rc == 0)
2991		bzero(ofld_rxq, sizeof(*ofld_rxq));
2992
2993	return (rc);
2994}
2995#endif
2996
2997#ifdef DEV_NETMAP
2998static int
2999alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx,
3000    int idx, struct sysctl_oid *oid)
3001{
3002	int rc;
3003	struct sysctl_oid_list *children;
3004	struct sysctl_ctx_list *ctx;
3005	char name[16];
3006	size_t len;
3007	struct adapter *sc = vi->pi->adapter;
3008	struct netmap_adapter *na = NA(vi->ifp);
3009
3010	MPASS(na != NULL);
3011
3012	len = vi->qsize_rxq * IQ_ESIZE;
3013	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
3014	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
3015	if (rc != 0)
3016		return (rc);
3017
3018	len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len;
3019	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
3020	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
3021	if (rc != 0)
3022		return (rc);
3023
3024	nm_rxq->vi = vi;
3025	nm_rxq->nid = idx;
3026	nm_rxq->iq_cidx = 0;
3027	nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE;
3028	nm_rxq->iq_gen = F_RSPD_GEN;
3029	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
3030	nm_rxq->fl_sidx = na->num_rx_desc;
3031	nm_rxq->intr_idx = intr_idx;
3032
3033	ctx = &vi->ctx;
3034	children = SYSCTL_CHILDREN(oid);
3035
3036	snprintf(name, sizeof(name), "%d", idx);
3037	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL,
3038	    "rx queue");
3039	children = SYSCTL_CHILDREN(oid);
3040
3041	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
3042	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16,
3043	    "I", "absolute id of the queue");
3044	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3045	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16,
3046	    "I", "SGE context id of the queue");
3047	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3048	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I",
3049	    "consumer index");
3050
3051	children = SYSCTL_CHILDREN(oid);
3052	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
3053	    "freelist");
3054	children = SYSCTL_CHILDREN(oid);
3055
3056	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
3057	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16,
3058	    "I", "SGE context id of the freelist");
3059	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
3060	    &nm_rxq->fl_cidx, 0, "consumer index");
3061	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
3062	    &nm_rxq->fl_pidx, 0, "producer index");
3063
3064	return (rc);
3065}
3066
3067
3068static int
3069free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
3070{
3071	struct adapter *sc = vi->pi->adapter;
3072
3073	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
3074	    nm_rxq->iq_desc);
3075	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
3076	    nm_rxq->fl_desc);
3077
3078	return (0);
3079}
3080
3081static int
3082alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
3083    struct sysctl_oid *oid)
3084{
3085	int rc;
3086	size_t len;
3087	struct port_info *pi = vi->pi;
3088	struct adapter *sc = pi->adapter;
3089	struct netmap_adapter *na = NA(vi->ifp);
3090	char name[16];
3091	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3092
3093	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
3094	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
3095	    &nm_txq->ba, (void **)&nm_txq->desc);
3096	if (rc)
3097		return (rc);
3098
3099	nm_txq->pidx = nm_txq->cidx = 0;
3100	nm_txq->sidx = na->num_tx_desc;
3101	nm_txq->nid = idx;
3102	nm_txq->iqidx = iqidx;
3103	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3104	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_VF_VLD(1) |
3105	    V_TXPKT_VF(vi->viid));
3106
3107	snprintf(name, sizeof(name), "%d", idx);
3108	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3109	    NULL, "netmap tx queue");
3110	children = SYSCTL_CHILDREN(oid);
3111
3112	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3113	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
3114	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3115	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I",
3116	    "consumer index");
3117	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
3118	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I",
3119	    "producer index");
3120
3121	return (rc);
3122}
3123
3124static int
3125free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
3126{
3127	struct adapter *sc = vi->pi->adapter;
3128
3129	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
3130	    nm_txq->desc);
3131
3132	return (0);
3133}
3134#endif
3135
3136static int
3137ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
3138{
3139	int rc, cntxt_id;
3140	struct fw_eq_ctrl_cmd c;
3141	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3142
3143	bzero(&c, sizeof(c));
3144
3145	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
3146	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
3147	    V_FW_EQ_CTRL_CMD_VFN(0));
3148	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
3149	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
3150	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
3151	c.physeqid_pkd = htobe32(0);
3152	c.fetchszm_to_iqid =
3153	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3154		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
3155		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
3156	c.dcaen_to_eqsize =
3157	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3158		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3159		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
3160	c.eqaddr = htobe64(eq->ba);
3161
3162	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3163	if (rc != 0) {
3164		device_printf(sc->dev,
3165		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
3166		return (rc);
3167	}
3168	eq->flags |= EQ_ALLOCATED;
3169
3170	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
3171	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3172	if (cntxt_id >= sc->sge.neq)
3173	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3174		cntxt_id, sc->sge.neq - 1);
3175	sc->sge.eqmap[cntxt_id] = eq;
3176
3177	return (rc);
3178}
3179
3180static int
3181eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3182{
3183	int rc, cntxt_id;
3184	struct fw_eq_eth_cmd c;
3185	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3186
3187	bzero(&c, sizeof(c));
3188
3189	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
3190	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
3191	    V_FW_EQ_ETH_CMD_VFN(0));
3192	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
3193	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
3194	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
3195	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
3196	c.fetchszm_to_iqid =
3197	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3198		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
3199		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
3200	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3201	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3202	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
3203	c.eqaddr = htobe64(eq->ba);
3204
3205	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3206	if (rc != 0) {
3207		device_printf(vi->dev,
3208		    "failed to create Ethernet egress queue: %d\n", rc);
3209		return (rc);
3210	}
3211	eq->flags |= EQ_ALLOCATED;
3212
3213	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
3214	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3215	if (cntxt_id >= sc->sge.neq)
3216	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3217		cntxt_id, sc->sge.neq - 1);
3218	sc->sge.eqmap[cntxt_id] = eq;
3219
3220	return (rc);
3221}
3222
3223#ifdef TCP_OFFLOAD
3224static int
3225ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3226{
3227	int rc, cntxt_id;
3228	struct fw_eq_ofld_cmd c;
3229	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3230
3231	bzero(&c, sizeof(c));
3232
3233	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
3234	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
3235	    V_FW_EQ_OFLD_CMD_VFN(0));
3236	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
3237	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
3238	c.fetchszm_to_iqid =
3239		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
3240		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
3241		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
3242	c.dcaen_to_eqsize =
3243	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
3244		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
3245		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
3246	c.eqaddr = htobe64(eq->ba);
3247
3248	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
3249	if (rc != 0) {
3250		device_printf(vi->dev,
3251		    "failed to create egress queue for TCP offload: %d\n", rc);
3252		return (rc);
3253	}
3254	eq->flags |= EQ_ALLOCATED;
3255
3256	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
3257	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
3258	if (cntxt_id >= sc->sge.neq)
3259	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
3260		cntxt_id, sc->sge.neq - 1);
3261	sc->sge.eqmap[cntxt_id] = eq;
3262
3263	return (rc);
3264}
3265#endif
3266
3267static int
3268alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
3269{
3270	int rc, qsize;
3271	size_t len;
3272
3273	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
3274
3275	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
3276	len = qsize * EQ_ESIZE;
3277	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
3278	    &eq->ba, (void **)&eq->desc);
3279	if (rc)
3280		return (rc);
3281
3282	eq->pidx = eq->cidx = 0;
3283	eq->equeqidx = eq->dbidx = 0;
3284	eq->doorbells = sc->doorbells;
3285
3286	switch (eq->flags & EQ_TYPEMASK) {
3287	case EQ_CTRL:
3288		rc = ctrl_eq_alloc(sc, eq);
3289		break;
3290
3291	case EQ_ETH:
3292		rc = eth_eq_alloc(sc, vi, eq);
3293		break;
3294
3295#ifdef TCP_OFFLOAD
3296	case EQ_OFLD:
3297		rc = ofld_eq_alloc(sc, vi, eq);
3298		break;
3299#endif
3300
3301	default:
3302		panic("%s: invalid eq type %d.", __func__,
3303		    eq->flags & EQ_TYPEMASK);
3304	}
3305	if (rc != 0) {
3306		device_printf(sc->dev,
3307		    "failed to allocate egress queue(%d): %d\n",
3308		    eq->flags & EQ_TYPEMASK, rc);
3309	}
3310
3311	if (isset(&eq->doorbells, DOORBELL_UDB) ||
3312	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
3313	    isset(&eq->doorbells, DOORBELL_WCWR)) {
3314		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
3315		uint32_t mask = (1 << s_qpp) - 1;
3316		volatile uint8_t *udb;
3317
3318		udb = sc->udbs_base + UDBS_DB_OFFSET;
3319		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
3320		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
3321		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
3322	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
3323		else {
3324			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
3325			eq->udb_qid = 0;
3326		}
3327		eq->udb = (volatile void *)udb;
3328	}
3329
3330	return (rc);
3331}
3332
3333static int
3334free_eq(struct adapter *sc, struct sge_eq *eq)
3335{
3336	int rc;
3337
3338	if (eq->flags & EQ_ALLOCATED) {
3339		switch (eq->flags & EQ_TYPEMASK) {
3340		case EQ_CTRL:
3341			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
3342			    eq->cntxt_id);
3343			break;
3344
3345		case EQ_ETH:
3346			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
3347			    eq->cntxt_id);
3348			break;
3349
3350#ifdef TCP_OFFLOAD
3351		case EQ_OFLD:
3352			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
3353			    eq->cntxt_id);
3354			break;
3355#endif
3356
3357		default:
3358			panic("%s: invalid eq type %d.", __func__,
3359			    eq->flags & EQ_TYPEMASK);
3360		}
3361		if (rc != 0) {
3362			device_printf(sc->dev,
3363			    "failed to free egress queue (%d): %d\n",
3364			    eq->flags & EQ_TYPEMASK, rc);
3365			return (rc);
3366		}
3367		eq->flags &= ~EQ_ALLOCATED;
3368	}
3369
3370	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
3371
3372	if (mtx_initialized(&eq->eq_lock))
3373		mtx_destroy(&eq->eq_lock);
3374
3375	bzero(eq, sizeof(*eq));
3376	return (0);
3377}
3378
3379static int
3380alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
3381    struct sysctl_oid *oid)
3382{
3383	int rc;
3384	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
3385	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3386
3387	rc = alloc_eq(sc, vi, &wrq->eq);
3388	if (rc)
3389		return (rc);
3390
3391	wrq->adapter = sc;
3392	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
3393	TAILQ_INIT(&wrq->incomplete_wrs);
3394	STAILQ_INIT(&wrq->wr_list);
3395	wrq->nwr_pending = 0;
3396	wrq->ndesc_needed = 0;
3397
3398	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3399	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
3400	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3401	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
3402	    "consumer index");
3403	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
3404	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
3405	    "producer index");
3406	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
3407	    &wrq->tx_wrs_direct, "# of work requests (direct)");
3408	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
3409	    &wrq->tx_wrs_copied, "# of work requests (copied)");
3410
3411	return (rc);
3412}
3413
3414static int
3415free_wrq(struct adapter *sc, struct sge_wrq *wrq)
3416{
3417	int rc;
3418
3419	rc = free_eq(sc, &wrq->eq);
3420	if (rc)
3421		return (rc);
3422
3423	bzero(wrq, sizeof(*wrq));
3424	return (0);
3425}
3426
3427static int
3428alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
3429    struct sysctl_oid *oid)
3430{
3431	int rc;
3432	struct port_info *pi = vi->pi;
3433	struct adapter *sc = pi->adapter;
3434	struct sge_eq *eq = &txq->eq;
3435	char name[16];
3436	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3437
3438	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
3439	    M_CXGBE, M_WAITOK);
3440	if (rc != 0) {
3441		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
3442		return (rc);
3443	}
3444
3445	rc = alloc_eq(sc, vi, eq);
3446	if (rc != 0) {
3447		mp_ring_free(txq->r);
3448		txq->r = NULL;
3449		return (rc);
3450	}
3451
3452	/* Can't fail after this point. */
3453
3454	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
3455	txq->ifp = vi->ifp;
3456	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
3457	txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3458	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_VF_VLD(1) |
3459	    V_TXPKT_VF(vi->viid));
3460	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
3461	    M_ZERO | M_WAITOK);
3462
3463	snprintf(name, sizeof(name), "%d", idx);
3464	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3465	    NULL, "tx queue");
3466	children = SYSCTL_CHILDREN(oid);
3467
3468	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3469	    &eq->cntxt_id, 0, "SGE context id of the queue");
3470	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
3471	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
3472	    "consumer index");
3473	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
3474	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
3475	    "producer index");
3476
3477	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
3478	    &txq->txcsum, "# of times hardware assisted with checksum");
3479	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
3480	    CTLFLAG_RD, &txq->vlan_insertion,
3481	    "# of times hardware inserted 802.1Q tag");
3482	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
3483	    &txq->tso_wrs, "# of TSO work requests");
3484	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
3485	    &txq->imm_wrs, "# of work requests with immediate data");
3486	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
3487	    &txq->sgl_wrs, "# of work requests with direct SGL");
3488	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
3489	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
3490	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
3491	    CTLFLAG_RD, &txq->txpkts0_wrs,
3492	    "# of txpkts (type 0) work requests");
3493	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
3494	    CTLFLAG_RD, &txq->txpkts1_wrs,
3495	    "# of txpkts (type 1) work requests");
3496	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
3497	    CTLFLAG_RD, &txq->txpkts0_pkts,
3498	    "# of frames tx'd using type0 txpkts work requests");
3499	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
3500	    CTLFLAG_RD, &txq->txpkts1_pkts,
3501	    "# of frames tx'd using type1 txpkts work requests");
3502
3503	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
3504	    CTLFLAG_RD, &txq->r->enqueues,
3505	    "# of enqueues to the mp_ring for this queue");
3506	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
3507	    CTLFLAG_RD, &txq->r->drops,
3508	    "# of drops in the mp_ring for this queue");
3509	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
3510	    CTLFLAG_RD, &txq->r->starts,
3511	    "# of normal consumer starts in the mp_ring for this queue");
3512	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
3513	    CTLFLAG_RD, &txq->r->stalls,
3514	    "# of consumer stalls in the mp_ring for this queue");
3515	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
3516	    CTLFLAG_RD, &txq->r->restarts,
3517	    "# of consumer restarts in the mp_ring for this queue");
3518	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
3519	    CTLFLAG_RD, &txq->r->abdications,
3520	    "# of consumer abdications in the mp_ring for this queue");
3521
3522	return (0);
3523}
3524
3525static int
3526free_txq(struct vi_info *vi, struct sge_txq *txq)
3527{
3528	int rc;
3529	struct adapter *sc = vi->pi->adapter;
3530	struct sge_eq *eq = &txq->eq;
3531
3532	rc = free_eq(sc, eq);
3533	if (rc)
3534		return (rc);
3535
3536	sglist_free(txq->gl);
3537	free(txq->sdesc, M_CXGBE);
3538	mp_ring_free(txq->r);
3539
3540	bzero(txq, sizeof(*txq));
3541	return (0);
3542}
3543
3544static void
3545oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
3546{
3547	bus_addr_t *ba = arg;
3548
3549	KASSERT(nseg == 1,
3550	    ("%s meant for single segment mappings only.", __func__));
3551
3552	*ba = error ? 0 : segs->ds_addr;
3553}
3554
3555static inline void
3556ring_fl_db(struct adapter *sc, struct sge_fl *fl)
3557{
3558	uint32_t n, v;
3559
3560	n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx);
3561	MPASS(n > 0);
3562
3563	wmb();
3564	v = fl->dbval | V_PIDX(n);
3565	if (fl->udb)
3566		*fl->udb = htole32(v);
3567	else
3568		t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v);
3569	IDXINCR(fl->dbidx, n, fl->sidx);
3570}
3571
3572/*
3573 * Fills up the freelist by allocating upto 'n' buffers.  Buffers that are
3574 * recycled do not count towards this allocation budget.
3575 *
3576 * Returns non-zero to indicate that this freelist should be added to the list
3577 * of starving freelists.
3578 */
3579static int
3580refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
3581{
3582	__be64 *d;
3583	struct fl_sdesc *sd;
3584	uintptr_t pa;
3585	caddr_t cl;
3586	struct cluster_layout *cll;
3587	struct sw_zone_info *swz;
3588	struct cluster_metadata *clm;
3589	uint16_t max_pidx;
3590	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
3591
3592	FL_LOCK_ASSERT_OWNED(fl);
3593
3594	/*
3595	 * We always stop at the begining of the hardware descriptor that's just
3596	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
3597	 * which would mean an empty freelist to the chip.
3598	 */
3599	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
3600	if (fl->pidx == max_pidx * 8)
3601		return (0);
3602
3603	d = &fl->desc[fl->pidx];
3604	sd = &fl->sdesc[fl->pidx];
3605	cll = &fl->cll_def;	/* default layout */
3606	swz = &sc->sge.sw_zone_info[cll->zidx];
3607
3608	while (n > 0) {
3609
3610		if (sd->cl != NULL) {
3611
3612			if (sd->nmbuf == 0) {
3613				/*
3614				 * Fast recycle without involving any atomics on
3615				 * the cluster's metadata (if the cluster has
3616				 * metadata).  This happens when all frames
3617				 * received in the cluster were small enough to
3618				 * fit within a single mbuf each.
3619				 */
3620				fl->cl_fast_recycled++;
3621#ifdef INVARIANTS
3622				clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
3623				if (clm != NULL)
3624					MPASS(clm->refcount == 1);
3625#endif
3626				goto recycled_fast;
3627			}
3628
3629			/*
3630			 * Cluster is guaranteed to have metadata.  Clusters
3631			 * without metadata always take the fast recycle path
3632			 * when they're recycled.
3633			 */
3634			clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
3635			MPASS(clm != NULL);
3636
3637			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
3638				fl->cl_recycled++;
3639				counter_u64_add(extfree_rels, 1);
3640				goto recycled;
3641			}
3642			sd->cl = NULL;	/* gave up my reference */
3643		}
3644		MPASS(sd->cl == NULL);
3645alloc:
3646		cl = uma_zalloc(swz->zone, M_NOWAIT);
3647		if (__predict_false(cl == NULL)) {
3648			if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 ||
3649			    fl->cll_def.zidx == fl->cll_alt.zidx)
3650				break;
3651
3652			/* fall back to the safe zone */
3653			cll = &fl->cll_alt;
3654			swz = &sc->sge.sw_zone_info[cll->zidx];
3655			goto alloc;
3656		}
3657		fl->cl_allocated++;
3658		n--;
3659
3660		pa = pmap_kextract((vm_offset_t)cl);
3661		pa += cll->region1;
3662		sd->cl = cl;
3663		sd->cll = *cll;
3664		*d = htobe64(pa | cll->hwidx);
3665		clm = cl_metadata(sc, fl, cll, cl);
3666		if (clm != NULL) {
3667recycled:
3668#ifdef INVARIANTS
3669			clm->sd = sd;
3670#endif
3671			clm->refcount = 1;
3672		}
3673		sd->nmbuf = 0;
3674recycled_fast:
3675		d++;
3676		sd++;
3677		if (__predict_false(++fl->pidx % 8 == 0)) {
3678			uint16_t pidx = fl->pidx / 8;
3679
3680			if (__predict_false(pidx == fl->sidx)) {
3681				fl->pidx = 0;
3682				pidx = 0;
3683				sd = fl->sdesc;
3684				d = fl->desc;
3685			}
3686			if (pidx == max_pidx)
3687				break;
3688
3689			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
3690				ring_fl_db(sc, fl);
3691		}
3692	}
3693
3694	if (fl->pidx / 8 != fl->dbidx)
3695		ring_fl_db(sc, fl);
3696
3697	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
3698}
3699
3700/*
3701 * Attempt to refill all starving freelists.
3702 */
3703static void
3704refill_sfl(void *arg)
3705{
3706	struct adapter *sc = arg;
3707	struct sge_fl *fl, *fl_temp;
3708
3709	mtx_assert(&sc->sfl_lock, MA_OWNED);
3710	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
3711		FL_LOCK(fl);
3712		refill_fl(sc, fl, 64);
3713		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
3714			TAILQ_REMOVE(&sc->sfl, fl, link);
3715			fl->flags &= ~FL_STARVING;
3716		}
3717		FL_UNLOCK(fl);
3718	}
3719
3720	if (!TAILQ_EMPTY(&sc->sfl))
3721		callout_schedule(&sc->sfl_callout, hz / 5);
3722}
3723
3724static int
3725alloc_fl_sdesc(struct sge_fl *fl)
3726{
3727
3728	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
3729	    M_ZERO | M_WAITOK);
3730
3731	return (0);
3732}
3733
3734static void
3735free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
3736{
3737	struct fl_sdesc *sd;
3738	struct cluster_metadata *clm;
3739	struct cluster_layout *cll;
3740	int i;
3741
3742	sd = fl->sdesc;
3743	for (i = 0; i < fl->sidx * 8; i++, sd++) {
3744		if (sd->cl == NULL)
3745			continue;
3746
3747		cll = &sd->cll;
3748		clm = cl_metadata(sc, fl, cll, sd->cl);
3749		if (sd->nmbuf == 0)
3750			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
3751		else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) {
3752			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
3753			counter_u64_add(extfree_rels, 1);
3754		}
3755		sd->cl = NULL;
3756	}
3757
3758	free(fl->sdesc, M_CXGBE);
3759	fl->sdesc = NULL;
3760}
3761
3762static inline void
3763get_pkt_gl(struct mbuf *m, struct sglist *gl)
3764{
3765	int rc;
3766
3767	M_ASSERTPKTHDR(m);
3768
3769	sglist_reset(gl);
3770	rc = sglist_append_mbuf(gl, m);
3771	if (__predict_false(rc != 0)) {
3772		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
3773		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
3774	}
3775
3776	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
3777	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
3778	    mbuf_nsegs(m), gl->sg_nseg));
3779	KASSERT(gl->sg_nseg > 0 &&
3780	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
3781	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
3782		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
3783}
3784
3785/*
3786 * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
3787 */
3788static inline u_int
3789txpkt_len16(u_int nsegs, u_int tso)
3790{
3791	u_int n;
3792
3793	MPASS(nsegs > 0);
3794
3795	nsegs--; /* first segment is part of ulptx_sgl */
3796	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
3797	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
3798	if (tso)
3799		n += sizeof(struct cpl_tx_pkt_lso_core);
3800
3801	return (howmany(n, 16));
3802}
3803
3804/*
3805 * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
3806 * request header.
3807 */
3808static inline u_int
3809txpkts0_len16(u_int nsegs)
3810{
3811	u_int n;
3812
3813	MPASS(nsegs > 0);
3814
3815	nsegs--; /* first segment is part of ulptx_sgl */
3816	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
3817	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
3818	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
3819
3820	return (howmany(n, 16));
3821}
3822
3823/*
3824 * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
3825 * request header.
3826 */
3827static inline u_int
3828txpkts1_len16(void)
3829{
3830	u_int n;
3831
3832	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
3833
3834	return (howmany(n, 16));
3835}
3836
3837static inline u_int
3838imm_payload(u_int ndesc)
3839{
3840	u_int n;
3841
3842	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
3843	    sizeof(struct cpl_tx_pkt_core);
3844
3845	return (n);
3846}
3847
3848/*
3849 * Write a txpkt WR for this packet to the hardware descriptors, update the
3850 * software descriptor, and advance the pidx.  It is guaranteed that enough
3851 * descriptors are available.
3852 *
3853 * The return value is the # of hardware descriptors used.
3854 */
3855static u_int
3856write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
3857    struct mbuf *m0, u_int available)
3858{
3859	struct sge_eq *eq = &txq->eq;
3860	struct tx_sdesc *txsd;
3861	struct cpl_tx_pkt_core *cpl;
3862	uint32_t ctrl;	/* used in many unrelated places */
3863	uint64_t ctrl1;
3864	int len16, ndesc, pktlen, nsegs;
3865	caddr_t dst;
3866
3867	TXQ_LOCK_ASSERT_OWNED(txq);
3868	M_ASSERTPKTHDR(m0);
3869	MPASS(available > 0 && available < eq->sidx);
3870
3871	len16 = mbuf_len16(m0);
3872	nsegs = mbuf_nsegs(m0);
3873	pktlen = m0->m_pkthdr.len;
3874	ctrl = sizeof(struct cpl_tx_pkt_core);
3875	if (needs_tso(m0))
3876		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
3877	else if (pktlen <= imm_payload(2) && available >= 2) {
3878		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
3879		ctrl += pktlen;
3880		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
3881		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
3882		nsegs = 0;
3883	}
3884	ndesc = howmany(len16, EQ_ESIZE / 16);
3885	MPASS(ndesc <= available);
3886
3887	/* Firmware work request header */
3888	MPASS(wr == (void *)&eq->desc[eq->pidx]);
3889	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
3890	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
3891
3892	ctrl = V_FW_WR_LEN16(len16);
3893	wr->equiq_to_len16 = htobe32(ctrl);
3894	wr->r3 = 0;
3895
3896	if (needs_tso(m0)) {
3897		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
3898
3899		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
3900		    m0->m_pkthdr.l4hlen > 0,
3901		    ("%s: mbuf %p needs TSO but missing header lengths",
3902			__func__, m0));
3903
3904		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
3905		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
3906		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
3907		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
3908			ctrl |= V_LSO_ETHHDR_LEN(1);
3909		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
3910			ctrl |= F_LSO_IPV6;
3911
3912		lso->lso_ctrl = htobe32(ctrl);
3913		lso->ipid_ofst = htobe16(0);
3914		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
3915		lso->seqno_offset = htobe32(0);
3916		lso->len = htobe32(pktlen);
3917
3918		cpl = (void *)(lso + 1);
3919
3920		txq->tso_wrs++;
3921	} else
3922		cpl = (void *)(wr + 1);
3923
3924	/* Checksum offload */
3925	ctrl1 = 0;
3926	if (needs_l3_csum(m0) == 0)
3927		ctrl1 |= F_TXPKT_IPCSUM_DIS;
3928	if (needs_l4_csum(m0) == 0)
3929		ctrl1 |= F_TXPKT_L4CSUM_DIS;
3930	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
3931	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
3932		txq->txcsum++;	/* some hardware assistance provided */
3933
3934	/* VLAN tag insertion */
3935	if (needs_vlan_insertion(m0)) {
3936		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
3937		txq->vlan_insertion++;
3938	}
3939
3940	/* CPL header */
3941	cpl->ctrl0 = txq->cpl_ctrl0;
3942	cpl->pack = 0;
3943	cpl->len = htobe16(pktlen);
3944	cpl->ctrl1 = htobe64(ctrl1);
3945
3946	/* SGL */
3947	dst = (void *)(cpl + 1);
3948	if (nsegs > 0) {
3949
3950		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
3951		txq->sgl_wrs++;
3952	} else {
3953		struct mbuf *m;
3954
3955		for (m = m0; m != NULL; m = m->m_next) {
3956			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
3957#ifdef INVARIANTS
3958			pktlen -= m->m_len;
3959#endif
3960		}
3961#ifdef INVARIANTS
3962		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
3963#endif
3964		txq->imm_wrs++;
3965	}
3966
3967	txq->txpkt_wrs++;
3968
3969	txsd = &txq->sdesc[eq->pidx];
3970	txsd->m = m0;
3971	txsd->desc_used = ndesc;
3972
3973	return (ndesc);
3974}
3975
3976static int
3977try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
3978{
3979	u_int needed, nsegs1, nsegs2, l1, l2;
3980
3981	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
3982		return (1);
3983
3984	nsegs1 = mbuf_nsegs(m);
3985	nsegs2 = mbuf_nsegs(n);
3986	if (nsegs1 + nsegs2 == 2) {
3987		txp->wr_type = 1;
3988		l1 = l2 = txpkts1_len16();
3989	} else {
3990		txp->wr_type = 0;
3991		l1 = txpkts0_len16(nsegs1);
3992		l2 = txpkts0_len16(nsegs2);
3993	}
3994	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
3995	needed = howmany(txp->len16, EQ_ESIZE / 16);
3996	if (needed > SGE_MAX_WR_NDESC || needed > available)
3997		return (1);
3998
3999	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
4000	if (txp->plen > 65535)
4001		return (1);
4002
4003	txp->npkt = 2;
4004	set_mbuf_len16(m, l1);
4005	set_mbuf_len16(n, l2);
4006
4007	return (0);
4008}
4009
4010static int
4011add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
4012{
4013	u_int plen, len16, needed, nsegs;
4014
4015	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
4016
4017	nsegs = mbuf_nsegs(m);
4018	if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1))
4019		return (1);
4020
4021	plen = txp->plen + m->m_pkthdr.len;
4022	if (plen > 65535)
4023		return (1);
4024
4025	if (txp->wr_type == 0)
4026		len16 = txpkts0_len16(nsegs);
4027	else
4028		len16 = txpkts1_len16();
4029	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
4030	if (needed > SGE_MAX_WR_NDESC || needed > available)
4031		return (1);
4032
4033	txp->npkt++;
4034	txp->plen = plen;
4035	txp->len16 += len16;
4036	set_mbuf_len16(m, len16);
4037
4038	return (0);
4039}
4040
4041/*
4042 * Write a txpkts WR for the packets in txp to the hardware descriptors, update
4043 * the software descriptor, and advance the pidx.  It is guaranteed that enough
4044 * descriptors are available.
4045 *
4046 * The return value is the # of hardware descriptors used.
4047 */
4048static u_int
4049write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
4050    struct mbuf *m0, const struct txpkts *txp, u_int available)
4051{
4052	struct sge_eq *eq = &txq->eq;
4053	struct tx_sdesc *txsd;
4054	struct cpl_tx_pkt_core *cpl;
4055	uint32_t ctrl;
4056	uint64_t ctrl1;
4057	int ndesc, checkwrap;
4058	struct mbuf *m;
4059	void *flitp;
4060
4061	TXQ_LOCK_ASSERT_OWNED(txq);
4062	MPASS(txp->npkt > 0);
4063	MPASS(txp->plen < 65536);
4064	MPASS(m0 != NULL);
4065	MPASS(m0->m_nextpkt != NULL);
4066	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
4067	MPASS(available > 0 && available < eq->sidx);
4068
4069	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
4070	MPASS(ndesc <= available);
4071
4072	MPASS(wr == (void *)&eq->desc[eq->pidx]);
4073	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
4074	ctrl = V_FW_WR_LEN16(txp->len16);
4075	wr->equiq_to_len16 = htobe32(ctrl);
4076	wr->plen = htobe16(txp->plen);
4077	wr->npkt = txp->npkt;
4078	wr->r3 = 0;
4079	wr->type = txp->wr_type;
4080	flitp = wr + 1;
4081
4082	/*
4083	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
4084	 * set then we know the WR is going to wrap around somewhere.  We'll
4085	 * check for that at appropriate points.
4086	 */
4087	checkwrap = eq->sidx - ndesc < eq->pidx;
4088	for (m = m0; m != NULL; m = m->m_nextpkt) {
4089		if (txp->wr_type == 0) {
4090			struct ulp_txpkt *ulpmc;
4091			struct ulptx_idata *ulpsc;
4092
4093			/* ULP master command */
4094			ulpmc = flitp;
4095			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
4096			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
4097			ulpmc->len = htobe32(mbuf_len16(m));
4098
4099			/* ULP subcommand */
4100			ulpsc = (void *)(ulpmc + 1);
4101			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
4102			    F_ULP_TX_SC_MORE);
4103			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
4104
4105			cpl = (void *)(ulpsc + 1);
4106			if (checkwrap &&
4107			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
4108				cpl = (void *)&eq->desc[0];
4109			txq->txpkts0_pkts += txp->npkt;
4110			txq->txpkts0_wrs++;
4111		} else {
4112			cpl = flitp;
4113			txq->txpkts1_pkts += txp->npkt;
4114			txq->txpkts1_wrs++;
4115		}
4116
4117		/* Checksum offload */
4118		ctrl1 = 0;
4119		if (needs_l3_csum(m) == 0)
4120			ctrl1 |= F_TXPKT_IPCSUM_DIS;
4121		if (needs_l4_csum(m) == 0)
4122			ctrl1 |= F_TXPKT_L4CSUM_DIS;
4123		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
4124		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
4125			txq->txcsum++;	/* some hardware assistance provided */
4126
4127		/* VLAN tag insertion */
4128		if (needs_vlan_insertion(m)) {
4129			ctrl1 |= F_TXPKT_VLAN_VLD |
4130			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
4131			txq->vlan_insertion++;
4132		}
4133
4134		/* CPL header */
4135		cpl->ctrl0 = txq->cpl_ctrl0;
4136		cpl->pack = 0;
4137		cpl->len = htobe16(m->m_pkthdr.len);
4138		cpl->ctrl1 = htobe64(ctrl1);
4139
4140		flitp = cpl + 1;
4141		if (checkwrap &&
4142		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
4143			flitp = (void *)&eq->desc[0];
4144
4145		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
4146
4147	}
4148
4149	txsd = &txq->sdesc[eq->pidx];
4150	txsd->m = m0;
4151	txsd->desc_used = ndesc;
4152
4153	return (ndesc);
4154}
4155
4156/*
4157 * If the SGL ends on an address that is not 16 byte aligned, this function will
4158 * add a 0 filled flit at the end.
4159 */
4160static void
4161write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
4162{
4163	struct sge_eq *eq = &txq->eq;
4164	struct sglist *gl = txq->gl;
4165	struct sglist_seg *seg;
4166	__be64 *flitp, *wrap;
4167	struct ulptx_sgl *usgl;
4168	int i, nflits, nsegs;
4169
4170	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
4171	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
4172	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
4173	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
4174
4175	get_pkt_gl(m, gl);
4176	nsegs = gl->sg_nseg;
4177	MPASS(nsegs > 0);
4178
4179	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
4180	flitp = (__be64 *)(*to);
4181	wrap = (__be64 *)(&eq->desc[eq->sidx]);
4182	seg = &gl->sg_segs[0];
4183	usgl = (void *)flitp;
4184
4185	/*
4186	 * We start at a 16 byte boundary somewhere inside the tx descriptor
4187	 * ring, so we're at least 16 bytes away from the status page.  There is
4188	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
4189	 */
4190
4191	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
4192	    V_ULPTX_NSGE(nsegs));
4193	usgl->len0 = htobe32(seg->ss_len);
4194	usgl->addr0 = htobe64(seg->ss_paddr);
4195	seg++;
4196
4197	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
4198
4199		/* Won't wrap around at all */
4200
4201		for (i = 0; i < nsegs - 1; i++, seg++) {
4202			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
4203			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
4204		}
4205		if (i & 1)
4206			usgl->sge[i / 2].len[1] = htobe32(0);
4207		flitp += nflits;
4208	} else {
4209
4210		/* Will wrap somewhere in the rest of the SGL */
4211
4212		/* 2 flits already written, write the rest flit by flit */
4213		flitp = (void *)(usgl + 1);
4214		for (i = 0; i < nflits - 2; i++) {
4215			if (flitp == wrap)
4216				flitp = (void *)eq->desc;
4217			*flitp++ = get_flit(seg, nsegs - 1, i);
4218		}
4219	}
4220
4221	if (nflits & 1) {
4222		MPASS(((uintptr_t)flitp) & 0xf);
4223		*flitp++ = 0;
4224	}
4225
4226	MPASS((((uintptr_t)flitp) & 0xf) == 0);
4227	if (__predict_false(flitp == wrap))
4228		*to = (void *)eq->desc;
4229	else
4230		*to = (void *)flitp;
4231}
4232
4233static inline void
4234copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
4235{
4236
4237	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
4238	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
4239
4240	if (__predict_true((uintptr_t)(*to) + len <=
4241	    (uintptr_t)&eq->desc[eq->sidx])) {
4242		bcopy(from, *to, len);
4243		(*to) += len;
4244	} else {
4245		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
4246
4247		bcopy(from, *to, portion);
4248		from += portion;
4249		portion = len - portion;	/* remaining */
4250		bcopy(from, (void *)eq->desc, portion);
4251		(*to) = (caddr_t)eq->desc + portion;
4252	}
4253}
4254
4255static inline void
4256ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
4257{
4258	u_int db;
4259
4260	MPASS(n > 0);
4261
4262	db = eq->doorbells;
4263	if (n > 1)
4264		clrbit(&db, DOORBELL_WCWR);
4265	wmb();
4266
4267	switch (ffs(db) - 1) {
4268	case DOORBELL_UDB:
4269		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
4270		break;
4271
4272	case DOORBELL_WCWR: {
4273		volatile uint64_t *dst, *src;
4274		int i;
4275
4276		/*
4277		 * Queues whose 128B doorbell segment fits in the page do not
4278		 * use relative qid (udb_qid is always 0).  Only queues with
4279		 * doorbell segments can do WCWR.
4280		 */
4281		KASSERT(eq->udb_qid == 0 && n == 1,
4282		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
4283		    __func__, eq->doorbells, n, eq->dbidx, eq));
4284
4285		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
4286		    UDBS_DB_OFFSET);
4287		i = eq->dbidx;
4288		src = (void *)&eq->desc[i];
4289		while (src != (void *)&eq->desc[i + 1])
4290			*dst++ = *src++;
4291		wmb();
4292		break;
4293	}
4294
4295	case DOORBELL_UDBWC:
4296		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
4297		wmb();
4298		break;
4299
4300	case DOORBELL_KDB:
4301		t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL),
4302		    V_QID(eq->cntxt_id) | V_PIDX(n));
4303		break;
4304	}
4305
4306	IDXINCR(eq->dbidx, n, eq->sidx);
4307}
4308
4309static inline u_int
4310reclaimable_tx_desc(struct sge_eq *eq)
4311{
4312	uint16_t hw_cidx;
4313
4314	hw_cidx = read_hw_cidx(eq);
4315	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
4316}
4317
4318static inline u_int
4319total_available_tx_desc(struct sge_eq *eq)
4320{
4321	uint16_t hw_cidx, pidx;
4322
4323	hw_cidx = read_hw_cidx(eq);
4324	pidx = eq->pidx;
4325
4326	if (pidx == hw_cidx)
4327		return (eq->sidx - 1);
4328	else
4329		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
4330}
4331
4332static inline uint16_t
4333read_hw_cidx(struct sge_eq *eq)
4334{
4335	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
4336	uint16_t cidx = spg->cidx;	/* stable snapshot */
4337
4338	return (be16toh(cidx));
4339}
4340
4341/*
4342 * Reclaim 'n' descriptors approximately.
4343 */
4344static u_int
4345reclaim_tx_descs(struct sge_txq *txq, u_int n)
4346{
4347	struct tx_sdesc *txsd;
4348	struct sge_eq *eq = &txq->eq;
4349	u_int can_reclaim, reclaimed;
4350
4351	TXQ_LOCK_ASSERT_OWNED(txq);
4352	MPASS(n > 0);
4353
4354	reclaimed = 0;
4355	can_reclaim = reclaimable_tx_desc(eq);
4356	while (can_reclaim && reclaimed < n) {
4357		int ndesc;
4358		struct mbuf *m, *nextpkt;
4359
4360		txsd = &txq->sdesc[eq->cidx];
4361		ndesc = txsd->desc_used;
4362
4363		/* Firmware doesn't return "partial" credits. */
4364		KASSERT(can_reclaim >= ndesc,
4365		    ("%s: unexpected number of credits: %d, %d",
4366		    __func__, can_reclaim, ndesc));
4367
4368		for (m = txsd->m; m != NULL; m = nextpkt) {
4369			nextpkt = m->m_nextpkt;
4370			m->m_nextpkt = NULL;
4371			m_freem(m);
4372		}
4373		reclaimed += ndesc;
4374		can_reclaim -= ndesc;
4375		IDXINCR(eq->cidx, ndesc, eq->sidx);
4376	}
4377
4378	return (reclaimed);
4379}
4380
4381static void
4382tx_reclaim(void *arg, int n)
4383{
4384	struct sge_txq *txq = arg;
4385	struct sge_eq *eq = &txq->eq;
4386
4387	do {
4388		if (TXQ_TRYLOCK(txq) == 0)
4389			break;
4390		n = reclaim_tx_descs(txq, 32);
4391		if (eq->cidx == eq->pidx)
4392			eq->equeqidx = eq->pidx;
4393		TXQ_UNLOCK(txq);
4394	} while (n > 0);
4395}
4396
4397static __be64
4398get_flit(struct sglist_seg *segs, int nsegs, int idx)
4399{
4400	int i = (idx / 3) * 2;
4401
4402	switch (idx % 3) {
4403	case 0: {
4404		__be64 rc;
4405
4406		rc = htobe32(segs[i].ss_len);
4407		if (i + 1 < nsegs)
4408			rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32;
4409
4410		return (rc);
4411	}
4412	case 1:
4413		return (htobe64(segs[i].ss_paddr));
4414	case 2:
4415		return (htobe64(segs[i + 1].ss_paddr));
4416	}
4417
4418	return (0);
4419}
4420
4421static void
4422find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp)
4423{
4424	int8_t zidx, hwidx, idx;
4425	uint16_t region1, region3;
4426	int spare, spare_needed, n;
4427	struct sw_zone_info *swz;
4428	struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0];
4429
4430	/*
4431	 * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize
4432	 * large enough for the max payload and cluster metadata.  Otherwise
4433	 * settle for the largest bufsize that leaves enough room in the cluster
4434	 * for metadata.
4435	 *
4436	 * Without buffer packing: Look for the smallest zone which has a
4437	 * bufsize large enough for the max payload.  Settle for the largest
4438	 * bufsize available if there's nothing big enough for max payload.
4439	 */
4440	spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0;
4441	swz = &sc->sge.sw_zone_info[0];
4442	hwidx = -1;
4443	for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) {
4444		if (swz->size > largest_rx_cluster) {
4445			if (__predict_true(hwidx != -1))
4446				break;
4447
4448			/*
4449			 * This is a misconfiguration.  largest_rx_cluster is
4450			 * preventing us from finding a refill source.  See
4451			 * dev.t5nex.<n>.buffer_sizes to figure out why.
4452			 */
4453			device_printf(sc->dev, "largest_rx_cluster=%u leaves no"
4454			    " refill source for fl %p (dma %u).  Ignored.\n",
4455			    largest_rx_cluster, fl, maxp);
4456		}
4457		for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) {
4458			hwb = &hwb_list[idx];
4459			spare = swz->size - hwb->size;
4460			if (spare < spare_needed)
4461				continue;
4462
4463			hwidx = idx;		/* best option so far */
4464			if (hwb->size >= maxp) {
4465
4466				if ((fl->flags & FL_BUF_PACKING) == 0)
4467					goto done; /* stop looking (not packing) */
4468
4469				if (swz->size >= safest_rx_cluster)
4470					goto done; /* stop looking (packing) */
4471			}
4472			break;		/* keep looking, next zone */
4473		}
4474	}
4475done:
4476	/* A usable hwidx has been located. */
4477	MPASS(hwidx != -1);
4478	hwb = &hwb_list[hwidx];
4479	zidx = hwb->zidx;
4480	swz = &sc->sge.sw_zone_info[zidx];
4481	region1 = 0;
4482	region3 = swz->size - hwb->size;
4483
4484	/*
4485	 * Stay within this zone and see if there is a better match when mbuf
4486	 * inlining is allowed.  Remember that the hwidx's are sorted in
4487	 * decreasing order of size (so in increasing order of spare area).
4488	 */
4489	for (idx = hwidx; idx != -1; idx = hwb->next) {
4490		hwb = &hwb_list[idx];
4491		spare = swz->size - hwb->size;
4492
4493		if (allow_mbufs_in_cluster == 0 || hwb->size < maxp)
4494			break;
4495
4496		/*
4497		 * Do not inline mbufs if doing so would violate the pad/pack
4498		 * boundary alignment requirement.
4499		 */
4500		if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0)
4501			continue;
4502		if (fl->flags & FL_BUF_PACKING &&
4503		    (MSIZE % sc->params.sge.pack_boundary) != 0)
4504			continue;
4505
4506		if (spare < CL_METADATA_SIZE + MSIZE)
4507			continue;
4508		n = (spare - CL_METADATA_SIZE) / MSIZE;
4509		if (n > howmany(hwb->size, maxp))
4510			break;
4511
4512		hwidx = idx;
4513		if (fl->flags & FL_BUF_PACKING) {
4514			region1 = n * MSIZE;
4515			region3 = spare - region1;
4516		} else {
4517			region1 = MSIZE;
4518			region3 = spare - region1;
4519			break;
4520		}
4521	}
4522
4523	KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES,
4524	    ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp));
4525	KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES,
4526	    ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp));
4527	KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 ==
4528	    sc->sge.sw_zone_info[zidx].size,
4529	    ("%s: bad buffer layout for fl %p, maxp %d. "
4530		"cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4531		sc->sge.sw_zone_info[zidx].size, region1,
4532		sc->sge.hw_buf_info[hwidx].size, region3));
4533	if (fl->flags & FL_BUF_PACKING || region1 > 0) {
4534		KASSERT(region3 >= CL_METADATA_SIZE,
4535		    ("%s: no room for metadata.  fl %p, maxp %d; "
4536		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4537		    sc->sge.sw_zone_info[zidx].size, region1,
4538		    sc->sge.hw_buf_info[hwidx].size, region3));
4539		KASSERT(region1 % MSIZE == 0,
4540		    ("%s: bad mbuf region for fl %p, maxp %d. "
4541		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4542		    sc->sge.sw_zone_info[zidx].size, region1,
4543		    sc->sge.hw_buf_info[hwidx].size, region3));
4544	}
4545
4546	fl->cll_def.zidx = zidx;
4547	fl->cll_def.hwidx = hwidx;
4548	fl->cll_def.region1 = region1;
4549	fl->cll_def.region3 = region3;
4550}
4551
4552static void
4553find_safe_refill_source(struct adapter *sc, struct sge_fl *fl)
4554{
4555	struct sge *s = &sc->sge;
4556	struct hw_buf_info *hwb;
4557	struct sw_zone_info *swz;
4558	int spare;
4559	int8_t hwidx;
4560
4561	if (fl->flags & FL_BUF_PACKING)
4562		hwidx = s->safe_hwidx2;	/* with room for metadata */
4563	else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) {
4564		hwidx = s->safe_hwidx2;
4565		hwb = &s->hw_buf_info[hwidx];
4566		swz = &s->sw_zone_info[hwb->zidx];
4567		spare = swz->size - hwb->size;
4568
4569		/* no good if there isn't room for an mbuf as well */
4570		if (spare < CL_METADATA_SIZE + MSIZE)
4571			hwidx = s->safe_hwidx1;
4572	} else
4573		hwidx = s->safe_hwidx1;
4574
4575	if (hwidx == -1) {
4576		/* No fallback source */
4577		fl->cll_alt.hwidx = -1;
4578		fl->cll_alt.zidx = -1;
4579
4580		return;
4581	}
4582
4583	hwb = &s->hw_buf_info[hwidx];
4584	swz = &s->sw_zone_info[hwb->zidx];
4585	spare = swz->size - hwb->size;
4586	fl->cll_alt.hwidx = hwidx;
4587	fl->cll_alt.zidx = hwb->zidx;
4588	if (allow_mbufs_in_cluster &&
4589	    (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0))
4590		fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE;
4591	else
4592		fl->cll_alt.region1 = 0;
4593	fl->cll_alt.region3 = spare - fl->cll_alt.region1;
4594}
4595
4596static void
4597add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
4598{
4599	mtx_lock(&sc->sfl_lock);
4600	FL_LOCK(fl);
4601	if ((fl->flags & FL_DOOMED) == 0) {
4602		fl->flags |= FL_STARVING;
4603		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
4604		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
4605	}
4606	FL_UNLOCK(fl);
4607	mtx_unlock(&sc->sfl_lock);
4608}
4609
4610static void
4611handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
4612{
4613	struct sge_wrq *wrq = (void *)eq;
4614
4615	atomic_readandclear_int(&eq->equiq);
4616	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
4617}
4618
4619static void
4620handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
4621{
4622	struct sge_txq *txq = (void *)eq;
4623
4624	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
4625
4626	atomic_readandclear_int(&eq->equiq);
4627	mp_ring_check_drainage(txq->r, 0);
4628	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
4629}
4630
4631static int
4632handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
4633    struct mbuf *m)
4634{
4635	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
4636	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
4637	struct adapter *sc = iq->adapter;
4638	struct sge *s = &sc->sge;
4639	struct sge_eq *eq;
4640	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
4641		&handle_wrq_egr_update, &handle_eth_egr_update,
4642		&handle_wrq_egr_update};
4643
4644	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
4645	    rss->opcode));
4646
4647	eq = s->eqmap[qid - s->eq_start];
4648	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
4649
4650	return (0);
4651}
4652
4653/* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
4654CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
4655    offsetof(struct cpl_fw6_msg, data));
4656
4657static int
4658handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
4659{
4660	struct adapter *sc = iq->adapter;
4661	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
4662
4663	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
4664	    rss->opcode));
4665
4666	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
4667		const struct rss_header *rss2;
4668
4669		rss2 = (const struct rss_header *)&cpl->data[0];
4670		return (sc->cpl_handler[rss2->opcode](iq, rss2, m));
4671	}
4672
4673	return (sc->fw_msg_handler[cpl->type](sc, &cpl->data[0]));
4674}
4675
4676static int
4677sysctl_uint16(SYSCTL_HANDLER_ARGS)
4678{
4679	uint16_t *id = arg1;
4680	int i = *id;
4681
4682	return sysctl_handle_int(oidp, &i, 0, req);
4683}
4684
4685static int
4686sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
4687{
4688	struct sge *s = arg1;
4689	struct hw_buf_info *hwb = &s->hw_buf_info[0];
4690	struct sw_zone_info *swz = &s->sw_zone_info[0];
4691	int i, rc;
4692	struct sbuf sb;
4693	char c;
4694
4695	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
4696	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
4697		if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster)
4698			c = '*';
4699		else
4700			c = '\0';
4701
4702		sbuf_printf(&sb, "%u%c ", hwb->size, c);
4703	}
4704	sbuf_trim(&sb);
4705	sbuf_finish(&sb);
4706	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
4707	sbuf_delete(&sb);
4708	return (rc);
4709}
4710