t4_sge.c revision 281955
1/*-
2 * Copyright (c) 2011 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/10/sys/dev/cxgbe/t4_sge.c 281955 2015-04-24 23:26:44Z hiren $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#include <sys/types.h>
35#include <sys/mbuf.h>
36#include <sys/socket.h>
37#include <sys/kernel.h>
38#include <sys/kdb.h>
39#include <sys/malloc.h>
40#include <sys/queue.h>
41#include <sys/sbuf.h>
42#include <sys/taskqueue.h>
43#include <sys/time.h>
44#include <sys/sysctl.h>
45#include <sys/smp.h>
46#include <sys/counter.h>
47#include <net/bpf.h>
48#include <net/ethernet.h>
49#include <net/if.h>
50#include <net/if_vlan_var.h>
51#include <netinet/in.h>
52#include <netinet/ip.h>
53#include <netinet/ip6.h>
54#include <netinet/tcp.h>
55#include <machine/md_var.h>
56#include <vm/vm.h>
57#include <vm/pmap.h>
58#ifdef DEV_NETMAP
59#include <machine/bus.h>
60#include <sys/selinfo.h>
61#include <net/if_var.h>
62#include <net/netmap.h>
63#include <dev/netmap/netmap_kern.h>
64#endif
65
66#include "common/common.h"
67#include "common/t4_regs.h"
68#include "common/t4_regs_values.h"
69#include "common/t4_msg.h"
70
71#ifdef T4_PKT_TIMESTAMP
72#define RX_COPY_THRESHOLD (MINCLSIZE - 8)
73#else
74#define RX_COPY_THRESHOLD MINCLSIZE
75#endif
76
77/*
78 * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
79 * 0-7 are valid values.
80 */
81int fl_pktshift = 2;
82TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift);
83
84/*
85 * Pad ethernet payload up to this boundary.
86 * -1: driver should figure out a good value.
87 *  0: disable padding.
88 *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
89 */
90int fl_pad = -1;
91TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
92
93/*
94 * Status page length.
95 * -1: driver should figure out a good value.
96 *  64 or 128 are the only other valid values.
97 */
98int spg_len = -1;
99TUNABLE_INT("hw.cxgbe.spg_len", &spg_len);
100
101/*
102 * Congestion drops.
103 * -1: no congestion feedback (not recommended).
104 *  0: backpressure the channel instead of dropping packets right away.
105 *  1: no backpressure, drop packets for the congested queue immediately.
106 */
107static int cong_drop = 0;
108TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
109
110/*
111 * Deliver multiple frames in the same free list buffer if they fit.
112 * -1: let the driver decide whether to enable buffer packing or not.
113 *  0: disable buffer packing.
114 *  1: enable buffer packing.
115 */
116static int buffer_packing = -1;
117TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing);
118
119/*
120 * Start next frame in a packed buffer at this boundary.
121 * -1: driver should figure out a good value.
122 * T4: driver will ignore this and use the same value as fl_pad above.
123 * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
124 */
125static int fl_pack = -1;
126TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
127
128/*
129 * Allow the driver to create mbuf(s) in a cluster allocated for rx.
130 * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
131 * 1: ok to create mbuf(s) within a cluster if there is room.
132 */
133static int allow_mbufs_in_cluster = 1;
134TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
135
136/*
137 * Largest rx cluster size that the driver is allowed to allocate.
138 */
139static int largest_rx_cluster = MJUM16BYTES;
140TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
141
142/*
143 * Size of cluster allocation that's most likely to succeed.  The driver will
144 * fall back to this size if it fails to allocate clusters larger than this.
145 */
146static int safest_rx_cluster = PAGE_SIZE;
147TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
148
149/* Used to track coalesced tx work request */
150struct txpkts {
151	uint64_t *flitp;	/* ptr to flit where next pkt should start */
152	uint8_t npkt;		/* # of packets in this work request */
153	uint8_t nflits;		/* # of flits used by this work request */
154	uint16_t plen;		/* total payload (sum of all packets) */
155};
156
157/* A packet's SGL.  This + m_pkthdr has all info needed for tx */
158struct sgl {
159	int nsegs;		/* # of segments in the SGL, 0 means imm. tx */
160	int nflits;		/* # of flits needed for the SGL */
161	bus_dma_segment_t seg[TX_SGL_SEGS];
162};
163
164static int service_iq(struct sge_iq *, int);
165static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
166static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
167static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
168static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
169static inline void init_eq(struct sge_eq *, int, int, uint8_t, uint16_t,
170    char *);
171static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
172    bus_addr_t *, void **);
173static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
174    void *);
175static int alloc_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *,
176    int, int);
177static int free_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *);
178static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
179    struct sge_fl *);
180static int alloc_fwq(struct adapter *);
181static int free_fwq(struct adapter *);
182static int alloc_mgmtq(struct adapter *);
183static int free_mgmtq(struct adapter *);
184static int alloc_rxq(struct port_info *, struct sge_rxq *, int, int,
185    struct sysctl_oid *);
186static int free_rxq(struct port_info *, struct sge_rxq *);
187#ifdef TCP_OFFLOAD
188static int alloc_ofld_rxq(struct port_info *, struct sge_ofld_rxq *, int, int,
189    struct sysctl_oid *);
190static int free_ofld_rxq(struct port_info *, struct sge_ofld_rxq *);
191#endif
192#ifdef DEV_NETMAP
193static int alloc_nm_rxq(struct port_info *, struct sge_nm_rxq *, int, int,
194    struct sysctl_oid *);
195static int free_nm_rxq(struct port_info *, struct sge_nm_rxq *);
196static int alloc_nm_txq(struct port_info *, struct sge_nm_txq *, int, int,
197    struct sysctl_oid *);
198static int free_nm_txq(struct port_info *, struct sge_nm_txq *);
199#endif
200static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
201static int eth_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
202#ifdef TCP_OFFLOAD
203static int ofld_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
204#endif
205static int alloc_eq(struct adapter *, struct port_info *, struct sge_eq *);
206static int free_eq(struct adapter *, struct sge_eq *);
207static int alloc_wrq(struct adapter *, struct port_info *, struct sge_wrq *,
208    struct sysctl_oid *);
209static int free_wrq(struct adapter *, struct sge_wrq *);
210static int alloc_txq(struct port_info *, struct sge_txq *, int,
211    struct sysctl_oid *);
212static int free_txq(struct port_info *, struct sge_txq *);
213static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
214static inline void ring_fl_db(struct adapter *, struct sge_fl *);
215static int refill_fl(struct adapter *, struct sge_fl *, int);
216static void refill_sfl(void *);
217static int alloc_fl_sdesc(struct sge_fl *);
218static void free_fl_sdesc(struct adapter *, struct sge_fl *);
219static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
220static void find_safe_refill_source(struct adapter *, struct sge_fl *);
221static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
222
223static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int);
224static int free_pkt_sgl(struct sge_txq *, struct sgl *);
225static int write_txpkt_wr(struct port_info *, struct sge_txq *, struct mbuf *,
226    struct sgl *);
227static int add_to_txpkts(struct port_info *, struct sge_txq *, struct txpkts *,
228    struct mbuf *, struct sgl *);
229static void write_txpkts_wr(struct sge_txq *, struct txpkts *);
230static inline void write_ulp_cpl_sgl(struct port_info *, struct sge_txq *,
231    struct txpkts *, struct mbuf *, struct sgl *);
232static int write_sgl_to_txd(struct sge_eq *, struct sgl *, caddr_t *);
233static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
234static inline void ring_eq_db(struct adapter *, struct sge_eq *);
235static inline int reclaimable(struct sge_eq *);
236static int reclaim_tx_descs(struct sge_txq *, int, int);
237static void write_eqflush_wr(struct sge_eq *);
238static __be64 get_flit(bus_dma_segment_t *, int, int);
239static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
240    struct mbuf *);
241static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
242    struct mbuf *);
243
244static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
245static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
246
247static counter_u64_t extfree_refs;
248static counter_u64_t extfree_rels;
249
250/*
251 * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
252 */
253void
254t4_sge_modload(void)
255{
256
257	if (fl_pktshift < 0 || fl_pktshift > 7) {
258		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
259		    " using 2 instead.\n", fl_pktshift);
260		fl_pktshift = 2;
261	}
262
263	if (spg_len != 64 && spg_len != 128) {
264		int len;
265
266#if defined(__i386__) || defined(__amd64__)
267		len = cpu_clflush_line_size > 64 ? 128 : 64;
268#else
269		len = 64;
270#endif
271		if (spg_len != -1) {
272			printf("Invalid hw.cxgbe.spg_len value (%d),"
273			    " using %d instead.\n", spg_len, len);
274		}
275		spg_len = len;
276	}
277
278	if (cong_drop < -1 || cong_drop > 1) {
279		printf("Invalid hw.cxgbe.cong_drop value (%d),"
280		    " using 0 instead.\n", cong_drop);
281		cong_drop = 0;
282	}
283
284	extfree_refs = counter_u64_alloc(M_WAITOK);
285	extfree_rels = counter_u64_alloc(M_WAITOK);
286	counter_u64_zero(extfree_refs);
287	counter_u64_zero(extfree_rels);
288}
289
290void
291t4_sge_modunload(void)
292{
293
294	counter_u64_free(extfree_refs);
295	counter_u64_free(extfree_rels);
296}
297
298uint64_t
299t4_sge_extfree_refs(void)
300{
301	uint64_t refs, rels;
302
303	rels = counter_u64_fetch(extfree_rels);
304	refs = counter_u64_fetch(extfree_refs);
305
306	return (refs - rels);
307}
308
309void
310t4_init_sge_cpl_handlers(struct adapter *sc)
311{
312
313	t4_register_cpl_handler(sc, CPL_FW4_MSG, handle_fw_msg);
314	t4_register_cpl_handler(sc, CPL_FW6_MSG, handle_fw_msg);
315	t4_register_cpl_handler(sc, CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
316	t4_register_cpl_handler(sc, CPL_RX_PKT, t4_eth_rx);
317	t4_register_fw_msg_handler(sc, FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
318}
319
320static inline void
321setup_pad_and_pack_boundaries(struct adapter *sc)
322{
323	uint32_t v, m;
324	int pad, pack;
325
326	pad = fl_pad;
327	if (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad)) {
328		/*
329		 * If there is any chance that we might use buffer packing and
330		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
331		 * it to 32 in all other cases.
332		 */
333		pad = is_t4(sc) && buffer_packing ? 64 : 32;
334
335		/*
336		 * For fl_pad = 0 we'll still write a reasonable value to the
337		 * register but all the freelists will opt out of padding.
338		 * We'll complain here only if the user tried to set it to a
339		 * value greater than 0 that was invalid.
340		 */
341		if (fl_pad > 0) {
342			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
343			    " (%d), using %d instead.\n", fl_pad, pad);
344		}
345	}
346	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
347	v = V_INGPADBOUNDARY(ilog2(pad) - 5);
348	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
349
350	if (is_t4(sc)) {
351		if (fl_pack != -1 && fl_pack != pad) {
352			/* Complain but carry on. */
353			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
354			    " using %d instead.\n", fl_pack, pad);
355		}
356		return;
357	}
358
359	pack = fl_pack;
360	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
361	    !powerof2(fl_pack)) {
362		pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
363		MPASS(powerof2(pack));
364		if (pack < 16)
365			pack = 16;
366		if (pack == 32)
367			pack = 64;
368		if (pack > 4096)
369			pack = 4096;
370		if (fl_pack != -1) {
371			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
372			    " (%d), using %d instead.\n", fl_pack, pack);
373		}
374	}
375	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
376	if (pack == 16)
377		v = V_INGPACKBOUNDARY(0);
378	else
379		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
380
381	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
382	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
383}
384
385/*
386 * adap->params.vpd.cclk must be set up before this is called.
387 */
388void
389t4_tweak_chip_settings(struct adapter *sc)
390{
391	int i;
392	uint32_t v, m;
393	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
394	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
395	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
396	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
397	static int sge_flbuf_sizes[] = {
398		MCLBYTES,
399#if MJUMPAGESIZE != MCLBYTES
400		MJUMPAGESIZE,
401		MJUMPAGESIZE - CL_METADATA_SIZE,
402		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
403#endif
404		MJUM9BYTES,
405		MJUM16BYTES,
406		MCLBYTES - MSIZE - CL_METADATA_SIZE,
407		MJUM9BYTES - CL_METADATA_SIZE,
408		MJUM16BYTES - CL_METADATA_SIZE,
409	};
410
411	KASSERT(sc->flags & MASTER_PF,
412	    ("%s: trying to change chip settings when not master.", __func__));
413
414	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
415	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
416	    V_EGRSTATUSPAGESIZE(spg_len == 128);
417	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
418
419	setup_pad_and_pack_boundaries(sc);
420
421	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
422	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
423	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
424	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
425	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
426	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
427	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
428	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
429	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
430
431	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
432	    ("%s: hw buffer size table too big", __func__));
433	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
434		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
435		    sge_flbuf_sizes[i]);
436	}
437
438	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
439	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
440	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
441
442	KASSERT(intr_timer[0] <= timer_max,
443	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
444	    timer_max));
445	for (i = 1; i < nitems(intr_timer); i++) {
446		KASSERT(intr_timer[i] >= intr_timer[i - 1],
447		    ("%s: timers not listed in increasing order (%d)",
448		    __func__, i));
449
450		while (intr_timer[i] > timer_max) {
451			if (i == nitems(intr_timer) - 1) {
452				intr_timer[i] = timer_max;
453				break;
454			}
455			intr_timer[i] += intr_timer[i - 1];
456			intr_timer[i] /= 2;
457		}
458	}
459
460	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
461	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
462	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
463	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
464	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
465	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
466	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
467	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
468	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
469
470	if (cong_drop == 0) {
471		m = F_TUNNELCNGDROP0 | F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 |
472		    F_TUNNELCNGDROP3;
473		t4_set_reg_field(sc, A_TP_PARA_REG3, m, 0);
474	}
475
476	/* 4K, 16K, 64K, 256K DDP "page sizes" */
477	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
478	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
479
480	m = v = F_TDDPTAGTCB;
481	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
482
483	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
484	    F_RESETDDPOFFSET;
485	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
486	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
487}
488
489/*
490 * SGE wants the buffer to be at least 64B and then a multiple of 16.  If
491 * padding is is use the buffer's start and end need to be aligned to the pad
492 * boundary as well.  We'll just make sure that the size is a multiple of the
493 * boundary here, it is up to the buffer allocation code to make sure the start
494 * of the buffer is aligned as well.
495 */
496static inline int
497hwsz_ok(struct adapter *sc, int hwsz)
498{
499	int mask = fl_pad ? sc->sge.pad_boundary - 1 : 16 - 1;
500
501	return (hwsz >= 64 && (hwsz & mask) == 0);
502}
503
504/*
505 * XXX: driver really should be able to deal with unexpected settings.
506 */
507int
508t4_read_chip_settings(struct adapter *sc)
509{
510	struct sge *s = &sc->sge;
511	int i, j, n, rc = 0;
512	uint32_t m, v, r;
513	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
514	static int sw_buf_sizes[] = {	/* Sorted by size */
515		MCLBYTES,
516#if MJUMPAGESIZE != MCLBYTES
517		MJUMPAGESIZE,
518#endif
519		MJUM9BYTES,
520		MJUM16BYTES
521	};
522	struct sw_zone_info *swz, *safe_swz;
523	struct hw_buf_info *hwb;
524
525	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
526	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
527	    V_EGRSTATUSPAGESIZE(spg_len == 128);
528	r = t4_read_reg(sc, A_SGE_CONTROL);
529	if ((r & m) != v) {
530		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
531		rc = EINVAL;
532	}
533	s->pad_boundary = 1 << (G_INGPADBOUNDARY(r) + 5);
534
535	if (is_t4(sc))
536		s->pack_boundary = s->pad_boundary;
537	else {
538		r = t4_read_reg(sc, A_SGE_CONTROL2);
539		if (G_INGPACKBOUNDARY(r) == 0)
540			s->pack_boundary = 16;
541		else
542			s->pack_boundary = 1 << (G_INGPACKBOUNDARY(r) + 5);
543	}
544
545	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
546	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
547	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
548	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
549	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
550	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
551	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
552	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
553	r = t4_read_reg(sc, A_SGE_HOST_PAGE_SIZE);
554	if (r != v) {
555		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
556		rc = EINVAL;
557	}
558
559	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
560	hwb = &s->hw_buf_info[0];
561	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
562		r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
563		hwb->size = r;
564		hwb->zidx = hwsz_ok(sc, r) ? -1 : -2;
565		hwb->next = -1;
566	}
567
568	/*
569	 * Create a sorted list in decreasing order of hw buffer sizes (and so
570	 * increasing order of spare area) for each software zone.
571	 *
572	 * If padding is enabled then the start and end of the buffer must align
573	 * to the pad boundary; if packing is enabled then they must align with
574	 * the pack boundary as well.  Allocations from the cluster zones are
575	 * aligned to min(size, 4K), so the buffer starts at that alignment and
576	 * ends at hwb->size alignment.  If mbuf inlining is allowed the
577	 * starting alignment will be reduced to MSIZE and the driver will
578	 * exercise appropriate caution when deciding on the best buffer layout
579	 * to use.
580	 */
581	n = 0;	/* no usable buffer size to begin with */
582	swz = &s->sw_zone_info[0];
583	safe_swz = NULL;
584	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
585		int8_t head = -1, tail = -1;
586
587		swz->size = sw_buf_sizes[i];
588		swz->zone = m_getzone(swz->size);
589		swz->type = m_gettype(swz->size);
590
591		if (swz->size < PAGE_SIZE) {
592			MPASS(powerof2(swz->size));
593			if (fl_pad && (swz->size % sc->sge.pad_boundary != 0))
594				continue;
595		}
596
597		if (swz->size == safest_rx_cluster)
598			safe_swz = swz;
599
600		hwb = &s->hw_buf_info[0];
601		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
602			if (hwb->zidx != -1 || hwb->size > swz->size)
603				continue;
604#ifdef INVARIANTS
605			if (fl_pad)
606				MPASS(hwb->size % sc->sge.pad_boundary == 0);
607#endif
608			hwb->zidx = i;
609			if (head == -1)
610				head = tail = j;
611			else if (hwb->size < s->hw_buf_info[tail].size) {
612				s->hw_buf_info[tail].next = j;
613				tail = j;
614			} else {
615				int8_t *cur;
616				struct hw_buf_info *t;
617
618				for (cur = &head; *cur != -1; cur = &t->next) {
619					t = &s->hw_buf_info[*cur];
620					if (hwb->size == t->size) {
621						hwb->zidx = -2;
622						break;
623					}
624					if (hwb->size > t->size) {
625						hwb->next = *cur;
626						*cur = j;
627						break;
628					}
629				}
630			}
631		}
632		swz->head_hwidx = head;
633		swz->tail_hwidx = tail;
634
635		if (tail != -1) {
636			n++;
637			if (swz->size - s->hw_buf_info[tail].size >=
638			    CL_METADATA_SIZE)
639				sc->flags |= BUF_PACKING_OK;
640		}
641	}
642	if (n == 0) {
643		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
644		rc = EINVAL;
645	}
646
647	s->safe_hwidx1 = -1;
648	s->safe_hwidx2 = -1;
649	if (safe_swz != NULL) {
650		s->safe_hwidx1 = safe_swz->head_hwidx;
651		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
652			int spare;
653
654			hwb = &s->hw_buf_info[i];
655#ifdef INVARIANTS
656			if (fl_pad)
657				MPASS(hwb->size % sc->sge.pad_boundary == 0);
658#endif
659			spare = safe_swz->size - hwb->size;
660			if (spare >= CL_METADATA_SIZE) {
661				s->safe_hwidx2 = i;
662				break;
663			}
664		}
665	}
666
667	r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD);
668	s->counter_val[0] = G_THRESHOLD_0(r);
669	s->counter_val[1] = G_THRESHOLD_1(r);
670	s->counter_val[2] = G_THRESHOLD_2(r);
671	s->counter_val[3] = G_THRESHOLD_3(r);
672
673	r = t4_read_reg(sc, A_SGE_TIMER_VALUE_0_AND_1);
674	s->timer_val[0] = G_TIMERVALUE0(r) / core_ticks_per_usec(sc);
675	s->timer_val[1] = G_TIMERVALUE1(r) / core_ticks_per_usec(sc);
676	r = t4_read_reg(sc, A_SGE_TIMER_VALUE_2_AND_3);
677	s->timer_val[2] = G_TIMERVALUE2(r) / core_ticks_per_usec(sc);
678	s->timer_val[3] = G_TIMERVALUE3(r) / core_ticks_per_usec(sc);
679	r = t4_read_reg(sc, A_SGE_TIMER_VALUE_4_AND_5);
680	s->timer_val[4] = G_TIMERVALUE4(r) / core_ticks_per_usec(sc);
681	s->timer_val[5] = G_TIMERVALUE5(r) / core_ticks_per_usec(sc);
682
683	if (cong_drop == 0) {
684		m = F_TUNNELCNGDROP0 | F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 |
685		    F_TUNNELCNGDROP3;
686		r = t4_read_reg(sc, A_TP_PARA_REG3);
687		if (r & m) {
688			device_printf(sc->dev,
689			    "invalid TP_PARA_REG3(0x%x)\n", r);
690			rc = EINVAL;
691		}
692	}
693
694	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
695	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
696	if (r != v) {
697		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
698		rc = EINVAL;
699	}
700
701	m = v = F_TDDPTAGTCB;
702	r = t4_read_reg(sc, A_ULP_RX_CTL);
703	if ((r & m) != v) {
704		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
705		rc = EINVAL;
706	}
707
708	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
709	    F_RESETDDPOFFSET;
710	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
711	r = t4_read_reg(sc, A_TP_PARA_REG5);
712	if ((r & m) != v) {
713		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
714		rc = EINVAL;
715	}
716
717	r = t4_read_reg(sc, A_SGE_CONM_CTRL);
718	s->fl_starve_threshold = G_EGRTHRESHOLD(r) * 2 + 1;
719	if (is_t4(sc))
720		s->fl_starve_threshold2 = s->fl_starve_threshold;
721	else
722		s->fl_starve_threshold2 = G_EGRTHRESHOLDPACKING(r) * 2 + 1;
723
724	/* egress queues: log2 of # of doorbells per BAR2 page */
725	r = t4_read_reg(sc, A_SGE_EGRESS_QUEUES_PER_PAGE_PF);
726	r >>= S_QUEUESPERPAGEPF0 +
727	    (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf;
728	s->eq_s_qpp = r & M_QUEUESPERPAGEPF0;
729
730	/* ingress queues: log2 of # of doorbells per BAR2 page */
731	r = t4_read_reg(sc, A_SGE_INGRESS_QUEUES_PER_PAGE_PF);
732	r >>= S_QUEUESPERPAGEPF0 +
733	    (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf;
734	s->iq_s_qpp = r & M_QUEUESPERPAGEPF0;
735
736	t4_init_tp_params(sc);
737
738	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
739	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
740
741	return (rc);
742}
743
744int
745t4_create_dma_tag(struct adapter *sc)
746{
747	int rc;
748
749	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
750	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
751	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
752	    NULL, &sc->dmat);
753	if (rc != 0) {
754		device_printf(sc->dev,
755		    "failed to create main DMA tag: %d\n", rc);
756	}
757
758	return (rc);
759}
760
761void
762t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
763    struct sysctl_oid_list *children)
764{
765
766	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
767	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
768	    "freelist buffer sizes");
769
770	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
771	    NULL, fl_pktshift, "payload DMA offset in rx buffer (bytes)");
772
773	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
774	    NULL, sc->sge.pad_boundary, "payload pad boundary (bytes)");
775
776	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
777	    NULL, spg_len, "status page size (bytes)");
778
779	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
780	    NULL, cong_drop, "congestion drop setting");
781
782	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
783	    NULL, sc->sge.pack_boundary, "payload pack boundary (bytes)");
784}
785
786int
787t4_destroy_dma_tag(struct adapter *sc)
788{
789	if (sc->dmat)
790		bus_dma_tag_destroy(sc->dmat);
791
792	return (0);
793}
794
795/*
796 * Allocate and initialize the firmware event queue and the management queue.
797 *
798 * Returns errno on failure.  Resources allocated up to that point may still be
799 * allocated.  Caller is responsible for cleanup in case this function fails.
800 */
801int
802t4_setup_adapter_queues(struct adapter *sc)
803{
804	int rc;
805
806	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
807
808	sysctl_ctx_init(&sc->ctx);
809	sc->flags |= ADAP_SYSCTL_CTX;
810
811	/*
812	 * Firmware event queue
813	 */
814	rc = alloc_fwq(sc);
815	if (rc != 0)
816		return (rc);
817
818	/*
819	 * Management queue.  This is just a control queue that uses the fwq as
820	 * its associated iq.
821	 */
822	rc = alloc_mgmtq(sc);
823
824	return (rc);
825}
826
827/*
828 * Idempotent
829 */
830int
831t4_teardown_adapter_queues(struct adapter *sc)
832{
833
834	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
835
836	/* Do this before freeing the queue */
837	if (sc->flags & ADAP_SYSCTL_CTX) {
838		sysctl_ctx_free(&sc->ctx);
839		sc->flags &= ~ADAP_SYSCTL_CTX;
840	}
841
842	free_mgmtq(sc);
843	free_fwq(sc);
844
845	return (0);
846}
847
848static inline int
849port_intr_count(struct port_info *pi)
850{
851	int rc = 0;
852
853	if (pi->flags & INTR_RXQ)
854		rc += pi->nrxq;
855#ifdef TCP_OFFLOAD
856	if (pi->flags & INTR_OFLD_RXQ)
857		rc += pi->nofldrxq;
858#endif
859#ifdef DEV_NETMAP
860	if (pi->flags & INTR_NM_RXQ)
861		rc += pi->nnmrxq;
862#endif
863	return (rc);
864}
865
866static inline int
867first_vector(struct port_info *pi)
868{
869	struct adapter *sc = pi->adapter;
870	int rc = T4_EXTRA_INTR, i;
871
872	if (sc->intr_count == 1)
873		return (0);
874
875	for_each_port(sc, i) {
876		if (i == pi->port_id)
877			break;
878
879		rc += port_intr_count(sc->port[i]);
880	}
881
882	return (rc);
883}
884
885/*
886 * Given an arbitrary "index," come up with an iq that can be used by other
887 * queues (of this port) for interrupt forwarding, SGE egress updates, etc.
888 * The iq returned is guaranteed to be something that takes direct interrupts.
889 */
890static struct sge_iq *
891port_intr_iq(struct port_info *pi, int idx)
892{
893	struct adapter *sc = pi->adapter;
894	struct sge *s = &sc->sge;
895	struct sge_iq *iq = NULL;
896	int nintr, i;
897
898	if (sc->intr_count == 1)
899		return (&sc->sge.fwq);
900
901	nintr = port_intr_count(pi);
902	KASSERT(nintr != 0,
903	    ("%s: pi %p has no exclusive interrupts, total interrupts = %d",
904	    __func__, pi, sc->intr_count));
905#ifdef DEV_NETMAP
906	/* Exclude netmap queues as they can't take anyone else's interrupts */
907	if (pi->flags & INTR_NM_RXQ)
908		nintr -= pi->nnmrxq;
909	KASSERT(nintr > 0,
910	    ("%s: pi %p has nintr %d after netmap adjustment of %d", __func__,
911	    pi, nintr, pi->nnmrxq));
912#endif
913	i = idx % nintr;
914
915	if (pi->flags & INTR_RXQ) {
916	       	if (i < pi->nrxq) {
917			iq = &s->rxq[pi->first_rxq + i].iq;
918			goto done;
919		}
920		i -= pi->nrxq;
921	}
922#ifdef TCP_OFFLOAD
923	if (pi->flags & INTR_OFLD_RXQ) {
924	       	if (i < pi->nofldrxq) {
925			iq = &s->ofld_rxq[pi->first_ofld_rxq + i].iq;
926			goto done;
927		}
928		i -= pi->nofldrxq;
929	}
930#endif
931	panic("%s: pi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__,
932	    pi, pi->flags & INTR_ALL, idx, nintr);
933done:
934	MPASS(iq != NULL);
935	KASSERT(iq->flags & IQ_INTR,
936	    ("%s: iq %p (port %p, intr_flags 0x%lx, idx %d)", __func__, iq, pi,
937	    pi->flags & INTR_ALL, idx));
938	return (iq);
939}
940
941/* Maximum payload that can be delivered with a single iq descriptor */
942static inline int
943mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
944{
945	int payload;
946
947#ifdef TCP_OFFLOAD
948	if (toe) {
949		payload = sc->tt.rx_coalesce ?
950		    G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
951	} else {
952#endif
953		/* large enough even when hw VLAN extraction is disabled */
954		payload = fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN +
955		    mtu;
956#ifdef TCP_OFFLOAD
957	}
958#endif
959
960	return (payload);
961}
962
963int
964t4_setup_port_queues(struct port_info *pi)
965{
966	int rc = 0, i, j, intr_idx, iqid;
967	struct sge_rxq *rxq;
968	struct sge_txq *txq;
969	struct sge_wrq *ctrlq;
970#ifdef TCP_OFFLOAD
971	struct sge_ofld_rxq *ofld_rxq;
972	struct sge_wrq *ofld_txq;
973#endif
974#ifdef DEV_NETMAP
975	struct sge_nm_rxq *nm_rxq;
976	struct sge_nm_txq *nm_txq;
977#endif
978	char name[16];
979	struct adapter *sc = pi->adapter;
980	struct ifnet *ifp = pi->ifp;
981	struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
982	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
983	int maxp, mtu = ifp->if_mtu;
984
985	/* Interrupt vector to start from (when using multiple vectors) */
986	intr_idx = first_vector(pi);
987
988	/*
989	 * First pass over all NIC and TOE rx queues:
990	 * a) initialize iq and fl
991	 * b) allocate queue iff it will take direct interrupts.
992	 */
993	maxp = mtu_to_max_payload(sc, mtu, 0);
994	if (pi->flags & INTR_RXQ) {
995		oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq",
996		    CTLFLAG_RD, NULL, "rx queues");
997	}
998	for_each_rxq(pi, i, rxq) {
999
1000		init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, pi->qsize_rxq);
1001
1002		snprintf(name, sizeof(name), "%s rxq%d-fl",
1003		    device_get_nameunit(pi->dev), i);
1004		init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, name);
1005
1006		if (pi->flags & INTR_RXQ) {
1007			rxq->iq.flags |= IQ_INTR;
1008			rc = alloc_rxq(pi, rxq, intr_idx, i, oid);
1009			if (rc != 0)
1010				goto done;
1011			intr_idx++;
1012		}
1013	}
1014#ifdef TCP_OFFLOAD
1015	maxp = mtu_to_max_payload(sc, mtu, 1);
1016	if (is_offload(sc) && pi->flags & INTR_OFLD_RXQ) {
1017		oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq",
1018		    CTLFLAG_RD, NULL,
1019		    "rx queues for offloaded TCP connections");
1020	}
1021	for_each_ofld_rxq(pi, i, ofld_rxq) {
1022
1023		init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx,
1024		    pi->qsize_rxq);
1025
1026		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
1027		    device_get_nameunit(pi->dev), i);
1028		init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, name);
1029
1030		if (pi->flags & INTR_OFLD_RXQ) {
1031			ofld_rxq->iq.flags |= IQ_INTR;
1032			rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid);
1033			if (rc != 0)
1034				goto done;
1035			intr_idx++;
1036		}
1037	}
1038#endif
1039#ifdef DEV_NETMAP
1040	/*
1041	 * We don't have buffers to back the netmap rx queues right now so we
1042	 * create the queues in a way that doesn't set off any congestion signal
1043	 * in the chip.
1044	 */
1045	if (pi->flags & INTR_NM_RXQ) {
1046		oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "nm_rxq",
1047		    CTLFLAG_RD, NULL, "rx queues for netmap");
1048		for_each_nm_rxq(pi, i, nm_rxq) {
1049			rc = alloc_nm_rxq(pi, nm_rxq, intr_idx, i, oid);
1050			if (rc != 0)
1051				goto done;
1052			intr_idx++;
1053		}
1054	}
1055#endif
1056
1057	/*
1058	 * Second pass over all NIC and TOE rx queues.  The queues forwarding
1059	 * their interrupts are allocated now.
1060	 */
1061	j = 0;
1062	if (!(pi->flags & INTR_RXQ)) {
1063		oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq",
1064		    CTLFLAG_RD, NULL, "rx queues");
1065		for_each_rxq(pi, i, rxq) {
1066			MPASS(!(rxq->iq.flags & IQ_INTR));
1067
1068			intr_idx = port_intr_iq(pi, j)->abs_id;
1069
1070			rc = alloc_rxq(pi, rxq, intr_idx, i, oid);
1071			if (rc != 0)
1072				goto done;
1073			j++;
1074		}
1075	}
1076#ifdef TCP_OFFLOAD
1077	if (is_offload(sc) && !(pi->flags & INTR_OFLD_RXQ)) {
1078		oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq",
1079		    CTLFLAG_RD, NULL,
1080		    "rx queues for offloaded TCP connections");
1081		for_each_ofld_rxq(pi, i, ofld_rxq) {
1082			MPASS(!(ofld_rxq->iq.flags & IQ_INTR));
1083
1084			intr_idx = port_intr_iq(pi, j)->abs_id;
1085
1086			rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid);
1087			if (rc != 0)
1088				goto done;
1089			j++;
1090		}
1091	}
1092#endif
1093#ifdef DEV_NETMAP
1094	if (!(pi->flags & INTR_NM_RXQ))
1095		CXGBE_UNIMPLEMENTED(__func__);
1096#endif
1097
1098	/*
1099	 * Now the tx queues.  Only one pass needed.
1100	 */
1101	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
1102	    NULL, "tx queues");
1103	j = 0;
1104	for_each_txq(pi, i, txq) {
1105		iqid = port_intr_iq(pi, j)->cntxt_id;
1106		snprintf(name, sizeof(name), "%s txq%d",
1107		    device_get_nameunit(pi->dev), i);
1108		init_eq(&txq->eq, EQ_ETH, pi->qsize_txq, pi->tx_chan, iqid,
1109		    name);
1110
1111		rc = alloc_txq(pi, txq, i, oid);
1112		if (rc != 0)
1113			goto done;
1114		j++;
1115	}
1116#ifdef TCP_OFFLOAD
1117	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_txq",
1118	    CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
1119	for_each_ofld_txq(pi, i, ofld_txq) {
1120		struct sysctl_oid *oid2;
1121
1122		iqid = port_intr_iq(pi, j)->cntxt_id;
1123		snprintf(name, sizeof(name), "%s ofld_txq%d",
1124		    device_get_nameunit(pi->dev), i);
1125		init_eq(&ofld_txq->eq, EQ_OFLD, pi->qsize_txq, pi->tx_chan,
1126		    iqid, name);
1127
1128		snprintf(name, sizeof(name), "%d", i);
1129		oid2 = SYSCTL_ADD_NODE(&pi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
1130		    name, CTLFLAG_RD, NULL, "offload tx queue");
1131
1132		rc = alloc_wrq(sc, pi, ofld_txq, oid2);
1133		if (rc != 0)
1134			goto done;
1135		j++;
1136	}
1137#endif
1138#ifdef DEV_NETMAP
1139	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "nm_txq",
1140	    CTLFLAG_RD, NULL, "tx queues for netmap use");
1141	for_each_nm_txq(pi, i, nm_txq) {
1142		iqid = pi->first_nm_rxq + (j % pi->nnmrxq);
1143		rc = alloc_nm_txq(pi, nm_txq, iqid, i, oid);
1144		if (rc != 0)
1145			goto done;
1146		j++;
1147	}
1148#endif
1149
1150	/*
1151	 * Finally, the control queue.
1152	 */
1153	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD,
1154	    NULL, "ctrl queue");
1155	ctrlq = &sc->sge.ctrlq[pi->port_id];
1156	iqid = port_intr_iq(pi, 0)->cntxt_id;
1157	snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(pi->dev));
1158	init_eq(&ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid, name);
1159	rc = alloc_wrq(sc, pi, ctrlq, oid);
1160
1161done:
1162	if (rc)
1163		t4_teardown_port_queues(pi);
1164
1165	return (rc);
1166}
1167
1168/*
1169 * Idempotent
1170 */
1171int
1172t4_teardown_port_queues(struct port_info *pi)
1173{
1174	int i;
1175	struct adapter *sc = pi->adapter;
1176	struct sge_rxq *rxq;
1177	struct sge_txq *txq;
1178#ifdef TCP_OFFLOAD
1179	struct sge_ofld_rxq *ofld_rxq;
1180	struct sge_wrq *ofld_txq;
1181#endif
1182#ifdef DEV_NETMAP
1183	struct sge_nm_rxq *nm_rxq;
1184	struct sge_nm_txq *nm_txq;
1185#endif
1186
1187	/* Do this before freeing the queues */
1188	if (pi->flags & PORT_SYSCTL_CTX) {
1189		sysctl_ctx_free(&pi->ctx);
1190		pi->flags &= ~PORT_SYSCTL_CTX;
1191	}
1192
1193	/*
1194	 * Take down all the tx queues first, as they reference the rx queues
1195	 * (for egress updates, etc.).
1196	 */
1197
1198	free_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
1199
1200	for_each_txq(pi, i, txq) {
1201		free_txq(pi, txq);
1202	}
1203#ifdef TCP_OFFLOAD
1204	for_each_ofld_txq(pi, i, ofld_txq) {
1205		free_wrq(sc, ofld_txq);
1206	}
1207#endif
1208#ifdef DEV_NETMAP
1209	for_each_nm_txq(pi, i, nm_txq)
1210	    free_nm_txq(pi, nm_txq);
1211#endif
1212
1213	/*
1214	 * Then take down the rx queues that forward their interrupts, as they
1215	 * reference other rx queues.
1216	 */
1217
1218	for_each_rxq(pi, i, rxq) {
1219		if ((rxq->iq.flags & IQ_INTR) == 0)
1220			free_rxq(pi, rxq);
1221	}
1222#ifdef TCP_OFFLOAD
1223	for_each_ofld_rxq(pi, i, ofld_rxq) {
1224		if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
1225			free_ofld_rxq(pi, ofld_rxq);
1226	}
1227#endif
1228#ifdef DEV_NETMAP
1229	for_each_nm_rxq(pi, i, nm_rxq)
1230	    free_nm_rxq(pi, nm_rxq);
1231#endif
1232
1233	/*
1234	 * Then take down the rx queues that take direct interrupts.
1235	 */
1236
1237	for_each_rxq(pi, i, rxq) {
1238		if (rxq->iq.flags & IQ_INTR)
1239			free_rxq(pi, rxq);
1240	}
1241#ifdef TCP_OFFLOAD
1242	for_each_ofld_rxq(pi, i, ofld_rxq) {
1243		if (ofld_rxq->iq.flags & IQ_INTR)
1244			free_ofld_rxq(pi, ofld_rxq);
1245	}
1246#endif
1247
1248	return (0);
1249}
1250
1251/*
1252 * Deals with errors and the firmware event queue.  All data rx queues forward
1253 * their interrupt to the firmware event queue.
1254 */
1255void
1256t4_intr_all(void *arg)
1257{
1258	struct adapter *sc = arg;
1259	struct sge_iq *fwq = &sc->sge.fwq;
1260
1261	t4_intr_err(arg);
1262	if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) {
1263		service_iq(fwq, 0);
1264		atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE);
1265	}
1266}
1267
1268/* Deals with error interrupts */
1269void
1270t4_intr_err(void *arg)
1271{
1272	struct adapter *sc = arg;
1273
1274	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
1275	t4_slow_intr_handler(sc);
1276}
1277
1278void
1279t4_intr_evt(void *arg)
1280{
1281	struct sge_iq *iq = arg;
1282
1283	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1284		service_iq(iq, 0);
1285		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1286	}
1287}
1288
1289void
1290t4_intr(void *arg)
1291{
1292	struct sge_iq *iq = arg;
1293
1294	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
1295		service_iq(iq, 0);
1296		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
1297	}
1298}
1299
1300/*
1301 * Deals with anything and everything on the given ingress queue.
1302 */
1303static int
1304service_iq(struct sge_iq *iq, int budget)
1305{
1306	struct sge_iq *q;
1307	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
1308	struct sge_fl *fl;			/* Use iff IQ_HAS_FL */
1309	struct adapter *sc = iq->adapter;
1310	struct iq_desc *d = &iq->desc[iq->cidx];
1311	int ndescs = 0, limit;
1312	int rsp_type, refill;
1313	uint32_t lq;
1314	uint16_t fl_hw_cidx;
1315	struct mbuf *m0;
1316	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
1317#if defined(INET) || defined(INET6)
1318	const struct timeval lro_timeout = {0, sc->lro_timeout};
1319#endif
1320
1321	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
1322
1323	limit = budget ? budget : iq->qsize / 16;
1324
1325	if (iq->flags & IQ_HAS_FL) {
1326		fl = &rxq->fl;
1327		fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
1328	} else {
1329		fl = NULL;
1330		fl_hw_cidx = 0;			/* to silence gcc warning */
1331	}
1332
1333	/*
1334	 * We always come back and check the descriptor ring for new indirect
1335	 * interrupts and other responses after running a single handler.
1336	 */
1337	for (;;) {
1338		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
1339
1340			rmb();
1341
1342			refill = 0;
1343			m0 = NULL;
1344			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
1345			lq = be32toh(d->rsp.pldbuflen_qid);
1346
1347			switch (rsp_type) {
1348			case X_RSPD_TYPE_FLBUF:
1349
1350				KASSERT(iq->flags & IQ_HAS_FL,
1351				    ("%s: data for an iq (%p) with no freelist",
1352				    __func__, iq));
1353
1354				m0 = get_fl_payload(sc, fl, lq);
1355				if (__predict_false(m0 == NULL))
1356					goto process_iql;
1357				refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2;
1358#ifdef T4_PKT_TIMESTAMP
1359				/*
1360				 * 60 bit timestamp for the payload is
1361				 * *(uint64_t *)m0->m_pktdat.  Note that it is
1362				 * in the leading free-space in the mbuf.  The
1363				 * kernel can clobber it during a pullup,
1364				 * m_copymdata, etc.  You need to make sure that
1365				 * the mbuf reaches you unmolested if you care
1366				 * about the timestamp.
1367				 */
1368				*(uint64_t *)m0->m_pktdat =
1369				    be64toh(ctrl->u.last_flit) &
1370				    0xfffffffffffffff;
1371#endif
1372
1373				/* fall through */
1374
1375			case X_RSPD_TYPE_CPL:
1376				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
1377				    ("%s: bad opcode %02x.", __func__,
1378				    d->rss.opcode));
1379				sc->cpl_handler[d->rss.opcode](iq, &d->rss, m0);
1380				break;
1381
1382			case X_RSPD_TYPE_INTR:
1383
1384				/*
1385				 * Interrupts should be forwarded only to queues
1386				 * that are not forwarding their interrupts.
1387				 * This means service_iq can recurse but only 1
1388				 * level deep.
1389				 */
1390				KASSERT(budget == 0,
1391				    ("%s: budget %u, rsp_type %u", __func__,
1392				    budget, rsp_type));
1393
1394				/*
1395				 * There are 1K interrupt-capable queues (qids 0
1396				 * through 1023).  A response type indicating a
1397				 * forwarded interrupt with a qid >= 1K is an
1398				 * iWARP async notification.
1399				 */
1400				if (lq >= 1024) {
1401                                        sc->an_handler(iq, &d->rsp);
1402                                        break;
1403                                }
1404
1405				q = sc->sge.iqmap[lq - sc->sge.iq_start];
1406				if (atomic_cmpset_int(&q->state, IQS_IDLE,
1407				    IQS_BUSY)) {
1408					if (service_iq(q, q->qsize / 16) == 0) {
1409						atomic_cmpset_int(&q->state,
1410						    IQS_BUSY, IQS_IDLE);
1411					} else {
1412						STAILQ_INSERT_TAIL(&iql, q,
1413						    link);
1414					}
1415				}
1416				break;
1417
1418			default:
1419				KASSERT(0,
1420				    ("%s: illegal response type %d on iq %p",
1421				    __func__, rsp_type, iq));
1422				log(LOG_ERR,
1423				    "%s: illegal response type %d on iq %p",
1424				    device_get_nameunit(sc->dev), rsp_type, iq);
1425				break;
1426			}
1427
1428			d++;
1429			if (__predict_false(++iq->cidx == iq->sidx)) {
1430				iq->cidx = 0;
1431				iq->gen ^= F_RSPD_GEN;
1432				d = &iq->desc[0];
1433			}
1434			if (__predict_false(++ndescs == limit)) {
1435				t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
1436				    V_CIDXINC(ndescs) |
1437				    V_INGRESSQID(iq->cntxt_id) |
1438				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
1439				ndescs = 0;
1440
1441#if defined(INET) || defined(INET6)
1442				if (iq->flags & IQ_LRO_ENABLED &&
1443				    sc->lro_timeout != 0) {
1444					tcp_lro_flush_inactive(&rxq->lro,
1445					    &lro_timeout);
1446				}
1447#endif
1448
1449				if (budget) {
1450					if (iq->flags & IQ_HAS_FL) {
1451						FL_LOCK(fl);
1452						refill_fl(sc, fl, 32);
1453						FL_UNLOCK(fl);
1454					}
1455					return (EINPROGRESS);
1456				}
1457			}
1458			if (refill) {
1459				FL_LOCK(fl);
1460				refill_fl(sc, fl, 32);
1461				FL_UNLOCK(fl);
1462				fl_hw_cidx = fl->hw_cidx;
1463			}
1464		}
1465
1466process_iql:
1467		if (STAILQ_EMPTY(&iql))
1468			break;
1469
1470		/*
1471		 * Process the head only, and send it to the back of the list if
1472		 * it's still not done.
1473		 */
1474		q = STAILQ_FIRST(&iql);
1475		STAILQ_REMOVE_HEAD(&iql, link);
1476		if (service_iq(q, q->qsize / 8) == 0)
1477			atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
1478		else
1479			STAILQ_INSERT_TAIL(&iql, q, link);
1480	}
1481
1482#if defined(INET) || defined(INET6)
1483	if (iq->flags & IQ_LRO_ENABLED) {
1484		struct lro_ctrl *lro = &rxq->lro;
1485		struct lro_entry *l;
1486
1487		while (!SLIST_EMPTY(&lro->lro_active)) {
1488			l = SLIST_FIRST(&lro->lro_active);
1489			SLIST_REMOVE_HEAD(&lro->lro_active, next);
1490			tcp_lro_flush(lro, l);
1491		}
1492	}
1493#endif
1494
1495	t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) |
1496	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
1497
1498	if (iq->flags & IQ_HAS_FL) {
1499		int starved;
1500
1501		FL_LOCK(fl);
1502		starved = refill_fl(sc, fl, 64);
1503		FL_UNLOCK(fl);
1504		if (__predict_false(starved != 0))
1505			add_fl_to_sfl(sc, fl);
1506	}
1507
1508	return (0);
1509}
1510
1511static inline int
1512cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
1513{
1514	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
1515
1516	if (rc)
1517		MPASS(cll->region3 >= CL_METADATA_SIZE);
1518
1519	return (rc);
1520}
1521
1522static inline struct cluster_metadata *
1523cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
1524    caddr_t cl)
1525{
1526
1527	if (cl_has_metadata(fl, cll)) {
1528		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1529
1530		return ((struct cluster_metadata *)(cl + swz->size) - 1);
1531	}
1532	return (NULL);
1533}
1534
1535static int
1536rxb_free(struct mbuf *m, void *arg1, void *arg2)
1537{
1538	uma_zone_t zone = arg1;
1539	caddr_t cl = arg2;
1540
1541	uma_zfree(zone, cl);
1542	counter_u64_add(extfree_rels, 1);
1543
1544	return (EXT_FREE_OK);
1545}
1546
1547/*
1548 * The mbuf returned by this function could be allocated from zone_mbuf or
1549 * constructed in spare room in the cluster.
1550 *
1551 * The mbuf carries the payload in one of these ways
1552 * a) frame inside the mbuf (mbuf from zone_mbuf)
1553 * b) m_cljset (for clusters without metadata) zone_mbuf
1554 * c) m_extaddref (cluster with metadata) inline mbuf
1555 * d) m_extaddref (cluster with metadata) zone_mbuf
1556 */
1557static struct mbuf *
1558get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
1559    int remaining)
1560{
1561	struct mbuf *m;
1562	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
1563	struct cluster_layout *cll = &sd->cll;
1564	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
1565	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
1566	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
1567	int len, blen;
1568	caddr_t payload;
1569
1570	blen = hwb->size - fl->rx_offset;	/* max possible in this buf */
1571	len = min(remaining, blen);
1572	payload = sd->cl + cll->region1 + fl->rx_offset;
1573	if (fl->flags & FL_BUF_PACKING) {
1574		const u_int l = fr_offset + len;
1575		const u_int pad = roundup2(l, fl->buf_boundary) - l;
1576
1577		if (fl->rx_offset + len + pad < hwb->size)
1578			blen = len + pad;
1579		MPASS(fl->rx_offset + blen <= hwb->size);
1580	} else {
1581		MPASS(fl->rx_offset == 0);	/* not packing */
1582	}
1583
1584
1585	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
1586
1587		/*
1588		 * Copy payload into a freshly allocated mbuf.
1589		 */
1590
1591		m = fr_offset == 0 ?
1592		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1593		if (m == NULL)
1594			return (NULL);
1595		fl->mbuf_allocated++;
1596#ifdef T4_PKT_TIMESTAMP
1597		/* Leave room for a timestamp */
1598		m->m_data += 8;
1599#endif
1600		/* copy data to mbuf */
1601		bcopy(payload, mtod(m, caddr_t), len);
1602
1603	} else if (sd->nmbuf * MSIZE < cll->region1) {
1604
1605		/*
1606		 * There's spare room in the cluster for an mbuf.  Create one
1607		 * and associate it with the payload that's in the cluster.
1608		 */
1609
1610		MPASS(clm != NULL);
1611		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
1612		/* No bzero required */
1613		if (m_init(m, NULL, 0, M_NOWAIT, MT_DATA,
1614		    fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE))
1615			return (NULL);
1616		fl->mbuf_inlined++;
1617		m_extaddref(m, payload, blen, &clm->refcount, rxb_free,
1618		    swz->zone, sd->cl);
1619		if (sd->nmbuf++ == 0)
1620			counter_u64_add(extfree_refs, 1);
1621
1622	} else {
1623
1624		/*
1625		 * Grab an mbuf from zone_mbuf and associate it with the
1626		 * payload in the cluster.
1627		 */
1628
1629		m = fr_offset == 0 ?
1630		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
1631		if (m == NULL)
1632			return (NULL);
1633		fl->mbuf_allocated++;
1634		if (clm != NULL) {
1635			m_extaddref(m, payload, blen, &clm->refcount,
1636			    rxb_free, swz->zone, sd->cl);
1637			if (sd->nmbuf++ == 0)
1638				counter_u64_add(extfree_refs, 1);
1639		} else {
1640			m_cljset(m, sd->cl, swz->type);
1641			sd->cl = NULL;	/* consumed, not a recycle candidate */
1642		}
1643	}
1644	if (fr_offset == 0)
1645		m->m_pkthdr.len = remaining;
1646	m->m_len = len;
1647
1648	if (fl->flags & FL_BUF_PACKING) {
1649		fl->rx_offset += blen;
1650		MPASS(fl->rx_offset <= hwb->size);
1651		if (fl->rx_offset < hwb->size)
1652			return (m);	/* without advancing the cidx */
1653	}
1654
1655	if (__predict_false(++fl->cidx % 8 == 0)) {
1656		uint16_t cidx = fl->cidx / 8;
1657
1658		if (__predict_false(cidx == fl->sidx))
1659			fl->cidx = cidx = 0;
1660		fl->hw_cidx = cidx;
1661	}
1662	fl->rx_offset = 0;
1663
1664	return (m);
1665}
1666
1667static struct mbuf *
1668get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf)
1669{
1670	struct mbuf *m0, *m, **pnext;
1671	u_int remaining;
1672	const u_int total = G_RSPD_LEN(len_newbuf);
1673
1674	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
1675		M_ASSERTPKTHDR(fl->m0);
1676		MPASS(fl->m0->m_pkthdr.len == total);
1677		MPASS(fl->remaining < total);
1678
1679		m0 = fl->m0;
1680		pnext = fl->pnext;
1681		remaining = fl->remaining;
1682		fl->flags &= ~FL_BUF_RESUME;
1683		goto get_segment;
1684	}
1685
1686	if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
1687		fl->rx_offset = 0;
1688		if (__predict_false(++fl->cidx % 8 == 0)) {
1689			uint16_t cidx = fl->cidx / 8;
1690
1691			if (__predict_false(cidx == fl->sidx))
1692				fl->cidx = cidx = 0;
1693			fl->hw_cidx = cidx;
1694		}
1695	}
1696
1697	/*
1698	 * Payload starts at rx_offset in the current hw buffer.  Its length is
1699	 * 'len' and it may span multiple hw buffers.
1700	 */
1701
1702	m0 = get_scatter_segment(sc, fl, 0, total);
1703	if (m0 == NULL)
1704		return (NULL);
1705	remaining = total - m0->m_len;
1706	pnext = &m0->m_next;
1707	while (remaining > 0) {
1708get_segment:
1709		MPASS(fl->rx_offset == 0);
1710		m = get_scatter_segment(sc, fl, total - remaining, remaining);
1711		if (__predict_false(m == NULL)) {
1712			fl->m0 = m0;
1713			fl->pnext = pnext;
1714			fl->remaining = remaining;
1715			fl->flags |= FL_BUF_RESUME;
1716			return (NULL);
1717		}
1718		*pnext = m;
1719		pnext = &m->m_next;
1720		remaining -= m->m_len;
1721	}
1722	*pnext = NULL;
1723
1724	return (m0);
1725}
1726
1727static int
1728t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
1729{
1730	struct sge_rxq *rxq = iq_to_rxq(iq);
1731	struct ifnet *ifp = rxq->ifp;
1732	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
1733#if defined(INET) || defined(INET6)
1734	struct lro_ctrl *lro = &rxq->lro;
1735#endif
1736
1737	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
1738	    rss->opcode));
1739
1740	m0->m_pkthdr.len -= fl_pktshift;
1741	m0->m_len -= fl_pktshift;
1742	m0->m_data += fl_pktshift;
1743
1744	m0->m_pkthdr.rcvif = ifp;
1745	M_HASHTYPE_SET(m0, M_HASHTYPE_OPAQUE);
1746	m0->m_pkthdr.flowid = be32toh(rss->hash_val);
1747
1748	if (cpl->csum_calc && !cpl->err_vec) {
1749		if (ifp->if_capenable & IFCAP_RXCSUM &&
1750		    cpl->l2info & htobe32(F_RXF_IP)) {
1751			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
1752			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1753			rxq->rxcsum++;
1754		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
1755		    cpl->l2info & htobe32(F_RXF_IP6)) {
1756			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
1757			    CSUM_PSEUDO_HDR);
1758			rxq->rxcsum++;
1759		}
1760
1761		if (__predict_false(cpl->ip_frag))
1762			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
1763		else
1764			m0->m_pkthdr.csum_data = 0xffff;
1765	}
1766
1767	if (cpl->vlan_ex) {
1768		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
1769		m0->m_flags |= M_VLANTAG;
1770		rxq->vlan_extraction++;
1771	}
1772
1773#if defined(INET) || defined(INET6)
1774	if (cpl->l2info & htobe32(F_RXF_LRO) &&
1775	    iq->flags & IQ_LRO_ENABLED &&
1776	    tcp_lro_rx(lro, m0, 0) == 0) {
1777		/* queued for LRO */
1778	} else
1779#endif
1780	ifp->if_input(ifp, m0);
1781
1782	return (0);
1783}
1784
1785/*
1786 * Doesn't fail.  Holds on to work requests it can't send right away.
1787 */
1788void
1789t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
1790{
1791	struct sge_eq *eq = &wrq->eq;
1792	int can_reclaim;
1793	caddr_t dst;
1794
1795	TXQ_LOCK_ASSERT_OWNED(wrq);
1796#ifdef TCP_OFFLOAD
1797	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD ||
1798	    (eq->flags & EQ_TYPEMASK) == EQ_CTRL,
1799	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
1800#else
1801	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL,
1802	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
1803#endif
1804
1805	if (__predict_true(wr != NULL))
1806		STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
1807
1808	can_reclaim = reclaimable(eq);
1809	if (__predict_false(eq->flags & EQ_STALLED)) {
1810		if (eq->avail + can_reclaim < tx_resume_threshold(eq))
1811			return;
1812		eq->flags &= ~EQ_STALLED;
1813		eq->unstalled++;
1814	}
1815	eq->cidx += can_reclaim;
1816	eq->avail += can_reclaim;
1817	if (__predict_false(eq->cidx >= eq->cap))
1818		eq->cidx -= eq->cap;
1819
1820	while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) {
1821		int ndesc;
1822
1823		if (__predict_false(wr->wr_len < 0 ||
1824		    wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) {
1825
1826#ifdef INVARIANTS
1827			panic("%s: work request with length %d", __func__,
1828			    wr->wr_len);
1829#endif
1830#ifdef KDB
1831			kdb_backtrace();
1832#endif
1833			log(LOG_ERR, "%s: %s work request with length %d",
1834			    device_get_nameunit(sc->dev), __func__, wr->wr_len);
1835			STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
1836			free_wrqe(wr);
1837			continue;
1838		}
1839
1840		ndesc = howmany(wr->wr_len, EQ_ESIZE);
1841		if (eq->avail < ndesc) {
1842			wrq->no_desc++;
1843			break;
1844		}
1845
1846		dst = (void *)&eq->desc[eq->pidx];
1847		copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len);
1848
1849		eq->pidx += ndesc;
1850		eq->avail -= ndesc;
1851		if (__predict_false(eq->pidx >= eq->cap))
1852			eq->pidx -= eq->cap;
1853
1854		eq->pending += ndesc;
1855		if (eq->pending >= 8)
1856			ring_eq_db(sc, eq);
1857
1858		wrq->tx_wrs++;
1859		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
1860		free_wrqe(wr);
1861
1862		if (eq->avail < 8) {
1863			can_reclaim = reclaimable(eq);
1864			eq->cidx += can_reclaim;
1865			eq->avail += can_reclaim;
1866			if (__predict_false(eq->cidx >= eq->cap))
1867				eq->cidx -= eq->cap;
1868		}
1869	}
1870
1871	if (eq->pending)
1872		ring_eq_db(sc, eq);
1873
1874	if (wr != NULL) {
1875		eq->flags |= EQ_STALLED;
1876		if (callout_pending(&eq->tx_callout) == 0)
1877			callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
1878	}
1879}
1880
1881/* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */
1882#define TXPKTS_PKT_HDR ((\
1883    sizeof(struct ulp_txpkt) + \
1884    sizeof(struct ulptx_idata) + \
1885    sizeof(struct cpl_tx_pkt_core) \
1886    ) / 8)
1887
1888/* Header of a coalesced tx WR, before SGL of first packet (in flits) */
1889#define TXPKTS_WR_HDR (\
1890    sizeof(struct fw_eth_tx_pkts_wr) / 8 + \
1891    TXPKTS_PKT_HDR)
1892
1893/* Header of a tx WR, before SGL of first packet (in flits) */
1894#define TXPKT_WR_HDR ((\
1895    sizeof(struct fw_eth_tx_pkt_wr) + \
1896    sizeof(struct cpl_tx_pkt_core) \
1897    ) / 8 )
1898
1899/* Header of a tx LSO WR, before SGL of first packet (in flits) */
1900#define TXPKT_LSO_WR_HDR ((\
1901    sizeof(struct fw_eth_tx_pkt_wr) + \
1902    sizeof(struct cpl_tx_pkt_lso_core) + \
1903    sizeof(struct cpl_tx_pkt_core) \
1904    ) / 8 )
1905
1906int
1907t4_eth_tx(struct ifnet *ifp, struct sge_txq *txq, struct mbuf *m)
1908{
1909	struct port_info *pi = (void *)ifp->if_softc;
1910	struct adapter *sc = pi->adapter;
1911	struct sge_eq *eq = &txq->eq;
1912	struct buf_ring *br = txq->br;
1913	struct mbuf *next;
1914	int rc, coalescing, can_reclaim;
1915	struct txpkts txpkts;
1916	struct sgl sgl;
1917
1918	TXQ_LOCK_ASSERT_OWNED(txq);
1919	KASSERT(m, ("%s: called with nothing to do.", __func__));
1920	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_ETH,
1921	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
1922
1923	prefetch(&eq->desc[eq->pidx]);
1924	prefetch(&txq->sdesc[eq->pidx]);
1925
1926	txpkts.npkt = 0;/* indicates there's nothing in txpkts */
1927	coalescing = 0;
1928
1929	can_reclaim = reclaimable(eq);
1930	if (__predict_false(eq->flags & EQ_STALLED)) {
1931		if (eq->avail + can_reclaim < tx_resume_threshold(eq)) {
1932			txq->m = m;
1933			return (0);
1934		}
1935		eq->flags &= ~EQ_STALLED;
1936		eq->unstalled++;
1937	}
1938
1939	if (__predict_false(eq->flags & EQ_DOOMED)) {
1940		m_freem(m);
1941		while ((m = buf_ring_dequeue_sc(txq->br)) != NULL)
1942			m_freem(m);
1943		return (ENETDOWN);
1944	}
1945
1946	if (eq->avail < 8 && can_reclaim)
1947		reclaim_tx_descs(txq, can_reclaim, 32);
1948
1949	for (; m; m = next ? next : drbr_dequeue(ifp, br)) {
1950
1951		if (eq->avail < 8)
1952			break;
1953
1954		next = m->m_nextpkt;
1955		m->m_nextpkt = NULL;
1956
1957		if (next || buf_ring_peek(br))
1958			coalescing = 1;
1959
1960		rc = get_pkt_sgl(txq, &m, &sgl, coalescing);
1961		if (rc != 0) {
1962			if (rc == ENOMEM) {
1963
1964				/* Short of resources, suspend tx */
1965
1966				m->m_nextpkt = next;
1967				break;
1968			}
1969
1970			/*
1971			 * Unrecoverable error for this packet, throw it away
1972			 * and move on to the next.  get_pkt_sgl may already
1973			 * have freed m (it will be NULL in that case and the
1974			 * m_freem here is still safe).
1975			 */
1976
1977			m_freem(m);
1978			continue;
1979		}
1980
1981		if (coalescing &&
1982		    add_to_txpkts(pi, txq, &txpkts, m, &sgl) == 0) {
1983
1984			/* Successfully absorbed into txpkts */
1985
1986			write_ulp_cpl_sgl(pi, txq, &txpkts, m, &sgl);
1987			goto doorbell;
1988		}
1989
1990		/*
1991		 * We weren't coalescing to begin with, or current frame could
1992		 * not be coalesced (add_to_txpkts flushes txpkts if a frame
1993		 * given to it can't be coalesced).  Either way there should be
1994		 * nothing in txpkts.
1995		 */
1996		KASSERT(txpkts.npkt == 0,
1997		    ("%s: txpkts not empty: %d", __func__, txpkts.npkt));
1998
1999		/* We're sending out individual packets now */
2000		coalescing = 0;
2001
2002		if (eq->avail < 8)
2003			reclaim_tx_descs(txq, 0, 8);
2004		rc = write_txpkt_wr(pi, txq, m, &sgl);
2005		if (rc != 0) {
2006
2007			/* Short of hardware descriptors, suspend tx */
2008
2009			/*
2010			 * This is an unlikely but expensive failure.  We've
2011			 * done all the hard work (DMA mappings etc.) and now we
2012			 * can't send out the packet.  What's worse, we have to
2013			 * spend even more time freeing up everything in sgl.
2014			 */
2015			txq->no_desc++;
2016			free_pkt_sgl(txq, &sgl);
2017
2018			m->m_nextpkt = next;
2019			break;
2020		}
2021
2022		ETHER_BPF_MTAP(ifp, m);
2023		if (sgl.nsegs == 0)
2024			m_freem(m);
2025doorbell:
2026		if (eq->pending >= 8)
2027			ring_eq_db(sc, eq);
2028
2029		can_reclaim = reclaimable(eq);
2030		if (can_reclaim >= 32)
2031			reclaim_tx_descs(txq, can_reclaim, 64);
2032	}
2033
2034	if (txpkts.npkt > 0)
2035		write_txpkts_wr(txq, &txpkts);
2036
2037	/*
2038	 * m not NULL means there was an error but we haven't thrown it away.
2039	 * This can happen when we're short of tx descriptors (no_desc) or maybe
2040	 * even DMA maps (no_dmamap).  Either way, a credit flush and reclaim
2041	 * will get things going again.
2042	 */
2043	if (m && !(eq->flags & EQ_CRFLUSHED)) {
2044		struct tx_sdesc *txsd = &txq->sdesc[eq->pidx];
2045
2046		/*
2047		 * If EQ_CRFLUSHED is not set then we know we have at least one
2048		 * available descriptor because any WR that reduces eq->avail to
2049		 * 0 also sets EQ_CRFLUSHED.
2050		 */
2051		KASSERT(eq->avail > 0, ("%s: no space for eqflush.", __func__));
2052
2053		txsd->desc_used = 1;
2054		txsd->credits = 0;
2055		write_eqflush_wr(eq);
2056	}
2057	txq->m = m;
2058
2059	if (eq->pending)
2060		ring_eq_db(sc, eq);
2061
2062	reclaim_tx_descs(txq, 0, 128);
2063
2064	if (eq->flags & EQ_STALLED && callout_pending(&eq->tx_callout) == 0)
2065		callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
2066
2067	return (0);
2068}
2069
2070void
2071t4_update_fl_bufsize(struct ifnet *ifp)
2072{
2073	struct port_info *pi = ifp->if_softc;
2074	struct adapter *sc = pi->adapter;
2075	struct sge_rxq *rxq;
2076#ifdef TCP_OFFLOAD
2077	struct sge_ofld_rxq *ofld_rxq;
2078#endif
2079	struct sge_fl *fl;
2080	int i, maxp, mtu = ifp->if_mtu;
2081
2082	maxp = mtu_to_max_payload(sc, mtu, 0);
2083	for_each_rxq(pi, i, rxq) {
2084		fl = &rxq->fl;
2085
2086		FL_LOCK(fl);
2087		find_best_refill_source(sc, fl, maxp);
2088		FL_UNLOCK(fl);
2089	}
2090#ifdef TCP_OFFLOAD
2091	maxp = mtu_to_max_payload(sc, mtu, 1);
2092	for_each_ofld_rxq(pi, i, ofld_rxq) {
2093		fl = &ofld_rxq->fl;
2094
2095		FL_LOCK(fl);
2096		find_best_refill_source(sc, fl, maxp);
2097		FL_UNLOCK(fl);
2098	}
2099#endif
2100}
2101
2102int
2103can_resume_tx(struct sge_eq *eq)
2104{
2105
2106	return (eq->avail + reclaimable(eq) >= tx_resume_threshold(eq));
2107}
2108
2109static inline void
2110init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
2111    int qsize)
2112{
2113
2114	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
2115	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
2116	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
2117	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
2118
2119	iq->flags = 0;
2120	iq->adapter = sc;
2121	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
2122	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
2123	if (pktc_idx >= 0) {
2124		iq->intr_params |= F_QINTR_CNT_EN;
2125		iq->intr_pktc_idx = pktc_idx;
2126	}
2127	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
2128	iq->sidx = iq->qsize - spg_len / IQ_ESIZE;
2129}
2130
2131static inline void
2132init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
2133{
2134
2135	fl->qsize = qsize;
2136	fl->sidx = qsize - spg_len / EQ_ESIZE;
2137	strlcpy(fl->lockname, name, sizeof(fl->lockname));
2138	if (sc->flags & BUF_PACKING_OK &&
2139	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
2140	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
2141		fl->flags |= FL_BUF_PACKING;
2142	find_best_refill_source(sc, fl, maxp);
2143	find_safe_refill_source(sc, fl);
2144}
2145
2146static inline void
2147init_eq(struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan,
2148    uint16_t iqid, char *name)
2149{
2150	KASSERT(tx_chan < NCHAN, ("%s: bad tx channel %d", __func__, tx_chan));
2151	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
2152
2153	eq->flags = eqtype & EQ_TYPEMASK;
2154	eq->tx_chan = tx_chan;
2155	eq->iqid = iqid;
2156	eq->qsize = qsize;
2157	strlcpy(eq->lockname, name, sizeof(eq->lockname));
2158
2159	TASK_INIT(&eq->tx_task, 0, t4_tx_task, eq);
2160	callout_init(&eq->tx_callout, CALLOUT_MPSAFE);
2161}
2162
2163static int
2164alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
2165    bus_dmamap_t *map, bus_addr_t *pa, void **va)
2166{
2167	int rc;
2168
2169	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
2170	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
2171	if (rc != 0) {
2172		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
2173		goto done;
2174	}
2175
2176	rc = bus_dmamem_alloc(*tag, va,
2177	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
2178	if (rc != 0) {
2179		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
2180		goto done;
2181	}
2182
2183	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
2184	if (rc != 0) {
2185		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
2186		goto done;
2187	}
2188done:
2189	if (rc)
2190		free_ring(sc, *tag, *map, *pa, *va);
2191
2192	return (rc);
2193}
2194
2195static int
2196free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
2197    bus_addr_t pa, void *va)
2198{
2199	if (pa)
2200		bus_dmamap_unload(tag, map);
2201	if (va)
2202		bus_dmamem_free(tag, va, map);
2203	if (tag)
2204		bus_dma_tag_destroy(tag);
2205
2206	return (0);
2207}
2208
2209/*
2210 * Allocates the ring for an ingress queue and an optional freelist.  If the
2211 * freelist is specified it will be allocated and then associated with the
2212 * ingress queue.
2213 *
2214 * Returns errno on failure.  Resources allocated up to that point may still be
2215 * allocated.  Caller is responsible for cleanup in case this function fails.
2216 *
2217 * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then
2218 * the intr_idx specifies the vector, starting from 0.  Otherwise it specifies
2219 * the abs_id of the ingress queue to which its interrupts should be forwarded.
2220 */
2221static int
2222alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl,
2223    int intr_idx, int cong)
2224{
2225	int rc, i, cntxt_id;
2226	size_t len;
2227	struct fw_iq_cmd c;
2228	struct adapter *sc = iq->adapter;
2229	__be32 v = 0;
2230
2231	len = iq->qsize * IQ_ESIZE;
2232	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
2233	    (void **)&iq->desc);
2234	if (rc != 0)
2235		return (rc);
2236
2237	bzero(&c, sizeof(c));
2238	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
2239	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
2240	    V_FW_IQ_CMD_VFN(0));
2241
2242	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
2243	    FW_LEN16(c));
2244
2245	/* Special handling for firmware event queue */
2246	if (iq == &sc->sge.fwq)
2247		v |= F_FW_IQ_CMD_IQASYNCH;
2248
2249	if (iq->flags & IQ_INTR) {
2250		KASSERT(intr_idx < sc->intr_count,
2251		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
2252	} else
2253		v |= F_FW_IQ_CMD_IQANDST;
2254	v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
2255
2256	c.type_to_iqandstindex = htobe32(v |
2257	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
2258	    V_FW_IQ_CMD_VIID(pi->viid) |
2259	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
2260	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
2261	    F_FW_IQ_CMD_IQGTSMODE |
2262	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
2263	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
2264	c.iqsize = htobe16(iq->qsize);
2265	c.iqaddr = htobe64(iq->ba);
2266	if (cong >= 0)
2267		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
2268
2269	if (fl) {
2270		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
2271
2272		len = fl->qsize * EQ_ESIZE;
2273		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
2274		    &fl->ba, (void **)&fl->desc);
2275		if (rc)
2276			return (rc);
2277
2278		/* Allocate space for one software descriptor per buffer. */
2279		rc = alloc_fl_sdesc(fl);
2280		if (rc != 0) {
2281			device_printf(sc->dev,
2282			    "failed to setup fl software descriptors: %d\n",
2283			    rc);
2284			return (rc);
2285		}
2286
2287		if (fl->flags & FL_BUF_PACKING) {
2288			fl->lowat = roundup2(sc->sge.fl_starve_threshold2, 8);
2289			fl->buf_boundary = sc->sge.pack_boundary;
2290		} else {
2291			fl->lowat = roundup2(sc->sge.fl_starve_threshold, 8);
2292			fl->buf_boundary = 16;
2293		}
2294		if (fl_pad && fl->buf_boundary < sc->sge.pad_boundary)
2295			fl->buf_boundary = sc->sge.pad_boundary;
2296
2297		c.iqns_to_fl0congen |=
2298		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
2299			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
2300			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
2301			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
2302			    0));
2303		if (cong >= 0) {
2304			c.iqns_to_fl0congen |=
2305				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
2306				    F_FW_IQ_CMD_FL0CONGCIF |
2307				    F_FW_IQ_CMD_FL0CONGEN);
2308		}
2309		c.fl0dcaen_to_fl0cidxfthresh =
2310		    htobe16(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_64B) |
2311			V_FW_IQ_CMD_FL0FBMAX(X_FETCHBURSTMAX_512B));
2312		c.fl0size = htobe16(fl->qsize);
2313		c.fl0addr = htobe64(fl->ba);
2314	}
2315
2316	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2317	if (rc != 0) {
2318		device_printf(sc->dev,
2319		    "failed to create ingress queue: %d\n", rc);
2320		return (rc);
2321	}
2322
2323	iq->cidx = 0;
2324	iq->gen = F_RSPD_GEN;
2325	iq->intr_next = iq->intr_params;
2326	iq->cntxt_id = be16toh(c.iqid);
2327	iq->abs_id = be16toh(c.physiqid);
2328	iq->flags |= IQ_ALLOCATED;
2329
2330	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
2331	if (cntxt_id >= sc->sge.niq) {
2332		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
2333		    cntxt_id, sc->sge.niq - 1);
2334	}
2335	sc->sge.iqmap[cntxt_id] = iq;
2336
2337	if (fl) {
2338		u_int qid;
2339
2340		iq->flags |= IQ_HAS_FL;
2341		fl->cntxt_id = be16toh(c.fl0id);
2342		fl->pidx = fl->cidx = 0;
2343
2344		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
2345		if (cntxt_id >= sc->sge.neq) {
2346			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
2347			    __func__, cntxt_id, sc->sge.neq - 1);
2348		}
2349		sc->sge.eqmap[cntxt_id] = (void *)fl;
2350
2351		qid = fl->cntxt_id;
2352		if (isset(&sc->doorbells, DOORBELL_UDB)) {
2353			uint32_t s_qpp = sc->sge.eq_s_qpp;
2354			uint32_t mask = (1 << s_qpp) - 1;
2355			volatile uint8_t *udb;
2356
2357			udb = sc->udbs_base + UDBS_DB_OFFSET;
2358			udb += (qid >> s_qpp) << PAGE_SHIFT;
2359			qid &= mask;
2360			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
2361				udb += qid << UDBS_SEG_SHIFT;
2362				qid = 0;
2363			}
2364			fl->udb = (volatile void *)udb;
2365		}
2366		fl->dbval = F_DBPRIO | V_QID(qid);
2367		if (is_t5(sc))
2368			fl->dbval |= F_DBTYPE;
2369
2370		FL_LOCK(fl);
2371		/* Enough to make sure the SGE doesn't think it's starved */
2372		refill_fl(sc, fl, fl->lowat);
2373		FL_UNLOCK(fl);
2374	}
2375
2376	if (is_t5(sc) && cong >= 0) {
2377		uint32_t param, val;
2378
2379		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
2380		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
2381		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
2382		if (cong == 0)
2383			val = 1 << 19;
2384		else {
2385			val = 2 << 19;
2386			for (i = 0; i < 4; i++) {
2387				if (cong & (1 << i))
2388					val |= 1 << (i << 2);
2389			}
2390		}
2391
2392		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
2393		if (rc != 0) {
2394			/* report error but carry on */
2395			device_printf(sc->dev,
2396			    "failed to set congestion manager context for "
2397			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
2398		}
2399	}
2400
2401	/* Enable IQ interrupts */
2402	atomic_store_rel_int(&iq->state, IQS_IDLE);
2403	t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_SEINTARM(iq->intr_params) |
2404	    V_INGRESSQID(iq->cntxt_id));
2405
2406	return (0);
2407}
2408
2409static int
2410free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl)
2411{
2412	int rc;
2413	struct adapter *sc = iq->adapter;
2414	device_t dev;
2415
2416	if (sc == NULL)
2417		return (0);	/* nothing to do */
2418
2419	dev = pi ? pi->dev : sc->dev;
2420
2421	if (iq->flags & IQ_ALLOCATED) {
2422		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
2423		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
2424		    fl ? fl->cntxt_id : 0xffff, 0xffff);
2425		if (rc != 0) {
2426			device_printf(dev,
2427			    "failed to free queue %p: %d\n", iq, rc);
2428			return (rc);
2429		}
2430		iq->flags &= ~IQ_ALLOCATED;
2431	}
2432
2433	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
2434
2435	bzero(iq, sizeof(*iq));
2436
2437	if (fl) {
2438		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
2439		    fl->desc);
2440
2441		if (fl->sdesc)
2442			free_fl_sdesc(sc, fl);
2443
2444		if (mtx_initialized(&fl->fl_lock))
2445			mtx_destroy(&fl->fl_lock);
2446
2447		bzero(fl, sizeof(*fl));
2448	}
2449
2450	return (0);
2451}
2452
2453static void
2454add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
2455    struct sge_fl *fl)
2456{
2457	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2458
2459	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
2460	    "freelist");
2461	children = SYSCTL_CHILDREN(oid);
2462
2463	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
2464	    CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I",
2465	    "SGE context id of the freelist");
2466	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
2467	    fl_pad ? 1 : 0, "padding enabled");
2468	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
2469	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
2470	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
2471	    0, "consumer index");
2472	if (fl->flags & FL_BUF_PACKING) {
2473		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
2474		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
2475	}
2476	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
2477	    0, "producer index");
2478	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated",
2479	    CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated");
2480	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined",
2481	    CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters");
2482	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
2483	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
2484	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
2485	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
2486	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
2487	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
2488}
2489
2490static int
2491alloc_fwq(struct adapter *sc)
2492{
2493	int rc, intr_idx;
2494	struct sge_iq *fwq = &sc->sge.fwq;
2495	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
2496	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2497
2498	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
2499	fwq->flags |= IQ_INTR;	/* always */
2500	intr_idx = sc->intr_count > 1 ? 1 : 0;
2501	rc = alloc_iq_fl(sc->port[0], fwq, NULL, intr_idx, -1);
2502	if (rc != 0) {
2503		device_printf(sc->dev,
2504		    "failed to create firmware event queue: %d\n", rc);
2505		return (rc);
2506	}
2507
2508	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
2509	    NULL, "firmware event queue");
2510	children = SYSCTL_CHILDREN(oid);
2511
2512	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id",
2513	    CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I",
2514	    "absolute id of the queue");
2515	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id",
2516	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I",
2517	    "SGE context id of the queue");
2518	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx",
2519	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I",
2520	    "consumer index");
2521
2522	return (0);
2523}
2524
2525static int
2526free_fwq(struct adapter *sc)
2527{
2528	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
2529}
2530
2531static int
2532alloc_mgmtq(struct adapter *sc)
2533{
2534	int rc;
2535	struct sge_wrq *mgmtq = &sc->sge.mgmtq;
2536	char name[16];
2537	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
2538	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2539
2540	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD,
2541	    NULL, "management queue");
2542
2543	snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev));
2544	init_eq(&mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan,
2545	    sc->sge.fwq.cntxt_id, name);
2546	rc = alloc_wrq(sc, NULL, mgmtq, oid);
2547	if (rc != 0) {
2548		device_printf(sc->dev,
2549		    "failed to create management queue: %d\n", rc);
2550		return (rc);
2551	}
2552
2553	return (0);
2554}
2555
2556static int
2557free_mgmtq(struct adapter *sc)
2558{
2559
2560	return free_wrq(sc, &sc->sge.mgmtq);
2561}
2562
2563int
2564tnl_cong(struct port_info *pi)
2565{
2566
2567	if (cong_drop == -1)
2568		return (-1);
2569	else if (cong_drop == 1)
2570		return (0);
2571	else
2572		return (pi->rx_chan_map);
2573}
2574
2575static int
2576alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int idx,
2577    struct sysctl_oid *oid)
2578{
2579	int rc;
2580	struct sysctl_oid_list *children;
2581	char name[16];
2582
2583	rc = alloc_iq_fl(pi, &rxq->iq, &rxq->fl, intr_idx, tnl_cong(pi));
2584	if (rc != 0)
2585		return (rc);
2586
2587	/*
2588	 * The freelist is just barely above the starvation threshold right now,
2589	 * fill it up a bit more.
2590	 */
2591	FL_LOCK(&rxq->fl);
2592	refill_fl(pi->adapter, &rxq->fl, 128);
2593	FL_UNLOCK(&rxq->fl);
2594
2595#if defined(INET) || defined(INET6)
2596	rc = tcp_lro_init(&rxq->lro);
2597	if (rc != 0)
2598		return (rc);
2599	rxq->lro.ifp = pi->ifp; /* also indicates LRO init'ed */
2600
2601	if (pi->ifp->if_capenable & IFCAP_LRO)
2602		rxq->iq.flags |= IQ_LRO_ENABLED;
2603#endif
2604	rxq->ifp = pi->ifp;
2605
2606	children = SYSCTL_CHILDREN(oid);
2607
2608	snprintf(name, sizeof(name), "%d", idx);
2609	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
2610	    NULL, "rx queue");
2611	children = SYSCTL_CHILDREN(oid);
2612
2613	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "abs_id",
2614	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I",
2615	    "absolute id of the queue");
2616	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id",
2617	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I",
2618	    "SGE context id of the queue");
2619	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
2620	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I",
2621	    "consumer index");
2622#if defined(INET) || defined(INET6)
2623	SYSCTL_ADD_INT(&pi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
2624	    &rxq->lro.lro_queued, 0, NULL);
2625	SYSCTL_ADD_INT(&pi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
2626	    &rxq->lro.lro_flushed, 0, NULL);
2627#endif
2628	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
2629	    &rxq->rxcsum, "# of times hardware assisted with checksum");
2630	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "vlan_extraction",
2631	    CTLFLAG_RD, &rxq->vlan_extraction,
2632	    "# of times hardware extracted 802.1Q tag");
2633
2634	add_fl_sysctls(&pi->ctx, oid, &rxq->fl);
2635
2636	return (rc);
2637}
2638
2639static int
2640free_rxq(struct port_info *pi, struct sge_rxq *rxq)
2641{
2642	int rc;
2643
2644#if defined(INET) || defined(INET6)
2645	if (rxq->lro.ifp) {
2646		tcp_lro_free(&rxq->lro);
2647		rxq->lro.ifp = NULL;
2648	}
2649#endif
2650
2651	rc = free_iq_fl(pi, &rxq->iq, &rxq->fl);
2652	if (rc == 0)
2653		bzero(rxq, sizeof(*rxq));
2654
2655	return (rc);
2656}
2657
2658#ifdef TCP_OFFLOAD
2659static int
2660alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq,
2661    int intr_idx, int idx, struct sysctl_oid *oid)
2662{
2663	int rc;
2664	struct sysctl_oid_list *children;
2665	char name[16];
2666
2667	rc = alloc_iq_fl(pi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx,
2668	    pi->rx_chan_map);
2669	if (rc != 0)
2670		return (rc);
2671
2672	children = SYSCTL_CHILDREN(oid);
2673
2674	snprintf(name, sizeof(name), "%d", idx);
2675	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
2676	    NULL, "rx queue");
2677	children = SYSCTL_CHILDREN(oid);
2678
2679	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "abs_id",
2680	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16,
2681	    "I", "absolute id of the queue");
2682	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id",
2683	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16,
2684	    "I", "SGE context id of the queue");
2685	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
2686	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I",
2687	    "consumer index");
2688
2689	add_fl_sysctls(&pi->ctx, oid, &ofld_rxq->fl);
2690
2691	return (rc);
2692}
2693
2694static int
2695free_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq)
2696{
2697	int rc;
2698
2699	rc = free_iq_fl(pi, &ofld_rxq->iq, &ofld_rxq->fl);
2700	if (rc == 0)
2701		bzero(ofld_rxq, sizeof(*ofld_rxq));
2702
2703	return (rc);
2704}
2705#endif
2706
2707#ifdef DEV_NETMAP
2708static int
2709alloc_nm_rxq(struct port_info *pi, struct sge_nm_rxq *nm_rxq, int intr_idx,
2710    int idx, struct sysctl_oid *oid)
2711{
2712	int rc;
2713	struct sysctl_oid_list *children;
2714	struct sysctl_ctx_list *ctx;
2715	char name[16];
2716	size_t len;
2717	struct adapter *sc = pi->adapter;
2718	struct netmap_adapter *na = NA(pi->nm_ifp);
2719
2720	MPASS(na != NULL);
2721
2722	len = pi->qsize_rxq * IQ_ESIZE;
2723	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
2724	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
2725	if (rc != 0)
2726		return (rc);
2727
2728	len = na->num_rx_desc * EQ_ESIZE + spg_len;
2729	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
2730	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
2731	if (rc != 0)
2732		return (rc);
2733
2734	nm_rxq->pi = pi;
2735	nm_rxq->nid = idx;
2736	nm_rxq->iq_cidx = 0;
2737	nm_rxq->iq_sidx = pi->qsize_rxq - spg_len / IQ_ESIZE;
2738	nm_rxq->iq_gen = F_RSPD_GEN;
2739	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
2740	nm_rxq->fl_sidx = na->num_rx_desc;
2741	nm_rxq->intr_idx = intr_idx;
2742
2743	ctx = &pi->ctx;
2744	children = SYSCTL_CHILDREN(oid);
2745
2746	snprintf(name, sizeof(name), "%d", idx);
2747	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL,
2748	    "rx queue");
2749	children = SYSCTL_CHILDREN(oid);
2750
2751	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
2752	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16,
2753	    "I", "absolute id of the queue");
2754	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
2755	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16,
2756	    "I", "SGE context id of the queue");
2757	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
2758	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I",
2759	    "consumer index");
2760
2761	children = SYSCTL_CHILDREN(oid);
2762	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
2763	    "freelist");
2764	children = SYSCTL_CHILDREN(oid);
2765
2766	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
2767	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16,
2768	    "I", "SGE context id of the freelist");
2769	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
2770	    &nm_rxq->fl_cidx, 0, "consumer index");
2771	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
2772	    &nm_rxq->fl_pidx, 0, "producer index");
2773
2774	return (rc);
2775}
2776
2777
2778static int
2779free_nm_rxq(struct port_info *pi, struct sge_nm_rxq *nm_rxq)
2780{
2781	struct adapter *sc = pi->adapter;
2782
2783	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
2784	    nm_rxq->iq_desc);
2785	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
2786	    nm_rxq->fl_desc);
2787
2788	return (0);
2789}
2790
2791static int
2792alloc_nm_txq(struct port_info *pi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
2793    struct sysctl_oid *oid)
2794{
2795	int rc;
2796	size_t len;
2797	struct adapter *sc = pi->adapter;
2798	struct netmap_adapter *na = NA(pi->nm_ifp);
2799	char name[16];
2800	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
2801
2802	len = na->num_tx_desc * EQ_ESIZE + spg_len;
2803	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
2804	    &nm_txq->ba, (void **)&nm_txq->desc);
2805	if (rc)
2806		return (rc);
2807
2808	nm_txq->pidx = nm_txq->cidx = 0;
2809	nm_txq->sidx = na->num_tx_desc;
2810	nm_txq->nid = idx;
2811	nm_txq->iqidx = iqidx;
2812	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
2813	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf));
2814
2815	snprintf(name, sizeof(name), "%d", idx);
2816	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
2817	    NULL, "netmap tx queue");
2818	children = SYSCTL_CHILDREN(oid);
2819
2820	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
2821	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
2822	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
2823	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I",
2824	    "consumer index");
2825	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "pidx",
2826	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I",
2827	    "producer index");
2828
2829	return (rc);
2830}
2831
2832static int
2833free_nm_txq(struct port_info *pi, struct sge_nm_txq *nm_txq)
2834{
2835	struct adapter *sc = pi->adapter;
2836
2837	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
2838	    nm_txq->desc);
2839
2840	return (0);
2841}
2842#endif
2843
2844static int
2845ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
2846{
2847	int rc, cntxt_id;
2848	struct fw_eq_ctrl_cmd c;
2849
2850	bzero(&c, sizeof(c));
2851
2852	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
2853	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
2854	    V_FW_EQ_CTRL_CMD_VFN(0));
2855	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
2856	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
2857	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); /* XXX */
2858	c.physeqid_pkd = htobe32(0);
2859	c.fetchszm_to_iqid =
2860	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
2861		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
2862		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
2863	c.dcaen_to_eqsize =
2864	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
2865		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
2866		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
2867		V_FW_EQ_CTRL_CMD_EQSIZE(eq->qsize));
2868	c.eqaddr = htobe64(eq->ba);
2869
2870	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2871	if (rc != 0) {
2872		device_printf(sc->dev,
2873		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
2874		return (rc);
2875	}
2876	eq->flags |= EQ_ALLOCATED;
2877
2878	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
2879	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
2880	if (cntxt_id >= sc->sge.neq)
2881	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
2882		cntxt_id, sc->sge.neq - 1);
2883	sc->sge.eqmap[cntxt_id] = eq;
2884
2885	return (rc);
2886}
2887
2888static int
2889eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
2890{
2891	int rc, cntxt_id;
2892	struct fw_eq_eth_cmd c;
2893
2894	bzero(&c, sizeof(c));
2895
2896	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
2897	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
2898	    V_FW_EQ_ETH_CMD_VFN(0));
2899	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
2900	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
2901	c.autoequiqe_to_viid = htobe32(V_FW_EQ_ETH_CMD_VIID(pi->viid));
2902	c.fetchszm_to_iqid =
2903	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
2904		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
2905		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
2906	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
2907		      V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
2908		      V_FW_EQ_ETH_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
2909		      V_FW_EQ_ETH_CMD_EQSIZE(eq->qsize));
2910	c.eqaddr = htobe64(eq->ba);
2911
2912	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2913	if (rc != 0) {
2914		device_printf(pi->dev,
2915		    "failed to create Ethernet egress queue: %d\n", rc);
2916		return (rc);
2917	}
2918	eq->flags |= EQ_ALLOCATED;
2919
2920	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
2921	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
2922	if (cntxt_id >= sc->sge.neq)
2923	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
2924		cntxt_id, sc->sge.neq - 1);
2925	sc->sge.eqmap[cntxt_id] = eq;
2926
2927	return (rc);
2928}
2929
2930#ifdef TCP_OFFLOAD
2931static int
2932ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
2933{
2934	int rc, cntxt_id;
2935	struct fw_eq_ofld_cmd c;
2936
2937	bzero(&c, sizeof(c));
2938
2939	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
2940	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
2941	    V_FW_EQ_OFLD_CMD_VFN(0));
2942	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
2943	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
2944	c.fetchszm_to_iqid =
2945		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
2946		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
2947		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
2948	c.dcaen_to_eqsize =
2949	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
2950		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
2951		V_FW_EQ_OFLD_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
2952		V_FW_EQ_OFLD_CMD_EQSIZE(eq->qsize));
2953	c.eqaddr = htobe64(eq->ba);
2954
2955	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
2956	if (rc != 0) {
2957		device_printf(pi->dev,
2958		    "failed to create egress queue for TCP offload: %d\n", rc);
2959		return (rc);
2960	}
2961	eq->flags |= EQ_ALLOCATED;
2962
2963	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
2964	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
2965	if (cntxt_id >= sc->sge.neq)
2966	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
2967		cntxt_id, sc->sge.neq - 1);
2968	sc->sge.eqmap[cntxt_id] = eq;
2969
2970	return (rc);
2971}
2972#endif
2973
2974static int
2975alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
2976{
2977	int rc;
2978	size_t len;
2979
2980	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
2981
2982	len = eq->qsize * EQ_ESIZE;
2983	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
2984	    &eq->ba, (void **)&eq->desc);
2985	if (rc)
2986		return (rc);
2987
2988	eq->cap = eq->qsize - spg_len / EQ_ESIZE;
2989	eq->spg = (void *)&eq->desc[eq->cap];
2990	eq->avail = eq->cap - 1;	/* one less to avoid cidx = pidx */
2991	eq->pidx = eq->cidx = 0;
2992	eq->doorbells = sc->doorbells;
2993
2994	switch (eq->flags & EQ_TYPEMASK) {
2995	case EQ_CTRL:
2996		rc = ctrl_eq_alloc(sc, eq);
2997		break;
2998
2999	case EQ_ETH:
3000		rc = eth_eq_alloc(sc, pi, eq);
3001		break;
3002
3003#ifdef TCP_OFFLOAD
3004	case EQ_OFLD:
3005		rc = ofld_eq_alloc(sc, pi, eq);
3006		break;
3007#endif
3008
3009	default:
3010		panic("%s: invalid eq type %d.", __func__,
3011		    eq->flags & EQ_TYPEMASK);
3012	}
3013	if (rc != 0) {
3014		device_printf(sc->dev,
3015		    "failed to allocate egress queue(%d): %d\n",
3016		    eq->flags & EQ_TYPEMASK, rc);
3017	}
3018
3019	eq->tx_callout.c_cpu = eq->cntxt_id % mp_ncpus;
3020
3021	if (isset(&eq->doorbells, DOORBELL_UDB) ||
3022	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
3023	    isset(&eq->doorbells, DOORBELL_WCWR)) {
3024		uint32_t s_qpp = sc->sge.eq_s_qpp;
3025		uint32_t mask = (1 << s_qpp) - 1;
3026		volatile uint8_t *udb;
3027
3028		udb = sc->udbs_base + UDBS_DB_OFFSET;
3029		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
3030		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
3031		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
3032	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
3033		else {
3034			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
3035			eq->udb_qid = 0;
3036		}
3037		eq->udb = (volatile void *)udb;
3038	}
3039
3040	return (rc);
3041}
3042
3043static int
3044free_eq(struct adapter *sc, struct sge_eq *eq)
3045{
3046	int rc;
3047
3048	if (eq->flags & EQ_ALLOCATED) {
3049		switch (eq->flags & EQ_TYPEMASK) {
3050		case EQ_CTRL:
3051			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
3052			    eq->cntxt_id);
3053			break;
3054
3055		case EQ_ETH:
3056			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
3057			    eq->cntxt_id);
3058			break;
3059
3060#ifdef TCP_OFFLOAD
3061		case EQ_OFLD:
3062			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
3063			    eq->cntxt_id);
3064			break;
3065#endif
3066
3067		default:
3068			panic("%s: invalid eq type %d.", __func__,
3069			    eq->flags & EQ_TYPEMASK);
3070		}
3071		if (rc != 0) {
3072			device_printf(sc->dev,
3073			    "failed to free egress queue (%d): %d\n",
3074			    eq->flags & EQ_TYPEMASK, rc);
3075			return (rc);
3076		}
3077		eq->flags &= ~EQ_ALLOCATED;
3078	}
3079
3080	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
3081
3082	if (mtx_initialized(&eq->eq_lock))
3083		mtx_destroy(&eq->eq_lock);
3084
3085	bzero(eq, sizeof(*eq));
3086	return (0);
3087}
3088
3089static int
3090alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq,
3091    struct sysctl_oid *oid)
3092{
3093	int rc;
3094	struct sysctl_ctx_list *ctx = pi ? &pi->ctx : &sc->ctx;
3095	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3096
3097	rc = alloc_eq(sc, pi, &wrq->eq);
3098	if (rc)
3099		return (rc);
3100
3101	wrq->adapter = sc;
3102	STAILQ_INIT(&wrq->wr_list);
3103
3104	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3105	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
3106	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
3107	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
3108	    "consumer index");
3109	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
3110	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
3111	    "producer index");
3112	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs", CTLFLAG_RD,
3113	    &wrq->tx_wrs, "# of work requests");
3114	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD,
3115	    &wrq->no_desc, 0,
3116	    "# of times queue ran out of hardware descriptors");
3117	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD,
3118	    &wrq->eq.unstalled, 0, "# of times queue recovered after stall");
3119
3120	return (rc);
3121}
3122
3123static int
3124free_wrq(struct adapter *sc, struct sge_wrq *wrq)
3125{
3126	int rc;
3127
3128	rc = free_eq(sc, &wrq->eq);
3129	if (rc)
3130		return (rc);
3131
3132	bzero(wrq, sizeof(*wrq));
3133	return (0);
3134}
3135
3136static int
3137alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx,
3138    struct sysctl_oid *oid)
3139{
3140	int rc;
3141	struct adapter *sc = pi->adapter;
3142	struct sge_eq *eq = &txq->eq;
3143	char name[16];
3144	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
3145
3146	rc = alloc_eq(sc, pi, eq);
3147	if (rc)
3148		return (rc);
3149
3150	txq->ifp = pi->ifp;
3151
3152	txq->sdesc = malloc(eq->cap * sizeof(struct tx_sdesc), M_CXGBE,
3153	    M_ZERO | M_WAITOK);
3154	txq->br = buf_ring_alloc(eq->qsize, M_CXGBE, M_WAITOK, &eq->eq_lock);
3155
3156	rc = bus_dma_tag_create(sc->dmat, 1, 0, BUS_SPACE_MAXADDR,
3157	    BUS_SPACE_MAXADDR, NULL, NULL, 64 * 1024, TX_SGL_SEGS,
3158	    BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &txq->tx_tag);
3159	if (rc != 0) {
3160		device_printf(sc->dev,
3161		    "failed to create tx DMA tag: %d\n", rc);
3162		return (rc);
3163	}
3164
3165	/*
3166	 * We can stuff ~10 frames in an 8-descriptor txpkts WR (8 is the SGE
3167	 * limit for any WR).  txq->no_dmamap events shouldn't occur if maps is
3168	 * sized for the worst case.
3169	 */
3170	rc = t4_alloc_tx_maps(&txq->txmaps, txq->tx_tag, eq->qsize * 10 / 8,
3171	    M_WAITOK);
3172	if (rc != 0) {
3173		device_printf(sc->dev, "failed to setup tx DMA maps: %d\n", rc);
3174		return (rc);
3175	}
3176
3177	snprintf(name, sizeof(name), "%d", idx);
3178	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
3179	    NULL, "tx queue");
3180	children = SYSCTL_CHILDREN(oid);
3181
3182	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
3183	    &eq->cntxt_id, 0, "SGE context id of the queue");
3184	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx",
3185	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
3186	    "consumer index");
3187	SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "pidx",
3188	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
3189	    "producer index");
3190
3191	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
3192	    &txq->txcsum, "# of times hardware assisted with checksum");
3193	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "vlan_insertion",
3194	    CTLFLAG_RD, &txq->vlan_insertion,
3195	    "# of times hardware inserted 802.1Q tag");
3196	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
3197	    &txq->tso_wrs, "# of TSO work requests");
3198	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
3199	    &txq->imm_wrs, "# of work requests with immediate data");
3200	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
3201	    &txq->sgl_wrs, "# of work requests with direct SGL");
3202	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
3203	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
3204	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_wrs", CTLFLAG_RD,
3205	    &txq->txpkts_wrs, "# of txpkts work requests (multiple pkts/WR)");
3206	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_pkts", CTLFLAG_RD,
3207	    &txq->txpkts_pkts, "# of frames tx'd using txpkts work requests");
3208
3209	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "br_drops", CTLFLAG_RD,
3210	    &txq->br->br_drops, "# of drops in the buf_ring for this queue");
3211	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_dmamap", CTLFLAG_RD,
3212	    &txq->no_dmamap, 0, "# of times txq ran out of DMA maps");
3213	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD,
3214	    &txq->no_desc, 0, "# of times txq ran out of hardware descriptors");
3215	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "egr_update", CTLFLAG_RD,
3216	    &eq->egr_update, 0, "egress update notifications from the SGE");
3217	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD,
3218	    &eq->unstalled, 0, "# of times txq recovered after stall");
3219
3220	return (rc);
3221}
3222
3223static int
3224free_txq(struct port_info *pi, struct sge_txq *txq)
3225{
3226	int rc;
3227	struct adapter *sc = pi->adapter;
3228	struct sge_eq *eq = &txq->eq;
3229
3230	rc = free_eq(sc, eq);
3231	if (rc)
3232		return (rc);
3233
3234	free(txq->sdesc, M_CXGBE);
3235
3236	if (txq->txmaps.maps)
3237		t4_free_tx_maps(&txq->txmaps, txq->tx_tag);
3238
3239	buf_ring_free(txq->br, M_CXGBE);
3240
3241	if (txq->tx_tag)
3242		bus_dma_tag_destroy(txq->tx_tag);
3243
3244	bzero(txq, sizeof(*txq));
3245	return (0);
3246}
3247
3248static void
3249oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
3250{
3251	bus_addr_t *ba = arg;
3252
3253	KASSERT(nseg == 1,
3254	    ("%s meant for single segment mappings only.", __func__));
3255
3256	*ba = error ? 0 : segs->ds_addr;
3257}
3258
3259static inline void
3260ring_fl_db(struct adapter *sc, struct sge_fl *fl)
3261{
3262	uint32_t n, v;
3263
3264	n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx);
3265	MPASS(n > 0);
3266
3267	wmb();
3268	v = fl->dbval | V_PIDX(n);
3269	if (fl->udb)
3270		*fl->udb = htole32(v);
3271	else
3272		t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v);
3273	IDXINCR(fl->dbidx, n, fl->sidx);
3274}
3275
3276/*
3277 * Fills up the freelist by allocating upto 'n' buffers.  Buffers that are
3278 * recycled do not count towards this allocation budget.
3279 *
3280 * Returns non-zero to indicate that this freelist should be added to the list
3281 * of starving freelists.
3282 */
3283static int
3284refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
3285{
3286	__be64 *d;
3287	struct fl_sdesc *sd;
3288	uintptr_t pa;
3289	caddr_t cl;
3290	struct cluster_layout *cll;
3291	struct sw_zone_info *swz;
3292	struct cluster_metadata *clm;
3293	uint16_t max_pidx;
3294	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
3295
3296	FL_LOCK_ASSERT_OWNED(fl);
3297
3298	/*
3299	 * We always stop at the begining of the hardware descriptor that's just
3300	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
3301	 * which would mean an empty freelist to the chip.
3302	 */
3303	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
3304	if (fl->pidx == max_pidx * 8)
3305		return (0);
3306
3307	d = &fl->desc[fl->pidx];
3308	sd = &fl->sdesc[fl->pidx];
3309	cll = &fl->cll_def;	/* default layout */
3310	swz = &sc->sge.sw_zone_info[cll->zidx];
3311
3312	while (n > 0) {
3313
3314		if (sd->cl != NULL) {
3315
3316			if (sd->nmbuf == 0) {
3317				/*
3318				 * Fast recycle without involving any atomics on
3319				 * the cluster's metadata (if the cluster has
3320				 * metadata).  This happens when all frames
3321				 * received in the cluster were small enough to
3322				 * fit within a single mbuf each.
3323				 */
3324				fl->cl_fast_recycled++;
3325#ifdef INVARIANTS
3326				clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
3327				if (clm != NULL)
3328					MPASS(clm->refcount == 1);
3329#endif
3330				goto recycled_fast;
3331			}
3332
3333			/*
3334			 * Cluster is guaranteed to have metadata.  Clusters
3335			 * without metadata always take the fast recycle path
3336			 * when they're recycled.
3337			 */
3338			clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
3339			MPASS(clm != NULL);
3340
3341			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
3342				fl->cl_recycled++;
3343				counter_u64_add(extfree_rels, 1);
3344				goto recycled;
3345			}
3346			sd->cl = NULL;	/* gave up my reference */
3347		}
3348		MPASS(sd->cl == NULL);
3349alloc:
3350		cl = uma_zalloc(swz->zone, M_NOWAIT);
3351		if (__predict_false(cl == NULL)) {
3352			if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 ||
3353			    fl->cll_def.zidx == fl->cll_alt.zidx)
3354				break;
3355
3356			/* fall back to the safe zone */
3357			cll = &fl->cll_alt;
3358			swz = &sc->sge.sw_zone_info[cll->zidx];
3359			goto alloc;
3360		}
3361		fl->cl_allocated++;
3362		n--;
3363
3364		pa = pmap_kextract((vm_offset_t)cl);
3365		pa += cll->region1;
3366		sd->cl = cl;
3367		sd->cll = *cll;
3368		*d = htobe64(pa | cll->hwidx);
3369		clm = cl_metadata(sc, fl, cll, cl);
3370		if (clm != NULL) {
3371recycled:
3372#ifdef INVARIANTS
3373			clm->sd = sd;
3374#endif
3375			clm->refcount = 1;
3376		}
3377		sd->nmbuf = 0;
3378recycled_fast:
3379		d++;
3380		sd++;
3381		if (__predict_false(++fl->pidx % 8 == 0)) {
3382			uint16_t pidx = fl->pidx / 8;
3383
3384			if (__predict_false(pidx == fl->sidx)) {
3385				fl->pidx = 0;
3386				pidx = 0;
3387				sd = fl->sdesc;
3388				d = fl->desc;
3389			}
3390			if (pidx == max_pidx)
3391				break;
3392
3393			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
3394				ring_fl_db(sc, fl);
3395		}
3396	}
3397
3398	if (fl->pidx / 8 != fl->dbidx)
3399		ring_fl_db(sc, fl);
3400
3401	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
3402}
3403
3404/*
3405 * Attempt to refill all starving freelists.
3406 */
3407static void
3408refill_sfl(void *arg)
3409{
3410	struct adapter *sc = arg;
3411	struct sge_fl *fl, *fl_temp;
3412
3413	mtx_lock(&sc->sfl_lock);
3414	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
3415		FL_LOCK(fl);
3416		refill_fl(sc, fl, 64);
3417		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
3418			TAILQ_REMOVE(&sc->sfl, fl, link);
3419			fl->flags &= ~FL_STARVING;
3420		}
3421		FL_UNLOCK(fl);
3422	}
3423
3424	if (!TAILQ_EMPTY(&sc->sfl))
3425		callout_schedule(&sc->sfl_callout, hz / 5);
3426	mtx_unlock(&sc->sfl_lock);
3427}
3428
3429static int
3430alloc_fl_sdesc(struct sge_fl *fl)
3431{
3432
3433	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
3434	    M_ZERO | M_WAITOK);
3435
3436	return (0);
3437}
3438
3439static void
3440free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
3441{
3442	struct fl_sdesc *sd;
3443	struct cluster_metadata *clm;
3444	struct cluster_layout *cll;
3445	int i;
3446
3447	sd = fl->sdesc;
3448	for (i = 0; i < fl->sidx * 8; i++, sd++) {
3449		if (sd->cl == NULL)
3450			continue;
3451
3452		cll = &sd->cll;
3453		clm = cl_metadata(sc, fl, cll, sd->cl);
3454		if (sd->nmbuf == 0)
3455			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
3456		else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) {
3457			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
3458			counter_u64_add(extfree_rels, 1);
3459		}
3460		sd->cl = NULL;
3461	}
3462
3463	free(fl->sdesc, M_CXGBE);
3464	fl->sdesc = NULL;
3465}
3466
3467int
3468t4_alloc_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag, int count,
3469    int flags)
3470{
3471	struct tx_map *txm;
3472	int i, rc;
3473
3474	txmaps->map_total = txmaps->map_avail = count;
3475	txmaps->map_cidx = txmaps->map_pidx = 0;
3476
3477	txmaps->maps = malloc(count * sizeof(struct tx_map), M_CXGBE,
3478	    M_ZERO | flags);
3479
3480	txm = txmaps->maps;
3481	for (i = 0; i < count; i++, txm++) {
3482		rc = bus_dmamap_create(tx_tag, 0, &txm->map);
3483		if (rc != 0)
3484			goto failed;
3485	}
3486
3487	return (0);
3488failed:
3489	while (--i >= 0) {
3490		txm--;
3491		bus_dmamap_destroy(tx_tag, txm->map);
3492	}
3493	KASSERT(txm == txmaps->maps, ("%s: EDOOFUS", __func__));
3494
3495	free(txmaps->maps, M_CXGBE);
3496	txmaps->maps = NULL;
3497
3498	return (rc);
3499}
3500
3501void
3502t4_free_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag)
3503{
3504	struct tx_map *txm;
3505	int i;
3506
3507	txm = txmaps->maps;
3508	for (i = 0; i < txmaps->map_total; i++, txm++) {
3509
3510		if (txm->m) {
3511			bus_dmamap_unload(tx_tag, txm->map);
3512			m_freem(txm->m);
3513			txm->m = NULL;
3514		}
3515
3516		bus_dmamap_destroy(tx_tag, txm->map);
3517	}
3518
3519	free(txmaps->maps, M_CXGBE);
3520	txmaps->maps = NULL;
3521}
3522
3523/*
3524 * We'll do immediate data tx for non-TSO, but only when not coalescing.  We're
3525 * willing to use upto 2 hardware descriptors which means a maximum of 96 bytes
3526 * of immediate data.
3527 */
3528#define IMM_LEN ( \
3529      2 * EQ_ESIZE \
3530    - sizeof(struct fw_eth_tx_pkt_wr) \
3531    - sizeof(struct cpl_tx_pkt_core))
3532
3533/*
3534 * Returns non-zero on failure, no need to cleanup anything in that case.
3535 *
3536 * Note 1: We always try to defrag the mbuf if required and return EFBIG only
3537 * if the resulting chain still won't fit in a tx descriptor.
3538 *
3539 * Note 2: We'll pullup the mbuf chain if TSO is requested and the first mbuf
3540 * does not have the TCP header in it.
3541 */
3542static int
3543get_pkt_sgl(struct sge_txq *txq, struct mbuf **fp, struct sgl *sgl,
3544    int sgl_only)
3545{
3546	struct mbuf *m = *fp;
3547	struct tx_maps *txmaps;
3548	struct tx_map *txm;
3549	int rc, defragged = 0, n;
3550
3551	TXQ_LOCK_ASSERT_OWNED(txq);
3552
3553	if (m->m_pkthdr.tso_segsz)
3554		sgl_only = 1;	/* Do not allow immediate data with LSO */
3555
3556start:	sgl->nsegs = 0;
3557
3558	if (m->m_pkthdr.len <= IMM_LEN && !sgl_only)
3559		return (0);	/* nsegs = 0 tells caller to use imm. tx */
3560
3561	txmaps = &txq->txmaps;
3562	if (txmaps->map_avail == 0) {
3563		txq->no_dmamap++;
3564		return (ENOMEM);
3565	}
3566	txm = &txmaps->maps[txmaps->map_pidx];
3567
3568	if (m->m_pkthdr.tso_segsz && m->m_len < 50) {
3569		*fp = m_pullup(m, 50);
3570		m = *fp;
3571		if (m == NULL)
3572			return (ENOBUFS);
3573	}
3574
3575	rc = bus_dmamap_load_mbuf_sg(txq->tx_tag, txm->map, m, sgl->seg,
3576	    &sgl->nsegs, BUS_DMA_NOWAIT);
3577	if (rc == EFBIG && defragged == 0) {
3578		m = m_defrag(m, M_NOWAIT);
3579		if (m == NULL)
3580			return (EFBIG);
3581
3582		defragged = 1;
3583		*fp = m;
3584		goto start;
3585	}
3586	if (rc != 0)
3587		return (rc);
3588
3589	txm->m = m;
3590	txmaps->map_avail--;
3591	if (++txmaps->map_pidx == txmaps->map_total)
3592		txmaps->map_pidx = 0;
3593
3594	KASSERT(sgl->nsegs > 0 && sgl->nsegs <= TX_SGL_SEGS,
3595	    ("%s: bad DMA mapping (%d segments)", __func__, sgl->nsegs));
3596
3597	/*
3598	 * Store the # of flits required to hold this frame's SGL in nflits.  An
3599	 * SGL has a (ULPTX header + len0, addr0) tuple optionally followed by
3600	 * multiple (len0 + len1, addr0, addr1) tuples.  If addr1 is not used
3601	 * then len1 must be set to 0.
3602	 */
3603	n = sgl->nsegs - 1;
3604	sgl->nflits = (3 * n) / 2 + (n & 1) + 2;
3605
3606	return (0);
3607}
3608
3609
3610/*
3611 * Releases all the txq resources used up in the specified sgl.
3612 */
3613static int
3614free_pkt_sgl(struct sge_txq *txq, struct sgl *sgl)
3615{
3616	struct tx_maps *txmaps;
3617	struct tx_map *txm;
3618
3619	TXQ_LOCK_ASSERT_OWNED(txq);
3620
3621	if (sgl->nsegs == 0)
3622		return (0);	/* didn't use any map */
3623
3624	txmaps = &txq->txmaps;
3625
3626	/* 1 pkt uses exactly 1 map, back it out */
3627
3628	txmaps->map_avail++;
3629	if (txmaps->map_pidx > 0)
3630		txmaps->map_pidx--;
3631	else
3632		txmaps->map_pidx = txmaps->map_total - 1;
3633
3634	txm = &txmaps->maps[txmaps->map_pidx];
3635	bus_dmamap_unload(txq->tx_tag, txm->map);
3636	txm->m = NULL;
3637
3638	return (0);
3639}
3640
3641static int
3642write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m,
3643    struct sgl *sgl)
3644{
3645	struct sge_eq *eq = &txq->eq;
3646	struct fw_eth_tx_pkt_wr *wr;
3647	struct cpl_tx_pkt_core *cpl;
3648	uint32_t ctrl;	/* used in many unrelated places */
3649	uint64_t ctrl1;
3650	int nflits, ndesc, pktlen;
3651	struct tx_sdesc *txsd;
3652	caddr_t dst;
3653
3654	TXQ_LOCK_ASSERT_OWNED(txq);
3655
3656	pktlen = m->m_pkthdr.len;
3657
3658	/*
3659	 * Do we have enough flits to send this frame out?
3660	 */
3661	ctrl = sizeof(struct cpl_tx_pkt_core);
3662	if (m->m_pkthdr.tso_segsz) {
3663		nflits = TXPKT_LSO_WR_HDR;
3664		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
3665	} else
3666		nflits = TXPKT_WR_HDR;
3667	if (sgl->nsegs > 0)
3668		nflits += sgl->nflits;
3669	else {
3670		nflits += howmany(pktlen, 8);
3671		ctrl += pktlen;
3672	}
3673	ndesc = howmany(nflits, 8);
3674	if (ndesc > eq->avail)
3675		return (ENOMEM);
3676
3677	/* Firmware work request header */
3678	wr = (void *)&eq->desc[eq->pidx];
3679	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
3680	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
3681	ctrl = V_FW_WR_LEN16(howmany(nflits, 2));
3682	if (eq->avail == ndesc) {
3683		if (!(eq->flags & EQ_CRFLUSHED)) {
3684			ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
3685			eq->flags |= EQ_CRFLUSHED;
3686		}
3687		eq->flags |= EQ_STALLED;
3688	}
3689
3690	wr->equiq_to_len16 = htobe32(ctrl);
3691	wr->r3 = 0;
3692
3693	if (m->m_pkthdr.tso_segsz) {
3694		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
3695		struct ether_header *eh;
3696		void *l3hdr;
3697#if defined(INET) || defined(INET6)
3698		struct tcphdr *tcp;
3699#endif
3700		uint16_t eh_type;
3701
3702		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
3703		    F_LSO_LAST_SLICE;
3704
3705		eh = mtod(m, struct ether_header *);
3706		eh_type = ntohs(eh->ether_type);
3707		if (eh_type == ETHERTYPE_VLAN) {
3708			struct ether_vlan_header *evh = (void *)eh;
3709
3710			ctrl |= V_LSO_ETHHDR_LEN(1);
3711			l3hdr = evh + 1;
3712			eh_type = ntohs(evh->evl_proto);
3713		} else
3714			l3hdr = eh + 1;
3715
3716		switch (eh_type) {
3717#ifdef INET6
3718		case ETHERTYPE_IPV6:
3719		{
3720			struct ip6_hdr *ip6 = l3hdr;
3721
3722			/*
3723			 * XXX-BZ For now we do not pretend to support
3724			 * IPv6 extension headers.
3725			 */
3726			KASSERT(ip6->ip6_nxt == IPPROTO_TCP, ("%s: CSUM_TSO "
3727			    "with ip6_nxt != TCP: %u", __func__, ip6->ip6_nxt));
3728			tcp = (struct tcphdr *)(ip6 + 1);
3729			ctrl |= F_LSO_IPV6;
3730			ctrl |= V_LSO_IPHDR_LEN(sizeof(*ip6) >> 2) |
3731			    V_LSO_TCPHDR_LEN(tcp->th_off);
3732			break;
3733		}
3734#endif
3735#ifdef INET
3736		case ETHERTYPE_IP:
3737		{
3738			struct ip *ip = l3hdr;
3739
3740			tcp = (void *)((uintptr_t)ip + ip->ip_hl * 4);
3741			ctrl |= V_LSO_IPHDR_LEN(ip->ip_hl) |
3742			    V_LSO_TCPHDR_LEN(tcp->th_off);
3743			break;
3744		}
3745#endif
3746		default:
3747			panic("%s: CSUM_TSO but no supported IP version "
3748			    "(0x%04x)", __func__, eh_type);
3749		}
3750
3751		lso->lso_ctrl = htobe32(ctrl);
3752		lso->ipid_ofst = htobe16(0);
3753		lso->mss = htobe16(m->m_pkthdr.tso_segsz);
3754		lso->seqno_offset = htobe32(0);
3755		lso->len = htobe32(pktlen);
3756
3757		cpl = (void *)(lso + 1);
3758
3759		txq->tso_wrs++;
3760	} else
3761		cpl = (void *)(wr + 1);
3762
3763	/* Checksum offload */
3764	ctrl1 = 0;
3765	if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)))
3766		ctrl1 |= F_TXPKT_IPCSUM_DIS;
3767	if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
3768	    CSUM_TCP_IPV6 | CSUM_TSO)))
3769		ctrl1 |= F_TXPKT_L4CSUM_DIS;
3770	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
3771	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
3772		txq->txcsum++;	/* some hardware assistance provided */
3773
3774	/* VLAN tag insertion */
3775	if (m->m_flags & M_VLANTAG) {
3776		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
3777		txq->vlan_insertion++;
3778	}
3779
3780	/* CPL header */
3781	cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
3782	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
3783	cpl->pack = 0;
3784	cpl->len = htobe16(pktlen);
3785	cpl->ctrl1 = htobe64(ctrl1);
3786
3787	/* Software descriptor */
3788	txsd = &txq->sdesc[eq->pidx];
3789	txsd->desc_used = ndesc;
3790
3791	eq->pending += ndesc;
3792	eq->avail -= ndesc;
3793	eq->pidx += ndesc;
3794	if (eq->pidx >= eq->cap)
3795		eq->pidx -= eq->cap;
3796
3797	/* SGL */
3798	dst = (void *)(cpl + 1);
3799	if (sgl->nsegs > 0) {
3800		txsd->credits = 1;
3801		txq->sgl_wrs++;
3802		write_sgl_to_txd(eq, sgl, &dst);
3803	} else {
3804		txsd->credits = 0;
3805		txq->imm_wrs++;
3806		for (; m; m = m->m_next) {
3807			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
3808#ifdef INVARIANTS
3809			pktlen -= m->m_len;
3810#endif
3811		}
3812#ifdef INVARIANTS
3813		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
3814#endif
3815
3816	}
3817
3818	txq->txpkt_wrs++;
3819	return (0);
3820}
3821
3822/*
3823 * Returns 0 to indicate that m has been accepted into a coalesced tx work
3824 * request.  It has either been folded into txpkts or txpkts was flushed and m
3825 * has started a new coalesced work request (as the first frame in a fresh
3826 * txpkts).
3827 *
3828 * Returns non-zero to indicate a failure - caller is responsible for
3829 * transmitting m, if there was anything in txpkts it has been flushed.
3830 */
3831static int
3832add_to_txpkts(struct port_info *pi, struct sge_txq *txq, struct txpkts *txpkts,
3833    struct mbuf *m, struct sgl *sgl)
3834{
3835	struct sge_eq *eq = &txq->eq;
3836	int can_coalesce;
3837	struct tx_sdesc *txsd;
3838	int flits;
3839
3840	TXQ_LOCK_ASSERT_OWNED(txq);
3841
3842	KASSERT(sgl->nsegs, ("%s: can't coalesce imm data", __func__));
3843
3844	if (txpkts->npkt > 0) {
3845		flits = TXPKTS_PKT_HDR + sgl->nflits;
3846		can_coalesce = m->m_pkthdr.tso_segsz == 0 &&
3847		    txpkts->nflits + flits <= TX_WR_FLITS &&
3848		    txpkts->nflits + flits <= eq->avail * 8 &&
3849		    txpkts->plen + m->m_pkthdr.len < 65536;
3850
3851		if (can_coalesce) {
3852			txpkts->npkt++;
3853			txpkts->nflits += flits;
3854			txpkts->plen += m->m_pkthdr.len;
3855
3856			txsd = &txq->sdesc[eq->pidx];
3857			txsd->credits++;
3858
3859			return (0);
3860		}
3861
3862		/*
3863		 * Couldn't coalesce m into txpkts.  The first order of business
3864		 * is to send txpkts on its way.  Then we'll revisit m.
3865		 */
3866		write_txpkts_wr(txq, txpkts);
3867	}
3868
3869	/*
3870	 * Check if we can start a new coalesced tx work request with m as
3871	 * the first packet in it.
3872	 */
3873
3874	KASSERT(txpkts->npkt == 0, ("%s: txpkts not empty", __func__));
3875
3876	flits = TXPKTS_WR_HDR + sgl->nflits;
3877	can_coalesce = m->m_pkthdr.tso_segsz == 0 &&
3878	    flits <= eq->avail * 8 && flits <= TX_WR_FLITS;
3879
3880	if (can_coalesce == 0)
3881		return (EINVAL);
3882
3883	/*
3884	 * Start a fresh coalesced tx WR with m as the first frame in it.
3885	 */
3886	txpkts->npkt = 1;
3887	txpkts->nflits = flits;
3888	txpkts->flitp = &eq->desc[eq->pidx].flit[2];
3889	txpkts->plen = m->m_pkthdr.len;
3890
3891	txsd = &txq->sdesc[eq->pidx];
3892	txsd->credits = 1;
3893
3894	return (0);
3895}
3896
3897/*
3898 * Note that write_txpkts_wr can never run out of hardware descriptors (but
3899 * write_txpkt_wr can).  add_to_txpkts ensures that a frame is accepted for
3900 * coalescing only if sufficient hardware descriptors are available.
3901 */
3902static void
3903write_txpkts_wr(struct sge_txq *txq, struct txpkts *txpkts)
3904{
3905	struct sge_eq *eq = &txq->eq;
3906	struct fw_eth_tx_pkts_wr *wr;
3907	struct tx_sdesc *txsd;
3908	uint32_t ctrl;
3909	int ndesc;
3910
3911	TXQ_LOCK_ASSERT_OWNED(txq);
3912
3913	ndesc = howmany(txpkts->nflits, 8);
3914
3915	wr = (void *)&eq->desc[eq->pidx];
3916	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
3917	ctrl = V_FW_WR_LEN16(howmany(txpkts->nflits, 2));
3918	if (eq->avail == ndesc) {
3919		if (!(eq->flags & EQ_CRFLUSHED)) {
3920			ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
3921			eq->flags |= EQ_CRFLUSHED;
3922		}
3923		eq->flags |= EQ_STALLED;
3924	}
3925	wr->equiq_to_len16 = htobe32(ctrl);
3926	wr->plen = htobe16(txpkts->plen);
3927	wr->npkt = txpkts->npkt;
3928	wr->r3 = wr->type = 0;
3929
3930	/* Everything else already written */
3931
3932	txsd = &txq->sdesc[eq->pidx];
3933	txsd->desc_used = ndesc;
3934
3935	KASSERT(eq->avail >= ndesc, ("%s: out of descriptors", __func__));
3936
3937	eq->pending += ndesc;
3938	eq->avail -= ndesc;
3939	eq->pidx += ndesc;
3940	if (eq->pidx >= eq->cap)
3941		eq->pidx -= eq->cap;
3942
3943	txq->txpkts_pkts += txpkts->npkt;
3944	txq->txpkts_wrs++;
3945	txpkts->npkt = 0;	/* emptied */
3946}
3947
3948static inline void
3949write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq,
3950    struct txpkts *txpkts, struct mbuf *m, struct sgl *sgl)
3951{
3952	struct ulp_txpkt *ulpmc;
3953	struct ulptx_idata *ulpsc;
3954	struct cpl_tx_pkt_core *cpl;
3955	struct sge_eq *eq = &txq->eq;
3956	uintptr_t flitp, start, end;
3957	uint64_t ctrl;
3958	caddr_t dst;
3959
3960	KASSERT(txpkts->npkt > 0, ("%s: txpkts is empty", __func__));
3961
3962	start = (uintptr_t)eq->desc;
3963	end = (uintptr_t)eq->spg;
3964
3965	/* Checksum offload */
3966	ctrl = 0;
3967	if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)))
3968		ctrl |= F_TXPKT_IPCSUM_DIS;
3969	if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
3970	    CSUM_TCP_IPV6 | CSUM_TSO)))
3971		ctrl |= F_TXPKT_L4CSUM_DIS;
3972	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
3973	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
3974		txq->txcsum++;	/* some hardware assistance provided */
3975
3976	/* VLAN tag insertion */
3977	if (m->m_flags & M_VLANTAG) {
3978		ctrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
3979		txq->vlan_insertion++;
3980	}
3981
3982	/*
3983	 * The previous packet's SGL must have ended at a 16 byte boundary (this
3984	 * is required by the firmware/hardware).  It follows that flitp cannot
3985	 * wrap around between the ULPTX master command and ULPTX subcommand (8
3986	 * bytes each), and that it can not wrap around in the middle of the
3987	 * cpl_tx_pkt_core either.
3988	 */
3989	flitp = (uintptr_t)txpkts->flitp;
3990	KASSERT((flitp & 0xf) == 0,
3991	    ("%s: last SGL did not end at 16 byte boundary: %p",
3992	    __func__, txpkts->flitp));
3993
3994	/* ULP master command */
3995	ulpmc = (void *)flitp;
3996	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) |
3997	    V_ULP_TXPKT_FID(eq->iqid));
3998	ulpmc->len = htonl(howmany(sizeof(*ulpmc) + sizeof(*ulpsc) +
3999	    sizeof(*cpl) + 8 * sgl->nflits, 16));
4000
4001	/* ULP subcommand */
4002	ulpsc = (void *)(ulpmc + 1);
4003	ulpsc->cmd_more = htobe32(V_ULPTX_CMD((u32)ULP_TX_SC_IMM) |
4004	    F_ULP_TX_SC_MORE);
4005	ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
4006
4007	flitp += sizeof(*ulpmc) + sizeof(*ulpsc);
4008	if (flitp == end)
4009		flitp = start;
4010
4011	/* CPL_TX_PKT */
4012	cpl = (void *)flitp;
4013	cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
4014	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
4015	cpl->pack = 0;
4016	cpl->len = htobe16(m->m_pkthdr.len);
4017	cpl->ctrl1 = htobe64(ctrl);
4018
4019	flitp += sizeof(*cpl);
4020	if (flitp == end)
4021		flitp = start;
4022
4023	/* SGL for this frame */
4024	dst = (caddr_t)flitp;
4025	txpkts->nflits += write_sgl_to_txd(eq, sgl, &dst);
4026	txpkts->flitp = (void *)dst;
4027
4028	KASSERT(((uintptr_t)dst & 0xf) == 0,
4029	    ("%s: SGL ends at %p (not a 16 byte boundary)", __func__, dst));
4030}
4031
4032/*
4033 * If the SGL ends on an address that is not 16 byte aligned, this function will
4034 * add a 0 filled flit at the end.  It returns 1 in that case.
4035 */
4036static int
4037write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to)
4038{
4039	__be64 *flitp, *end;
4040	struct ulptx_sgl *usgl;
4041	bus_dma_segment_t *seg;
4042	int i, padded;
4043
4044	KASSERT(sgl->nsegs > 0 && sgl->nflits > 0,
4045	    ("%s: bad SGL - nsegs=%d, nflits=%d",
4046	    __func__, sgl->nsegs, sgl->nflits));
4047
4048	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
4049	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
4050
4051	flitp = (__be64 *)(*to);
4052	end = flitp + sgl->nflits;
4053	seg = &sgl->seg[0];
4054	usgl = (void *)flitp;
4055
4056	/*
4057	 * We start at a 16 byte boundary somewhere inside the tx descriptor
4058	 * ring, so we're at least 16 bytes away from the status page.  There is
4059	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
4060	 */
4061
4062	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
4063	    V_ULPTX_NSGE(sgl->nsegs));
4064	usgl->len0 = htobe32(seg->ds_len);
4065	usgl->addr0 = htobe64(seg->ds_addr);
4066	seg++;
4067
4068	if ((uintptr_t)end <= (uintptr_t)eq->spg) {
4069
4070		/* Won't wrap around at all */
4071
4072		for (i = 0; i < sgl->nsegs - 1; i++, seg++) {
4073			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ds_len);
4074			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ds_addr);
4075		}
4076		if (i & 1)
4077			usgl->sge[i / 2].len[1] = htobe32(0);
4078	} else {
4079
4080		/* Will wrap somewhere in the rest of the SGL */
4081
4082		/* 2 flits already written, write the rest flit by flit */
4083		flitp = (void *)(usgl + 1);
4084		for (i = 0; i < sgl->nflits - 2; i++) {
4085			if ((uintptr_t)flitp == (uintptr_t)eq->spg)
4086				flitp = (void *)eq->desc;
4087			*flitp++ = get_flit(seg, sgl->nsegs - 1, i);
4088		}
4089		end = flitp;
4090	}
4091
4092	if ((uintptr_t)end & 0xf) {
4093		*(uint64_t *)end = 0;
4094		end++;
4095		padded = 1;
4096	} else
4097		padded = 0;
4098
4099	if ((uintptr_t)end == (uintptr_t)eq->spg)
4100		*to = (void *)eq->desc;
4101	else
4102		*to = (void *)end;
4103
4104	return (padded);
4105}
4106
4107static inline void
4108copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
4109{
4110	if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) {
4111		bcopy(from, *to, len);
4112		(*to) += len;
4113	} else {
4114		int portion = (uintptr_t)eq->spg - (uintptr_t)(*to);
4115
4116		bcopy(from, *to, portion);
4117		from += portion;
4118		portion = len - portion;	/* remaining */
4119		bcopy(from, (void *)eq->desc, portion);
4120		(*to) = (caddr_t)eq->desc + portion;
4121	}
4122}
4123
4124static inline void
4125ring_eq_db(struct adapter *sc, struct sge_eq *eq)
4126{
4127	u_int db, pending;
4128
4129	db = eq->doorbells;
4130	pending = eq->pending;
4131	if (pending > 1)
4132		clrbit(&db, DOORBELL_WCWR);
4133	eq->pending = 0;
4134	wmb();
4135
4136	switch (ffs(db) - 1) {
4137	case DOORBELL_UDB:
4138		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending));
4139		return;
4140
4141	case DOORBELL_WCWR: {
4142		volatile uint64_t *dst, *src;
4143		int i;
4144
4145		/*
4146		 * Queues whose 128B doorbell segment fits in the page do not
4147		 * use relative qid (udb_qid is always 0).  Only queues with
4148		 * doorbell segments can do WCWR.
4149		 */
4150		KASSERT(eq->udb_qid == 0 && pending == 1,
4151		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
4152		    __func__, eq->doorbells, pending, eq->pidx, eq));
4153
4154		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
4155		    UDBS_DB_OFFSET);
4156		i = eq->pidx ? eq->pidx - 1 : eq->cap - 1;
4157		src = (void *)&eq->desc[i];
4158		while (src != (void *)&eq->desc[i + 1])
4159			*dst++ = *src++;
4160		wmb();
4161		return;
4162	}
4163
4164	case DOORBELL_UDBWC:
4165		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending));
4166		wmb();
4167		return;
4168
4169	case DOORBELL_KDB:
4170		t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL),
4171		    V_QID(eq->cntxt_id) | V_PIDX(pending));
4172		return;
4173	}
4174}
4175
4176static inline int
4177reclaimable(struct sge_eq *eq)
4178{
4179	unsigned int cidx;
4180
4181	cidx = eq->spg->cidx;	/* stable snapshot */
4182	cidx = be16toh(cidx);
4183
4184	if (cidx >= eq->cidx)
4185		return (cidx - eq->cidx);
4186	else
4187		return (cidx + eq->cap - eq->cidx);
4188}
4189
4190/*
4191 * There are "can_reclaim" tx descriptors ready to be reclaimed.  Reclaim as
4192 * many as possible but stop when there are around "n" mbufs to free.
4193 *
4194 * The actual number reclaimed is provided as the return value.
4195 */
4196static int
4197reclaim_tx_descs(struct sge_txq *txq, int can_reclaim, int n)
4198{
4199	struct tx_sdesc *txsd;
4200	struct tx_maps *txmaps;
4201	struct tx_map *txm;
4202	unsigned int reclaimed, maps;
4203	struct sge_eq *eq = &txq->eq;
4204
4205	TXQ_LOCK_ASSERT_OWNED(txq);
4206
4207	if (can_reclaim == 0)
4208		can_reclaim = reclaimable(eq);
4209
4210	maps = reclaimed = 0;
4211	while (can_reclaim && maps < n) {
4212		int ndesc;
4213
4214		txsd = &txq->sdesc[eq->cidx];
4215		ndesc = txsd->desc_used;
4216
4217		/* Firmware doesn't return "partial" credits. */
4218		KASSERT(can_reclaim >= ndesc,
4219		    ("%s: unexpected number of credits: %d, %d",
4220		    __func__, can_reclaim, ndesc));
4221
4222		maps += txsd->credits;
4223
4224		reclaimed += ndesc;
4225		can_reclaim -= ndesc;
4226
4227		eq->cidx += ndesc;
4228		if (__predict_false(eq->cidx >= eq->cap))
4229			eq->cidx -= eq->cap;
4230	}
4231
4232	txmaps = &txq->txmaps;
4233	txm = &txmaps->maps[txmaps->map_cidx];
4234	if (maps)
4235		prefetch(txm->m);
4236
4237	eq->avail += reclaimed;
4238	KASSERT(eq->avail < eq->cap,	/* avail tops out at (cap - 1) */
4239	    ("%s: too many descriptors available", __func__));
4240
4241	txmaps->map_avail += maps;
4242	KASSERT(txmaps->map_avail <= txmaps->map_total,
4243	    ("%s: too many maps available", __func__));
4244
4245	while (maps--) {
4246		struct tx_map *next;
4247
4248		next = txm + 1;
4249		if (__predict_false(txmaps->map_cidx + 1 == txmaps->map_total))
4250			next = txmaps->maps;
4251		prefetch(next->m);
4252
4253		bus_dmamap_unload(txq->tx_tag, txm->map);
4254		m_freem(txm->m);
4255		txm->m = NULL;
4256
4257		txm = next;
4258		if (__predict_false(++txmaps->map_cidx == txmaps->map_total))
4259			txmaps->map_cidx = 0;
4260	}
4261
4262	return (reclaimed);
4263}
4264
4265static void
4266write_eqflush_wr(struct sge_eq *eq)
4267{
4268	struct fw_eq_flush_wr *wr;
4269
4270	EQ_LOCK_ASSERT_OWNED(eq);
4271	KASSERT(eq->avail > 0, ("%s: no descriptors left.", __func__));
4272	KASSERT(!(eq->flags & EQ_CRFLUSHED), ("%s: flushed already", __func__));
4273
4274	wr = (void *)&eq->desc[eq->pidx];
4275	bzero(wr, sizeof(*wr));
4276	wr->opcode = FW_EQ_FLUSH_WR;
4277	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(sizeof(*wr) / 16) |
4278	    F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
4279
4280	eq->flags |= (EQ_CRFLUSHED | EQ_STALLED);
4281	eq->pending++;
4282	eq->avail--;
4283	if (++eq->pidx == eq->cap)
4284		eq->pidx = 0;
4285}
4286
4287static __be64
4288get_flit(bus_dma_segment_t *sgl, int nsegs, int idx)
4289{
4290	int i = (idx / 3) * 2;
4291
4292	switch (idx % 3) {
4293	case 0: {
4294		__be64 rc;
4295
4296		rc = htobe32(sgl[i].ds_len);
4297		if (i + 1 < nsegs)
4298			rc |= (uint64_t)htobe32(sgl[i + 1].ds_len) << 32;
4299
4300		return (rc);
4301	}
4302	case 1:
4303		return htobe64(sgl[i].ds_addr);
4304	case 2:
4305		return htobe64(sgl[i + 1].ds_addr);
4306	}
4307
4308	return (0);
4309}
4310
4311static void
4312find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp)
4313{
4314	int8_t zidx, hwidx, idx;
4315	uint16_t region1, region3;
4316	int spare, spare_needed, n;
4317	struct sw_zone_info *swz;
4318	struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0];
4319
4320	/*
4321	 * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize
4322	 * large enough for the max payload and cluster metadata.  Otherwise
4323	 * settle for the largest bufsize that leaves enough room in the cluster
4324	 * for metadata.
4325	 *
4326	 * Without buffer packing: Look for the smallest zone which has a
4327	 * bufsize large enough for the max payload.  Settle for the largest
4328	 * bufsize available if there's nothing big enough for max payload.
4329	 */
4330	spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0;
4331	swz = &sc->sge.sw_zone_info[0];
4332	hwidx = -1;
4333	for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) {
4334		if (swz->size > largest_rx_cluster) {
4335			if (__predict_true(hwidx != -1))
4336				break;
4337
4338			/*
4339			 * This is a misconfiguration.  largest_rx_cluster is
4340			 * preventing us from finding a refill source.  See
4341			 * dev.t5nex.<n>.buffer_sizes to figure out why.
4342			 */
4343			device_printf(sc->dev, "largest_rx_cluster=%u leaves no"
4344			    " refill source for fl %p (dma %u).  Ignored.\n",
4345			    largest_rx_cluster, fl, maxp);
4346		}
4347		for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) {
4348			hwb = &hwb_list[idx];
4349			spare = swz->size - hwb->size;
4350			if (spare < spare_needed)
4351				continue;
4352
4353			hwidx = idx;		/* best option so far */
4354			if (hwb->size >= maxp) {
4355
4356				if ((fl->flags & FL_BUF_PACKING) == 0)
4357					goto done; /* stop looking (not packing) */
4358
4359				if (swz->size >= safest_rx_cluster)
4360					goto done; /* stop looking (packing) */
4361			}
4362			break;		/* keep looking, next zone */
4363		}
4364	}
4365done:
4366	/* A usable hwidx has been located. */
4367	MPASS(hwidx != -1);
4368	hwb = &hwb_list[hwidx];
4369	zidx = hwb->zidx;
4370	swz = &sc->sge.sw_zone_info[zidx];
4371	region1 = 0;
4372	region3 = swz->size - hwb->size;
4373
4374	/*
4375	 * Stay within this zone and see if there is a better match when mbuf
4376	 * inlining is allowed.  Remember that the hwidx's are sorted in
4377	 * decreasing order of size (so in increasing order of spare area).
4378	 */
4379	for (idx = hwidx; idx != -1; idx = hwb->next) {
4380		hwb = &hwb_list[idx];
4381		spare = swz->size - hwb->size;
4382
4383		if (allow_mbufs_in_cluster == 0 || hwb->size < maxp)
4384			break;
4385
4386		/*
4387		 * Do not inline mbufs if doing so would violate the pad/pack
4388		 * boundary alignment requirement.
4389		 */
4390		if (fl_pad && (MSIZE % sc->sge.pad_boundary) != 0)
4391			continue;
4392		if (fl->flags & FL_BUF_PACKING &&
4393		    (MSIZE % sc->sge.pack_boundary) != 0)
4394			continue;
4395
4396		if (spare < CL_METADATA_SIZE + MSIZE)
4397			continue;
4398		n = (spare - CL_METADATA_SIZE) / MSIZE;
4399		if (n > howmany(hwb->size, maxp))
4400			break;
4401
4402		hwidx = idx;
4403		if (fl->flags & FL_BUF_PACKING) {
4404			region1 = n * MSIZE;
4405			region3 = spare - region1;
4406		} else {
4407			region1 = MSIZE;
4408			region3 = spare - region1;
4409			break;
4410		}
4411	}
4412
4413	KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES,
4414	    ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp));
4415	KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES,
4416	    ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp));
4417	KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 ==
4418	    sc->sge.sw_zone_info[zidx].size,
4419	    ("%s: bad buffer layout for fl %p, maxp %d. "
4420		"cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4421		sc->sge.sw_zone_info[zidx].size, region1,
4422		sc->sge.hw_buf_info[hwidx].size, region3));
4423	if (fl->flags & FL_BUF_PACKING || region1 > 0) {
4424		KASSERT(region3 >= CL_METADATA_SIZE,
4425		    ("%s: no room for metadata.  fl %p, maxp %d; "
4426		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4427		    sc->sge.sw_zone_info[zidx].size, region1,
4428		    sc->sge.hw_buf_info[hwidx].size, region3));
4429		KASSERT(region1 % MSIZE == 0,
4430		    ("%s: bad mbuf region for fl %p, maxp %d. "
4431		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
4432		    sc->sge.sw_zone_info[zidx].size, region1,
4433		    sc->sge.hw_buf_info[hwidx].size, region3));
4434	}
4435
4436	fl->cll_def.zidx = zidx;
4437	fl->cll_def.hwidx = hwidx;
4438	fl->cll_def.region1 = region1;
4439	fl->cll_def.region3 = region3;
4440}
4441
4442static void
4443find_safe_refill_source(struct adapter *sc, struct sge_fl *fl)
4444{
4445	struct sge *s = &sc->sge;
4446	struct hw_buf_info *hwb;
4447	struct sw_zone_info *swz;
4448	int spare;
4449	int8_t hwidx;
4450
4451	if (fl->flags & FL_BUF_PACKING)
4452		hwidx = s->safe_hwidx2;	/* with room for metadata */
4453	else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) {
4454		hwidx = s->safe_hwidx2;
4455		hwb = &s->hw_buf_info[hwidx];
4456		swz = &s->sw_zone_info[hwb->zidx];
4457		spare = swz->size - hwb->size;
4458
4459		/* no good if there isn't room for an mbuf as well */
4460		if (spare < CL_METADATA_SIZE + MSIZE)
4461			hwidx = s->safe_hwidx1;
4462	} else
4463		hwidx = s->safe_hwidx1;
4464
4465	if (hwidx == -1) {
4466		/* No fallback source */
4467		fl->cll_alt.hwidx = -1;
4468		fl->cll_alt.zidx = -1;
4469
4470		return;
4471	}
4472
4473	hwb = &s->hw_buf_info[hwidx];
4474	swz = &s->sw_zone_info[hwb->zidx];
4475	spare = swz->size - hwb->size;
4476	fl->cll_alt.hwidx = hwidx;
4477	fl->cll_alt.zidx = hwb->zidx;
4478	if (allow_mbufs_in_cluster &&
4479	    (fl_pad == 0 || (MSIZE % sc->sge.pad_boundary) == 0))
4480		fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE;
4481	else
4482		fl->cll_alt.region1 = 0;
4483	fl->cll_alt.region3 = spare - fl->cll_alt.region1;
4484}
4485
4486static void
4487add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
4488{
4489	mtx_lock(&sc->sfl_lock);
4490	FL_LOCK(fl);
4491	if ((fl->flags & FL_DOOMED) == 0) {
4492		fl->flags |= FL_STARVING;
4493		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
4494		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
4495	}
4496	FL_UNLOCK(fl);
4497	mtx_unlock(&sc->sfl_lock);
4498}
4499
4500static int
4501handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
4502    struct mbuf *m)
4503{
4504	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
4505	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
4506	struct adapter *sc = iq->adapter;
4507	struct sge *s = &sc->sge;
4508	struct sge_eq *eq;
4509
4510	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
4511	    rss->opcode));
4512
4513	eq = s->eqmap[qid - s->eq_start];
4514	EQ_LOCK(eq);
4515	KASSERT(eq->flags & EQ_CRFLUSHED,
4516	    ("%s: unsolicited egress update", __func__));
4517	eq->flags &= ~EQ_CRFLUSHED;
4518	eq->egr_update++;
4519
4520	if (__predict_false(eq->flags & EQ_DOOMED))
4521		wakeup_one(eq);
4522	else if (eq->flags & EQ_STALLED && can_resume_tx(eq))
4523		taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task);
4524	EQ_UNLOCK(eq);
4525
4526	return (0);
4527}
4528
4529/* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
4530CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
4531    offsetof(struct cpl_fw6_msg, data));
4532
4533static int
4534handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
4535{
4536	struct adapter *sc = iq->adapter;
4537	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
4538
4539	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
4540	    rss->opcode));
4541
4542	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
4543		const struct rss_header *rss2;
4544
4545		rss2 = (const struct rss_header *)&cpl->data[0];
4546		return (sc->cpl_handler[rss2->opcode](iq, rss2, m));
4547	}
4548
4549	return (sc->fw_msg_handler[cpl->type](sc, &cpl->data[0]));
4550}
4551
4552static int
4553sysctl_uint16(SYSCTL_HANDLER_ARGS)
4554{
4555	uint16_t *id = arg1;
4556	int i = *id;
4557
4558	return sysctl_handle_int(oidp, &i, 0, req);
4559}
4560
4561static int
4562sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
4563{
4564	struct sge *s = arg1;
4565	struct hw_buf_info *hwb = &s->hw_buf_info[0];
4566	struct sw_zone_info *swz = &s->sw_zone_info[0];
4567	int i, rc;
4568	struct sbuf sb;
4569	char c;
4570
4571	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
4572	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
4573		if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster)
4574			c = '*';
4575		else
4576			c = '\0';
4577
4578		sbuf_printf(&sb, "%u%c ", hwb->size, c);
4579	}
4580	sbuf_trim(&sb);
4581	sbuf_finish(&sb);
4582	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
4583	sbuf_delete(&sb);
4584	return (rc);
4585}
4586