1/*-
2 * Copyright (c) 2014-2018, Matthew Macy <mmacy@mattmacy.io>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 *  1. Redistributions of source code must retain the above copyright notice,
9 *     this list of conditions and the following disclaimer.
10 *
11 *  2. Neither the name of Matthew Macy nor the names of its
12 *     contributors may be used to endorse or promote products derived from
13 *     this software without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 * POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29#include "opt_inet.h"
30#include "opt_inet6.h"
31#include "opt_acpi.h"
32#include "opt_sched.h"
33
34#include <sys/param.h>
35#include <sys/types.h>
36#include <sys/bus.h>
37#include <sys/eventhandler.h>
38#include <sys/kernel.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/module.h>
42#include <sys/kobj.h>
43#include <sys/rman.h>
44#include <sys/sbuf.h>
45#include <sys/smp.h>
46#include <sys/socket.h>
47#include <sys/sockio.h>
48#include <sys/sysctl.h>
49#include <sys/syslog.h>
50#include <sys/taskqueue.h>
51#include <sys/limits.h>
52
53#include <net/if.h>
54#include <net/if_var.h>
55#include <net/if_private.h>
56#include <net/if_types.h>
57#include <net/if_media.h>
58#include <net/bpf.h>
59#include <net/ethernet.h>
60#include <net/mp_ring.h>
61#include <net/debugnet.h>
62#include <net/pfil.h>
63#include <net/vnet.h>
64
65#include <netinet/in.h>
66#include <netinet/in_pcb.h>
67#include <netinet/tcp_lro.h>
68#include <netinet/in_systm.h>
69#include <netinet/if_ether.h>
70#include <netinet/ip.h>
71#include <netinet/ip6.h>
72#include <netinet/tcp.h>
73#include <netinet/ip_var.h>
74#include <netinet6/ip6_var.h>
75
76#include <machine/bus.h>
77#include <machine/in_cksum.h>
78
79#include <vm/vm.h>
80#include <vm/pmap.h>
81
82#include <dev/led/led.h>
83#include <dev/pci/pcireg.h>
84#include <dev/pci/pcivar.h>
85#include <dev/pci/pci_private.h>
86
87#include <net/iflib.h>
88
89#include "ifdi_if.h"
90
91#ifdef PCI_IOV
92#include <dev/pci/pci_iov.h>
93#endif
94
95#include <sys/bitstring.h>
96/*
97 * enable accounting of every mbuf as it comes in to and goes out of
98 * iflib's software descriptor references
99 */
100#define MEMORY_LOGGING 0
101/*
102 * Enable mbuf vectors for compressing long mbuf chains
103 */
104
105/*
106 * NB:
107 * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
108 *   we prefetch needs to be determined by the time spent in m_free vis a vis
109 *   the cost of a prefetch. This will of course vary based on the workload:
110 *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
111 *        is quite expensive, thus suggesting very little prefetch.
112 *      - small packet forwarding which is just returning a single mbuf to
113 *        UMA will typically be very fast vis a vis the cost of a memory
114 *        access.
115 */
116
117/*
118 * File organization:
119 *  - private structures
120 *  - iflib private utility functions
121 *  - ifnet functions
122 *  - vlan registry and other exported functions
123 *  - iflib public core functions
124 *
125 *
126 */
127static MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
128
129#define	IFLIB_RXEOF_MORE (1U << 0)
130#define	IFLIB_RXEOF_EMPTY (2U << 0)
131
132struct iflib_txq;
133typedef struct iflib_txq *iflib_txq_t;
134struct iflib_rxq;
135typedef struct iflib_rxq *iflib_rxq_t;
136struct iflib_fl;
137typedef struct iflib_fl *iflib_fl_t;
138
139struct iflib_ctx;
140
141static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
142static void iflib_timer(void *arg);
143static void iflib_tqg_detach(if_ctx_t ctx);
144
145typedef struct iflib_filter_info {
146	driver_filter_t *ifi_filter;
147	void *ifi_filter_arg;
148	struct grouptask *ifi_task;
149	void *ifi_ctx;
150} *iflib_filter_info_t;
151
152struct iflib_ctx {
153	KOBJ_FIELDS;
154	/*
155	 * Pointer to hardware driver's softc
156	 */
157	void *ifc_softc;
158	device_t ifc_dev;
159	if_t ifc_ifp;
160
161	cpuset_t ifc_cpus;
162	if_shared_ctx_t ifc_sctx;
163	struct if_softc_ctx ifc_softc_ctx;
164
165	struct sx ifc_ctx_sx;
166	struct mtx ifc_state_mtx;
167
168	iflib_txq_t ifc_txqs;
169	iflib_rxq_t ifc_rxqs;
170	uint32_t ifc_if_flags;
171	uint32_t ifc_flags;
172	uint32_t ifc_max_fl_buf_size;
173	uint32_t ifc_rx_mbuf_sz;
174
175	int ifc_link_state;
176	int ifc_watchdog_events;
177	struct cdev *ifc_led_dev;
178	struct resource *ifc_msix_mem;
179
180	struct if_irq ifc_legacy_irq;
181	struct grouptask ifc_admin_task;
182	struct grouptask ifc_vflr_task;
183	struct iflib_filter_info ifc_filter_info;
184	struct ifmedia	ifc_media;
185	struct ifmedia	*ifc_mediap;
186
187	struct sysctl_oid *ifc_sysctl_node;
188	uint16_t ifc_sysctl_ntxqs;
189	uint16_t ifc_sysctl_nrxqs;
190	uint16_t ifc_sysctl_qs_eq_override;
191	uint16_t ifc_sysctl_rx_budget;
192	uint16_t ifc_sysctl_tx_abdicate;
193	uint16_t ifc_sysctl_core_offset;
194#define	CORE_OFFSET_UNSPECIFIED	0xffff
195	uint8_t  ifc_sysctl_separate_txrx;
196	uint8_t  ifc_sysctl_use_logical_cores;
197	uint16_t ifc_sysctl_extra_msix_vectors;
198	bool	 ifc_cpus_are_physical_cores;
199
200	qidx_t ifc_sysctl_ntxds[8];
201	qidx_t ifc_sysctl_nrxds[8];
202	struct if_txrx ifc_txrx;
203#define isc_txd_encap  ifc_txrx.ift_txd_encap
204#define isc_txd_flush  ifc_txrx.ift_txd_flush
205#define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
206#define isc_rxd_available ifc_txrx.ift_rxd_available
207#define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
208#define isc_rxd_refill ifc_txrx.ift_rxd_refill
209#define isc_rxd_flush ifc_txrx.ift_rxd_flush
210#define isc_legacy_intr ifc_txrx.ift_legacy_intr
211#define isc_txq_select ifc_txrx.ift_txq_select
212#define isc_txq_select_v2 ifc_txrx.ift_txq_select_v2
213
214	eventhandler_tag ifc_vlan_attach_event;
215	eventhandler_tag ifc_vlan_detach_event;
216	struct ether_addr ifc_mac;
217};
218
219void *
220iflib_get_softc(if_ctx_t ctx)
221{
222
223	return (ctx->ifc_softc);
224}
225
226device_t
227iflib_get_dev(if_ctx_t ctx)
228{
229
230	return (ctx->ifc_dev);
231}
232
233if_t
234iflib_get_ifp(if_ctx_t ctx)
235{
236
237	return (ctx->ifc_ifp);
238}
239
240struct ifmedia *
241iflib_get_media(if_ctx_t ctx)
242{
243
244	return (ctx->ifc_mediap);
245}
246
247void
248iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
249{
250
251	bcopy(mac, ctx->ifc_mac.octet, ETHER_ADDR_LEN);
252}
253
254if_softc_ctx_t
255iflib_get_softc_ctx(if_ctx_t ctx)
256{
257
258	return (&ctx->ifc_softc_ctx);
259}
260
261if_shared_ctx_t
262iflib_get_sctx(if_ctx_t ctx)
263{
264
265	return (ctx->ifc_sctx);
266}
267
268uint16_t
269iflib_get_extra_msix_vectors_sysctl(if_ctx_t ctx)
270{
271
272	return (ctx->ifc_sysctl_extra_msix_vectors);
273}
274
275#define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
276#define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
277#define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
278
279#define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
280#define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
281
282typedef struct iflib_sw_rx_desc_array {
283	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
284	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
285	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
286	bus_addr_t	*ifsd_ba;          /* bus addr of cluster for rx */
287} iflib_rxsd_array_t;
288
289typedef struct iflib_sw_tx_desc_array {
290	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
291	bus_dmamap_t	*ifsd_tso_map;     /* bus_dma maps for TSO packet */
292	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
293} if_txsd_vec_t;
294
295/* magic number that should be high enough for any hardware */
296#define IFLIB_MAX_TX_SEGS		128
297#define IFLIB_RX_COPY_THRESH		128
298#define IFLIB_MAX_RX_REFRESH		32
299/* The minimum descriptors per second before we start coalescing */
300#define IFLIB_MIN_DESC_SEC		16384
301#define IFLIB_DEFAULT_TX_UPDATE_FREQ	16
302#define IFLIB_QUEUE_IDLE		0
303#define IFLIB_QUEUE_HUNG		1
304#define IFLIB_QUEUE_WORKING		2
305/* maximum number of txqs that can share an rx interrupt */
306#define IFLIB_MAX_TX_SHARED_INTR	4
307
308/* this should really scale with ring size - this is a fairly arbitrary value */
309#define TX_BATCH_SIZE			32
310
311#define IFLIB_RESTART_BUDGET		8
312
313#define	IFC_LEGACY		0x001
314#define	IFC_QFLUSH		0x002
315#define	IFC_MULTISEG		0x004
316#define	IFC_SPARE1		0x008
317#define	IFC_SC_ALLOCATED	0x010
318#define	IFC_INIT_DONE		0x020
319#define	IFC_PREFETCH		0x040
320#define	IFC_DO_RESET		0x080
321#define	IFC_DO_WATCHDOG		0x100
322#define	IFC_SPARE0		0x200
323#define	IFC_SPARE2		0x400
324#define	IFC_IN_DETACH		0x800
325
326#define	IFC_NETMAP_TX_IRQ	0x80000000
327
328#define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
329				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
330				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
331
332struct iflib_txq {
333	qidx_t		ift_in_use;
334	qidx_t		ift_cidx;
335	qidx_t		ift_cidx_processed;
336	qidx_t		ift_pidx;
337	uint8_t		ift_gen;
338	uint8_t		ift_br_offset;
339	uint16_t	ift_npending;
340	uint16_t	ift_db_pending;
341	uint16_t	ift_rs_pending;
342	/* implicit pad */
343	uint8_t		ift_txd_size[8];
344	uint64_t	ift_processed;
345	uint64_t	ift_cleaned;
346	uint64_t	ift_cleaned_prev;
347#if MEMORY_LOGGING
348	uint64_t	ift_enqueued;
349	uint64_t	ift_dequeued;
350#endif
351	uint64_t	ift_no_tx_dma_setup;
352	uint64_t	ift_no_desc_avail;
353	uint64_t	ift_mbuf_defrag_failed;
354	uint64_t	ift_mbuf_defrag;
355	uint64_t	ift_map_failed;
356	uint64_t	ift_txd_encap_efbig;
357	uint64_t	ift_pullups;
358	uint64_t	ift_last_timer_tick;
359
360	struct mtx	ift_mtx;
361	struct mtx	ift_db_mtx;
362
363	/* constant values */
364	if_ctx_t	ift_ctx;
365	struct ifmp_ring        *ift_br;
366	struct grouptask	ift_task;
367	qidx_t		ift_size;
368	uint16_t	ift_id;
369	struct callout	ift_timer;
370#ifdef DEV_NETMAP
371	struct callout	ift_netmap_timer;
372#endif /* DEV_NETMAP */
373
374	if_txsd_vec_t	ift_sds;
375	uint8_t		ift_qstatus;
376	uint8_t		ift_closed;
377	uint8_t		ift_update_freq;
378	struct iflib_filter_info ift_filter_info;
379	bus_dma_tag_t	ift_buf_tag;
380	bus_dma_tag_t	ift_tso_buf_tag;
381	iflib_dma_info_t	ift_ifdi;
382#define	MTX_NAME_LEN	32
383	char                    ift_mtx_name[MTX_NAME_LEN];
384	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
385#ifdef IFLIB_DIAGNOSTICS
386	uint64_t ift_cpu_exec_count[256];
387#endif
388} __aligned(CACHE_LINE_SIZE);
389
390struct iflib_fl {
391	qidx_t		ifl_cidx;
392	qidx_t		ifl_pidx;
393	qidx_t		ifl_credits;
394	uint8_t		ifl_gen;
395	uint8_t		ifl_rxd_size;
396#if MEMORY_LOGGING
397	uint64_t	ifl_m_enqueued;
398	uint64_t	ifl_m_dequeued;
399	uint64_t	ifl_cl_enqueued;
400	uint64_t	ifl_cl_dequeued;
401#endif
402	/* implicit pad */
403	bitstr_t 	*ifl_rx_bitmap;
404	qidx_t		ifl_fragidx;
405	/* constant */
406	qidx_t		ifl_size;
407	uint16_t	ifl_buf_size;
408	uint16_t	ifl_cltype;
409	uma_zone_t	ifl_zone;
410	iflib_rxsd_array_t	ifl_sds;
411	iflib_rxq_t	ifl_rxq;
412	uint8_t		ifl_id;
413	bus_dma_tag_t	ifl_buf_tag;
414	iflib_dma_info_t	ifl_ifdi;
415	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
416	qidx_t		ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
417}  __aligned(CACHE_LINE_SIZE);
418
419static inline qidx_t
420get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
421{
422	qidx_t used;
423
424	if (pidx > cidx)
425		used = pidx - cidx;
426	else if (pidx < cidx)
427		used = size - cidx + pidx;
428	else if (gen == 0 && pidx == cidx)
429		used = 0;
430	else if (gen == 1 && pidx == cidx)
431		used = size;
432	else
433		panic("bad state");
434
435	return (used);
436}
437
438#define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
439
440#define IDXDIFF(head, tail, wrap) \
441	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
442
443struct iflib_rxq {
444	if_ctx_t	ifr_ctx;
445	iflib_fl_t	ifr_fl;
446	uint64_t	ifr_rx_irq;
447	struct pfil_head	*pfil;
448	/*
449	 * If there is a separate completion queue (IFLIB_HAS_RXCQ), this is
450	 * the completion queue consumer index.  Otherwise it's unused.
451	 */
452	qidx_t		ifr_cq_cidx;
453	uint16_t	ifr_id;
454	uint8_t		ifr_nfl;
455	uint8_t		ifr_ntxqirq;
456	uint8_t		ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
457	uint8_t		ifr_fl_offset;
458	struct lro_ctrl			ifr_lc;
459	struct grouptask        ifr_task;
460	struct callout		ifr_watchdog;
461	struct iflib_filter_info ifr_filter_info;
462	iflib_dma_info_t		ifr_ifdi;
463
464	/* dynamically allocate if any drivers need a value substantially larger than this */
465	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
466#ifdef IFLIB_DIAGNOSTICS
467	uint64_t ifr_cpu_exec_count[256];
468#endif
469}  __aligned(CACHE_LINE_SIZE);
470
471typedef struct if_rxsd {
472	caddr_t *ifsd_cl;
473	iflib_fl_t ifsd_fl;
474} *if_rxsd_t;
475
476/* multiple of word size */
477#ifdef __LP64__
478#define PKT_INFO_SIZE	6
479#define RXD_INFO_SIZE	5
480#define PKT_TYPE uint64_t
481#else
482#define PKT_INFO_SIZE	11
483#define RXD_INFO_SIZE	8
484#define PKT_TYPE uint32_t
485#endif
486#define PKT_LOOP_BOUND  ((PKT_INFO_SIZE/3)*3)
487#define RXD_LOOP_BOUND  ((RXD_INFO_SIZE/4)*4)
488
489typedef struct if_pkt_info_pad {
490	PKT_TYPE pkt_val[PKT_INFO_SIZE];
491} *if_pkt_info_pad_t;
492typedef struct if_rxd_info_pad {
493	PKT_TYPE rxd_val[RXD_INFO_SIZE];
494} *if_rxd_info_pad_t;
495
496CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
497CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
498
499static inline void
500pkt_info_zero(if_pkt_info_t pi)
501{
502	if_pkt_info_pad_t pi_pad;
503
504	pi_pad = (if_pkt_info_pad_t)pi;
505	pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
506	pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
507#ifndef __LP64__
508	pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
509	pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
510#endif
511}
512
513static inline void
514rxd_info_zero(if_rxd_info_t ri)
515{
516	if_rxd_info_pad_t ri_pad;
517	int i;
518
519	ri_pad = (if_rxd_info_pad_t)ri;
520	for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
521		ri_pad->rxd_val[i] = 0;
522		ri_pad->rxd_val[i+1] = 0;
523		ri_pad->rxd_val[i+2] = 0;
524		ri_pad->rxd_val[i+3] = 0;
525	}
526#ifdef __LP64__
527	ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
528#endif
529}
530
531/*
532 * Only allow a single packet to take up most 1/nth of the tx ring
533 */
534#define MAX_SINGLE_PACKET_FRACTION 12
535#define IF_BAD_DMA (bus_addr_t)-1
536
537#define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
538
539#define CTX_LOCK_INIT(_sc)  sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock")
540#define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx)
541#define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx)
542#define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx)
543
544#define STATE_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF)
545#define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx)
546#define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx)
547#define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx)
548
549#define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
550#define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
551
552/* Our boot-time initialization hook */
553static int	iflib_module_event_handler(module_t, int, void *);
554
555static moduledata_t iflib_moduledata = {
556	"iflib",
557	iflib_module_event_handler,
558	NULL
559};
560
561DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
562MODULE_VERSION(iflib, 1);
563
564MODULE_DEPEND(iflib, pci, 1, 1, 1);
565MODULE_DEPEND(iflib, ether, 1, 1, 1);
566
567TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
568TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
569
570#ifndef IFLIB_DEBUG_COUNTERS
571#ifdef INVARIANTS
572#define IFLIB_DEBUG_COUNTERS 1
573#else
574#define IFLIB_DEBUG_COUNTERS 0
575#endif /* !INVARIANTS */
576#endif
577
578static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
579    "iflib driver parameters");
580
581/*
582 * XXX need to ensure that this can't accidentally cause the head to be moved backwards
583 */
584static int iflib_min_tx_latency = 0;
585SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
586    &iflib_min_tx_latency, 0,
587    "minimize transmit latency at the possible expense of throughput");
588static int iflib_no_tx_batch = 0;
589SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
590    &iflib_no_tx_batch, 0,
591    "minimize transmit latency at the possible expense of throughput");
592static int iflib_timer_default = 1000;
593SYSCTL_INT(_net_iflib, OID_AUTO, timer_default, CTLFLAG_RW,
594    &iflib_timer_default, 0, "number of ticks between iflib_timer calls");
595
596
597#if IFLIB_DEBUG_COUNTERS
598
599static int iflib_tx_seen;
600static int iflib_tx_sent;
601static int iflib_tx_encap;
602static int iflib_rx_allocs;
603static int iflib_fl_refills;
604static int iflib_fl_refills_large;
605static int iflib_tx_frees;
606
607SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD, &iflib_tx_seen, 0,
608    "# TX mbufs seen");
609SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD, &iflib_tx_sent, 0,
610    "# TX mbufs sent");
611SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD, &iflib_tx_encap, 0,
612    "# TX mbufs encapped");
613SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD, &iflib_tx_frees, 0,
614    "# TX frees");
615SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD, &iflib_rx_allocs, 0,
616    "# RX allocations");
617SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD, &iflib_fl_refills, 0,
618    "# refills");
619SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
620    &iflib_fl_refills_large, 0, "# large refills");
621
622static int iflib_txq_drain_flushing;
623static int iflib_txq_drain_oactive;
624static int iflib_txq_drain_notready;
625
626SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
627    &iflib_txq_drain_flushing, 0, "# drain flushes");
628SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
629    &iflib_txq_drain_oactive, 0, "# drain oactives");
630SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
631    &iflib_txq_drain_notready, 0, "# drain notready");
632
633static int iflib_encap_load_mbuf_fail;
634static int iflib_encap_pad_mbuf_fail;
635static int iflib_encap_txq_avail_fail;
636static int iflib_encap_txd_encap_fail;
637
638SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
639    &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
640SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
641    &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
642SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
643    &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
644SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
645    &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
646
647static int iflib_task_fn_rxs;
648static int iflib_rx_intr_enables;
649static int iflib_fast_intrs;
650static int iflib_rx_unavail;
651static int iflib_rx_ctx_inactive;
652static int iflib_rx_if_input;
653static int iflib_rxd_flush;
654
655static int iflib_verbose_debug;
656
657SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD, &iflib_task_fn_rxs, 0,
658    "# task_fn_rx calls");
659SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
660    &iflib_rx_intr_enables, 0, "# RX intr enables");
661SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD, &iflib_fast_intrs, 0,
662    "# fast_intr calls");
663SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD, &iflib_rx_unavail, 0,
664    "# times rxeof called with no available data");
665SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
666    &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
667SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD, &iflib_rx_if_input,
668    0, "# times rxeof called if_input");
669SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD, &iflib_rxd_flush, 0,
670    "# times rxd_flush called");
671SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
672    &iflib_verbose_debug, 0, "enable verbose debugging");
673
674#define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
675static void
676iflib_debug_reset(void)
677{
678	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
679		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
680		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
681		iflib_txq_drain_notready =
682		iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
683		iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
684		iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
685		iflib_rx_unavail =
686		iflib_rx_ctx_inactive = iflib_rx_if_input =
687		iflib_rxd_flush = 0;
688}
689
690#else
691#define DBG_COUNTER_INC(name)
692static void iflib_debug_reset(void) {}
693#endif
694
695#define IFLIB_DEBUG 0
696
697static void iflib_tx_structures_free(if_ctx_t ctx);
698static void iflib_rx_structures_free(if_ctx_t ctx);
699static int iflib_queues_alloc(if_ctx_t ctx);
700static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
701static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
702static int iflib_qset_structures_setup(if_ctx_t ctx);
703static int iflib_msix_init(if_ctx_t ctx);
704static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str);
705static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
706static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
707#ifdef ALTQ
708static void iflib_altq_if_start(if_t ifp);
709static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m);
710#endif
711static int iflib_register(if_ctx_t);
712static void iflib_deregister(if_ctx_t);
713static void iflib_unregister_vlan_handlers(if_ctx_t ctx);
714static uint16_t iflib_get_mbuf_size_for(unsigned int size);
715static void iflib_init_locked(if_ctx_t ctx);
716static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
717static void iflib_add_device_sysctl_post(if_ctx_t ctx);
718static void iflib_ifmp_purge(iflib_txq_t txq);
719static void _iflib_pre_assert(if_softc_ctx_t scctx);
720static void iflib_stop(if_ctx_t ctx);
721static void iflib_if_init_locked(if_ctx_t ctx);
722static void iflib_free_intr_mem(if_ctx_t ctx);
723#ifndef __NO_STRICT_ALIGNMENT
724static struct mbuf * iflib_fixup_rx(struct mbuf *m);
725#endif
726
727static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
728    SLIST_HEAD_INITIALIZER(cpu_offsets);
729struct cpu_offset {
730	SLIST_ENTRY(cpu_offset) entries;
731	cpuset_t	set;
732	unsigned int	refcount;
733	uint16_t	next_cpuid;
734};
735static struct mtx cpu_offset_mtx;
736MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
737    MTX_DEF);
738
739DEBUGNET_DEFINE(iflib);
740
741static int
742iflib_num_rx_descs(if_ctx_t ctx)
743{
744	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
745	if_shared_ctx_t sctx = ctx->ifc_sctx;
746	uint16_t first_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
747
748	return scctx->isc_nrxd[first_rxq];
749}
750
751static int
752iflib_num_tx_descs(if_ctx_t ctx)
753{
754	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
755	if_shared_ctx_t sctx = ctx->ifc_sctx;
756	uint16_t first_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
757
758	return scctx->isc_ntxd[first_txq];
759}
760
761#ifdef DEV_NETMAP
762#include <sys/selinfo.h>
763#include <net/netmap.h>
764#include <dev/netmap/netmap_kern.h>
765
766MODULE_DEPEND(iflib, netmap, 1, 1, 1);
767
768static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init);
769static void iflib_netmap_timer(void *arg);
770
771/*
772 * device-specific sysctl variables:
773 *
774 * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
775 *	During regular operations the CRC is stripped, but on some
776 *	hardware reception of frames not multiple of 64 is slower,
777 *	so using crcstrip=0 helps in benchmarks.
778 *
779 * iflib_rx_miss, iflib_rx_miss_bufs:
780 *	count packets that might be missed due to lost interrupts.
781 */
782SYSCTL_DECL(_dev_netmap);
783/*
784 * The xl driver by default strips CRCs and we do not override it.
785 */
786
787int iflib_crcstrip = 1;
788SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
789    CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on RX frames");
790
791int iflib_rx_miss, iflib_rx_miss_bufs;
792SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
793    CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed RX intr");
794SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
795    CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed RX intr bufs");
796
797/*
798 * Register/unregister. We are already under netmap lock.
799 * Only called on the first register or the last unregister.
800 */
801static int
802iflib_netmap_register(struct netmap_adapter *na, int onoff)
803{
804	if_t ifp = na->ifp;
805	if_ctx_t ctx = if_getsoftc(ifp);
806	int status;
807
808	CTX_LOCK(ctx);
809	if (!CTX_IS_VF(ctx))
810		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
811
812	iflib_stop(ctx);
813
814	/*
815	 * Enable (or disable) netmap flags, and intercept (or restore)
816	 * ifp->if_transmit. This is done once the device has been stopped
817	 * to prevent race conditions. Also, this must be done after
818	 * calling netmap_disable_all_rings() and before calling
819	 * netmap_enable_all_rings(), so that these two functions see the
820	 * updated state of the NAF_NETMAP_ON bit.
821	 */
822	if (onoff) {
823		nm_set_native_flags(na);
824	} else {
825		nm_clear_native_flags(na);
826	}
827
828	iflib_init_locked(ctx);
829	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
830	status = if_getdrvflags(ifp) & IFF_DRV_RUNNING ? 0 : 1;
831	if (status)
832		nm_clear_native_flags(na);
833	CTX_UNLOCK(ctx);
834	return (status);
835}
836
837static int
838iflib_netmap_config(struct netmap_adapter *na, struct nm_config_info *info)
839{
840	if_t ifp = na->ifp;
841	if_ctx_t ctx = if_getsoftc(ifp);
842	iflib_rxq_t rxq = &ctx->ifc_rxqs[0];
843	iflib_fl_t fl = &rxq->ifr_fl[0];
844
845	info->num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
846	info->num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
847	info->num_tx_descs = iflib_num_tx_descs(ctx);
848	info->num_rx_descs = iflib_num_rx_descs(ctx);
849	info->rx_buf_maxsize = fl->ifl_buf_size;
850	nm_prinf("txr %u rxr %u txd %u rxd %u rbufsz %u",
851		info->num_tx_rings, info->num_rx_rings, info->num_tx_descs,
852		info->num_rx_descs, info->rx_buf_maxsize);
853
854	return 0;
855}
856
857static int
858netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init)
859{
860	struct netmap_adapter *na = kring->na;
861	u_int const lim = kring->nkr_num_slots - 1;
862	struct netmap_ring *ring = kring->ring;
863	bus_dmamap_t *map;
864	struct if_rxd_update iru;
865	if_ctx_t ctx = rxq->ifr_ctx;
866	iflib_fl_t fl = &rxq->ifr_fl[0];
867	u_int nic_i_first, nic_i;
868	u_int nm_i;
869	int i, n;
870#if IFLIB_DEBUG_COUNTERS
871	int rf_count = 0;
872#endif
873
874	/*
875	 * This function is used both at initialization and in rxsync.
876	 * At initialization we need to prepare (with isc_rxd_refill())
877	 * all the netmap buffers currently owned by the kernel, in
878	 * such a way to keep fl->ifl_pidx and kring->nr_hwcur in sync
879	 * (except for kring->nkr_hwofs). These may be less than
880	 * kring->nkr_num_slots if netmap_reset() was called while
881	 * an application using the kring that still owned some
882	 * buffers.
883	 * At rxsync time, both indexes point to the next buffer to be
884	 * refilled.
885	 * In any case we publish (with isc_rxd_flush()) up to
886	 * (fl->ifl_pidx - 1) % N (included), to avoid the NIC tail/prod
887	 * pointer to overrun the head/cons pointer, although this is
888	 * not necessary for some NICs (e.g. vmx).
889	 */
890	if (__predict_false(init)) {
891		n = kring->nkr_num_slots - nm_kr_rxspace(kring);
892	} else {
893		n = kring->rhead - kring->nr_hwcur;
894		if (n == 0)
895			return (0); /* Nothing to do. */
896		if (n < 0)
897			n += kring->nkr_num_slots;
898	}
899
900	iru_init(&iru, rxq, 0 /* flid */);
901	map = fl->ifl_sds.ifsd_map;
902	nic_i = fl->ifl_pidx;
903	nm_i = netmap_idx_n2k(kring, nic_i);
904	if (__predict_false(init)) {
905		/*
906		 * On init/reset, nic_i must be 0, and we must
907		 * start to refill from hwtail (see netmap_reset()).
908		 */
909		MPASS(nic_i == 0);
910		MPASS(nm_i == kring->nr_hwtail);
911	} else
912		MPASS(nm_i == kring->nr_hwcur);
913	DBG_COUNTER_INC(fl_refills);
914	while (n > 0) {
915#if IFLIB_DEBUG_COUNTERS
916		if (++rf_count == 9)
917			DBG_COUNTER_INC(fl_refills_large);
918#endif
919		nic_i_first = nic_i;
920		for (i = 0; n > 0 && i < IFLIB_MAX_RX_REFRESH; n--, i++) {
921			struct netmap_slot *slot = &ring->slot[nm_i];
922			uint64_t paddr;
923			void *addr = PNMB(na, slot, &paddr);
924
925			MPASS(i < IFLIB_MAX_RX_REFRESH);
926
927			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
928			        return netmap_ring_reinit(kring);
929
930			fl->ifl_bus_addrs[i] = paddr +
931			    nm_get_offset(kring, slot);
932			fl->ifl_rxd_idxs[i] = nic_i;
933
934			if (__predict_false(init)) {
935				netmap_load_map(na, fl->ifl_buf_tag,
936				    map[nic_i], addr);
937			} else if (slot->flags & NS_BUF_CHANGED) {
938				/* buffer has changed, reload map */
939				netmap_reload_map(na, fl->ifl_buf_tag,
940				    map[nic_i], addr);
941			}
942			bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i],
943			    BUS_DMASYNC_PREREAD);
944			slot->flags &= ~NS_BUF_CHANGED;
945
946			nm_i = nm_next(nm_i, lim);
947			nic_i = nm_next(nic_i, lim);
948		}
949
950		iru.iru_pidx = nic_i_first;
951		iru.iru_count = i;
952		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
953	}
954	fl->ifl_pidx = nic_i;
955	/*
956	 * At the end of the loop we must have refilled everything
957	 * we could possibly refill.
958	 */
959	MPASS(nm_i == kring->rhead);
960	kring->nr_hwcur = nm_i;
961
962	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
963	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
964	ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id,
965	    nm_prev(nic_i, lim));
966	DBG_COUNTER_INC(rxd_flush);
967
968	return (0);
969}
970
971#define NETMAP_TX_TIMER_US	90
972
973/*
974 * Reconcile kernel and user view of the transmit ring.
975 *
976 * All information is in the kring.
977 * Userspace wants to send packets up to the one before kring->rhead,
978 * kernel knows kring->nr_hwcur is the first unsent packet.
979 *
980 * Here we push packets out (as many as possible), and possibly
981 * reclaim buffers from previously completed transmission.
982 *
983 * The caller (netmap) guarantees that there is only one instance
984 * running at any time. Any interference with other driver
985 * methods should be handled by the individual drivers.
986 */
987static int
988iflib_netmap_txsync(struct netmap_kring *kring, int flags)
989{
990	struct netmap_adapter *na = kring->na;
991	if_t ifp = na->ifp;
992	struct netmap_ring *ring = kring->ring;
993	u_int nm_i;	/* index into the netmap kring */
994	u_int nic_i;	/* index into the NIC ring */
995	u_int const lim = kring->nkr_num_slots - 1;
996	u_int const head = kring->rhead;
997	struct if_pkt_info pi;
998	int tx_pkts = 0, tx_bytes = 0;
999
1000	/*
1001	 * interrupts on every tx packet are expensive so request
1002	 * them every half ring, or where NS_REPORT is set
1003	 */
1004	u_int report_frequency = kring->nkr_num_slots >> 1;
1005	/* device-specific */
1006	if_ctx_t ctx = if_getsoftc(ifp);
1007	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
1008
1009	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1010	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1011
1012	/*
1013	 * First part: process new packets to send.
1014	 * nm_i is the current index in the netmap kring,
1015	 * nic_i is the corresponding index in the NIC ring.
1016	 *
1017	 * If we have packets to send (nm_i != head)
1018	 * iterate over the netmap ring, fetch length and update
1019	 * the corresponding slot in the NIC ring. Some drivers also
1020	 * need to update the buffer's physical address in the NIC slot
1021	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
1022	 *
1023	 * The netmap_reload_map() calls is especially expensive,
1024	 * even when (as in this case) the tag is 0, so do only
1025	 * when the buffer has actually changed.
1026	 *
1027	 * If possible do not set the report/intr bit on all slots,
1028	 * but only a few times per ring or when NS_REPORT is set.
1029	 *
1030	 * Finally, on 10G and faster drivers, it might be useful
1031	 * to prefetch the next slot and txr entry.
1032	 */
1033
1034	nm_i = kring->nr_hwcur;
1035	if (nm_i != head) {	/* we have new packets to send */
1036		uint32_t pkt_len = 0, seg_idx = 0;
1037		int nic_i_start = -1, flags = 0;
1038		pkt_info_zero(&pi);
1039		pi.ipi_segs = txq->ift_segs;
1040		pi.ipi_qsidx = kring->ring_id;
1041		nic_i = netmap_idx_k2n(kring, nm_i);
1042
1043		__builtin_prefetch(&ring->slot[nm_i]);
1044		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
1045		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
1046
1047		while (nm_i != head) {
1048			struct netmap_slot *slot = &ring->slot[nm_i];
1049			uint64_t offset = nm_get_offset(kring, slot);
1050			u_int len = slot->len;
1051			uint64_t paddr;
1052			void *addr = PNMB(na, slot, &paddr);
1053
1054			flags |= (slot->flags & NS_REPORT ||
1055				nic_i == 0 || nic_i == report_frequency) ?
1056				IPI_TX_INTR : 0;
1057
1058			/*
1059			 * If this is the first packet fragment, save the
1060			 * index of the first NIC slot for later.
1061			 */
1062			if (nic_i_start < 0)
1063				nic_i_start = nic_i;
1064
1065			pi.ipi_segs[seg_idx].ds_addr = paddr + offset;
1066			pi.ipi_segs[seg_idx].ds_len = len;
1067			if (len) {
1068				pkt_len += len;
1069				seg_idx++;
1070			}
1071
1072			if (!(slot->flags & NS_MOREFRAG)) {
1073				pi.ipi_len = pkt_len;
1074				pi.ipi_nsegs = seg_idx;
1075				pi.ipi_pidx = nic_i_start;
1076				pi.ipi_ndescs = 0;
1077				pi.ipi_flags = flags;
1078
1079				/* Prepare the NIC TX ring. */
1080				ctx->isc_txd_encap(ctx->ifc_softc, &pi);
1081				DBG_COUNTER_INC(tx_encap);
1082
1083				/* Update transmit counters */
1084				tx_bytes += pi.ipi_len;
1085				tx_pkts++;
1086
1087				/* Reinit per-packet info for the next one. */
1088				flags = seg_idx = pkt_len = 0;
1089				nic_i_start = -1;
1090			}
1091
1092			/* prefetch for next round */
1093			__builtin_prefetch(&ring->slot[nm_i + 1]);
1094			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
1095			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
1096
1097			NM_CHECK_ADDR_LEN_OFF(na, len, offset);
1098
1099			if (slot->flags & NS_BUF_CHANGED) {
1100				/* buffer has changed, reload map */
1101				netmap_reload_map(na, txq->ift_buf_tag,
1102				    txq->ift_sds.ifsd_map[nic_i], addr);
1103			}
1104			/* make sure changes to the buffer are synced */
1105			bus_dmamap_sync(txq->ift_buf_tag,
1106			    txq->ift_sds.ifsd_map[nic_i],
1107			    BUS_DMASYNC_PREWRITE);
1108
1109			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED | NS_MOREFRAG);
1110			nm_i = nm_next(nm_i, lim);
1111			nic_i = nm_next(nic_i, lim);
1112		}
1113		kring->nr_hwcur = nm_i;
1114
1115		/* synchronize the NIC ring */
1116		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1117		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1118
1119		/* (re)start the tx unit up to slot nic_i (excluded) */
1120		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
1121	}
1122
1123	/*
1124	 * Second part: reclaim buffers for completed transmissions.
1125	 *
1126	 * If there are unclaimed buffers, attempt to reclaim them.
1127	 * If we don't manage to reclaim them all, and TX IRQs are not in use,
1128	 * trigger a per-tx-queue timer to try again later.
1129	 */
1130	if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
1131		if (iflib_tx_credits_update(ctx, txq)) {
1132			/* some tx completed, increment avail */
1133			nic_i = txq->ift_cidx_processed;
1134			kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
1135		}
1136	}
1137
1138	if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ))
1139		if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
1140			callout_reset_sbt_on(&txq->ift_netmap_timer,
1141			    NETMAP_TX_TIMER_US * SBT_1US, SBT_1US,
1142			    iflib_netmap_timer, txq,
1143			    txq->ift_netmap_timer.c_cpu, 0);
1144		}
1145
1146	if_inc_counter(ifp, IFCOUNTER_OBYTES, tx_bytes);
1147	if_inc_counter(ifp, IFCOUNTER_OPACKETS, tx_pkts);
1148
1149	return (0);
1150}
1151
1152/*
1153 * Reconcile kernel and user view of the receive ring.
1154 * Same as for the txsync, this routine must be efficient.
1155 * The caller guarantees a single invocations, but races against
1156 * the rest of the driver should be handled here.
1157 *
1158 * On call, kring->rhead is the first packet that userspace wants
1159 * to keep, and kring->rcur is the wakeup point.
1160 * The kernel has previously reported packets up to kring->rtail.
1161 *
1162 * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
1163 * of whether or not we received an interrupt.
1164 */
1165static int
1166iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
1167{
1168	struct netmap_adapter *na = kring->na;
1169	struct netmap_ring *ring = kring->ring;
1170	if_t ifp = na->ifp;
1171	uint32_t nm_i;	/* index into the netmap ring */
1172	uint32_t nic_i;	/* index into the NIC ring */
1173	u_int n;
1174	u_int const lim = kring->nkr_num_slots - 1;
1175	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
1176	int i = 0, rx_bytes = 0, rx_pkts = 0;
1177
1178	if_ctx_t ctx = if_getsoftc(ifp);
1179	if_shared_ctx_t sctx = ctx->ifc_sctx;
1180	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1181	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
1182	iflib_fl_t fl = &rxq->ifr_fl[0];
1183	struct if_rxd_info ri;
1184	qidx_t *cidxp;
1185
1186	/*
1187	 * netmap only uses free list 0, to avoid out of order consumption
1188	 * of receive buffers
1189	 */
1190
1191	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
1192	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1193
1194	/*
1195	 * First part: import newly received packets.
1196	 *
1197	 * nm_i is the index of the next free slot in the netmap ring,
1198	 * nic_i is the index of the next received packet in the NIC ring
1199	 * (or in the free list 0 if IFLIB_HAS_RXCQ is set), and they may
1200	 * differ in case if_init() has been called while
1201	 * in netmap mode. For the receive ring we have
1202	 *
1203	 *	nic_i = fl->ifl_cidx;
1204	 *	nm_i = kring->nr_hwtail (previous)
1205	 * and
1206	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
1207	 *
1208	 * fl->ifl_cidx is set to 0 on a ring reinit
1209	 */
1210	if (netmap_no_pendintr || force_update) {
1211		uint32_t hwtail_lim = nm_prev(kring->nr_hwcur, lim);
1212		bool have_rxcq = sctx->isc_flags & IFLIB_HAS_RXCQ;
1213		int crclen = iflib_crcstrip ? 0 : 4;
1214		int error, avail;
1215
1216		/*
1217		 * For the free list consumer index, we use the same
1218		 * logic as in iflib_rxeof().
1219		 */
1220		if (have_rxcq)
1221			cidxp = &rxq->ifr_cq_cidx;
1222		else
1223			cidxp = &fl->ifl_cidx;
1224		avail = ctx->isc_rxd_available(ctx->ifc_softc,
1225		    rxq->ifr_id, *cidxp, USHRT_MAX);
1226
1227		nic_i = fl->ifl_cidx;
1228		nm_i = netmap_idx_n2k(kring, nic_i);
1229		MPASS(nm_i == kring->nr_hwtail);
1230		for (n = 0; avail > 0 && nm_i != hwtail_lim; n++, avail--) {
1231			rxd_info_zero(&ri);
1232			ri.iri_frags = rxq->ifr_frags;
1233			ri.iri_qsidx = kring->ring_id;
1234			ri.iri_ifp = ctx->ifc_ifp;
1235			ri.iri_cidx = *cidxp;
1236
1237			error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
1238			for (i = 0; i < ri.iri_nfrags; i++) {
1239				if (error) {
1240					ring->slot[nm_i].len = 0;
1241					ring->slot[nm_i].flags = 0;
1242				} else {
1243					ring->slot[nm_i].len = ri.iri_frags[i].irf_len;
1244					if (i == (ri.iri_nfrags - 1)) {
1245						ring->slot[nm_i].len -= crclen;
1246						ring->slot[nm_i].flags = 0;
1247
1248						/* Update receive counters */
1249						rx_bytes += ri.iri_len;
1250						rx_pkts++;
1251					} else
1252						ring->slot[nm_i].flags = NS_MOREFRAG;
1253				}
1254
1255				bus_dmamap_sync(fl->ifl_buf_tag,
1256				    fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
1257				nm_i = nm_next(nm_i, lim);
1258				fl->ifl_cidx = nic_i = nm_next(nic_i, lim);
1259			}
1260
1261			if (have_rxcq) {
1262				*cidxp = ri.iri_cidx;
1263				while (*cidxp >= scctx->isc_nrxd[0])
1264					*cidxp -= scctx->isc_nrxd[0];
1265			}
1266
1267		}
1268		if (n) { /* update the state variables */
1269			if (netmap_no_pendintr && !force_update) {
1270				/* diagnostics */
1271				iflib_rx_miss ++;
1272				iflib_rx_miss_bufs += n;
1273			}
1274			kring->nr_hwtail = nm_i;
1275		}
1276		kring->nr_kflags &= ~NKR_PENDINTR;
1277	}
1278	/*
1279	 * Second part: skip past packets that userspace has released.
1280	 * (kring->nr_hwcur to head excluded),
1281	 * and make the buffers available for reception.
1282	 * As usual nm_i is the index in the netmap ring,
1283	 * nic_i is the index in the NIC ring, and
1284	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
1285	 */
1286	netmap_fl_refill(rxq, kring, false);
1287
1288	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
1289	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
1290
1291	return (0);
1292}
1293
1294static void
1295iflib_netmap_intr(struct netmap_adapter *na, int onoff)
1296{
1297	if_ctx_t ctx = if_getsoftc(na->ifp);
1298
1299	CTX_LOCK(ctx);
1300	if (onoff) {
1301		IFDI_INTR_ENABLE(ctx);
1302	} else {
1303		IFDI_INTR_DISABLE(ctx);
1304	}
1305	CTX_UNLOCK(ctx);
1306}
1307
1308static int
1309iflib_netmap_attach(if_ctx_t ctx)
1310{
1311	struct netmap_adapter na;
1312
1313	bzero(&na, sizeof(na));
1314
1315	na.ifp = ctx->ifc_ifp;
1316	na.na_flags = NAF_BDG_MAYSLEEP | NAF_MOREFRAG | NAF_OFFSETS;
1317	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
1318	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
1319
1320	na.num_tx_desc = iflib_num_tx_descs(ctx);
1321	na.num_rx_desc = iflib_num_rx_descs(ctx);
1322	na.nm_txsync = iflib_netmap_txsync;
1323	na.nm_rxsync = iflib_netmap_rxsync;
1324	na.nm_register = iflib_netmap_register;
1325	na.nm_intr = iflib_netmap_intr;
1326	na.nm_config = iflib_netmap_config;
1327	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
1328	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
1329	return (netmap_attach(&na));
1330}
1331
1332static int
1333iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
1334{
1335	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1336	struct netmap_slot *slot;
1337
1338	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
1339	if (slot == NULL)
1340		return (0);
1341	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
1342		/*
1343		 * In netmap mode, set the map for the packet buffer.
1344		 * NOTE: Some drivers (not this one) also need to set
1345		 * the physical buffer address in the NIC ring.
1346		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
1347		 * netmap slot index, si
1348		 */
1349		int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i);
1350		netmap_load_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[i],
1351		    NMB(na, slot + si));
1352	}
1353	return (1);
1354}
1355
1356static int
1357iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
1358{
1359	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1360	struct netmap_kring *kring;
1361	struct netmap_slot *slot;
1362
1363	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
1364	if (slot == NULL)
1365		return (0);
1366	kring = na->rx_rings[rxq->ifr_id];
1367	netmap_fl_refill(rxq, kring, true);
1368	return (1);
1369}
1370
1371static void
1372iflib_netmap_timer(void *arg)
1373{
1374	iflib_txq_t txq = arg;
1375	if_ctx_t ctx = txq->ift_ctx;
1376
1377	/*
1378	 * Wake up the netmap application, to give it a chance to
1379	 * call txsync and reclaim more completed TX buffers.
1380	 */
1381	netmap_tx_irq(ctx->ifc_ifp, txq->ift_id);
1382}
1383
1384#define iflib_netmap_detach(ifp) netmap_detach(ifp)
1385
1386#else
1387#define iflib_netmap_txq_init(ctx, txq) (0)
1388#define iflib_netmap_rxq_init(ctx, rxq) (0)
1389#define iflib_netmap_detach(ifp)
1390#define netmap_enable_all_rings(ifp)
1391#define netmap_disable_all_rings(ifp)
1392
1393#define iflib_netmap_attach(ctx) (0)
1394#define netmap_rx_irq(ifp, qid, budget) (0)
1395#endif
1396
1397#if defined(__i386__) || defined(__amd64__)
1398static __inline void
1399prefetch(void *x)
1400{
1401	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
1402}
1403
1404static __inline void
1405prefetch2cachelines(void *x)
1406{
1407	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
1408#if (CACHE_LINE_SIZE < 128)
1409	__asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long)))));
1410#endif
1411}
1412#else
1413static __inline void
1414prefetch(void *x)
1415{
1416}
1417
1418static __inline void
1419prefetch2cachelines(void *x)
1420{
1421}
1422#endif
1423
1424static void
1425iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
1426{
1427	iflib_fl_t fl;
1428
1429	fl = &rxq->ifr_fl[flid];
1430	iru->iru_paddrs = fl->ifl_bus_addrs;
1431	iru->iru_idxs = fl->ifl_rxd_idxs;
1432	iru->iru_qsidx = rxq->ifr_id;
1433	iru->iru_buf_size = fl->ifl_buf_size;
1434	iru->iru_flidx = fl->ifl_id;
1435}
1436
1437static void
1438_iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
1439{
1440	if (err)
1441		return;
1442	*(bus_addr_t *) arg = segs[0].ds_addr;
1443}
1444
1445#define	DMA_WIDTH_TO_BUS_LOWADDR(width)				\
1446	(((width) == 0) || (width) == flsll(BUS_SPACE_MAXADDR) ?	\
1447	    BUS_SPACE_MAXADDR : (1ULL << (width)) - 1ULL)
1448
1449int
1450iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags)
1451{
1452	int err;
1453	device_t dev = ctx->ifc_dev;
1454	bus_addr_t lowaddr;
1455
1456	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(ctx->ifc_softc_ctx.isc_dma_width);
1457
1458	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
1459				align, 0,		/* alignment, bounds */
1460				lowaddr,		/* lowaddr */
1461				BUS_SPACE_MAXADDR,	/* highaddr */
1462				NULL, NULL,		/* filter, filterarg */
1463				size,			/* maxsize */
1464				1,			/* nsegments */
1465				size,			/* maxsegsize */
1466				BUS_DMA_ALLOCNOW,	/* flags */
1467				NULL,			/* lockfunc */
1468				NULL,			/* lockarg */
1469				&dma->idi_tag);
1470	if (err) {
1471		device_printf(dev,
1472		    "%s: bus_dma_tag_create failed: %d (size=%d, align=%d)\n",
1473		    __func__, err, size, align);
1474		goto fail_0;
1475	}
1476
1477	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
1478	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
1479	if (err) {
1480		device_printf(dev,
1481		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
1482		    __func__, (uintmax_t)size, err);
1483		goto fail_1;
1484	}
1485
1486	dma->idi_paddr = IF_BAD_DMA;
1487	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
1488	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
1489	if (err || dma->idi_paddr == IF_BAD_DMA) {
1490		device_printf(dev,
1491		    "%s: bus_dmamap_load failed: %d\n",
1492		    __func__, err);
1493		goto fail_2;
1494	}
1495
1496	dma->idi_size = size;
1497	return (0);
1498
1499fail_2:
1500	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1501fail_1:
1502	bus_dma_tag_destroy(dma->idi_tag);
1503fail_0:
1504	dma->idi_tag = NULL;
1505
1506	return (err);
1507}
1508
1509int
1510iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
1511{
1512	if_shared_ctx_t sctx = ctx->ifc_sctx;
1513
1514	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
1515
1516	return (iflib_dma_alloc_align(ctx, size, sctx->isc_q_align, dma, mapflags));
1517}
1518
1519int
1520iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
1521{
1522	int i, err;
1523	iflib_dma_info_t *dmaiter;
1524
1525	dmaiter = dmalist;
1526	for (i = 0; i < count; i++, dmaiter++) {
1527		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
1528			break;
1529	}
1530	if (err)
1531		iflib_dma_free_multi(dmalist, i);
1532	return (err);
1533}
1534
1535void
1536iflib_dma_free(iflib_dma_info_t dma)
1537{
1538	if (dma->idi_tag == NULL)
1539		return;
1540	if (dma->idi_paddr != IF_BAD_DMA) {
1541		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
1542		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1543		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
1544		dma->idi_paddr = IF_BAD_DMA;
1545	}
1546	if (dma->idi_vaddr != NULL) {
1547		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1548		dma->idi_vaddr = NULL;
1549	}
1550	bus_dma_tag_destroy(dma->idi_tag);
1551	dma->idi_tag = NULL;
1552}
1553
1554void
1555iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
1556{
1557	int i;
1558	iflib_dma_info_t *dmaiter = dmalist;
1559
1560	for (i = 0; i < count; i++, dmaiter++)
1561		iflib_dma_free(*dmaiter);
1562}
1563
1564static int
1565iflib_fast_intr(void *arg)
1566{
1567	iflib_filter_info_t info = arg;
1568	struct grouptask *gtask = info->ifi_task;
1569	int result;
1570
1571	DBG_COUNTER_INC(fast_intrs);
1572	if (info->ifi_filter != NULL) {
1573		result = info->ifi_filter(info->ifi_filter_arg);
1574		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1575			return (result);
1576	}
1577
1578	GROUPTASK_ENQUEUE(gtask);
1579	return (FILTER_HANDLED);
1580}
1581
1582static int
1583iflib_fast_intr_rxtx(void *arg)
1584{
1585	iflib_filter_info_t info = arg;
1586	struct grouptask *gtask = info->ifi_task;
1587	if_ctx_t ctx;
1588	iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
1589	iflib_txq_t txq;
1590	void *sc;
1591	int i, cidx, result;
1592	qidx_t txqid;
1593	bool intr_enable, intr_legacy;
1594
1595	DBG_COUNTER_INC(fast_intrs);
1596	if (info->ifi_filter != NULL) {
1597		result = info->ifi_filter(info->ifi_filter_arg);
1598		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1599			return (result);
1600	}
1601
1602	ctx = rxq->ifr_ctx;
1603	sc = ctx->ifc_softc;
1604	intr_enable = false;
1605	intr_legacy = !!(ctx->ifc_flags & IFC_LEGACY);
1606	MPASS(rxq->ifr_ntxqirq);
1607	for (i = 0; i < rxq->ifr_ntxqirq; i++) {
1608		txqid = rxq->ifr_txqid[i];
1609		txq = &ctx->ifc_txqs[txqid];
1610		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1611		    BUS_DMASYNC_POSTREAD);
1612		if (!ctx->isc_txd_credits_update(sc, txqid, false)) {
1613			if (intr_legacy)
1614				intr_enable = true;
1615			else
1616				IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
1617			continue;
1618		}
1619		GROUPTASK_ENQUEUE(&txq->ift_task);
1620	}
1621	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
1622		cidx = rxq->ifr_cq_cidx;
1623	else
1624		cidx = rxq->ifr_fl[0].ifl_cidx;
1625	if (iflib_rxd_avail(ctx, rxq, cidx, 1))
1626		GROUPTASK_ENQUEUE(gtask);
1627	else {
1628		if (intr_legacy)
1629			intr_enable = true;
1630		else
1631			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
1632		DBG_COUNTER_INC(rx_intr_enables);
1633	}
1634	if (intr_enable)
1635		IFDI_INTR_ENABLE(ctx);
1636	return (FILTER_HANDLED);
1637}
1638
1639static int
1640iflib_fast_intr_ctx(void *arg)
1641{
1642	iflib_filter_info_t info = arg;
1643	struct grouptask *gtask = info->ifi_task;
1644	int result;
1645
1646	DBG_COUNTER_INC(fast_intrs);
1647	if (info->ifi_filter != NULL) {
1648		result = info->ifi_filter(info->ifi_filter_arg);
1649		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1650			return (result);
1651	}
1652
1653	if (gtask->gt_taskqueue != NULL)
1654		GROUPTASK_ENQUEUE(gtask);
1655	return (FILTER_HANDLED);
1656}
1657
1658static int
1659_iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
1660		 driver_filter_t filter, driver_intr_t handler, void *arg,
1661		 const char *name)
1662{
1663	struct resource *res;
1664	void *tag = NULL;
1665	device_t dev = ctx->ifc_dev;
1666	int flags, i, rc;
1667
1668	flags = RF_ACTIVE;
1669	if (ctx->ifc_flags & IFC_LEGACY)
1670		flags |= RF_SHAREABLE;
1671	MPASS(rid < 512);
1672	i = rid;
1673	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &i, flags);
1674	if (res == NULL) {
1675		device_printf(dev,
1676		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
1677		return (ENOMEM);
1678	}
1679	irq->ii_res = res;
1680	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
1681	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
1682						filter, handler, arg, &tag);
1683	if (rc != 0) {
1684		device_printf(dev,
1685		    "failed to setup interrupt for rid %d, name %s: %d\n",
1686					  rid, name ? name : "unknown", rc);
1687		return (rc);
1688	} else if (name)
1689		bus_describe_intr(dev, res, tag, "%s", name);
1690
1691	irq->ii_tag = tag;
1692	return (0);
1693}
1694
1695/*********************************************************************
1696 *
1697 *  Allocate DMA resources for TX buffers as well as memory for the TX
1698 *  mbuf map.  TX DMA maps (non-TSO/TSO) and TX mbuf map are kept in a
1699 *  iflib_sw_tx_desc_array structure, storing all the information that
1700 *  is needed to transmit a packet on the wire.  This is called only
1701 *  once at attach, setup is done every reset.
1702 *
1703 **********************************************************************/
1704static int
1705iflib_txsd_alloc(iflib_txq_t txq)
1706{
1707	if_ctx_t ctx = txq->ift_ctx;
1708	if_shared_ctx_t sctx = ctx->ifc_sctx;
1709	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1710	device_t dev = ctx->ifc_dev;
1711	bus_size_t tsomaxsize;
1712	bus_addr_t lowaddr;
1713	int err, nsegments, ntsosegments;
1714	bool tso;
1715
1716	nsegments = scctx->isc_tx_nsegments;
1717	ntsosegments = scctx->isc_tx_tso_segments_max;
1718	tsomaxsize = scctx->isc_tx_tso_size_max;
1719	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_VLAN_MTU)
1720		tsomaxsize += sizeof(struct ether_vlan_header);
1721	MPASS(scctx->isc_ntxd[0] > 0);
1722	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
1723	MPASS(nsegments > 0);
1724	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) {
1725		MPASS(ntsosegments > 0);
1726		MPASS(sctx->isc_tso_maxsize >= tsomaxsize);
1727	}
1728
1729	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(scctx->isc_dma_width);
1730
1731	/*
1732	 * Set up DMA tags for TX buffers.
1733	 */
1734	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
1735			       1, 0,			/* alignment, bounds */
1736			       lowaddr,			/* lowaddr */
1737			       BUS_SPACE_MAXADDR,	/* highaddr */
1738			       NULL, NULL,		/* filter, filterarg */
1739			       sctx->isc_tx_maxsize,		/* maxsize */
1740			       nsegments,	/* nsegments */
1741			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
1742			       0,			/* flags */
1743			       NULL,			/* lockfunc */
1744			       NULL,			/* lockfuncarg */
1745			       &txq->ift_buf_tag))) {
1746		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
1747		device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n",
1748		    (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
1749		goto fail;
1750	}
1751	tso = (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) != 0;
1752	if (tso && (err = bus_dma_tag_create(bus_get_dma_tag(dev),
1753			       1, 0,			/* alignment, bounds */
1754			       lowaddr,			/* lowaddr */
1755			       BUS_SPACE_MAXADDR,	/* highaddr */
1756			       NULL, NULL,		/* filter, filterarg */
1757			       tsomaxsize,		/* maxsize */
1758			       ntsosegments,	/* nsegments */
1759			       sctx->isc_tso_maxsegsize,/* maxsegsize */
1760			       0,			/* flags */
1761			       NULL,			/* lockfunc */
1762			       NULL,			/* lockfuncarg */
1763			       &txq->ift_tso_buf_tag))) {
1764		device_printf(dev, "Unable to allocate TSO TX DMA tag: %d\n",
1765		    err);
1766		goto fail;
1767	}
1768
1769	/* Allocate memory for the TX mbuf map. */
1770	if (!(txq->ift_sds.ifsd_m =
1771	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
1772	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1773		device_printf(dev, "Unable to allocate TX mbuf map memory\n");
1774		err = ENOMEM;
1775		goto fail;
1776	}
1777
1778	/*
1779	 * Create the DMA maps for TX buffers.
1780	 */
1781	if ((txq->ift_sds.ifsd_map = (bus_dmamap_t *)malloc(
1782	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
1783	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
1784		device_printf(dev,
1785		    "Unable to allocate TX buffer DMA map memory\n");
1786		err = ENOMEM;
1787		goto fail;
1788	}
1789	if (tso && (txq->ift_sds.ifsd_tso_map = (bus_dmamap_t *)malloc(
1790	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
1791	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
1792		device_printf(dev,
1793		    "Unable to allocate TSO TX buffer map memory\n");
1794		err = ENOMEM;
1795		goto fail;
1796	}
1797	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
1798		err = bus_dmamap_create(txq->ift_buf_tag, 0,
1799		    &txq->ift_sds.ifsd_map[i]);
1800		if (err != 0) {
1801			device_printf(dev, "Unable to create TX DMA map\n");
1802			goto fail;
1803		}
1804		if (!tso)
1805			continue;
1806		err = bus_dmamap_create(txq->ift_tso_buf_tag, 0,
1807		    &txq->ift_sds.ifsd_tso_map[i]);
1808		if (err != 0) {
1809			device_printf(dev, "Unable to create TSO TX DMA map\n");
1810			goto fail;
1811		}
1812	}
1813	return (0);
1814fail:
1815	/* We free all, it handles case where we are in the middle */
1816	iflib_tx_structures_free(ctx);
1817	return (err);
1818}
1819
1820static void
1821iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
1822{
1823	bus_dmamap_t map;
1824
1825	if (txq->ift_sds.ifsd_map != NULL) {
1826		map = txq->ift_sds.ifsd_map[i];
1827		bus_dmamap_sync(txq->ift_buf_tag, map, BUS_DMASYNC_POSTWRITE);
1828		bus_dmamap_unload(txq->ift_buf_tag, map);
1829		bus_dmamap_destroy(txq->ift_buf_tag, map);
1830		txq->ift_sds.ifsd_map[i] = NULL;
1831	}
1832
1833	if (txq->ift_sds.ifsd_tso_map != NULL) {
1834		map = txq->ift_sds.ifsd_tso_map[i];
1835		bus_dmamap_sync(txq->ift_tso_buf_tag, map,
1836		    BUS_DMASYNC_POSTWRITE);
1837		bus_dmamap_unload(txq->ift_tso_buf_tag, map);
1838		bus_dmamap_destroy(txq->ift_tso_buf_tag, map);
1839		txq->ift_sds.ifsd_tso_map[i] = NULL;
1840	}
1841}
1842
1843static void
1844iflib_txq_destroy(iflib_txq_t txq)
1845{
1846	if_ctx_t ctx = txq->ift_ctx;
1847
1848	for (int i = 0; i < txq->ift_size; i++)
1849		iflib_txsd_destroy(ctx, txq, i);
1850
1851	if (txq->ift_br != NULL) {
1852		ifmp_ring_free(txq->ift_br);
1853		txq->ift_br = NULL;
1854	}
1855
1856	mtx_destroy(&txq->ift_mtx);
1857
1858	if (txq->ift_sds.ifsd_map != NULL) {
1859		free(txq->ift_sds.ifsd_map, M_IFLIB);
1860		txq->ift_sds.ifsd_map = NULL;
1861	}
1862	if (txq->ift_sds.ifsd_tso_map != NULL) {
1863		free(txq->ift_sds.ifsd_tso_map, M_IFLIB);
1864		txq->ift_sds.ifsd_tso_map = NULL;
1865	}
1866	if (txq->ift_sds.ifsd_m != NULL) {
1867		free(txq->ift_sds.ifsd_m, M_IFLIB);
1868		txq->ift_sds.ifsd_m = NULL;
1869	}
1870	if (txq->ift_buf_tag != NULL) {
1871		bus_dma_tag_destroy(txq->ift_buf_tag);
1872		txq->ift_buf_tag = NULL;
1873	}
1874	if (txq->ift_tso_buf_tag != NULL) {
1875		bus_dma_tag_destroy(txq->ift_tso_buf_tag);
1876		txq->ift_tso_buf_tag = NULL;
1877	}
1878	if (txq->ift_ifdi != NULL) {
1879		free(txq->ift_ifdi, M_IFLIB);
1880	}
1881}
1882
1883static void
1884iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
1885{
1886	struct mbuf **mp;
1887
1888	mp = &txq->ift_sds.ifsd_m[i];
1889	if (*mp == NULL)
1890		return;
1891
1892	if (txq->ift_sds.ifsd_map != NULL) {
1893		bus_dmamap_sync(txq->ift_buf_tag,
1894		    txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE);
1895		bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i]);
1896	}
1897	if (txq->ift_sds.ifsd_tso_map != NULL) {
1898		bus_dmamap_sync(txq->ift_tso_buf_tag,
1899		    txq->ift_sds.ifsd_tso_map[i], BUS_DMASYNC_POSTWRITE);
1900		bus_dmamap_unload(txq->ift_tso_buf_tag,
1901		    txq->ift_sds.ifsd_tso_map[i]);
1902	}
1903	m_freem(*mp);
1904	DBG_COUNTER_INC(tx_frees);
1905	*mp = NULL;
1906}
1907
1908static int
1909iflib_txq_setup(iflib_txq_t txq)
1910{
1911	if_ctx_t ctx = txq->ift_ctx;
1912	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1913	if_shared_ctx_t sctx = ctx->ifc_sctx;
1914	iflib_dma_info_t di;
1915	int i;
1916
1917	/* Set number of descriptors available */
1918	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
1919	/* XXX make configurable */
1920	txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
1921
1922	/* Reset indices */
1923	txq->ift_cidx_processed = 0;
1924	txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
1925	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
1926
1927	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
1928		bzero((void *)di->idi_vaddr, di->idi_size);
1929
1930	IFDI_TXQ_SETUP(ctx, txq->ift_id);
1931	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
1932		bus_dmamap_sync(di->idi_tag, di->idi_map,
1933		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1934	return (0);
1935}
1936
1937/*********************************************************************
1938 *
1939 *  Allocate DMA resources for RX buffers as well as memory for the RX
1940 *  mbuf map, direct RX cluster pointer map and RX cluster bus address
1941 *  map.  RX DMA map, RX mbuf map, direct RX cluster pointer map and
1942 *  RX cluster map are kept in a iflib_sw_rx_desc_array structure.
1943 *  Since we use use one entry in iflib_sw_rx_desc_array per received
1944 *  packet, the maximum number of entries we'll need is equal to the
1945 *  number of hardware receive descriptors that we've allocated.
1946 *
1947 **********************************************************************/
1948static int
1949iflib_rxsd_alloc(iflib_rxq_t rxq)
1950{
1951	if_ctx_t ctx = rxq->ifr_ctx;
1952	if_shared_ctx_t sctx = ctx->ifc_sctx;
1953	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1954	device_t dev = ctx->ifc_dev;
1955	iflib_fl_t fl;
1956	bus_addr_t lowaddr;
1957	int			err;
1958
1959	MPASS(scctx->isc_nrxd[0] > 0);
1960	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
1961
1962	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(scctx->isc_dma_width);
1963
1964	fl = rxq->ifr_fl;
1965	for (int i = 0; i < rxq->ifr_nfl; i++, fl++) {
1966		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
1967		/* Set up DMA tag for RX buffers. */
1968		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
1969					 1, 0,			/* alignment, bounds */
1970					 lowaddr,		/* lowaddr */
1971					 BUS_SPACE_MAXADDR,	/* highaddr */
1972					 NULL, NULL,		/* filter, filterarg */
1973					 sctx->isc_rx_maxsize,	/* maxsize */
1974					 sctx->isc_rx_nsegments,	/* nsegments */
1975					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
1976					 0,			/* flags */
1977					 NULL,			/* lockfunc */
1978					 NULL,			/* lockarg */
1979					 &fl->ifl_buf_tag);
1980		if (err) {
1981			device_printf(dev,
1982			    "Unable to allocate RX DMA tag: %d\n", err);
1983			goto fail;
1984		}
1985
1986		/* Allocate memory for the RX mbuf map. */
1987		if (!(fl->ifl_sds.ifsd_m =
1988		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
1989					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1990			device_printf(dev,
1991			    "Unable to allocate RX mbuf map memory\n");
1992			err = ENOMEM;
1993			goto fail;
1994		}
1995
1996		/* Allocate memory for the direct RX cluster pointer map. */
1997		if (!(fl->ifl_sds.ifsd_cl =
1998		      (caddr_t *) malloc(sizeof(caddr_t) *
1999					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
2000			device_printf(dev,
2001			    "Unable to allocate RX cluster map memory\n");
2002			err = ENOMEM;
2003			goto fail;
2004		}
2005
2006		/* Allocate memory for the RX cluster bus address map. */
2007		if (!(fl->ifl_sds.ifsd_ba =
2008		      (bus_addr_t *) malloc(sizeof(bus_addr_t) *
2009					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
2010			device_printf(dev,
2011			    "Unable to allocate RX bus address map memory\n");
2012			err = ENOMEM;
2013			goto fail;
2014		}
2015
2016		/*
2017		 * Create the DMA maps for RX buffers.
2018		 */
2019		if (!(fl->ifl_sds.ifsd_map =
2020		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
2021			device_printf(dev,
2022			    "Unable to allocate RX buffer DMA map memory\n");
2023			err = ENOMEM;
2024			goto fail;
2025		}
2026		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
2027			err = bus_dmamap_create(fl->ifl_buf_tag, 0,
2028			    &fl->ifl_sds.ifsd_map[i]);
2029			if (err != 0) {
2030				device_printf(dev, "Unable to create RX buffer DMA map\n");
2031				goto fail;
2032			}
2033		}
2034	}
2035	return (0);
2036
2037fail:
2038	iflib_rx_structures_free(ctx);
2039	return (err);
2040}
2041
2042/*
2043 * Internal service routines
2044 */
2045
2046struct rxq_refill_cb_arg {
2047	int               error;
2048	bus_dma_segment_t seg;
2049	int               nseg;
2050};
2051
2052static void
2053_rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
2054{
2055	struct rxq_refill_cb_arg *cb_arg = arg;
2056
2057	cb_arg->error = error;
2058	cb_arg->seg = segs[0];
2059	cb_arg->nseg = nseg;
2060}
2061
2062/**
2063 * iflib_fl_refill - refill an rxq free-buffer list
2064 * @ctx: the iflib context
2065 * @fl: the free list to refill
2066 * @count: the number of new buffers to allocate
2067 *
2068 * (Re)populate an rxq free-buffer list with up to @count new packet buffers.
2069 * The caller must assure that @count does not exceed the queue's capacity
2070 * minus one (since we always leave a descriptor unavailable).
2071 */
2072static uint8_t
2073iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
2074{
2075	struct if_rxd_update iru;
2076	struct rxq_refill_cb_arg cb_arg;
2077	struct mbuf *m;
2078	caddr_t cl, *sd_cl;
2079	struct mbuf **sd_m;
2080	bus_dmamap_t *sd_map;
2081	bus_addr_t bus_addr, *sd_ba;
2082	int err, frag_idx, i, idx, n, pidx;
2083	qidx_t credits;
2084
2085	MPASS(count <= fl->ifl_size - fl->ifl_credits - 1);
2086
2087	sd_m = fl->ifl_sds.ifsd_m;
2088	sd_map = fl->ifl_sds.ifsd_map;
2089	sd_cl = fl->ifl_sds.ifsd_cl;
2090	sd_ba = fl->ifl_sds.ifsd_ba;
2091	pidx = fl->ifl_pidx;
2092	idx = pidx;
2093	frag_idx = fl->ifl_fragidx;
2094	credits = fl->ifl_credits;
2095
2096	i = 0;
2097	n = count;
2098	MPASS(n > 0);
2099	MPASS(credits + n <= fl->ifl_size);
2100
2101	if (pidx < fl->ifl_cidx)
2102		MPASS(pidx + n <= fl->ifl_cidx);
2103	if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
2104		MPASS(fl->ifl_gen == 0);
2105	if (pidx > fl->ifl_cidx)
2106		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
2107
2108	DBG_COUNTER_INC(fl_refills);
2109	if (n > 8)
2110		DBG_COUNTER_INC(fl_refills_large);
2111	iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
2112	while (n-- > 0) {
2113		/*
2114		 * We allocate an uninitialized mbuf + cluster, mbuf is
2115		 * initialized after rx.
2116		 *
2117		 * If the cluster is still set then we know a minimum sized
2118		 * packet was received
2119		 */
2120		bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,
2121		    &frag_idx);
2122		if (frag_idx < 0)
2123			bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
2124		MPASS(frag_idx >= 0);
2125		if ((cl = sd_cl[frag_idx]) == NULL) {
2126			cl = uma_zalloc(fl->ifl_zone, M_NOWAIT);
2127			if (__predict_false(cl == NULL))
2128				break;
2129
2130			cb_arg.error = 0;
2131			MPASS(sd_map != NULL);
2132			err = bus_dmamap_load(fl->ifl_buf_tag, sd_map[frag_idx],
2133			    cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg,
2134			    BUS_DMA_NOWAIT);
2135			if (__predict_false(err != 0 || cb_arg.error)) {
2136				uma_zfree(fl->ifl_zone, cl);
2137				break;
2138			}
2139
2140			sd_ba[frag_idx] = bus_addr = cb_arg.seg.ds_addr;
2141			sd_cl[frag_idx] = cl;
2142#if MEMORY_LOGGING
2143			fl->ifl_cl_enqueued++;
2144#endif
2145		} else {
2146			bus_addr = sd_ba[frag_idx];
2147		}
2148		bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
2149		    BUS_DMASYNC_PREREAD);
2150
2151		if (sd_m[frag_idx] == NULL) {
2152			m = m_gethdr_raw(M_NOWAIT, 0);
2153			if (__predict_false(m == NULL))
2154				break;
2155			sd_m[frag_idx] = m;
2156		}
2157		bit_set(fl->ifl_rx_bitmap, frag_idx);
2158#if MEMORY_LOGGING
2159		fl->ifl_m_enqueued++;
2160#endif
2161
2162		DBG_COUNTER_INC(rx_allocs);
2163		fl->ifl_rxd_idxs[i] = frag_idx;
2164		fl->ifl_bus_addrs[i] = bus_addr;
2165		credits++;
2166		i++;
2167		MPASS(credits <= fl->ifl_size);
2168		if (++idx == fl->ifl_size) {
2169#ifdef INVARIANTS
2170			fl->ifl_gen = 1;
2171#endif
2172			idx = 0;
2173		}
2174		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
2175			iru.iru_pidx = pidx;
2176			iru.iru_count = i;
2177			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
2178			fl->ifl_pidx = idx;
2179			fl->ifl_credits = credits;
2180			pidx = idx;
2181			i = 0;
2182		}
2183	}
2184
2185	if (n < count - 1) {
2186		if (i != 0) {
2187			iru.iru_pidx = pidx;
2188			iru.iru_count = i;
2189			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
2190			fl->ifl_pidx = idx;
2191			fl->ifl_credits = credits;
2192		}
2193		DBG_COUNTER_INC(rxd_flush);
2194		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
2195		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2196		ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id,
2197		    fl->ifl_id, fl->ifl_pidx);
2198		if (__predict_true(bit_test(fl->ifl_rx_bitmap, frag_idx))) {
2199			fl->ifl_fragidx = frag_idx + 1;
2200			if (fl->ifl_fragidx == fl->ifl_size)
2201				fl->ifl_fragidx = 0;
2202		} else {
2203			fl->ifl_fragidx = frag_idx;
2204		}
2205	}
2206
2207	return (n == -1 ? 0 : IFLIB_RXEOF_EMPTY);
2208}
2209
2210static inline uint8_t
2211iflib_fl_refill_all(if_ctx_t ctx, iflib_fl_t fl)
2212{
2213	/*
2214	 * We leave an unused descriptor to avoid pidx to catch up with cidx.
2215	 * This is important as it confuses most NICs. For instance,
2216	 * Intel NICs have (per receive ring) RDH and RDT registers, where
2217	 * RDH points to the next receive descriptor to be used by the NIC,
2218	 * and RDT for the next receive descriptor to be published by the
2219	 * driver to the NIC (RDT - 1 is thus the last valid one).
2220	 * The condition RDH == RDT means no descriptors are available to
2221	 * the NIC, and thus it would be ambiguous if it also meant that
2222	 * all the descriptors are available to the NIC.
2223	 */
2224	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
2225#ifdef INVARIANTS
2226	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
2227#endif
2228
2229	MPASS(fl->ifl_credits <= fl->ifl_size);
2230	MPASS(reclaimable == delta);
2231
2232	if (reclaimable > 0)
2233		return (iflib_fl_refill(ctx, fl, reclaimable));
2234	return (0);
2235}
2236
2237uint8_t
2238iflib_in_detach(if_ctx_t ctx)
2239{
2240	bool in_detach;
2241
2242	STATE_LOCK(ctx);
2243	in_detach = !!(ctx->ifc_flags & IFC_IN_DETACH);
2244	STATE_UNLOCK(ctx);
2245	return (in_detach);
2246}
2247
2248static void
2249iflib_fl_bufs_free(iflib_fl_t fl)
2250{
2251	iflib_dma_info_t idi = fl->ifl_ifdi;
2252	bus_dmamap_t sd_map;
2253	uint32_t i;
2254
2255	for (i = 0; i < fl->ifl_size; i++) {
2256		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
2257		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
2258
2259		if (*sd_cl != NULL) {
2260			sd_map = fl->ifl_sds.ifsd_map[i];
2261			bus_dmamap_sync(fl->ifl_buf_tag, sd_map,
2262			    BUS_DMASYNC_POSTREAD);
2263			bus_dmamap_unload(fl->ifl_buf_tag, sd_map);
2264			uma_zfree(fl->ifl_zone, *sd_cl);
2265			*sd_cl = NULL;
2266			if (*sd_m != NULL) {
2267				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
2268				m_free_raw(*sd_m);
2269				*sd_m = NULL;
2270			}
2271		} else {
2272			MPASS(*sd_m == NULL);
2273		}
2274#if MEMORY_LOGGING
2275		fl->ifl_m_dequeued++;
2276		fl->ifl_cl_dequeued++;
2277#endif
2278	}
2279#ifdef INVARIANTS
2280	for (i = 0; i < fl->ifl_size; i++) {
2281		MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
2282		MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
2283	}
2284#endif
2285	/*
2286	 * Reset free list values
2287	 */
2288	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
2289	bzero(idi->idi_vaddr, idi->idi_size);
2290}
2291
2292/*********************************************************************
2293 *
2294 *  Initialize a free list and its buffers.
2295 *
2296 **********************************************************************/
2297static int
2298iflib_fl_setup(iflib_fl_t fl)
2299{
2300	iflib_rxq_t rxq = fl->ifl_rxq;
2301	if_ctx_t ctx = rxq->ifr_ctx;
2302	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2303	int qidx;
2304
2305	bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
2306	/*
2307	** Free current RX buffer structs and their mbufs
2308	*/
2309	iflib_fl_bufs_free(fl);
2310	/* Now replenish the mbufs */
2311	MPASS(fl->ifl_credits == 0);
2312	qidx = rxq->ifr_fl_offset + fl->ifl_id;
2313	if (scctx->isc_rxd_buf_size[qidx] != 0)
2314		fl->ifl_buf_size = scctx->isc_rxd_buf_size[qidx];
2315	else
2316		fl->ifl_buf_size = ctx->ifc_rx_mbuf_sz;
2317	/*
2318	 * ifl_buf_size may be a driver-supplied value, so pull it up
2319	 * to the selected mbuf size.
2320	 */
2321	fl->ifl_buf_size = iflib_get_mbuf_size_for(fl->ifl_buf_size);
2322	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
2323		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
2324	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
2325	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
2326
2327	/*
2328	 * Avoid pre-allocating zillions of clusters to an idle card
2329	 * potentially speeding up attach. In any case make sure
2330	 * to leave a descriptor unavailable. See the comment in
2331	 * iflib_fl_refill_all().
2332	 */
2333	MPASS(fl->ifl_size > 0);
2334	(void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size - 1));
2335	if (min(128, fl->ifl_size - 1) != fl->ifl_credits)
2336		return (ENOBUFS);
2337	/*
2338	 * handle failure
2339	 */
2340	MPASS(rxq != NULL);
2341	MPASS(fl->ifl_ifdi != NULL);
2342	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
2343	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2344	return (0);
2345}
2346
2347/*********************************************************************
2348 *
2349 *  Free receive ring data structures
2350 *
2351 **********************************************************************/
2352static void
2353iflib_rx_sds_free(iflib_rxq_t rxq)
2354{
2355	iflib_fl_t fl;
2356	int i, j;
2357
2358	if (rxq->ifr_fl != NULL) {
2359		for (i = 0; i < rxq->ifr_nfl; i++) {
2360			fl = &rxq->ifr_fl[i];
2361			if (fl->ifl_buf_tag != NULL) {
2362				if (fl->ifl_sds.ifsd_map != NULL) {
2363					for (j = 0; j < fl->ifl_size; j++) {
2364						bus_dmamap_sync(
2365						    fl->ifl_buf_tag,
2366						    fl->ifl_sds.ifsd_map[j],
2367						    BUS_DMASYNC_POSTREAD);
2368						bus_dmamap_unload(
2369						    fl->ifl_buf_tag,
2370						    fl->ifl_sds.ifsd_map[j]);
2371						bus_dmamap_destroy(
2372						    fl->ifl_buf_tag,
2373						    fl->ifl_sds.ifsd_map[j]);
2374					}
2375				}
2376				bus_dma_tag_destroy(fl->ifl_buf_tag);
2377				fl->ifl_buf_tag = NULL;
2378			}
2379			free(fl->ifl_sds.ifsd_m, M_IFLIB);
2380			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
2381			free(fl->ifl_sds.ifsd_ba, M_IFLIB);
2382			free(fl->ifl_sds.ifsd_map, M_IFLIB);
2383			free(fl->ifl_rx_bitmap, M_IFLIB);
2384			fl->ifl_sds.ifsd_m = NULL;
2385			fl->ifl_sds.ifsd_cl = NULL;
2386			fl->ifl_sds.ifsd_ba = NULL;
2387			fl->ifl_sds.ifsd_map = NULL;
2388			fl->ifl_rx_bitmap = NULL;
2389		}
2390		free(rxq->ifr_fl, M_IFLIB);
2391		rxq->ifr_fl = NULL;
2392		free(rxq->ifr_ifdi, M_IFLIB);
2393		rxq->ifr_ifdi = NULL;
2394		rxq->ifr_cq_cidx = 0;
2395	}
2396}
2397
2398/*
2399 * Timer routine
2400 */
2401static void
2402iflib_timer(void *arg)
2403{
2404	iflib_txq_t txq = arg;
2405	if_ctx_t ctx = txq->ift_ctx;
2406	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2407	uint64_t this_tick = ticks;
2408
2409	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
2410		return;
2411
2412	/*
2413	** Check on the state of the TX queue(s), this
2414	** can be done without the lock because its RO
2415	** and the HUNG state will be static if set.
2416	*/
2417	if (this_tick - txq->ift_last_timer_tick >= iflib_timer_default) {
2418		txq->ift_last_timer_tick = this_tick;
2419		IFDI_TIMER(ctx, txq->ift_id);
2420		if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
2421		    ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
2422		     (sctx->isc_pause_frames == 0)))
2423			goto hung;
2424
2425		if (txq->ift_qstatus != IFLIB_QUEUE_IDLE &&
2426		    ifmp_ring_is_stalled(txq->ift_br)) {
2427			KASSERT(ctx->ifc_link_state == LINK_STATE_UP,
2428			    ("queue can't be marked as hung if interface is down"));
2429			txq->ift_qstatus = IFLIB_QUEUE_HUNG;
2430		}
2431		txq->ift_cleaned_prev = txq->ift_cleaned;
2432	}
2433	/* handle any laggards */
2434	if (txq->ift_db_pending)
2435		GROUPTASK_ENQUEUE(&txq->ift_task);
2436
2437	sctx->isc_pause_frames = 0;
2438	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)
2439		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer,
2440		    txq, txq->ift_timer.c_cpu);
2441	return;
2442
2443 hung:
2444	device_printf(ctx->ifc_dev,
2445	    "Watchdog timeout (TX: %d desc avail: %d pidx: %d) -- resetting\n",
2446	    txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
2447	STATE_LOCK(ctx);
2448	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2449	ctx->ifc_flags |= (IFC_DO_WATCHDOG|IFC_DO_RESET);
2450	iflib_admin_intr_deferred(ctx);
2451	STATE_UNLOCK(ctx);
2452}
2453
2454static uint16_t
2455iflib_get_mbuf_size_for(unsigned int size)
2456{
2457
2458	if (size <= MCLBYTES)
2459		return (MCLBYTES);
2460	else
2461		return (MJUMPAGESIZE);
2462}
2463
2464static void
2465iflib_calc_rx_mbuf_sz(if_ctx_t ctx)
2466{
2467	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2468
2469	/*
2470	 * XXX don't set the max_frame_size to larger
2471	 * than the hardware can handle
2472	 */
2473	ctx->ifc_rx_mbuf_sz =
2474	    iflib_get_mbuf_size_for(sctx->isc_max_frame_size);
2475}
2476
2477uint32_t
2478iflib_get_rx_mbuf_sz(if_ctx_t ctx)
2479{
2480
2481	return (ctx->ifc_rx_mbuf_sz);
2482}
2483
2484static void
2485iflib_init_locked(if_ctx_t ctx)
2486{
2487	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2488	if_t ifp = ctx->ifc_ifp;
2489	iflib_fl_t fl;
2490	iflib_txq_t txq;
2491	iflib_rxq_t rxq;
2492	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
2493
2494	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2495	IFDI_INTR_DISABLE(ctx);
2496
2497	/*
2498	 * See iflib_stop(). Useful in case iflib_init_locked() is
2499	 * called without first calling iflib_stop().
2500	 */
2501	netmap_disable_all_rings(ifp);
2502
2503	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
2504	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
2505	/* Set hardware offload abilities */
2506	if_clearhwassist(ifp);
2507	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
2508		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
2509	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
2510		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
2511	if (if_getcapenable(ifp) & IFCAP_TSO4)
2512		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2513	if (if_getcapenable(ifp) & IFCAP_TSO6)
2514		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2515
2516	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
2517		CALLOUT_LOCK(txq);
2518		callout_stop(&txq->ift_timer);
2519#ifdef DEV_NETMAP
2520		callout_stop(&txq->ift_netmap_timer);
2521#endif /* DEV_NETMAP */
2522		CALLOUT_UNLOCK(txq);
2523		(void)iflib_netmap_txq_init(ctx, txq);
2524	}
2525
2526	/*
2527	 * Calculate a suitable Rx mbuf size prior to calling IFDI_INIT, so
2528	 * that drivers can use the value when setting up the hardware receive
2529	 * buffers.
2530	 */
2531	iflib_calc_rx_mbuf_sz(ctx);
2532
2533#ifdef INVARIANTS
2534	i = if_getdrvflags(ifp);
2535#endif
2536	IFDI_INIT(ctx);
2537	MPASS(if_getdrvflags(ifp) == i);
2538	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
2539		if (iflib_netmap_rxq_init(ctx, rxq) > 0) {
2540			/* This rxq is in netmap mode. Skip normal init. */
2541			continue;
2542		}
2543		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
2544			if (iflib_fl_setup(fl)) {
2545				device_printf(ctx->ifc_dev,
2546				    "setting up free list %d failed - "
2547				    "check cluster settings\n", j);
2548				goto done;
2549			}
2550		}
2551	}
2552done:
2553	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
2554	IFDI_INTR_ENABLE(ctx);
2555	txq = ctx->ifc_txqs;
2556	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++)
2557		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
2558			txq->ift_timer.c_cpu);
2559
2560        /* Re-enable txsync/rxsync. */
2561	netmap_enable_all_rings(ifp);
2562}
2563
2564static int
2565iflib_media_change(if_t ifp)
2566{
2567	if_ctx_t ctx = if_getsoftc(ifp);
2568	int err;
2569
2570	CTX_LOCK(ctx);
2571	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
2572		iflib_if_init_locked(ctx);
2573	CTX_UNLOCK(ctx);
2574	return (err);
2575}
2576
2577static void
2578iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
2579{
2580	if_ctx_t ctx = if_getsoftc(ifp);
2581
2582	CTX_LOCK(ctx);
2583	IFDI_UPDATE_ADMIN_STATUS(ctx);
2584	IFDI_MEDIA_STATUS(ctx, ifmr);
2585	CTX_UNLOCK(ctx);
2586}
2587
2588void
2589iflib_stop(if_ctx_t ctx)
2590{
2591	iflib_txq_t txq = ctx->ifc_txqs;
2592	iflib_rxq_t rxq = ctx->ifc_rxqs;
2593	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2594	if_shared_ctx_t sctx = ctx->ifc_sctx;
2595	iflib_dma_info_t di;
2596	iflib_fl_t fl;
2597	int i, j;
2598
2599	/* Tell the stack that the interface is no longer active */
2600	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2601
2602	IFDI_INTR_DISABLE(ctx);
2603	DELAY(1000);
2604	IFDI_STOP(ctx);
2605	DELAY(1000);
2606
2607	/*
2608	 * Stop any pending txsync/rxsync and prevent new ones
2609	 * form starting. Processes blocked in poll() will get
2610	 * POLLERR.
2611	 */
2612	netmap_disable_all_rings(ctx->ifc_ifp);
2613
2614	iflib_debug_reset();
2615	/* Wait for current tx queue users to exit to disarm watchdog timer. */
2616	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
2617		/* make sure all transmitters have completed before proceeding XXX */
2618
2619		CALLOUT_LOCK(txq);
2620		callout_stop(&txq->ift_timer);
2621#ifdef DEV_NETMAP
2622		callout_stop(&txq->ift_netmap_timer);
2623#endif /* DEV_NETMAP */
2624		CALLOUT_UNLOCK(txq);
2625
2626		/* clean any enqueued buffers */
2627		iflib_ifmp_purge(txq);
2628		/* Free any existing tx buffers. */
2629		for (j = 0; j < txq->ift_size; j++) {
2630			iflib_txsd_free(ctx, txq, j);
2631		}
2632		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
2633		txq->ift_in_use = txq->ift_gen = txq->ift_no_desc_avail = 0;
2634		if (sctx->isc_flags & IFLIB_PRESERVE_TX_INDICES)
2635			txq->ift_cidx = txq->ift_pidx;
2636		else
2637			txq->ift_cidx = txq->ift_pidx = 0;
2638
2639		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
2640		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
2641		txq->ift_pullups = 0;
2642		ifmp_ring_reset_stats(txq->ift_br);
2643		for (j = 0, di = txq->ift_ifdi; j < sctx->isc_ntxqs; j++, di++)
2644			bzero((void *)di->idi_vaddr, di->idi_size);
2645	}
2646	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
2647		if (rxq->ifr_task.gt_taskqueue != NULL)
2648			gtaskqueue_drain(rxq->ifr_task.gt_taskqueue,
2649				 &rxq->ifr_task.gt_task);
2650
2651		rxq->ifr_cq_cidx = 0;
2652		for (j = 0, di = rxq->ifr_ifdi; j < sctx->isc_nrxqs; j++, di++)
2653			bzero((void *)di->idi_vaddr, di->idi_size);
2654		/* also resets the free lists pidx/cidx */
2655		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
2656			iflib_fl_bufs_free(fl);
2657	}
2658}
2659
2660static inline caddr_t
2661calc_next_rxd(iflib_fl_t fl, int cidx)
2662{
2663	qidx_t size;
2664	int nrxd;
2665	caddr_t start, end, cur, next;
2666
2667	nrxd = fl->ifl_size;
2668	size = fl->ifl_rxd_size;
2669	start = fl->ifl_ifdi->idi_vaddr;
2670
2671	if (__predict_false(size == 0))
2672		return (start);
2673	cur = start + size*cidx;
2674	end = start + size*nrxd;
2675	next = CACHE_PTR_NEXT(cur);
2676	return (next < end ? next : start);
2677}
2678
2679static inline void
2680prefetch_pkts(iflib_fl_t fl, int cidx)
2681{
2682	int nextptr;
2683	int nrxd = fl->ifl_size;
2684	caddr_t next_rxd;
2685
2686	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
2687	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
2688	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
2689	next_rxd = calc_next_rxd(fl, cidx);
2690	prefetch(next_rxd);
2691	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
2692	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
2693	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
2694	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
2695	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
2696	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
2697	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
2698	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
2699}
2700
2701static struct mbuf *
2702rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd,
2703    int *pf_rv, if_rxd_info_t ri)
2704{
2705	bus_dmamap_t map;
2706	iflib_fl_t fl;
2707	caddr_t payload;
2708	struct mbuf *m;
2709	int flid, cidx, len, next;
2710
2711	map = NULL;
2712	flid = irf->irf_flid;
2713	cidx = irf->irf_idx;
2714	fl = &rxq->ifr_fl[flid];
2715	sd->ifsd_fl = fl;
2716	sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
2717	fl->ifl_credits--;
2718#if MEMORY_LOGGING
2719	fl->ifl_m_dequeued++;
2720#endif
2721	if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
2722		prefetch_pkts(fl, cidx);
2723	next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
2724	prefetch(&fl->ifl_sds.ifsd_map[next]);
2725	map = fl->ifl_sds.ifsd_map[cidx];
2726
2727	bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
2728
2729	if (rxq->pfil != NULL && PFIL_HOOKED_IN(rxq->pfil) && pf_rv != NULL &&
2730	    irf->irf_len != 0) {
2731		payload  = *sd->ifsd_cl;
2732		payload +=  ri->iri_pad;
2733		len = ri->iri_len - ri->iri_pad;
2734		*pf_rv = pfil_mem_in(rxq->pfil, payload, len, ri->iri_ifp, &m);
2735		switch (*pf_rv) {
2736		case PFIL_DROPPED:
2737		case PFIL_CONSUMED:
2738			/*
2739			 * The filter ate it.  Everything is recycled.
2740			 */
2741			m = NULL;
2742			unload = 0;
2743			break;
2744		case PFIL_REALLOCED:
2745			/*
2746			 * The filter copied it.  Everything is recycled.
2747			 * 'm' points at new mbuf.
2748			 */
2749			unload = 0;
2750			break;
2751		case PFIL_PASS:
2752			/*
2753			 * Filter said it was OK, so receive like
2754			 * normal
2755			 */
2756			m = fl->ifl_sds.ifsd_m[cidx];
2757			fl->ifl_sds.ifsd_m[cidx] = NULL;
2758			break;
2759		default:
2760			MPASS(0);
2761		}
2762	} else {
2763		m = fl->ifl_sds.ifsd_m[cidx];
2764		fl->ifl_sds.ifsd_m[cidx] = NULL;
2765		if (pf_rv != NULL)
2766			*pf_rv = PFIL_PASS;
2767	}
2768
2769	if (unload && irf->irf_len != 0)
2770		bus_dmamap_unload(fl->ifl_buf_tag, map);
2771	fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
2772	if (__predict_false(fl->ifl_cidx == 0))
2773		fl->ifl_gen = 0;
2774	bit_clear(fl->ifl_rx_bitmap, cidx);
2775	return (m);
2776}
2777
2778static struct mbuf *
2779assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd, int *pf_rv)
2780{
2781	struct mbuf *m, *mh, *mt;
2782	caddr_t cl;
2783	int  *pf_rv_ptr, flags, i, padlen;
2784	bool consumed;
2785
2786	i = 0;
2787	mh = NULL;
2788	consumed = false;
2789	*pf_rv = PFIL_PASS;
2790	pf_rv_ptr = pf_rv;
2791	do {
2792		m = rxd_frag_to_sd(rxq, &ri->iri_frags[i], !consumed, sd,
2793		    pf_rv_ptr, ri);
2794
2795		MPASS(*sd->ifsd_cl != NULL);
2796
2797		/*
2798		 * Exclude zero-length frags & frags from
2799		 * packets the filter has consumed or dropped
2800		 */
2801		if (ri->iri_frags[i].irf_len == 0 || consumed ||
2802		    *pf_rv == PFIL_CONSUMED || *pf_rv == PFIL_DROPPED) {
2803			if (mh == NULL) {
2804				/* everything saved here */
2805				consumed = true;
2806				pf_rv_ptr = NULL;
2807				continue;
2808			}
2809			/* XXX we can save the cluster here, but not the mbuf */
2810			m_init(m, M_NOWAIT, MT_DATA, 0);
2811			m_free(m);
2812			continue;
2813		}
2814		if (mh == NULL) {
2815			flags = M_PKTHDR|M_EXT;
2816			mh = mt = m;
2817			padlen = ri->iri_pad;
2818		} else {
2819			flags = M_EXT;
2820			mt->m_next = m;
2821			mt = m;
2822			/* assuming padding is only on the first fragment */
2823			padlen = 0;
2824		}
2825		cl = *sd->ifsd_cl;
2826		*sd->ifsd_cl = NULL;
2827
2828		/* Can these two be made one ? */
2829		m_init(m, M_NOWAIT, MT_DATA, flags);
2830		m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
2831		/*
2832		 * These must follow m_init and m_cljset
2833		 */
2834		m->m_data += padlen;
2835		ri->iri_len -= padlen;
2836		m->m_len = ri->iri_frags[i].irf_len;
2837	} while (++i < ri->iri_nfrags);
2838
2839	return (mh);
2840}
2841
2842/*
2843 * Process one software descriptor
2844 */
2845static struct mbuf *
2846iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
2847{
2848	struct if_rxsd sd;
2849	struct mbuf *m;
2850	int pf_rv;
2851
2852	/* should I merge this back in now that the two paths are basically duplicated? */
2853	if (ri->iri_nfrags == 1 &&
2854	    ri->iri_frags[0].irf_len != 0 &&
2855	    ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
2856		m = rxd_frag_to_sd(rxq, &ri->iri_frags[0], false, &sd,
2857		    &pf_rv, ri);
2858		if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED)
2859			return (m);
2860		if (pf_rv == PFIL_PASS) {
2861			m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
2862#ifndef __NO_STRICT_ALIGNMENT
2863			if (!IP_ALIGNED(m) && ri->iri_pad == 0)
2864				m->m_data += 2;
2865#endif
2866			memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
2867			m->m_len = ri->iri_frags[0].irf_len;
2868			m->m_data += ri->iri_pad;
2869			ri->iri_len -= ri->iri_pad;
2870		}
2871	} else {
2872		m = assemble_segments(rxq, ri, &sd, &pf_rv);
2873		if (m == NULL)
2874			return (NULL);
2875		if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED)
2876			return (m);
2877	}
2878	m->m_pkthdr.len = ri->iri_len;
2879	m->m_pkthdr.rcvif = ri->iri_ifp;
2880	m->m_flags |= ri->iri_flags;
2881	m->m_pkthdr.ether_vtag = ri->iri_vtag;
2882	m->m_pkthdr.flowid = ri->iri_flowid;
2883	M_HASHTYPE_SET(m, ri->iri_rsstype);
2884	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
2885	m->m_pkthdr.csum_data = ri->iri_csum_data;
2886	return (m);
2887}
2888
2889#if defined(INET6) || defined(INET)
2890static void
2891iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
2892{
2893	CURVNET_SET(if_getvnet(lc->ifp));
2894#if defined(INET6)
2895	*v6 = V_ip6_forwarding;
2896#endif
2897#if defined(INET)
2898	*v4 = V_ipforwarding;
2899#endif
2900	CURVNET_RESTORE();
2901}
2902
2903/*
2904 * Returns true if it's possible this packet could be LROed.
2905 * if it returns false, it is guaranteed that tcp_lro_rx()
2906 * would not return zero.
2907 */
2908static bool
2909iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
2910{
2911	struct ether_header *eh;
2912
2913	eh = mtod(m, struct ether_header *);
2914	switch (eh->ether_type) {
2915#if defined(INET6)
2916		case htons(ETHERTYPE_IPV6):
2917			return (!v6_forwarding);
2918#endif
2919#if defined (INET)
2920		case htons(ETHERTYPE_IP):
2921			return (!v4_forwarding);
2922#endif
2923	}
2924
2925	return false;
2926}
2927#else
2928static void
2929iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
2930{
2931}
2932#endif
2933
2934static void
2935_task_fn_rx_watchdog(void *context)
2936{
2937	iflib_rxq_t rxq = context;
2938
2939	GROUPTASK_ENQUEUE(&rxq->ifr_task);
2940}
2941
2942static uint8_t
2943iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
2944{
2945	if_t ifp;
2946	if_ctx_t ctx = rxq->ifr_ctx;
2947	if_shared_ctx_t sctx = ctx->ifc_sctx;
2948	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2949	int avail, i;
2950	qidx_t *cidxp;
2951	struct if_rxd_info ri;
2952	int err, budget_left, rx_bytes, rx_pkts;
2953	iflib_fl_t fl;
2954	int lro_enabled;
2955	bool v4_forwarding, v6_forwarding, lro_possible;
2956	uint8_t retval = 0;
2957
2958	/*
2959	 * XXX early demux data packets so that if_input processing only handles
2960	 * acks in interrupt context
2961	 */
2962	struct mbuf *m, *mh, *mt, *mf;
2963
2964	NET_EPOCH_ASSERT();
2965
2966	lro_possible = v4_forwarding = v6_forwarding = false;
2967	ifp = ctx->ifc_ifp;
2968	mh = mt = NULL;
2969	MPASS(budget > 0);
2970	rx_pkts	= rx_bytes = 0;
2971	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
2972		cidxp = &rxq->ifr_cq_cidx;
2973	else
2974		cidxp = &rxq->ifr_fl[0].ifl_cidx;
2975	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
2976		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
2977			retval |= iflib_fl_refill_all(ctx, fl);
2978		DBG_COUNTER_INC(rx_unavail);
2979		return (retval);
2980	}
2981
2982	/* pfil needs the vnet to be set */
2983	CURVNET_SET_QUIET(if_getvnet(ifp));
2984	for (budget_left = budget; budget_left > 0 && avail > 0;) {
2985		if (__predict_false(!CTX_ACTIVE(ctx))) {
2986			DBG_COUNTER_INC(rx_ctx_inactive);
2987			break;
2988		}
2989		/*
2990		 * Reset client set fields to their default values
2991		 */
2992		rxd_info_zero(&ri);
2993		ri.iri_qsidx = rxq->ifr_id;
2994		ri.iri_cidx = *cidxp;
2995		ri.iri_ifp = ifp;
2996		ri.iri_frags = rxq->ifr_frags;
2997		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
2998
2999		if (err)
3000			goto err;
3001		rx_pkts += 1;
3002		rx_bytes += ri.iri_len;
3003		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
3004			*cidxp = ri.iri_cidx;
3005			/* Update our consumer index */
3006			/* XXX NB: shurd - check if this is still safe */
3007			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0])
3008				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
3009			/* was this only a completion queue message? */
3010			if (__predict_false(ri.iri_nfrags == 0))
3011				continue;
3012		}
3013		MPASS(ri.iri_nfrags != 0);
3014		MPASS(ri.iri_len != 0);
3015
3016		/* will advance the cidx on the corresponding free lists */
3017		m = iflib_rxd_pkt_get(rxq, &ri);
3018		avail--;
3019		budget_left--;
3020		if (avail == 0 && budget_left)
3021			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
3022
3023		if (__predict_false(m == NULL))
3024			continue;
3025
3026		/* imm_pkt: -- cxgb */
3027		if (mh == NULL)
3028			mh = mt = m;
3029		else {
3030			mt->m_nextpkt = m;
3031			mt = m;
3032		}
3033	}
3034	CURVNET_RESTORE();
3035	/* make sure that we can refill faster than drain */
3036	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
3037		retval |= iflib_fl_refill_all(ctx, fl);
3038
3039	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
3040	if (lro_enabled)
3041		iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
3042	mt = mf = NULL;
3043	while (mh != NULL) {
3044		m = mh;
3045		mh = mh->m_nextpkt;
3046		m->m_nextpkt = NULL;
3047#ifndef __NO_STRICT_ALIGNMENT
3048		if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
3049			continue;
3050#endif
3051#if defined(INET6) || defined(INET)
3052		if (lro_enabled) {
3053			if (!lro_possible) {
3054				lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
3055				if (lro_possible && mf != NULL) {
3056					if_input(ifp, mf);
3057					DBG_COUNTER_INC(rx_if_input);
3058					mt = mf = NULL;
3059				}
3060			}
3061			if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) ==
3062			    (CSUM_L4_CALC|CSUM_L4_VALID)) {
3063				if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
3064					continue;
3065			}
3066		}
3067#endif
3068		if (lro_possible) {
3069			if_input(ifp, m);
3070			DBG_COUNTER_INC(rx_if_input);
3071			continue;
3072		}
3073
3074		if (mf == NULL)
3075			mf = m;
3076		if (mt != NULL)
3077			mt->m_nextpkt = m;
3078		mt = m;
3079	}
3080	if (mf != NULL) {
3081		if_input(ifp, mf);
3082		DBG_COUNTER_INC(rx_if_input);
3083	}
3084
3085	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
3086	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
3087
3088	/*
3089	 * Flush any outstanding LRO work
3090	 */
3091#if defined(INET6) || defined(INET)
3092	tcp_lro_flush_all(&rxq->ifr_lc);
3093#endif
3094	if (avail != 0 || iflib_rxd_avail(ctx, rxq, *cidxp, 1) != 0)
3095		retval |= IFLIB_RXEOF_MORE;
3096	return (retval);
3097err:
3098	STATE_LOCK(ctx);
3099	ctx->ifc_flags |= IFC_DO_RESET;
3100	iflib_admin_intr_deferred(ctx);
3101	STATE_UNLOCK(ctx);
3102	return (0);
3103}
3104
3105#define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
3106static inline qidx_t
3107txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
3108{
3109	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
3110	qidx_t minthresh = txq->ift_size / 8;
3111	if (in_use > 4*minthresh)
3112		return (notify_count);
3113	if (in_use > 2*minthresh)
3114		return (notify_count >> 1);
3115	if (in_use > minthresh)
3116		return (notify_count >> 3);
3117	return (0);
3118}
3119
3120static inline qidx_t
3121txq_max_rs_deferred(iflib_txq_t txq)
3122{
3123	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
3124	qidx_t minthresh = txq->ift_size / 8;
3125	if (txq->ift_in_use > 4*minthresh)
3126		return (notify_count);
3127	if (txq->ift_in_use > 2*minthresh)
3128		return (notify_count >> 1);
3129	if (txq->ift_in_use > minthresh)
3130		return (notify_count >> 2);
3131	return (2);
3132}
3133
3134#define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
3135#define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
3136
3137#define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
3138#define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
3139#define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
3140
3141/* forward compatibility for cxgb */
3142#define FIRST_QSET(ctx) 0
3143#define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
3144#define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
3145#define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
3146#define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
3147
3148/* XXX we should be setting this to something other than zero */
3149#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
3150#define	MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
3151    (ctx)->ifc_softc_ctx.isc_tx_nsegments)
3152
3153static inline bool
3154iflib_txd_db_check(iflib_txq_t txq, int ring)
3155{
3156	if_ctx_t ctx = txq->ift_ctx;
3157	qidx_t dbval, max;
3158
3159	max = TXQ_MAX_DB_DEFERRED(txq, txq->ift_in_use);
3160
3161	/* force || threshold exceeded || at the edge of the ring */
3162	if (ring || (txq->ift_db_pending >= max) || (TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2)) {
3163
3164		/*
3165		 * 'npending' is used if the card's doorbell is in terms of the number of descriptors
3166		 * pending flush (BRCM). 'pidx' is used in cases where the card's doorbeel uses the
3167		 * producer index explicitly (INTC).
3168		 */
3169		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
3170		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
3171		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
3172		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
3173
3174		/*
3175		 * Absent bugs there are zero packets pending so reset pending counts to zero.
3176		 */
3177		txq->ift_db_pending = txq->ift_npending = 0;
3178		return (true);
3179	}
3180	return (false);
3181}
3182
3183#ifdef PKT_DEBUG
3184static void
3185print_pkt(if_pkt_info_t pi)
3186{
3187	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
3188	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
3189	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
3190	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
3191	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
3192	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
3193}
3194#endif
3195
3196#define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
3197#define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO))
3198#define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
3199#define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO))
3200
3201/**
3202 * Parses out ethernet header information in the given mbuf.
3203 * Returns in pi: ipi_etype (EtherType) and ipi_ehdrlen (Ethernet header length)
3204 *
3205 * This will account for the VLAN header if present.
3206 *
3207 * XXX: This doesn't handle QinQ, which could prevent TX offloads for those
3208 * types of packets.
3209 */
3210static int
3211iflib_parse_ether_header(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups)
3212{
3213	struct ether_vlan_header *eh;
3214	struct mbuf *m;
3215
3216	m = *mp;
3217	if (__predict_false(m->m_len < sizeof(*eh))) {
3218		(*pullups)++;
3219		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
3220			return (ENOMEM);
3221	}
3222	eh = mtod(m, struct ether_vlan_header *);
3223	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
3224		pi->ipi_etype = ntohs(eh->evl_proto);
3225		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3226	} else {
3227		pi->ipi_etype = ntohs(eh->evl_encap_proto);
3228		pi->ipi_ehdrlen = ETHER_HDR_LEN;
3229	}
3230	*mp = m;
3231
3232	return (0);
3233}
3234
3235/**
3236 * Parse up to the L3 header and extract IPv4/IPv6 header information into pi.
3237 * Currently this information includes: IP ToS value, IP header version/presence
3238 *
3239 * This is missing some checks and doesn't edit the packet content as it goes,
3240 * unlike iflib_parse_header(), in order to keep the amount of code here minimal.
3241 */
3242static int
3243iflib_parse_header_partial(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups)
3244{
3245	struct mbuf *m;
3246	int err;
3247
3248	*pullups = 0;
3249	m = *mp;
3250	if (!M_WRITABLE(m)) {
3251		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
3252			return (ENOMEM);
3253		} else {
3254			m_freem(*mp);
3255			DBG_COUNTER_INC(tx_frees);
3256			*mp = m;
3257		}
3258	}
3259
3260	/* Fills out pi->ipi_etype */
3261	err = iflib_parse_ether_header(pi, mp, pullups);
3262	if (err)
3263		return (err);
3264	m = *mp;
3265
3266	switch (pi->ipi_etype) {
3267#ifdef INET
3268	case ETHERTYPE_IP:
3269	{
3270		struct mbuf *n;
3271		struct ip *ip = NULL;
3272		int miniplen;
3273
3274		miniplen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip));
3275		if (__predict_false(m->m_len < miniplen)) {
3276			/*
3277			 * Check for common case where the first mbuf only contains
3278			 * the Ethernet header
3279			 */
3280			if (m->m_len == pi->ipi_ehdrlen) {
3281				n = m->m_next;
3282				MPASS(n);
3283				/* If next mbuf contains at least the minimal IP header, then stop */
3284				if (n->m_len >= sizeof(*ip)) {
3285					ip = (struct ip *)n->m_data;
3286				} else {
3287					(*pullups)++;
3288					if (__predict_false((m = m_pullup(m, miniplen)) == NULL))
3289						return (ENOMEM);
3290					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3291				}
3292			} else {
3293				(*pullups)++;
3294				if (__predict_false((m = m_pullup(m, miniplen)) == NULL))
3295					return (ENOMEM);
3296				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3297			}
3298		} else {
3299			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3300		}
3301
3302		/* Have the IPv4 header w/ no options here */
3303		pi->ipi_ip_hlen = ip->ip_hl << 2;
3304		pi->ipi_ipproto = ip->ip_p;
3305		pi->ipi_ip_tos = ip->ip_tos;
3306		pi->ipi_flags |= IPI_TX_IPV4;
3307
3308		break;
3309	}
3310#endif
3311#ifdef INET6
3312	case ETHERTYPE_IPV6:
3313	{
3314		struct ip6_hdr *ip6;
3315
3316		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
3317			(*pullups)++;
3318			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
3319				return (ENOMEM);
3320		}
3321		ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
3322
3323		/* Have the IPv6 fixed header here */
3324		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
3325		pi->ipi_ipproto = ip6->ip6_nxt;
3326		pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6);
3327		pi->ipi_flags |= IPI_TX_IPV6;
3328
3329		break;
3330	}
3331#endif
3332	default:
3333		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
3334		pi->ipi_ip_hlen = 0;
3335		break;
3336	}
3337	*mp = m;
3338
3339	return (0);
3340
3341}
3342
3343static int
3344iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
3345{
3346	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
3347	struct mbuf *m;
3348	int err;
3349
3350	m = *mp;
3351	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
3352	    M_WRITABLE(m) == 0) {
3353		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
3354			return (ENOMEM);
3355		} else {
3356			m_freem(*mp);
3357			DBG_COUNTER_INC(tx_frees);
3358			*mp = m;
3359		}
3360	}
3361
3362	/* Fills out pi->ipi_etype */
3363	err = iflib_parse_ether_header(pi, mp, &txq->ift_pullups);
3364	if (__predict_false(err))
3365		return (err);
3366	m = *mp;
3367
3368	switch (pi->ipi_etype) {
3369#ifdef INET
3370	case ETHERTYPE_IP:
3371	{
3372		struct mbuf *n;
3373		struct ip *ip = NULL;
3374		struct tcphdr *th = NULL;
3375		int minthlen;
3376
3377		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
3378		if (__predict_false(m->m_len < minthlen)) {
3379			/*
3380			 * if this code bloat is causing too much of a hit
3381			 * move it to a separate function and mark it noinline
3382			 */
3383			if (m->m_len == pi->ipi_ehdrlen) {
3384				n = m->m_next;
3385				MPASS(n);
3386				if (n->m_len >= sizeof(*ip))  {
3387					ip = (struct ip *)n->m_data;
3388					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3389						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3390				} else {
3391					txq->ift_pullups++;
3392					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
3393						return (ENOMEM);
3394					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3395				}
3396			} else {
3397				txq->ift_pullups++;
3398				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
3399					return (ENOMEM);
3400				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3401				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3402					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3403			}
3404		} else {
3405			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3406			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3407				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3408		}
3409		pi->ipi_ip_hlen = ip->ip_hl << 2;
3410		pi->ipi_ipproto = ip->ip_p;
3411		pi->ipi_ip_tos = ip->ip_tos;
3412		pi->ipi_flags |= IPI_TX_IPV4;
3413
3414		/* TCP checksum offload may require TCP header length */
3415		if (IS_TX_OFFLOAD4(pi)) {
3416			if (__predict_true(pi->ipi_ipproto == IPPROTO_TCP)) {
3417				if (__predict_false(th == NULL)) {
3418					txq->ift_pullups++;
3419					if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
3420						return (ENOMEM);
3421					th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
3422				}
3423				pi->ipi_tcp_hflags = th->th_flags;
3424				pi->ipi_tcp_hlen = th->th_off << 2;
3425				pi->ipi_tcp_seq = th->th_seq;
3426			}
3427			if (IS_TSO4(pi)) {
3428				if (__predict_false(ip->ip_p != IPPROTO_TCP))
3429					return (ENXIO);
3430				/*
3431				 * TSO always requires hardware checksum offload.
3432				 */
3433				pi->ipi_csum_flags |= (CSUM_IP_TCP | CSUM_IP);
3434				th->th_sum = in_pseudo(ip->ip_src.s_addr,
3435						       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
3436				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
3437				if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
3438					ip->ip_sum = 0;
3439					ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
3440				}
3441			}
3442		}
3443		if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
3444                       ip->ip_sum = 0;
3445
3446		break;
3447	}
3448#endif
3449#ifdef INET6
3450	case ETHERTYPE_IPV6:
3451	{
3452		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
3453		struct tcphdr *th;
3454		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
3455
3456		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
3457			txq->ift_pullups++;
3458			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
3459				return (ENOMEM);
3460		}
3461		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
3462
3463		/* XXX-BZ this will go badly in case of ext hdrs. */
3464		pi->ipi_ipproto = ip6->ip6_nxt;
3465		pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6);
3466		pi->ipi_flags |= IPI_TX_IPV6;
3467
3468		/* TCP checksum offload may require TCP header length */
3469		if (IS_TX_OFFLOAD6(pi)) {
3470			if (pi->ipi_ipproto == IPPROTO_TCP) {
3471				if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
3472					txq->ift_pullups++;
3473					if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
3474						return (ENOMEM);
3475				}
3476				pi->ipi_tcp_hflags = th->th_flags;
3477				pi->ipi_tcp_hlen = th->th_off << 2;
3478				pi->ipi_tcp_seq = th->th_seq;
3479			}
3480			if (IS_TSO6(pi)) {
3481				if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
3482					return (ENXIO);
3483				/*
3484				 * TSO always requires hardware checksum offload.
3485				 */
3486				pi->ipi_csum_flags |= CSUM_IP6_TCP;
3487				th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
3488				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
3489			}
3490		}
3491		break;
3492	}
3493#endif
3494	default:
3495		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
3496		pi->ipi_ip_hlen = 0;
3497		break;
3498	}
3499	*mp = m;
3500
3501	return (0);
3502}
3503
3504/*
3505 * If dodgy hardware rejects the scatter gather chain we've handed it
3506 * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
3507 * m_defrag'd mbufs
3508 */
3509static __noinline struct mbuf *
3510iflib_remove_mbuf(iflib_txq_t txq)
3511{
3512	int ntxd, pidx;
3513	struct mbuf *m, **ifsd_m;
3514
3515	ifsd_m = txq->ift_sds.ifsd_m;
3516	ntxd = txq->ift_size;
3517	pidx = txq->ift_pidx & (ntxd - 1);
3518	ifsd_m = txq->ift_sds.ifsd_m;
3519	m = ifsd_m[pidx];
3520	ifsd_m[pidx] = NULL;
3521	bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[pidx]);
3522	if (txq->ift_sds.ifsd_tso_map != NULL)
3523		bus_dmamap_unload(txq->ift_tso_buf_tag,
3524		    txq->ift_sds.ifsd_tso_map[pidx]);
3525#if MEMORY_LOGGING
3526	txq->ift_dequeued++;
3527#endif
3528	return (m);
3529}
3530
3531static inline caddr_t
3532calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
3533{
3534	qidx_t size;
3535	int ntxd;
3536	caddr_t start, end, cur, next;
3537
3538	ntxd = txq->ift_size;
3539	size = txq->ift_txd_size[qid];
3540	start = txq->ift_ifdi[qid].idi_vaddr;
3541
3542	if (__predict_false(size == 0))
3543		return (start);
3544	cur = start + size*cidx;
3545	end = start + size*ntxd;
3546	next = CACHE_PTR_NEXT(cur);
3547	return (next < end ? next : start);
3548}
3549
3550/*
3551 * Pad an mbuf to ensure a minimum ethernet frame size.
3552 * min_frame_size is the frame size (less CRC) to pad the mbuf to
3553 */
3554static __noinline int
3555iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
3556{
3557	/*
3558	 * 18 is enough bytes to pad an ARP packet to 46 bytes, and
3559	 * and ARP message is the smallest common payload I can think of
3560	 */
3561	static char pad[18];	/* just zeros */
3562	int n;
3563	struct mbuf *new_head;
3564
3565	if (!M_WRITABLE(*m_head)) {
3566		new_head = m_dup(*m_head, M_NOWAIT);
3567		if (new_head == NULL) {
3568			m_freem(*m_head);
3569			device_printf(dev, "cannot pad short frame, m_dup() failed");
3570			DBG_COUNTER_INC(encap_pad_mbuf_fail);
3571			DBG_COUNTER_INC(tx_frees);
3572			return ENOMEM;
3573		}
3574		m_freem(*m_head);
3575		*m_head = new_head;
3576	}
3577
3578	for (n = min_frame_size - (*m_head)->m_pkthdr.len;
3579	     n > 0; n -= sizeof(pad))
3580		if (!m_append(*m_head, min(n, sizeof(pad)), pad))
3581			break;
3582
3583	if (n > 0) {
3584		m_freem(*m_head);
3585		device_printf(dev, "cannot pad short frame\n");
3586		DBG_COUNTER_INC(encap_pad_mbuf_fail);
3587		DBG_COUNTER_INC(tx_frees);
3588		return (ENOBUFS);
3589	}
3590
3591	return 0;
3592}
3593
3594static int
3595iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
3596{
3597	if_ctx_t		ctx;
3598	if_shared_ctx_t		sctx;
3599	if_softc_ctx_t		scctx;
3600	bus_dma_tag_t		buf_tag;
3601	bus_dma_segment_t	*segs;
3602	struct mbuf		*m_head, **ifsd_m;
3603	void			*next_txd;
3604	bus_dmamap_t		map;
3605	struct if_pkt_info	pi;
3606	int remap = 0;
3607	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
3608
3609	ctx = txq->ift_ctx;
3610	sctx = ctx->ifc_sctx;
3611	scctx = &ctx->ifc_softc_ctx;
3612	segs = txq->ift_segs;
3613	ntxd = txq->ift_size;
3614	m_head = *m_headp;
3615	map = NULL;
3616
3617	/*
3618	 * If we're doing TSO the next descriptor to clean may be quite far ahead
3619	 */
3620	cidx = txq->ift_cidx;
3621	pidx = txq->ift_pidx;
3622	if (ctx->ifc_flags & IFC_PREFETCH) {
3623		next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
3624		if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
3625			next_txd = calc_next_txd(txq, cidx, 0);
3626			prefetch(next_txd);
3627		}
3628
3629		/* prefetch the next cache line of mbuf pointers and flags */
3630		prefetch(&txq->ift_sds.ifsd_m[next]);
3631		prefetch(&txq->ift_sds.ifsd_map[next]);
3632		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
3633	}
3634	map = txq->ift_sds.ifsd_map[pidx];
3635	ifsd_m = txq->ift_sds.ifsd_m;
3636
3637	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3638		buf_tag = txq->ift_tso_buf_tag;
3639		max_segs = scctx->isc_tx_tso_segments_max;
3640		map = txq->ift_sds.ifsd_tso_map[pidx];
3641		MPASS(buf_tag != NULL);
3642		MPASS(max_segs > 0);
3643	} else {
3644		buf_tag = txq->ift_buf_tag;
3645		max_segs = scctx->isc_tx_nsegments;
3646		map = txq->ift_sds.ifsd_map[pidx];
3647	}
3648	if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
3649	    __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
3650		err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
3651		if (err) {
3652			DBG_COUNTER_INC(encap_txd_encap_fail);
3653			return err;
3654		}
3655	}
3656	m_head = *m_headp;
3657
3658	pkt_info_zero(&pi);
3659	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
3660	pi.ipi_pidx = pidx;
3661	pi.ipi_qsidx = txq->ift_id;
3662	pi.ipi_len = m_head->m_pkthdr.len;
3663	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
3664	pi.ipi_vtag = M_HAS_VLANTAG(m_head) ? m_head->m_pkthdr.ether_vtag : 0;
3665
3666	/* deliberate bitwise OR to make one condition */
3667	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
3668		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) {
3669			DBG_COUNTER_INC(encap_txd_encap_fail);
3670			return (err);
3671		}
3672		m_head = *m_headp;
3673	}
3674
3675retry:
3676	err = bus_dmamap_load_mbuf_sg(buf_tag, map, m_head, segs, &nsegs,
3677	    BUS_DMA_NOWAIT);
3678defrag:
3679	if (__predict_false(err)) {
3680		switch (err) {
3681		case EFBIG:
3682			/* try collapse once and defrag once */
3683			if (remap == 0) {
3684				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
3685				/* try defrag if collapsing fails */
3686				if (m_head == NULL)
3687					remap++;
3688			}
3689			if (remap == 1) {
3690				txq->ift_mbuf_defrag++;
3691				m_head = m_defrag(*m_headp, M_NOWAIT);
3692			}
3693			/*
3694			 * remap should never be >1 unless bus_dmamap_load_mbuf_sg
3695			 * failed to map an mbuf that was run through m_defrag
3696			 */
3697			MPASS(remap <= 1);
3698			if (__predict_false(m_head == NULL || remap > 1))
3699				goto defrag_failed;
3700			remap++;
3701			*m_headp = m_head;
3702			goto retry;
3703			break;
3704		case ENOMEM:
3705			txq->ift_no_tx_dma_setup++;
3706			break;
3707		default:
3708			txq->ift_no_tx_dma_setup++;
3709			m_freem(*m_headp);
3710			DBG_COUNTER_INC(tx_frees);
3711			*m_headp = NULL;
3712			break;
3713		}
3714		txq->ift_map_failed++;
3715		DBG_COUNTER_INC(encap_load_mbuf_fail);
3716		DBG_COUNTER_INC(encap_txd_encap_fail);
3717		return (err);
3718	}
3719	ifsd_m[pidx] = m_head;
3720	/*
3721	 * XXX assumes a 1 to 1 relationship between segments and
3722	 *        descriptors - this does not hold true on all drivers, e.g.
3723	 *        cxgb
3724	 */
3725	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
3726		txq->ift_no_desc_avail++;
3727		bus_dmamap_unload(buf_tag, map);
3728		DBG_COUNTER_INC(encap_txq_avail_fail);
3729		DBG_COUNTER_INC(encap_txd_encap_fail);
3730		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
3731			GROUPTASK_ENQUEUE(&txq->ift_task);
3732		return (ENOBUFS);
3733	}
3734	/*
3735	 * On Intel cards we can greatly reduce the number of TX interrupts
3736	 * we see by only setting report status on every Nth descriptor.
3737	 * However, this also means that the driver will need to keep track
3738	 * of the descriptors that RS was set on to check them for the DD bit.
3739	 */
3740	txq->ift_rs_pending += nsegs + 1;
3741	if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
3742	     iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) {
3743		pi.ipi_flags |= IPI_TX_INTR;
3744		txq->ift_rs_pending = 0;
3745	}
3746
3747	pi.ipi_segs = segs;
3748	pi.ipi_nsegs = nsegs;
3749
3750	MPASS(pidx >= 0 && pidx < txq->ift_size);
3751#ifdef PKT_DEBUG
3752	print_pkt(&pi);
3753#endif
3754	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
3755		bus_dmamap_sync(buf_tag, map, BUS_DMASYNC_PREWRITE);
3756		DBG_COUNTER_INC(tx_encap);
3757		MPASS(pi.ipi_new_pidx < txq->ift_size);
3758
3759		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
3760		if (pi.ipi_new_pidx < pi.ipi_pidx) {
3761			ndesc += txq->ift_size;
3762			txq->ift_gen = 1;
3763		}
3764		/*
3765		 * drivers can need as many as
3766		 * two sentinels
3767		 */
3768		MPASS(ndesc <= pi.ipi_nsegs + 2);
3769		MPASS(pi.ipi_new_pidx != pidx);
3770		MPASS(ndesc > 0);
3771		txq->ift_in_use += ndesc;
3772		txq->ift_db_pending += ndesc;
3773
3774		/*
3775		 * We update the last software descriptor again here because there may
3776		 * be a sentinel and/or there may be more mbufs than segments
3777		 */
3778		txq->ift_pidx = pi.ipi_new_pidx;
3779		txq->ift_npending += pi.ipi_ndescs;
3780	} else {
3781		*m_headp = m_head = iflib_remove_mbuf(txq);
3782		if (err == EFBIG) {
3783			txq->ift_txd_encap_efbig++;
3784			if (remap < 2) {
3785				remap = 1;
3786				goto defrag;
3787			}
3788		}
3789		goto defrag_failed;
3790	}
3791	/*
3792	 * err can't possibly be non-zero here, so we don't neet to test it
3793	 * to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail).
3794	 */
3795	return (err);
3796
3797defrag_failed:
3798	txq->ift_mbuf_defrag_failed++;
3799	txq->ift_map_failed++;
3800	m_freem(*m_headp);
3801	DBG_COUNTER_INC(tx_frees);
3802	*m_headp = NULL;
3803	DBG_COUNTER_INC(encap_txd_encap_fail);
3804	return (ENOMEM);
3805}
3806
3807static void
3808iflib_tx_desc_free(iflib_txq_t txq, int n)
3809{
3810	uint32_t qsize, cidx, mask, gen;
3811	struct mbuf *m, **ifsd_m;
3812	bool do_prefetch;
3813
3814	cidx = txq->ift_cidx;
3815	gen = txq->ift_gen;
3816	qsize = txq->ift_size;
3817	mask = qsize-1;
3818	ifsd_m = txq->ift_sds.ifsd_m;
3819	do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
3820
3821	while (n-- > 0) {
3822		if (do_prefetch) {
3823			prefetch(ifsd_m[(cidx + 3) & mask]);
3824			prefetch(ifsd_m[(cidx + 4) & mask]);
3825		}
3826		if ((m = ifsd_m[cidx]) != NULL) {
3827			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
3828			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
3829				bus_dmamap_sync(txq->ift_tso_buf_tag,
3830				    txq->ift_sds.ifsd_tso_map[cidx],
3831				    BUS_DMASYNC_POSTWRITE);
3832				bus_dmamap_unload(txq->ift_tso_buf_tag,
3833				    txq->ift_sds.ifsd_tso_map[cidx]);
3834			} else {
3835				bus_dmamap_sync(txq->ift_buf_tag,
3836				    txq->ift_sds.ifsd_map[cidx],
3837				    BUS_DMASYNC_POSTWRITE);
3838				bus_dmamap_unload(txq->ift_buf_tag,
3839				    txq->ift_sds.ifsd_map[cidx]);
3840			}
3841			/* XXX we don't support any drivers that batch packets yet */
3842			MPASS(m->m_nextpkt == NULL);
3843			m_freem(m);
3844			ifsd_m[cidx] = NULL;
3845#if MEMORY_LOGGING
3846			txq->ift_dequeued++;
3847#endif
3848			DBG_COUNTER_INC(tx_frees);
3849		}
3850		if (__predict_false(++cidx == qsize)) {
3851			cidx = 0;
3852			gen = 0;
3853		}
3854	}
3855	txq->ift_cidx = cidx;
3856	txq->ift_gen = gen;
3857}
3858
3859static __inline int
3860iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
3861{
3862	int reclaim;
3863	if_ctx_t ctx = txq->ift_ctx;
3864
3865	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
3866	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
3867
3868	/*
3869	 * Need a rate-limiting check so that this isn't called every time
3870	 */
3871	iflib_tx_credits_update(ctx, txq);
3872	reclaim = DESC_RECLAIMABLE(txq);
3873
3874	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
3875#ifdef INVARIANTS
3876		if (iflib_verbose_debug) {
3877			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
3878			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
3879			       reclaim, thresh);
3880		}
3881#endif
3882		return (0);
3883	}
3884	iflib_tx_desc_free(txq, reclaim);
3885	txq->ift_cleaned += reclaim;
3886	txq->ift_in_use -= reclaim;
3887
3888	return (reclaim);
3889}
3890
3891static struct mbuf **
3892_ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
3893{
3894	int next, size;
3895	struct mbuf **items;
3896
3897	size = r->size;
3898	next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
3899	items = __DEVOLATILE(struct mbuf **, &r->items[0]);
3900
3901	prefetch(items[(cidx + offset) & (size-1)]);
3902	if (remaining > 1) {
3903		prefetch2cachelines(&items[next]);
3904		prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]);
3905		prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]);
3906		prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]);
3907	}
3908	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
3909}
3910
3911static void
3912iflib_txq_check_drain(iflib_txq_t txq, int budget)
3913{
3914
3915	ifmp_ring_check_drainage(txq->ift_br, budget);
3916}
3917
3918static uint32_t
3919iflib_txq_can_drain(struct ifmp_ring *r)
3920{
3921	iflib_txq_t txq = r->cookie;
3922	if_ctx_t ctx = txq->ift_ctx;
3923
3924	if (TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2)
3925		return (1);
3926	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
3927	    BUS_DMASYNC_POSTREAD);
3928	return (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id,
3929	    false));
3930}
3931
3932static uint32_t
3933iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
3934{
3935	iflib_txq_t txq = r->cookie;
3936	if_ctx_t ctx = txq->ift_ctx;
3937	if_t ifp = ctx->ifc_ifp;
3938	struct mbuf *m, **mp;
3939	int avail, bytes_sent, skipped, count, err, i;
3940	int mcast_sent, pkt_sent, reclaimed;
3941	bool do_prefetch, rang, ring;
3942
3943	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
3944			    !LINK_ACTIVE(ctx))) {
3945		DBG_COUNTER_INC(txq_drain_notready);
3946		return (0);
3947	}
3948	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
3949	rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending);
3950	avail = IDXDIFF(pidx, cidx, r->size);
3951
3952	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
3953		/*
3954		 * The driver is unloading so we need to free all pending packets.
3955		 */
3956		DBG_COUNTER_INC(txq_drain_flushing);
3957		for (i = 0; i < avail; i++) {
3958			if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq))
3959				m_freem(r->items[(cidx + i) & (r->size-1)]);
3960			r->items[(cidx + i) & (r->size-1)] = NULL;
3961		}
3962		return (avail);
3963	}
3964
3965	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
3966		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3967		CALLOUT_LOCK(txq);
3968		callout_stop(&txq->ift_timer);
3969		CALLOUT_UNLOCK(txq);
3970		DBG_COUNTER_INC(txq_drain_oactive);
3971		return (0);
3972	}
3973
3974	/*
3975	 * If we've reclaimed any packets this queue cannot be hung.
3976	 */
3977	if (reclaimed)
3978		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3979	skipped = mcast_sent = bytes_sent = pkt_sent = 0;
3980	count = MIN(avail, TX_BATCH_SIZE);
3981#ifdef INVARIANTS
3982	if (iflib_verbose_debug)
3983		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
3984		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
3985#endif
3986	do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
3987	err = 0;
3988	for (i = 0; i < count && TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx) + 2; i++) {
3989		int rem = do_prefetch ? count - i : 0;
3990
3991		mp = _ring_peek_one(r, cidx, i, rem);
3992		MPASS(mp != NULL && *mp != NULL);
3993
3994		/*
3995		 * Completion interrupts will use the address of the txq
3996		 * as a sentinel to enqueue _something_ in order to acquire
3997		 * the lock on the mp_ring (there's no direct lock call).
3998		 * We obviously whave to check for these sentinel cases
3999		 * and skip them.
4000		 */
4001		if (__predict_false(*mp == (struct mbuf *)txq)) {
4002			skipped++;
4003			continue;
4004		}
4005		err = iflib_encap(txq, mp);
4006		if (__predict_false(err)) {
4007			/* no room - bail out */
4008			if (err == ENOBUFS)
4009				break;
4010			skipped++;
4011			/* we can't send this packet - skip it */
4012			continue;
4013		}
4014		pkt_sent++;
4015		m = *mp;
4016		DBG_COUNTER_INC(tx_sent);
4017		bytes_sent += m->m_pkthdr.len;
4018		mcast_sent += !!(m->m_flags & M_MCAST);
4019
4020		if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)))
4021			break;
4022		ETHER_BPF_MTAP(ifp, m);
4023		rang = iflib_txd_db_check(txq, false);
4024	}
4025
4026	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
4027	ring = rang ? false  : (iflib_min_tx_latency | err);
4028	iflib_txd_db_check(txq, ring);
4029	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
4030	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
4031	if (mcast_sent)
4032		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
4033#ifdef INVARIANTS
4034	if (iflib_verbose_debug)
4035		printf("consumed=%d\n", skipped + pkt_sent);
4036#endif
4037	return (skipped + pkt_sent);
4038}
4039
4040static uint32_t
4041iflib_txq_drain_always(struct ifmp_ring *r)
4042{
4043	return (1);
4044}
4045
4046static uint32_t
4047iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
4048{
4049	int i, avail;
4050	struct mbuf **mp;
4051	iflib_txq_t txq;
4052
4053	txq = r->cookie;
4054
4055	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
4056	CALLOUT_LOCK(txq);
4057	callout_stop(&txq->ift_timer);
4058	CALLOUT_UNLOCK(txq);
4059
4060	avail = IDXDIFF(pidx, cidx, r->size);
4061	for (i = 0; i < avail; i++) {
4062		mp = _ring_peek_one(r, cidx, i, avail - i);
4063		if (__predict_false(*mp == (struct mbuf *)txq))
4064			continue;
4065		m_freem(*mp);
4066		DBG_COUNTER_INC(tx_frees);
4067	}
4068	MPASS(ifmp_ring_is_stalled(r) == 0);
4069	return (avail);
4070}
4071
4072static void
4073iflib_ifmp_purge(iflib_txq_t txq)
4074{
4075	struct ifmp_ring *r;
4076
4077	r = txq->ift_br;
4078	r->drain = iflib_txq_drain_free;
4079	r->can_drain = iflib_txq_drain_always;
4080
4081	ifmp_ring_check_drainage(r, r->size);
4082
4083	r->drain = iflib_txq_drain;
4084	r->can_drain = iflib_txq_can_drain;
4085}
4086
4087static void
4088_task_fn_tx(void *context)
4089{
4090	iflib_txq_t txq = context;
4091	if_ctx_t ctx = txq->ift_ctx;
4092	if_t ifp = ctx->ifc_ifp;
4093	int abdicate = ctx->ifc_sysctl_tx_abdicate;
4094
4095#ifdef IFLIB_DIAGNOSTICS
4096	txq->ift_cpu_exec_count[curcpu]++;
4097#endif
4098	if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
4099		return;
4100#ifdef DEV_NETMAP
4101	if ((if_getcapenable(ifp) & IFCAP_NETMAP) &&
4102	    netmap_tx_irq(ifp, txq->ift_id))
4103		goto skip_ifmp;
4104#endif
4105#ifdef ALTQ
4106	if (if_altq_is_enabled(ifp))
4107		iflib_altq_if_start(ifp);
4108#endif
4109	if (txq->ift_db_pending)
4110		ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate);
4111	else if (!abdicate)
4112		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4113	/*
4114	 * When abdicating, we always need to check drainage, not just when we don't enqueue
4115	 */
4116	if (abdicate)
4117		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4118#ifdef DEV_NETMAP
4119skip_ifmp:
4120#endif
4121	if (ctx->ifc_flags & IFC_LEGACY)
4122		IFDI_INTR_ENABLE(ctx);
4123	else
4124		IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
4125}
4126
4127static void
4128_task_fn_rx(void *context)
4129{
4130	iflib_rxq_t rxq = context;
4131	if_ctx_t ctx = rxq->ifr_ctx;
4132	uint8_t more;
4133	uint16_t budget;
4134#ifdef DEV_NETMAP
4135	u_int work = 0;
4136	int nmirq;
4137#endif
4138
4139#ifdef IFLIB_DIAGNOSTICS
4140	rxq->ifr_cpu_exec_count[curcpu]++;
4141#endif
4142	DBG_COUNTER_INC(task_fn_rxs);
4143	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
4144		return;
4145#ifdef DEV_NETMAP
4146	nmirq = netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work);
4147	if (nmirq != NM_IRQ_PASS) {
4148		more = (nmirq == NM_IRQ_RESCHED) ? IFLIB_RXEOF_MORE : 0;
4149		goto skip_rxeof;
4150	}
4151#endif
4152	budget = ctx->ifc_sysctl_rx_budget;
4153	if (budget == 0)
4154		budget = 16;	/* XXX */
4155	more = iflib_rxeof(rxq, budget);
4156#ifdef DEV_NETMAP
4157skip_rxeof:
4158#endif
4159	if ((more & IFLIB_RXEOF_MORE) == 0) {
4160		if (ctx->ifc_flags & IFC_LEGACY)
4161			IFDI_INTR_ENABLE(ctx);
4162		else
4163			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
4164		DBG_COUNTER_INC(rx_intr_enables);
4165	}
4166	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
4167		return;
4168
4169	if (more & IFLIB_RXEOF_MORE)
4170		GROUPTASK_ENQUEUE(&rxq->ifr_task);
4171	else if (more & IFLIB_RXEOF_EMPTY)
4172		callout_reset_curcpu(&rxq->ifr_watchdog, 1, &_task_fn_rx_watchdog, rxq);
4173}
4174
4175static void
4176_task_fn_admin(void *context)
4177{
4178	if_ctx_t ctx = context;
4179	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
4180	iflib_txq_t txq;
4181	int i;
4182	bool oactive, running, do_reset, do_watchdog, in_detach;
4183
4184	STATE_LOCK(ctx);
4185	running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING);
4186	oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE);
4187	do_reset = (ctx->ifc_flags & IFC_DO_RESET);
4188	do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG);
4189	in_detach = (ctx->ifc_flags & IFC_IN_DETACH);
4190	ctx->ifc_flags &= ~(IFC_DO_RESET|IFC_DO_WATCHDOG);
4191	STATE_UNLOCK(ctx);
4192
4193	if ((!running && !oactive) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
4194		return;
4195	if (in_detach)
4196		return;
4197
4198	CTX_LOCK(ctx);
4199	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
4200		CALLOUT_LOCK(txq);
4201		callout_stop(&txq->ift_timer);
4202		CALLOUT_UNLOCK(txq);
4203	}
4204	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_ADMINCQ)
4205		IFDI_ADMIN_COMPLETION_HANDLE(ctx);
4206	if (do_watchdog) {
4207		ctx->ifc_watchdog_events++;
4208		IFDI_WATCHDOG_RESET(ctx);
4209	}
4210	IFDI_UPDATE_ADMIN_STATUS(ctx);
4211	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
4212		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
4213		    txq->ift_timer.c_cpu);
4214	}
4215	IFDI_LINK_INTR_ENABLE(ctx);
4216	if (do_reset)
4217		iflib_if_init_locked(ctx);
4218	CTX_UNLOCK(ctx);
4219
4220	if (LINK_ACTIVE(ctx) == 0)
4221		return;
4222	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
4223		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
4224}
4225
4226static void
4227_task_fn_iov(void *context)
4228{
4229	if_ctx_t ctx = context;
4230
4231	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) &&
4232	    !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
4233		return;
4234
4235	CTX_LOCK(ctx);
4236	IFDI_VFLR_HANDLE(ctx);
4237	CTX_UNLOCK(ctx);
4238}
4239
4240static int
4241iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
4242{
4243	int err;
4244	if_int_delay_info_t info;
4245	if_ctx_t ctx;
4246
4247	info = (if_int_delay_info_t)arg1;
4248	ctx = info->iidi_ctx;
4249	info->iidi_req = req;
4250	info->iidi_oidp = oidp;
4251	CTX_LOCK(ctx);
4252	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
4253	CTX_UNLOCK(ctx);
4254	return (err);
4255}
4256
4257/*********************************************************************
4258 *
4259 *  IFNET FUNCTIONS
4260 *
4261 **********************************************************************/
4262
4263static void
4264iflib_if_init_locked(if_ctx_t ctx)
4265{
4266	iflib_stop(ctx);
4267	iflib_init_locked(ctx);
4268}
4269
4270static void
4271iflib_if_init(void *arg)
4272{
4273	if_ctx_t ctx = arg;
4274
4275	CTX_LOCK(ctx);
4276	iflib_if_init_locked(ctx);
4277	CTX_UNLOCK(ctx);
4278}
4279
4280static int
4281iflib_if_transmit(if_t ifp, struct mbuf *m)
4282{
4283	if_ctx_t ctx = if_getsoftc(ifp);
4284	iflib_txq_t txq;
4285	int err, qidx;
4286	int abdicate;
4287
4288	if (__predict_false((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
4289		DBG_COUNTER_INC(tx_frees);
4290		m_freem(m);
4291		return (ENETDOWN);
4292	}
4293
4294	MPASS(m->m_nextpkt == NULL);
4295	/* ALTQ-enabled interfaces always use queue 0. */
4296	qidx = 0;
4297	/* Use driver-supplied queue selection method if it exists */
4298	if (ctx->isc_txq_select_v2) {
4299		struct if_pkt_info pi;
4300		uint64_t early_pullups = 0;
4301		pkt_info_zero(&pi);
4302
4303		err = iflib_parse_header_partial(&pi, &m, &early_pullups);
4304		if (__predict_false(err != 0)) {
4305			/* Assign pullups for bad pkts to default queue */
4306			ctx->ifc_txqs[0].ift_pullups += early_pullups;
4307			DBG_COUNTER_INC(encap_txd_encap_fail);
4308			return (err);
4309		}
4310		/* Let driver make queueing decision */
4311		qidx = ctx->isc_txq_select_v2(ctx->ifc_softc, m, &pi);
4312		ctx->ifc_txqs[qidx].ift_pullups += early_pullups;
4313	}
4314	/* Backwards compatibility w/ simpler queue select */
4315	else if (ctx->isc_txq_select)
4316		qidx = ctx->isc_txq_select(ctx->ifc_softc, m);
4317	/* If not, use iflib's standard method */
4318	else if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !if_altq_is_enabled(ifp))
4319		qidx = QIDX(ctx, m);
4320
4321	/* Set TX queue */
4322	txq = &ctx->ifc_txqs[qidx];
4323
4324#ifdef DRIVER_BACKPRESSURE
4325	if (txq->ift_closed) {
4326		while (m != NULL) {
4327			next = m->m_nextpkt;
4328			m->m_nextpkt = NULL;
4329			m_freem(m);
4330			DBG_COUNTER_INC(tx_frees);
4331			m = next;
4332		}
4333		return (ENOBUFS);
4334	}
4335#endif
4336#ifdef notyet
4337	qidx = count = 0;
4338	mp = marr;
4339	next = m;
4340	do {
4341		count++;
4342		next = next->m_nextpkt;
4343	} while (next != NULL);
4344
4345	if (count > nitems(marr))
4346		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
4347			/* XXX check nextpkt */
4348			m_freem(m);
4349			/* XXX simplify for now */
4350			DBG_COUNTER_INC(tx_frees);
4351			return (ENOBUFS);
4352		}
4353	for (next = m, i = 0; next != NULL; i++) {
4354		mp[i] = next;
4355		next = next->m_nextpkt;
4356		mp[i]->m_nextpkt = NULL;
4357	}
4358#endif
4359	DBG_COUNTER_INC(tx_seen);
4360	abdicate = ctx->ifc_sysctl_tx_abdicate;
4361
4362	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate);
4363
4364	if (abdicate)
4365		GROUPTASK_ENQUEUE(&txq->ift_task);
4366 	if (err) {
4367		if (!abdicate)
4368			GROUPTASK_ENQUEUE(&txq->ift_task);
4369		/* support forthcoming later */
4370#ifdef DRIVER_BACKPRESSURE
4371		txq->ift_closed = TRUE;
4372#endif
4373		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4374		m_freem(m);
4375		DBG_COUNTER_INC(tx_frees);
4376	}
4377
4378	return (err);
4379}
4380
4381#ifdef ALTQ
4382/*
4383 * The overall approach to integrating iflib with ALTQ is to continue to use
4384 * the iflib mp_ring machinery between the ALTQ queue(s) and the hardware
4385 * ring.  Technically, when using ALTQ, queueing to an intermediate mp_ring
4386 * is redundant/unnecessary, but doing so minimizes the amount of
4387 * ALTQ-specific code required in iflib.  It is assumed that the overhead of
4388 * redundantly queueing to an intermediate mp_ring is swamped by the
4389 * performance limitations inherent in using ALTQ.
4390 *
4391 * When ALTQ support is compiled in, all iflib drivers will use a transmit
4392 * routine, iflib_altq_if_transmit(), that checks if ALTQ is enabled for the
4393 * given interface.  If ALTQ is enabled for an interface, then all
4394 * transmitted packets for that interface will be submitted to the ALTQ
4395 * subsystem via IFQ_ENQUEUE().  We don't use the legacy if_transmit()
4396 * implementation because it uses IFQ_HANDOFF(), which will duplicatively
4397 * update stats that the iflib machinery handles, and which is sensitve to
4398 * the disused IFF_DRV_OACTIVE flag.  Additionally, iflib_altq_if_start()
4399 * will be installed as the start routine for use by ALTQ facilities that
4400 * need to trigger queue drains on a scheduled basis.
4401 *
4402 */
4403static void
4404iflib_altq_if_start(if_t ifp)
4405{
4406	struct ifaltq *ifq = &ifp->if_snd; /* XXX - DRVAPI */
4407	struct mbuf *m;
4408
4409	IFQ_LOCK(ifq);
4410	IFQ_DEQUEUE_NOLOCK(ifq, m);
4411	while (m != NULL) {
4412		iflib_if_transmit(ifp, m);
4413		IFQ_DEQUEUE_NOLOCK(ifq, m);
4414	}
4415	IFQ_UNLOCK(ifq);
4416}
4417
4418static int
4419iflib_altq_if_transmit(if_t ifp, struct mbuf *m)
4420{
4421	int err;
4422
4423	if (if_altq_is_enabled(ifp)) {
4424		IFQ_ENQUEUE(&ifp->if_snd, m, err); /* XXX - DRVAPI */
4425		if (err == 0)
4426			iflib_altq_if_start(ifp);
4427	} else
4428		err = iflib_if_transmit(ifp, m);
4429
4430	return (err);
4431}
4432#endif /* ALTQ */
4433
4434static void
4435iflib_if_qflush(if_t ifp)
4436{
4437	if_ctx_t ctx = if_getsoftc(ifp);
4438	iflib_txq_t txq = ctx->ifc_txqs;
4439	int i;
4440
4441	STATE_LOCK(ctx);
4442	ctx->ifc_flags |= IFC_QFLUSH;
4443	STATE_UNLOCK(ctx);
4444	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
4445		while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
4446			iflib_txq_check_drain(txq, 0);
4447	STATE_LOCK(ctx);
4448	ctx->ifc_flags &= ~IFC_QFLUSH;
4449	STATE_UNLOCK(ctx);
4450
4451	/*
4452	 * When ALTQ is enabled, this will also take care of purging the
4453	 * ALTQ queue(s).
4454	 */
4455	if_qflush(ifp);
4456}
4457
4458#define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
4459		     IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
4460		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \
4461		     IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM | IFCAP_MEXTPG)
4462
4463static int
4464iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
4465{
4466	if_ctx_t ctx = if_getsoftc(ifp);
4467	struct ifreq	*ifr = (struct ifreq *)data;
4468#if defined(INET) || defined(INET6)
4469	struct ifaddr	*ifa = (struct ifaddr *)data;
4470#endif
4471	bool		avoid_reset = false;
4472	int		err = 0, reinit = 0, bits;
4473
4474	switch (command) {
4475	case SIOCSIFADDR:
4476#ifdef INET
4477		if (ifa->ifa_addr->sa_family == AF_INET)
4478			avoid_reset = true;
4479#endif
4480#ifdef INET6
4481		if (ifa->ifa_addr->sa_family == AF_INET6)
4482			avoid_reset = true;
4483#endif
4484		/*
4485		** Calling init results in link renegotiation,
4486		** so we avoid doing it when possible.
4487		*/
4488		if (avoid_reset) {
4489			if_setflagbits(ifp, IFF_UP,0);
4490			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
4491				reinit = 1;
4492#ifdef INET
4493			if (!(if_getflags(ifp) & IFF_NOARP))
4494				arp_ifinit(ifp, ifa);
4495#endif
4496		} else
4497			err = ether_ioctl(ifp, command, data);
4498		break;
4499	case SIOCSIFMTU:
4500		CTX_LOCK(ctx);
4501		if (ifr->ifr_mtu == if_getmtu(ifp)) {
4502			CTX_UNLOCK(ctx);
4503			break;
4504		}
4505		bits = if_getdrvflags(ifp);
4506		/* stop the driver and free any clusters before proceeding */
4507		iflib_stop(ctx);
4508
4509		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
4510			STATE_LOCK(ctx);
4511			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
4512				ctx->ifc_flags |= IFC_MULTISEG;
4513			else
4514				ctx->ifc_flags &= ~IFC_MULTISEG;
4515			STATE_UNLOCK(ctx);
4516			err = if_setmtu(ifp, ifr->ifr_mtu);
4517		}
4518		iflib_init_locked(ctx);
4519		STATE_LOCK(ctx);
4520		if_setdrvflags(ifp, bits);
4521		STATE_UNLOCK(ctx);
4522		CTX_UNLOCK(ctx);
4523		break;
4524	case SIOCSIFFLAGS:
4525		CTX_LOCK(ctx);
4526		if (if_getflags(ifp) & IFF_UP) {
4527			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4528				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
4529				    (IFF_PROMISC | IFF_ALLMULTI)) {
4530					CTX_UNLOCK(ctx);
4531					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
4532					CTX_LOCK(ctx);
4533				}
4534			} else
4535				reinit = 1;
4536		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4537			iflib_stop(ctx);
4538		}
4539		ctx->ifc_if_flags = if_getflags(ifp);
4540		CTX_UNLOCK(ctx);
4541		break;
4542	case SIOCADDMULTI:
4543	case SIOCDELMULTI:
4544		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4545			CTX_LOCK(ctx);
4546			IFDI_INTR_DISABLE(ctx);
4547			IFDI_MULTI_SET(ctx);
4548			IFDI_INTR_ENABLE(ctx);
4549			CTX_UNLOCK(ctx);
4550		}
4551		break;
4552	case SIOCSIFMEDIA:
4553		CTX_LOCK(ctx);
4554		IFDI_MEDIA_SET(ctx);
4555		CTX_UNLOCK(ctx);
4556		/* FALLTHROUGH */
4557	case SIOCGIFMEDIA:
4558	case SIOCGIFXMEDIA:
4559		err = ifmedia_ioctl(ifp, ifr, ctx->ifc_mediap, command);
4560		break;
4561	case SIOCGI2C:
4562	{
4563		struct ifi2creq i2c;
4564
4565		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4566		if (err != 0)
4567			break;
4568		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
4569			err = EINVAL;
4570			break;
4571		}
4572		if (i2c.len > sizeof(i2c.data)) {
4573			err = EINVAL;
4574			break;
4575		}
4576
4577		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
4578			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4579			    sizeof(i2c));
4580		break;
4581	}
4582	case SIOCSIFCAP:
4583	{
4584		int mask, setmask, oldmask;
4585
4586		oldmask = if_getcapenable(ifp);
4587		mask = ifr->ifr_reqcap ^ oldmask;
4588		mask &= ctx->ifc_softc_ctx.isc_capabilities | IFCAP_MEXTPG;
4589		setmask = 0;
4590#ifdef TCP_OFFLOAD
4591		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
4592#endif
4593		setmask |= (mask & IFCAP_FLAGS);
4594		setmask |= (mask & IFCAP_WOL);
4595
4596		/*
4597		 * If any RX csum has changed, change all the ones that
4598		 * are supported by the driver.
4599		 */
4600		if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
4601			setmask |= ctx->ifc_softc_ctx.isc_capabilities &
4602			    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
4603		}
4604
4605		/*
4606		 * want to ensure that traffic has stopped before we change any of the flags
4607		 */
4608		if (setmask) {
4609			CTX_LOCK(ctx);
4610			bits = if_getdrvflags(ifp);
4611			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
4612				iflib_stop(ctx);
4613			STATE_LOCK(ctx);
4614			if_togglecapenable(ifp, setmask);
4615			ctx->ifc_softc_ctx.isc_capenable ^= setmask;
4616			STATE_UNLOCK(ctx);
4617			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
4618				iflib_init_locked(ctx);
4619			STATE_LOCK(ctx);
4620			if_setdrvflags(ifp, bits);
4621			STATE_UNLOCK(ctx);
4622			CTX_UNLOCK(ctx);
4623		}
4624		if_vlancap(ifp);
4625		break;
4626	}
4627	case SIOCGPRIVATE_0:
4628	case SIOCSDRVSPEC:
4629	case SIOCGDRVSPEC:
4630		CTX_LOCK(ctx);
4631		err = IFDI_PRIV_IOCTL(ctx, command, data);
4632		CTX_UNLOCK(ctx);
4633		break;
4634	default:
4635		err = ether_ioctl(ifp, command, data);
4636		break;
4637	}
4638	if (reinit)
4639		iflib_if_init(ctx);
4640	return (err);
4641}
4642
4643static uint64_t
4644iflib_if_get_counter(if_t ifp, ift_counter cnt)
4645{
4646	if_ctx_t ctx = if_getsoftc(ifp);
4647
4648	return (IFDI_GET_COUNTER(ctx, cnt));
4649}
4650
4651/*********************************************************************
4652 *
4653 *  OTHER FUNCTIONS EXPORTED TO THE STACK
4654 *
4655 **********************************************************************/
4656
4657static void
4658iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
4659{
4660	if_ctx_t ctx = if_getsoftc(ifp);
4661
4662	if ((void *)ctx != arg)
4663		return;
4664
4665	if ((vtag == 0) || (vtag > 4095))
4666		return;
4667
4668	if (iflib_in_detach(ctx))
4669		return;
4670
4671	CTX_LOCK(ctx);
4672	/* Driver may need all untagged packets to be flushed */
4673	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4674		iflib_stop(ctx);
4675	IFDI_VLAN_REGISTER(ctx, vtag);
4676	/* Re-init to load the changes, if required */
4677	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4678		iflib_init_locked(ctx);
4679	CTX_UNLOCK(ctx);
4680}
4681
4682static void
4683iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
4684{
4685	if_ctx_t ctx = if_getsoftc(ifp);
4686
4687	if ((void *)ctx != arg)
4688		return;
4689
4690	if ((vtag == 0) || (vtag > 4095))
4691		return;
4692
4693	CTX_LOCK(ctx);
4694	/* Driver may need all tagged packets to be flushed */
4695	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4696		iflib_stop(ctx);
4697	IFDI_VLAN_UNREGISTER(ctx, vtag);
4698	/* Re-init to load the changes, if required */
4699	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4700		iflib_init_locked(ctx);
4701	CTX_UNLOCK(ctx);
4702}
4703
4704static void
4705iflib_led_func(void *arg, int onoff)
4706{
4707	if_ctx_t ctx = arg;
4708
4709	CTX_LOCK(ctx);
4710	IFDI_LED_FUNC(ctx, onoff);
4711	CTX_UNLOCK(ctx);
4712}
4713
4714/*********************************************************************
4715 *
4716 *  BUS FUNCTION DEFINITIONS
4717 *
4718 **********************************************************************/
4719
4720int
4721iflib_device_probe(device_t dev)
4722{
4723	const pci_vendor_info_t *ent;
4724	if_shared_ctx_t sctx;
4725	uint16_t pci_device_id, pci_rev_id, pci_subdevice_id, pci_subvendor_id;
4726	uint16_t pci_vendor_id;
4727
4728	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
4729		return (ENOTSUP);
4730
4731	pci_vendor_id = pci_get_vendor(dev);
4732	pci_device_id = pci_get_device(dev);
4733	pci_subvendor_id = pci_get_subvendor(dev);
4734	pci_subdevice_id = pci_get_subdevice(dev);
4735	pci_rev_id = pci_get_revid(dev);
4736	if (sctx->isc_parse_devinfo != NULL)
4737		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
4738
4739	ent = sctx->isc_vendor_info;
4740	while (ent->pvi_vendor_id != 0) {
4741		if (pci_vendor_id != ent->pvi_vendor_id) {
4742			ent++;
4743			continue;
4744		}
4745		if ((pci_device_id == ent->pvi_device_id) &&
4746		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
4747		     (ent->pvi_subvendor_id == 0)) &&
4748		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
4749		     (ent->pvi_subdevice_id == 0)) &&
4750		    ((pci_rev_id == ent->pvi_rev_id) ||
4751		     (ent->pvi_rev_id == 0))) {
4752			device_set_desc_copy(dev, ent->pvi_name);
4753			/* this needs to be changed to zero if the bus probing code
4754			 * ever stops re-probing on best match because the sctx
4755			 * may have its values over written by register calls
4756			 * in subsequent probes
4757			 */
4758			return (BUS_PROBE_DEFAULT);
4759		}
4760		ent++;
4761	}
4762	return (ENXIO);
4763}
4764
4765int
4766iflib_device_probe_vendor(device_t dev)
4767{
4768	int probe;
4769
4770	probe = iflib_device_probe(dev);
4771	if (probe == BUS_PROBE_DEFAULT)
4772		return (BUS_PROBE_VENDOR);
4773	else
4774		return (probe);
4775}
4776
4777static void
4778iflib_reset_qvalues(if_ctx_t ctx)
4779{
4780	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
4781	if_shared_ctx_t sctx = ctx->ifc_sctx;
4782	device_t dev = ctx->ifc_dev;
4783	int i;
4784
4785	if (ctx->ifc_sysctl_ntxqs != 0)
4786		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
4787	if (ctx->ifc_sysctl_nrxqs != 0)
4788		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
4789
4790	for (i = 0; i < sctx->isc_ntxqs; i++) {
4791		if (ctx->ifc_sysctl_ntxds[i] != 0)
4792			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
4793		else
4794			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
4795	}
4796
4797	for (i = 0; i < sctx->isc_nrxqs; i++) {
4798		if (ctx->ifc_sysctl_nrxds[i] != 0)
4799			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
4800		else
4801			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
4802	}
4803
4804	for (i = 0; i < sctx->isc_nrxqs; i++) {
4805		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
4806			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
4807				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
4808			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
4809		}
4810		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
4811			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
4812				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
4813			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
4814		}
4815		if (!powerof2(scctx->isc_nrxd[i])) {
4816			device_printf(dev, "nrxd%d: %d is not a power of 2 - using default value of %d\n",
4817				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_default[i]);
4818			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
4819		}
4820	}
4821
4822	for (i = 0; i < sctx->isc_ntxqs; i++) {
4823		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
4824			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
4825				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
4826			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
4827		}
4828		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
4829			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
4830				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
4831			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
4832		}
4833		if (!powerof2(scctx->isc_ntxd[i])) {
4834			device_printf(dev, "ntxd%d: %d is not a power of 2 - using default value of %d\n",
4835				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_default[i]);
4836			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
4837		}
4838	}
4839}
4840
4841static void
4842iflib_add_pfil(if_ctx_t ctx)
4843{
4844	struct pfil_head *pfil;
4845	struct pfil_head_args pa;
4846	iflib_rxq_t rxq;
4847	int i;
4848
4849	pa.pa_version = PFIL_VERSION;
4850	pa.pa_flags = PFIL_IN;
4851	pa.pa_type = PFIL_TYPE_ETHERNET;
4852	pa.pa_headname = if_name(ctx->ifc_ifp);
4853	pfil = pfil_head_register(&pa);
4854
4855	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
4856		rxq->pfil = pfil;
4857	}
4858}
4859
4860static void
4861iflib_rem_pfil(if_ctx_t ctx)
4862{
4863	struct pfil_head *pfil;
4864	iflib_rxq_t rxq;
4865	int i;
4866
4867	rxq = ctx->ifc_rxqs;
4868	pfil = rxq->pfil;
4869	for (i = 0; i < NRXQSETS(ctx); i++, rxq++) {
4870		rxq->pfil = NULL;
4871	}
4872	pfil_head_unregister(pfil);
4873}
4874
4875
4876/*
4877 * Advance forward by n members of the cpuset ctx->ifc_cpus starting from
4878 * cpuid and wrapping as necessary.
4879 */
4880static unsigned int
4881cpuid_advance(if_ctx_t ctx, unsigned int cpuid, unsigned int n)
4882{
4883	unsigned int first_valid;
4884	unsigned int last_valid;
4885
4886	/* cpuid should always be in the valid set */
4887	MPASS(CPU_ISSET(cpuid, &ctx->ifc_cpus));
4888
4889	/* valid set should never be empty */
4890	MPASS(!CPU_EMPTY(&ctx->ifc_cpus));
4891
4892	first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
4893	last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
4894	n = n % CPU_COUNT(&ctx->ifc_cpus);
4895	while (n > 0) {
4896		do {
4897			cpuid++;
4898			if (cpuid > last_valid)
4899				cpuid = first_valid;
4900		} while (!CPU_ISSET(cpuid, &ctx->ifc_cpus));
4901		n--;
4902	}
4903
4904	return (cpuid);
4905}
4906
4907#if defined(SMP) && defined(SCHED_ULE)
4908extern struct cpu_group *cpu_top;              /* CPU topology */
4909
4910static int
4911find_child_with_core(int cpu, struct cpu_group *grp)
4912{
4913	int i;
4914
4915	if (grp->cg_children == 0)
4916		return -1;
4917
4918	MPASS(grp->cg_child);
4919	for (i = 0; i < grp->cg_children; i++) {
4920		if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
4921			return i;
4922	}
4923
4924	return -1;
4925}
4926
4927
4928/*
4929 * Find an L2 neighbor of the given CPU or return -1 if none found.  This
4930 * does not distinguish among multiple L2 neighbors if the given CPU has
4931 * more than one (it will always return the same result in that case).
4932 */
4933static int
4934find_l2_neighbor(int cpu)
4935{
4936	struct cpu_group *grp;
4937	int i;
4938
4939	grp = cpu_top;
4940	if (grp == NULL)
4941		return -1;
4942
4943	/*
4944	 * Find the smallest CPU group that contains the given core.
4945	 */
4946	i = 0;
4947	while ((i = find_child_with_core(cpu, grp)) != -1) {
4948		/*
4949		 * If the smallest group containing the given CPU has less
4950		 * than two members, we conclude the given CPU has no
4951		 * L2 neighbor.
4952		 */
4953		if (grp->cg_child[i].cg_count <= 1)
4954			return (-1);
4955		grp = &grp->cg_child[i];
4956	}
4957
4958	/* Must share L2. */
4959	if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
4960		return -1;
4961
4962	/*
4963	 * Select the first member of the set that isn't the reference
4964	 * CPU, which at this point is guaranteed to exist.
4965	 */
4966	for (i = 0; i < CPU_SETSIZE; i++) {
4967		if (CPU_ISSET(i, &grp->cg_mask) && i != cpu)
4968			return (i);
4969	}
4970
4971	/* Should never be reached */
4972	return (-1);
4973}
4974
4975#else
4976static int
4977find_l2_neighbor(int cpu)
4978{
4979
4980	return (-1);
4981}
4982#endif
4983
4984/*
4985 * CPU mapping behaviors
4986 * ---------------------
4987 * 'separate txrx' refers to the separate_txrx sysctl
4988 * 'use logical' refers to the use_logical_cores sysctl
4989 * 'INTR CPUS' indicates whether bus_get_cpus(INTR_CPUS) succeeded
4990 *
4991 *  separate     use     INTR
4992 *    txrx     logical   CPUS   result
4993 * ---------- --------- ------ ------------------------------------------------
4994 *     -          -       X     RX and TX queues mapped to consecutive physical
4995 *                              cores with RX/TX pairs on same core and excess
4996 *                              of either following
4997 *     -          X       X     RX and TX queues mapped to consecutive cores
4998 *                              of any type with RX/TX pairs on same core and
4999 *                              excess of either following
5000 *     X          -       X     RX and TX queues mapped to consecutive physical
5001 *                              cores; all RX then all TX
5002 *     X          X       X     RX queues mapped to consecutive physical cores
5003 *                              first, then TX queues mapped to L2 neighbor of
5004 *                              the corresponding RX queue if one exists,
5005 *                              otherwise to consecutive physical cores
5006 *     -         n/a      -     RX and TX queues mapped to consecutive cores of
5007 *                              any type with RX/TX pairs on same core and excess
5008 *                              of either following
5009 *     X         n/a      -     RX and TX queues mapped to consecutive cores of
5010 *                              any type; all RX then all TX
5011 */
5012static unsigned int
5013get_cpuid_for_queue(if_ctx_t ctx, unsigned int base_cpuid, unsigned int qid,
5014    bool is_tx)
5015{
5016	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
5017	unsigned int core_index;
5018
5019	if (ctx->ifc_sysctl_separate_txrx) {
5020		/*
5021		 * When using separate CPUs for TX and RX, the assignment
5022		 * will always be of a consecutive CPU out of the set of
5023		 * context CPUs, except for the specific case where the
5024		 * context CPUs are phsyical cores, the use of logical cores
5025		 * has been enabled, the assignment is for TX, the TX qid
5026		 * corresponds to an RX qid, and the CPU assigned to the
5027		 * corresponding RX queue has an L2 neighbor.
5028		 */
5029		if (ctx->ifc_sysctl_use_logical_cores &&
5030		    ctx->ifc_cpus_are_physical_cores &&
5031		    is_tx && qid < scctx->isc_nrxqsets) {
5032			int l2_neighbor;
5033			unsigned int rx_cpuid;
5034
5035			rx_cpuid = cpuid_advance(ctx, base_cpuid, qid);
5036			l2_neighbor = find_l2_neighbor(rx_cpuid);
5037			if (l2_neighbor != -1) {
5038				return (l2_neighbor);
5039			}
5040			/*
5041			 * ... else fall through to the normal
5042			 * consecutive-after-RX assignment scheme.
5043			 *
5044			 * Note that we are assuming that all RX queue CPUs
5045			 * have an L2 neighbor, or all do not.  If a mixed
5046			 * scenario is possible, we will have to keep track
5047			 * separately of how many queues prior to this one
5048			 * were not able to be assigned to an L2 neighbor.
5049			 */
5050		}
5051		if (is_tx)
5052			core_index = scctx->isc_nrxqsets + qid;
5053		else
5054			core_index = qid;
5055	} else {
5056		core_index = qid;
5057	}
5058
5059	return (cpuid_advance(ctx, base_cpuid, core_index));
5060}
5061
5062static uint16_t
5063get_ctx_core_offset(if_ctx_t ctx)
5064{
5065	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
5066	struct cpu_offset *op;
5067	cpuset_t assigned_cpus;
5068	unsigned int cores_consumed;
5069	unsigned int base_cpuid = ctx->ifc_sysctl_core_offset;
5070	unsigned int first_valid;
5071	unsigned int last_valid;
5072	unsigned int i;
5073
5074	first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
5075	last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
5076
5077	if (base_cpuid != CORE_OFFSET_UNSPECIFIED) {
5078		/*
5079		 * Align the user-chosen base CPU ID to the next valid CPU
5080		 * for this device.  If the chosen base CPU ID is smaller
5081		 * than the first valid CPU or larger than the last valid
5082		 * CPU, we assume the user does not know what the valid
5083		 * range is for this device and is thinking in terms of a
5084		 * zero-based reference frame, and so we shift the given
5085		 * value into the valid range (and wrap accordingly) so the
5086		 * intent is translated to the proper frame of reference.
5087		 * If the base CPU ID is within the valid first/last, but
5088		 * does not correspond to a valid CPU, it is advanced to the
5089		 * next valid CPU (wrapping if necessary).
5090		 */
5091		if (base_cpuid < first_valid || base_cpuid > last_valid) {
5092			/* shift from zero-based to first_valid-based */
5093			base_cpuid += first_valid;
5094			/* wrap to range [first_valid, last_valid] */
5095			base_cpuid = (base_cpuid - first_valid) %
5096			    (last_valid - first_valid + 1);
5097		}
5098		if (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) {
5099			/*
5100			 * base_cpuid is in [first_valid, last_valid], but
5101			 * not a member of the valid set.  In this case,
5102			 * there will always be a member of the valid set
5103			 * with a CPU ID that is greater than base_cpuid,
5104			 * and we simply advance to it.
5105			 */
5106			while (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus))
5107				base_cpuid++;
5108		}
5109		return (base_cpuid);
5110	}
5111
5112	/*
5113	 * Determine how many cores will be consumed by performing the CPU
5114	 * assignments and counting how many of the assigned CPUs correspond
5115	 * to CPUs in the set of context CPUs.  This is done using the CPU
5116	 * ID first_valid as the base CPU ID, as the base CPU must be within
5117	 * the set of context CPUs.
5118	 *
5119	 * Note not all assigned CPUs will be in the set of context CPUs
5120	 * when separate CPUs are being allocated to TX and RX queues,
5121	 * assignment to logical cores has been enabled, the set of context
5122	 * CPUs contains only physical CPUs, and TX queues are mapped to L2
5123	 * neighbors of CPUs that RX queues have been mapped to - in this
5124	 * case we do only want to count how many CPUs in the set of context
5125	 * CPUs have been consumed, as that determines the next CPU in that
5126	 * set to start allocating at for the next device for which
5127	 * core_offset is not set.
5128	 */
5129	CPU_ZERO(&assigned_cpus);
5130	for (i = 0; i < scctx->isc_ntxqsets; i++)
5131		CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, true),
5132		    &assigned_cpus);
5133	for (i = 0; i < scctx->isc_nrxqsets; i++)
5134		CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, false),
5135		    &assigned_cpus);
5136	CPU_AND(&assigned_cpus, &assigned_cpus, &ctx->ifc_cpus);
5137	cores_consumed = CPU_COUNT(&assigned_cpus);
5138
5139	mtx_lock(&cpu_offset_mtx);
5140	SLIST_FOREACH(op, &cpu_offsets, entries) {
5141		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
5142			base_cpuid = op->next_cpuid;
5143			op->next_cpuid = cpuid_advance(ctx, op->next_cpuid,
5144			    cores_consumed);
5145			MPASS(op->refcount < UINT_MAX);
5146			op->refcount++;
5147			break;
5148		}
5149	}
5150	if (base_cpuid == CORE_OFFSET_UNSPECIFIED) {
5151		base_cpuid = first_valid;
5152		op = malloc(sizeof(struct cpu_offset), M_IFLIB,
5153		    M_NOWAIT | M_ZERO);
5154		if (op == NULL) {
5155			device_printf(ctx->ifc_dev,
5156			    "allocation for cpu offset failed.\n");
5157		} else {
5158			op->next_cpuid = cpuid_advance(ctx, base_cpuid,
5159			    cores_consumed);
5160			op->refcount = 1;
5161			CPU_COPY(&ctx->ifc_cpus, &op->set);
5162			SLIST_INSERT_HEAD(&cpu_offsets, op, entries);
5163		}
5164	}
5165	mtx_unlock(&cpu_offset_mtx);
5166
5167	return (base_cpuid);
5168}
5169
5170static void
5171unref_ctx_core_offset(if_ctx_t ctx)
5172{
5173	struct cpu_offset *op, *top;
5174
5175	mtx_lock(&cpu_offset_mtx);
5176	SLIST_FOREACH_SAFE(op, &cpu_offsets, entries, top) {
5177		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
5178			MPASS(op->refcount > 0);
5179			op->refcount--;
5180			if (op->refcount == 0) {
5181				SLIST_REMOVE(&cpu_offsets, op, cpu_offset, entries);
5182				free(op, M_IFLIB);
5183			}
5184			break;
5185		}
5186	}
5187	mtx_unlock(&cpu_offset_mtx);
5188}
5189
5190int
5191iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
5192{
5193	if_ctx_t ctx;
5194	if_t ifp;
5195	if_softc_ctx_t scctx;
5196	kobjop_desc_t kobj_desc;
5197	kobj_method_t *kobj_method;
5198	int err, msix, rid;
5199	int num_txd, num_rxd;
5200
5201	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
5202
5203	if (sc == NULL) {
5204		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
5205		device_set_softc(dev, ctx);
5206		ctx->ifc_flags |= IFC_SC_ALLOCATED;
5207	}
5208
5209	ctx->ifc_sctx = sctx;
5210	ctx->ifc_dev = dev;
5211	ctx->ifc_softc = sc;
5212
5213	if ((err = iflib_register(ctx)) != 0) {
5214		device_printf(dev, "iflib_register failed %d\n", err);
5215		goto fail_ctx_free;
5216	}
5217	iflib_add_device_sysctl_pre(ctx);
5218
5219	scctx = &ctx->ifc_softc_ctx;
5220	ifp = ctx->ifc_ifp;
5221
5222	iflib_reset_qvalues(ctx);
5223	IFNET_WLOCK();
5224	CTX_LOCK(ctx);
5225	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
5226		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
5227		goto fail_unlock;
5228	}
5229	_iflib_pre_assert(scctx);
5230	ctx->ifc_txrx = *scctx->isc_txrx;
5231
5232	MPASS(scctx->isc_dma_width <= flsll(BUS_SPACE_MAXADDR));
5233
5234	if (sctx->isc_flags & IFLIB_DRIVER_MEDIA)
5235		ctx->ifc_mediap = scctx->isc_media;
5236
5237#ifdef INVARIANTS
5238	if (scctx->isc_capabilities & IFCAP_TXCSUM)
5239		MPASS(scctx->isc_tx_csum_flags);
5240#endif
5241
5242	if_setcapabilities(ifp,
5243	    scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_MEXTPG);
5244	if_setcapenable(ifp,
5245	    scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_MEXTPG);
5246
5247	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
5248		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
5249	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
5250		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
5251
5252	num_txd = iflib_num_tx_descs(ctx);
5253	num_rxd = iflib_num_rx_descs(ctx);
5254
5255	/* XXX change for per-queue sizes */
5256	device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
5257	    num_txd, num_rxd);
5258
5259	if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
5260		scctx->isc_tx_nsegments = max(1, num_txd /
5261		    MAX_SINGLE_PACKET_FRACTION);
5262	if (scctx->isc_tx_tso_segments_max > num_txd /
5263	    MAX_SINGLE_PACKET_FRACTION)
5264		scctx->isc_tx_tso_segments_max = max(1,
5265		    num_txd / MAX_SINGLE_PACKET_FRACTION);
5266
5267	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
5268	if (if_getcapabilities(ifp) & IFCAP_TSO) {
5269		/*
5270		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
5271		 * but some MACs do.
5272		 */
5273		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
5274		    IP_MAXPACKET));
5275		/*
5276		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
5277		 * into account.  In the worst case, each of these calls will
5278		 * add another mbuf and, thus, the requirement for another DMA
5279		 * segment.  So for best performance, it doesn't make sense to
5280		 * advertize a maximum of TSO segments that typically will
5281		 * require defragmentation in iflib_encap().
5282		 */
5283		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
5284		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
5285	}
5286	if (scctx->isc_rss_table_size == 0)
5287		scctx->isc_rss_table_size = 64;
5288	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
5289
5290	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
5291	/* XXX format name */
5292	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
5293	    NULL, NULL, "admin");
5294
5295	/* Set up cpu set.  If it fails, use the set of all CPUs. */
5296	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
5297		device_printf(dev, "Unable to fetch CPU list\n");
5298		CPU_COPY(&all_cpus, &ctx->ifc_cpus);
5299		ctx->ifc_cpus_are_physical_cores = false;
5300	} else
5301		ctx->ifc_cpus_are_physical_cores = true;
5302	MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
5303
5304	/*
5305	** Now set up MSI or MSI-X, should return us the number of supported
5306	** vectors (will be 1 for a legacy interrupt and MSI).
5307	*/
5308	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
5309		msix = scctx->isc_vectors;
5310	} else if (scctx->isc_msix_bar != 0)
5311	       /*
5312		* The simple fact that isc_msix_bar is not 0 does not mean we
5313		* we have a good value there that is known to work.
5314		*/
5315		msix = iflib_msix_init(ctx);
5316	else {
5317		scctx->isc_vectors = 1;
5318		scctx->isc_ntxqsets = 1;
5319		scctx->isc_nrxqsets = 1;
5320		scctx->isc_intr = IFLIB_INTR_LEGACY;
5321		msix = 0;
5322	}
5323	/* Get memory for the station queues */
5324	if ((err = iflib_queues_alloc(ctx))) {
5325		device_printf(dev, "Unable to allocate queue memory\n");
5326		goto fail_intr_free;
5327	}
5328
5329	if ((err = iflib_qset_structures_setup(ctx)))
5330		goto fail_queues;
5331
5332	/*
5333	 * Now that we know how many queues there are, get the core offset.
5334	 */
5335	ctx->ifc_sysctl_core_offset = get_ctx_core_offset(ctx);
5336
5337	if (msix > 1) {
5338		/*
5339		 * When using MSI-X, ensure that ifdi_{r,t}x_queue_intr_enable
5340		 * aren't the default NULL implementation.
5341		 */
5342		kobj_desc = &ifdi_rx_queue_intr_enable_desc;
5343		kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
5344		    kobj_desc);
5345		if (kobj_method == &kobj_desc->deflt) {
5346			device_printf(dev,
5347			    "MSI-X requires ifdi_rx_queue_intr_enable method");
5348			err = EOPNOTSUPP;
5349			goto fail_queues;
5350		}
5351		kobj_desc = &ifdi_tx_queue_intr_enable_desc;
5352		kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
5353		    kobj_desc);
5354		if (kobj_method == &kobj_desc->deflt) {
5355			device_printf(dev,
5356			    "MSI-X requires ifdi_tx_queue_intr_enable method");
5357			err = EOPNOTSUPP;
5358			goto fail_queues;
5359		}
5360
5361		/*
5362		 * Assign the MSI-X vectors.
5363		 * Note that the default NULL ifdi_msix_intr_assign method will
5364		 * fail here, too.
5365		 */
5366		err = IFDI_MSIX_INTR_ASSIGN(ctx, msix);
5367		if (err != 0) {
5368			device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n",
5369			    err);
5370			goto fail_queues;
5371		}
5372	} else if (scctx->isc_intr != IFLIB_INTR_MSIX) {
5373		rid = 0;
5374		if (scctx->isc_intr == IFLIB_INTR_MSI) {
5375			MPASS(msix == 1);
5376			rid = 1;
5377		}
5378		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
5379			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
5380			goto fail_queues;
5381		}
5382	} else {
5383		device_printf(dev,
5384		    "Cannot use iflib with only 1 MSI-X interrupt!\n");
5385		err = ENODEV;
5386		goto fail_queues;
5387	}
5388
5389	/*
5390	 * It prevents a double-locking panic with iflib_media_status when
5391	 * the driver loads.
5392	 */
5393	CTX_UNLOCK(ctx);
5394	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
5395	CTX_LOCK(ctx);
5396
5397	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
5398		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
5399		goto fail_detach;
5400	}
5401
5402	/*
5403	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
5404	 * This must appear after the call to ether_ifattach() because
5405	 * ether_ifattach() sets if_hdrlen to the default value.
5406	 */
5407	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
5408		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
5409
5410	if ((err = iflib_netmap_attach(ctx))) {
5411		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
5412		goto fail_detach;
5413	}
5414	*ctxp = ctx;
5415
5416	DEBUGNET_SET(ctx->ifc_ifp, iflib);
5417
5418	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
5419	iflib_add_device_sysctl_post(ctx);
5420	iflib_add_pfil(ctx);
5421	ctx->ifc_flags |= IFC_INIT_DONE;
5422	CTX_UNLOCK(ctx);
5423	IFNET_WUNLOCK();
5424
5425	return (0);
5426
5427fail_detach:
5428	ether_ifdetach(ctx->ifc_ifp);
5429fail_queues:
5430	iflib_tqg_detach(ctx);
5431	iflib_tx_structures_free(ctx);
5432	iflib_rx_structures_free(ctx);
5433	IFDI_DETACH(ctx);
5434	IFDI_QUEUES_FREE(ctx);
5435fail_intr_free:
5436	iflib_free_intr_mem(ctx);
5437fail_unlock:
5438	CTX_UNLOCK(ctx);
5439	IFNET_WUNLOCK();
5440	iflib_deregister(ctx);
5441fail_ctx_free:
5442	device_set_softc(ctx->ifc_dev, NULL);
5443        if (ctx->ifc_flags & IFC_SC_ALLOCATED)
5444                free(ctx->ifc_softc, M_IFLIB);
5445        free(ctx, M_IFLIB);
5446	return (err);
5447}
5448
5449int
5450iflib_device_attach(device_t dev)
5451{
5452	if_ctx_t ctx;
5453	if_shared_ctx_t sctx;
5454
5455	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
5456		return (ENOTSUP);
5457
5458	pci_enable_busmaster(dev);
5459
5460	return (iflib_device_register(dev, NULL, sctx, &ctx));
5461}
5462
5463int
5464iflib_device_deregister(if_ctx_t ctx)
5465{
5466	if_t ifp = ctx->ifc_ifp;
5467	device_t dev = ctx->ifc_dev;
5468
5469	/* Make sure VLANS are not using driver */
5470	if (if_vlantrunkinuse(ifp)) {
5471		device_printf(dev, "Vlan in use, detach first\n");
5472		return (EBUSY);
5473	}
5474#ifdef PCI_IOV
5475	if (!CTX_IS_VF(ctx) && pci_iov_detach(dev) != 0) {
5476		device_printf(dev, "SR-IOV in use; detach first.\n");
5477		return (EBUSY);
5478	}
5479#endif
5480
5481	STATE_LOCK(ctx);
5482	ctx->ifc_flags |= IFC_IN_DETACH;
5483	STATE_UNLOCK(ctx);
5484
5485	/* Unregister VLAN handlers before calling iflib_stop() */
5486	iflib_unregister_vlan_handlers(ctx);
5487
5488	iflib_netmap_detach(ifp);
5489	ether_ifdetach(ifp);
5490
5491	CTX_LOCK(ctx);
5492	iflib_stop(ctx);
5493	CTX_UNLOCK(ctx);
5494
5495	iflib_rem_pfil(ctx);
5496	if (ctx->ifc_led_dev != NULL)
5497		led_destroy(ctx->ifc_led_dev);
5498
5499	iflib_tqg_detach(ctx);
5500	iflib_tx_structures_free(ctx);
5501	iflib_rx_structures_free(ctx);
5502
5503	CTX_LOCK(ctx);
5504	IFDI_DETACH(ctx);
5505	IFDI_QUEUES_FREE(ctx);
5506	CTX_UNLOCK(ctx);
5507
5508	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
5509	iflib_free_intr_mem(ctx);
5510
5511	bus_generic_detach(dev);
5512
5513	iflib_deregister(ctx);
5514
5515	device_set_softc(ctx->ifc_dev, NULL);
5516	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
5517		free(ctx->ifc_softc, M_IFLIB);
5518	unref_ctx_core_offset(ctx);
5519	free(ctx, M_IFLIB);
5520	return (0);
5521}
5522
5523static void
5524iflib_tqg_detach(if_ctx_t ctx)
5525{
5526	iflib_txq_t txq;
5527	iflib_rxq_t rxq;
5528	int i;
5529	struct taskqgroup *tqg;
5530
5531	/* XXX drain any dependent tasks */
5532	tqg = qgroup_if_io_tqg;
5533	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
5534		callout_drain(&txq->ift_timer);
5535#ifdef DEV_NETMAP
5536		callout_drain(&txq->ift_netmap_timer);
5537#endif /* DEV_NETMAP */
5538		if (txq->ift_task.gt_uniq != NULL)
5539			taskqgroup_detach(tqg, &txq->ift_task);
5540	}
5541	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
5542		if (rxq->ifr_task.gt_uniq != NULL)
5543			taskqgroup_detach(tqg, &rxq->ifr_task);
5544	}
5545	tqg = qgroup_if_config_tqg;
5546	if (ctx->ifc_admin_task.gt_uniq != NULL)
5547		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
5548	if (ctx->ifc_vflr_task.gt_uniq != NULL)
5549		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
5550}
5551
5552static void
5553iflib_free_intr_mem(if_ctx_t ctx)
5554{
5555
5556	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
5557		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
5558	}
5559	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
5560		pci_release_msi(ctx->ifc_dev);
5561	}
5562	if (ctx->ifc_msix_mem != NULL) {
5563		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
5564		    rman_get_rid(ctx->ifc_msix_mem), ctx->ifc_msix_mem);
5565		ctx->ifc_msix_mem = NULL;
5566	}
5567}
5568
5569int
5570iflib_device_detach(device_t dev)
5571{
5572	if_ctx_t ctx = device_get_softc(dev);
5573
5574	return (iflib_device_deregister(ctx));
5575}
5576
5577int
5578iflib_device_suspend(device_t dev)
5579{
5580	if_ctx_t ctx = device_get_softc(dev);
5581
5582	CTX_LOCK(ctx);
5583	IFDI_SUSPEND(ctx);
5584	CTX_UNLOCK(ctx);
5585
5586	return bus_generic_suspend(dev);
5587}
5588int
5589iflib_device_shutdown(device_t dev)
5590{
5591	if_ctx_t ctx = device_get_softc(dev);
5592
5593	CTX_LOCK(ctx);
5594	IFDI_SHUTDOWN(ctx);
5595	CTX_UNLOCK(ctx);
5596
5597	return bus_generic_suspend(dev);
5598}
5599
5600int
5601iflib_device_resume(device_t dev)
5602{
5603	if_ctx_t ctx = device_get_softc(dev);
5604	iflib_txq_t txq = ctx->ifc_txqs;
5605
5606	CTX_LOCK(ctx);
5607	IFDI_RESUME(ctx);
5608	iflib_if_init_locked(ctx);
5609	CTX_UNLOCK(ctx);
5610	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
5611		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
5612
5613	return (bus_generic_resume(dev));
5614}
5615
5616int
5617iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
5618{
5619	int error;
5620	if_ctx_t ctx = device_get_softc(dev);
5621
5622	CTX_LOCK(ctx);
5623	error = IFDI_IOV_INIT(ctx, num_vfs, params);
5624	CTX_UNLOCK(ctx);
5625
5626	return (error);
5627}
5628
5629void
5630iflib_device_iov_uninit(device_t dev)
5631{
5632	if_ctx_t ctx = device_get_softc(dev);
5633
5634	CTX_LOCK(ctx);
5635	IFDI_IOV_UNINIT(ctx);
5636	CTX_UNLOCK(ctx);
5637}
5638
5639int
5640iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
5641{
5642	int error;
5643	if_ctx_t ctx = device_get_softc(dev);
5644
5645	CTX_LOCK(ctx);
5646	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
5647	CTX_UNLOCK(ctx);
5648
5649	return (error);
5650}
5651
5652/*********************************************************************
5653 *
5654 *  MODULE FUNCTION DEFINITIONS
5655 *
5656 **********************************************************************/
5657
5658/*
5659 * - Start a fast taskqueue thread for each core
5660 * - Start a taskqueue for control operations
5661 */
5662static int
5663iflib_module_init(void)
5664{
5665	iflib_timer_default = hz / 2;
5666	return (0);
5667}
5668
5669static int
5670iflib_module_event_handler(module_t mod, int what, void *arg)
5671{
5672	int err;
5673
5674	switch (what) {
5675	case MOD_LOAD:
5676		if ((err = iflib_module_init()) != 0)
5677			return (err);
5678		break;
5679	case MOD_UNLOAD:
5680		return (EBUSY);
5681	default:
5682		return (EOPNOTSUPP);
5683	}
5684
5685	return (0);
5686}
5687
5688/*********************************************************************
5689 *
5690 *  PUBLIC FUNCTION DEFINITIONS
5691 *     ordered as in iflib.h
5692 *
5693 **********************************************************************/
5694
5695static void
5696_iflib_assert(if_shared_ctx_t sctx)
5697{
5698	int i;
5699
5700	MPASS(sctx->isc_tx_maxsize);
5701	MPASS(sctx->isc_tx_maxsegsize);
5702
5703	MPASS(sctx->isc_rx_maxsize);
5704	MPASS(sctx->isc_rx_nsegments);
5705	MPASS(sctx->isc_rx_maxsegsize);
5706
5707	MPASS(sctx->isc_nrxqs >= 1 && sctx->isc_nrxqs <= 8);
5708	for (i = 0; i < sctx->isc_nrxqs; i++) {
5709		MPASS(sctx->isc_nrxd_min[i]);
5710		MPASS(powerof2(sctx->isc_nrxd_min[i]));
5711		MPASS(sctx->isc_nrxd_max[i]);
5712		MPASS(powerof2(sctx->isc_nrxd_max[i]));
5713		MPASS(sctx->isc_nrxd_default[i]);
5714		MPASS(powerof2(sctx->isc_nrxd_default[i]));
5715	}
5716
5717	MPASS(sctx->isc_ntxqs >= 1 && sctx->isc_ntxqs <= 8);
5718	for (i = 0; i < sctx->isc_ntxqs; i++) {
5719		MPASS(sctx->isc_ntxd_min[i]);
5720		MPASS(powerof2(sctx->isc_ntxd_min[i]));
5721		MPASS(sctx->isc_ntxd_max[i]);
5722		MPASS(powerof2(sctx->isc_ntxd_max[i]));
5723		MPASS(sctx->isc_ntxd_default[i]);
5724		MPASS(powerof2(sctx->isc_ntxd_default[i]));
5725	}
5726}
5727
5728static void
5729_iflib_pre_assert(if_softc_ctx_t scctx)
5730{
5731
5732	MPASS(scctx->isc_txrx->ift_txd_encap);
5733	MPASS(scctx->isc_txrx->ift_txd_flush);
5734	MPASS(scctx->isc_txrx->ift_txd_credits_update);
5735	MPASS(scctx->isc_txrx->ift_rxd_available);
5736	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
5737	MPASS(scctx->isc_txrx->ift_rxd_refill);
5738	MPASS(scctx->isc_txrx->ift_rxd_flush);
5739}
5740
5741static int
5742iflib_register(if_ctx_t ctx)
5743{
5744	if_shared_ctx_t sctx = ctx->ifc_sctx;
5745	driver_t *driver = sctx->isc_driver;
5746	device_t dev = ctx->ifc_dev;
5747	if_t ifp;
5748
5749	_iflib_assert(sctx);
5750
5751	CTX_LOCK_INIT(ctx);
5752	STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
5753	ifp = ctx->ifc_ifp = if_alloc(IFT_ETHER);
5754	if (ifp == NULL) {
5755		device_printf(dev, "can not allocate ifnet structure\n");
5756		return (ENOMEM);
5757	}
5758
5759	/*
5760	 * Initialize our context's device specific methods
5761	 */
5762	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
5763	kobj_class_compile((kobj_class_t) driver);
5764
5765	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
5766	if_setsoftc(ifp, ctx);
5767	if_setdev(ifp, dev);
5768	if_setinitfn(ifp, iflib_if_init);
5769	if_setioctlfn(ifp, iflib_if_ioctl);
5770#ifdef ALTQ
5771	if_setstartfn(ifp, iflib_altq_if_start);
5772	if_settransmitfn(ifp, iflib_altq_if_transmit);
5773	if_setsendqready(ifp);
5774#else
5775	if_settransmitfn(ifp, iflib_if_transmit);
5776#endif
5777	if_setqflushfn(ifp, iflib_if_qflush);
5778	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
5779	ctx->ifc_vlan_attach_event =
5780		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
5781							  EVENTHANDLER_PRI_FIRST);
5782	ctx->ifc_vlan_detach_event =
5783		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
5784							  EVENTHANDLER_PRI_FIRST);
5785
5786	if ((sctx->isc_flags & IFLIB_DRIVER_MEDIA) == 0) {
5787		ctx->ifc_mediap = &ctx->ifc_media;
5788		ifmedia_init(ctx->ifc_mediap, IFM_IMASK,
5789		    iflib_media_change, iflib_media_status);
5790	}
5791	return (0);
5792}
5793
5794static void
5795iflib_unregister_vlan_handlers(if_ctx_t ctx)
5796{
5797	/* Unregister VLAN events */
5798	if (ctx->ifc_vlan_attach_event != NULL) {
5799		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
5800		ctx->ifc_vlan_attach_event = NULL;
5801	}
5802	if (ctx->ifc_vlan_detach_event != NULL) {
5803		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
5804		ctx->ifc_vlan_detach_event = NULL;
5805	}
5806
5807}
5808
5809static void
5810iflib_deregister(if_ctx_t ctx)
5811{
5812	if_t ifp = ctx->ifc_ifp;
5813
5814	/* Remove all media */
5815	ifmedia_removeall(&ctx->ifc_media);
5816
5817	/* Ensure that VLAN event handlers are unregistered */
5818	iflib_unregister_vlan_handlers(ctx);
5819
5820	/* Release kobject reference */
5821	kobj_delete((kobj_t) ctx, NULL);
5822
5823	/* Free the ifnet structure */
5824	if_free(ifp);
5825
5826	STATE_LOCK_DESTROY(ctx);
5827
5828	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
5829	CTX_LOCK_DESTROY(ctx);
5830}
5831
5832static int
5833iflib_queues_alloc(if_ctx_t ctx)
5834{
5835	if_shared_ctx_t sctx = ctx->ifc_sctx;
5836	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
5837	device_t dev = ctx->ifc_dev;
5838	int nrxqsets = scctx->isc_nrxqsets;
5839	int ntxqsets = scctx->isc_ntxqsets;
5840	iflib_txq_t txq;
5841	iflib_rxq_t rxq;
5842	iflib_fl_t fl = NULL;
5843	int i, j, cpu, err, txconf, rxconf;
5844	iflib_dma_info_t ifdip;
5845	uint32_t *rxqsizes = scctx->isc_rxqsizes;
5846	uint32_t *txqsizes = scctx->isc_txqsizes;
5847	uint8_t nrxqs = sctx->isc_nrxqs;
5848	uint8_t ntxqs = sctx->isc_ntxqs;
5849	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
5850	int fl_offset = (sctx->isc_flags & IFLIB_HAS_RXCQ ? 1 : 0);
5851	caddr_t *vaddrs;
5852	uint64_t *paddrs;
5853
5854	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
5855	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
5856	KASSERT(nrxqs >= fl_offset + nfree_lists,
5857           ("there must be at least a rxq for each free list"));
5858
5859	/* Allocate the TX ring struct memory */
5860	if (!(ctx->ifc_txqs =
5861	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
5862	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
5863		device_printf(dev, "Unable to allocate TX ring memory\n");
5864		err = ENOMEM;
5865		goto fail;
5866	}
5867
5868	/* Now allocate the RX */
5869	if (!(ctx->ifc_rxqs =
5870	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
5871	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
5872		device_printf(dev, "Unable to allocate RX ring memory\n");
5873		err = ENOMEM;
5874		goto rx_fail;
5875	}
5876
5877	txq = ctx->ifc_txqs;
5878	rxq = ctx->ifc_rxqs;
5879
5880	/*
5881	 * XXX handle allocation failure
5882	 */
5883	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
5884		/* Set up some basics */
5885
5886		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs,
5887		    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
5888			device_printf(dev,
5889			    "Unable to allocate TX DMA info memory\n");
5890			err = ENOMEM;
5891			goto err_tx_desc;
5892		}
5893		txq->ift_ifdi = ifdip;
5894		for (j = 0; j < ntxqs; j++, ifdip++) {
5895			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, 0)) {
5896				device_printf(dev,
5897				    "Unable to allocate TX descriptors\n");
5898				err = ENOMEM;
5899				goto err_tx_desc;
5900			}
5901			txq->ift_txd_size[j] = scctx->isc_txd_size[j];
5902			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
5903		}
5904		txq->ift_ctx = ctx;
5905		txq->ift_id = i;
5906		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
5907			txq->ift_br_offset = 1;
5908		} else {
5909			txq->ift_br_offset = 0;
5910		}
5911
5912		if (iflib_txsd_alloc(txq)) {
5913			device_printf(dev, "Critical Failure setting up TX buffers\n");
5914			err = ENOMEM;
5915			goto err_tx_desc;
5916		}
5917
5918		/* Initialize the TX lock */
5919		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:TX(%d):callout",
5920		    device_get_nameunit(dev), txq->ift_id);
5921		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
5922		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
5923		txq->ift_timer.c_cpu = cpu;
5924#ifdef DEV_NETMAP
5925		callout_init_mtx(&txq->ift_netmap_timer, &txq->ift_mtx, 0);
5926		txq->ift_netmap_timer.c_cpu = cpu;
5927#endif /* DEV_NETMAP */
5928
5929		err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
5930				      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
5931		if (err) {
5932			/* XXX free any allocated rings */
5933			device_printf(dev, "Unable to allocate buf_ring\n");
5934			goto err_tx_desc;
5935		}
5936	}
5937
5938	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
5939		/* Set up some basics */
5940		callout_init(&rxq->ifr_watchdog, 1);
5941
5942		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs,
5943		   M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
5944			device_printf(dev,
5945			    "Unable to allocate RX DMA info memory\n");
5946			err = ENOMEM;
5947			goto err_tx_desc;
5948		}
5949
5950		rxq->ifr_ifdi = ifdip;
5951		/* XXX this needs to be changed if #rx queues != #tx queues */
5952		rxq->ifr_ntxqirq = 1;
5953		rxq->ifr_txqid[0] = i;
5954		for (j = 0; j < nrxqs; j++, ifdip++) {
5955			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, 0)) {
5956				device_printf(dev,
5957				    "Unable to allocate RX descriptors\n");
5958				err = ENOMEM;
5959				goto err_tx_desc;
5960			}
5961			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
5962		}
5963		rxq->ifr_ctx = ctx;
5964		rxq->ifr_id = i;
5965		rxq->ifr_fl_offset = fl_offset;
5966		rxq->ifr_nfl = nfree_lists;
5967		if (!(fl =
5968			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
5969			device_printf(dev, "Unable to allocate free list memory\n");
5970			err = ENOMEM;
5971			goto err_tx_desc;
5972		}
5973		rxq->ifr_fl = fl;
5974		for (j = 0; j < nfree_lists; j++) {
5975			fl[j].ifl_rxq = rxq;
5976			fl[j].ifl_id = j;
5977			fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
5978			fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
5979		}
5980		/* Allocate receive buffers for the ring */
5981		if (iflib_rxsd_alloc(rxq)) {
5982			device_printf(dev,
5983			    "Critical Failure setting up receive buffers\n");
5984			err = ENOMEM;
5985			goto err_rx_desc;
5986		}
5987
5988		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
5989			fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB,
5990			    M_WAITOK);
5991	}
5992
5993	/* TXQs */
5994	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
5995	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
5996	for (i = 0; i < ntxqsets; i++) {
5997		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
5998
5999		for (j = 0; j < ntxqs; j++, di++) {
6000			vaddrs[i*ntxqs + j] = di->idi_vaddr;
6001			paddrs[i*ntxqs + j] = di->idi_paddr;
6002		}
6003	}
6004	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
6005		device_printf(ctx->ifc_dev,
6006		    "Unable to allocate device TX queue\n");
6007		iflib_tx_structures_free(ctx);
6008		free(vaddrs, M_IFLIB);
6009		free(paddrs, M_IFLIB);
6010		goto err_rx_desc;
6011	}
6012	free(vaddrs, M_IFLIB);
6013	free(paddrs, M_IFLIB);
6014
6015	/* RXQs */
6016	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
6017	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
6018	for (i = 0; i < nrxqsets; i++) {
6019		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
6020
6021		for (j = 0; j < nrxqs; j++, di++) {
6022			vaddrs[i*nrxqs + j] = di->idi_vaddr;
6023			paddrs[i*nrxqs + j] = di->idi_paddr;
6024		}
6025	}
6026	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
6027		device_printf(ctx->ifc_dev,
6028		    "Unable to allocate device RX queue\n");
6029		iflib_tx_structures_free(ctx);
6030		free(vaddrs, M_IFLIB);
6031		free(paddrs, M_IFLIB);
6032		goto err_rx_desc;
6033	}
6034	free(vaddrs, M_IFLIB);
6035	free(paddrs, M_IFLIB);
6036
6037	return (0);
6038
6039/* XXX handle allocation failure changes */
6040err_rx_desc:
6041err_tx_desc:
6042rx_fail:
6043	if (ctx->ifc_rxqs != NULL)
6044		free(ctx->ifc_rxqs, M_IFLIB);
6045	ctx->ifc_rxqs = NULL;
6046	if (ctx->ifc_txqs != NULL)
6047		free(ctx->ifc_txqs, M_IFLIB);
6048	ctx->ifc_txqs = NULL;
6049fail:
6050	return (err);
6051}
6052
6053static int
6054iflib_tx_structures_setup(if_ctx_t ctx)
6055{
6056	iflib_txq_t txq = ctx->ifc_txqs;
6057	int i;
6058
6059	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
6060		iflib_txq_setup(txq);
6061
6062	return (0);
6063}
6064
6065static void
6066iflib_tx_structures_free(if_ctx_t ctx)
6067{
6068	iflib_txq_t txq = ctx->ifc_txqs;
6069	if_shared_ctx_t sctx = ctx->ifc_sctx;
6070	int i, j;
6071
6072	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
6073		for (j = 0; j < sctx->isc_ntxqs; j++)
6074			iflib_dma_free(&txq->ift_ifdi[j]);
6075		iflib_txq_destroy(txq);
6076	}
6077	free(ctx->ifc_txqs, M_IFLIB);
6078	ctx->ifc_txqs = NULL;
6079}
6080
6081/*********************************************************************
6082 *
6083 *  Initialize all receive rings.
6084 *
6085 **********************************************************************/
6086static int
6087iflib_rx_structures_setup(if_ctx_t ctx)
6088{
6089	iflib_rxq_t rxq = ctx->ifc_rxqs;
6090	int q;
6091#if defined(INET6) || defined(INET)
6092	int err, i;
6093#endif
6094
6095	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
6096#if defined(INET6) || defined(INET)
6097		err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
6098		    TCP_LRO_ENTRIES, min(1024,
6099		    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]));
6100		if (err != 0) {
6101			device_printf(ctx->ifc_dev,
6102			    "LRO Initialization failed!\n");
6103			goto fail;
6104		}
6105#endif
6106		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
6107	}
6108	return (0);
6109#if defined(INET6) || defined(INET)
6110fail:
6111	/*
6112	 * Free LRO resources allocated so far, we will only handle
6113	 * the rings that completed, the failing case will have
6114	 * cleaned up for itself.  'q' failed, so its the terminus.
6115	 */
6116	rxq = ctx->ifc_rxqs;
6117	for (i = 0; i < q; ++i, rxq++) {
6118		tcp_lro_free(&rxq->ifr_lc);
6119	}
6120	return (err);
6121#endif
6122}
6123
6124/*********************************************************************
6125 *
6126 *  Free all receive rings.
6127 *
6128 **********************************************************************/
6129static void
6130iflib_rx_structures_free(if_ctx_t ctx)
6131{
6132	iflib_rxq_t rxq = ctx->ifc_rxqs;
6133	if_shared_ctx_t sctx = ctx->ifc_sctx;
6134	int i, j;
6135
6136	for (i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
6137		for (j = 0; j < sctx->isc_nrxqs; j++)
6138			iflib_dma_free(&rxq->ifr_ifdi[j]);
6139		iflib_rx_sds_free(rxq);
6140#if defined(INET6) || defined(INET)
6141		tcp_lro_free(&rxq->ifr_lc);
6142#endif
6143	}
6144	free(ctx->ifc_rxqs, M_IFLIB);
6145	ctx->ifc_rxqs = NULL;
6146}
6147
6148static int
6149iflib_qset_structures_setup(if_ctx_t ctx)
6150{
6151	int err;
6152
6153	/*
6154	 * It is expected that the caller takes care of freeing queues if this
6155	 * fails.
6156	 */
6157	if ((err = iflib_tx_structures_setup(ctx)) != 0) {
6158		device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err);
6159		return (err);
6160	}
6161
6162	if ((err = iflib_rx_structures_setup(ctx)) != 0)
6163		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
6164
6165	return (err);
6166}
6167
6168int
6169iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
6170		driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, const char *name)
6171{
6172
6173	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
6174}
6175
6176/* Just to avoid copy/paste */
6177static inline int
6178iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
6179    int qid, struct grouptask *gtask, struct taskqgroup *tqg, void *uniq,
6180    const char *name)
6181{
6182	device_t dev;
6183	unsigned int base_cpuid, cpuid;
6184	int err;
6185
6186	dev = ctx->ifc_dev;
6187	base_cpuid = ctx->ifc_sysctl_core_offset;
6188	cpuid = get_cpuid_for_queue(ctx, base_cpuid, qid, type == IFLIB_INTR_TX);
6189	err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev,
6190	    irq ? irq->ii_res : NULL, name);
6191	if (err) {
6192		device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err);
6193		return (err);
6194	}
6195#ifdef notyet
6196	if (cpuid > ctx->ifc_cpuid_highest)
6197		ctx->ifc_cpuid_highest = cpuid;
6198#endif
6199	return (0);
6200}
6201
6202/*
6203 * Allocate a hardware interrupt for subctx using the parent (ctx)'s hardware
6204 * resources.
6205 *
6206 * Similar to iflib_irq_alloc_generic(), but for interrupt type IFLIB_INTR_RXTX
6207 * only.
6208 *
6209 * XXX: Could be removed if subctx's dev has its intr resource allocation
6210 * methods replaced with custom ones?
6211 */
6212int
6213iflib_irq_alloc_generic_subctx(if_ctx_t ctx, if_ctx_t subctx, if_irq_t irq,
6214			       int rid, iflib_intr_type_t type,
6215			       driver_filter_t *filter, void *filter_arg,
6216			       int qid, const char *name)
6217{
6218	device_t dev, subdev;
6219	struct grouptask *gtask;
6220	struct taskqgroup *tqg;
6221	iflib_filter_info_t info;
6222	gtask_fn_t *fn;
6223	int tqrid, err;
6224	driver_filter_t *intr_fast;
6225	void *q;
6226
6227	MPASS(ctx != NULL);
6228	MPASS(subctx != NULL);
6229
6230	tqrid = rid;
6231	dev = ctx->ifc_dev;
6232	subdev = subctx->ifc_dev;
6233
6234	switch (type) {
6235	case IFLIB_INTR_RXTX:
6236		q = &subctx->ifc_rxqs[qid];
6237		info = &subctx->ifc_rxqs[qid].ifr_filter_info;
6238		gtask = &subctx->ifc_rxqs[qid].ifr_task;
6239		tqg = qgroup_if_io_tqg;
6240		fn = _task_fn_rx;
6241		intr_fast = iflib_fast_intr_rxtx;
6242		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6243		break;
6244	default:
6245		device_printf(dev, "%s: unknown net intr type for subctx %s (%d)\n",
6246		    __func__, device_get_nameunit(subdev), type);
6247		return (EINVAL);
6248	}
6249
6250	info->ifi_filter = filter;
6251	info->ifi_filter_arg = filter_arg;
6252	info->ifi_task = gtask;
6253	info->ifi_ctx = q;
6254
6255	NET_GROUPTASK_INIT(gtask, 0, fn, q);
6256
6257	/* Allocate interrupts from hardware using parent context */
6258	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info, name);
6259	if (err != 0) {
6260		device_printf(dev, "_iflib_irq_alloc failed for subctx %s: %d\n",
6261		    device_get_nameunit(subdev), err);
6262		return (err);
6263	}
6264
6265	if (tqrid != -1) {
6266		err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
6267		    name);
6268		if (err)
6269			return (err);
6270	} else {
6271		taskqgroup_attach(tqg, gtask, q, dev, irq->ii_res, name);
6272	}
6273
6274	return (0);
6275}
6276
6277int
6278iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
6279			iflib_intr_type_t type, driver_filter_t *filter,
6280			void *filter_arg, int qid, const char *name)
6281{
6282	device_t dev;
6283	struct grouptask *gtask;
6284	struct taskqgroup *tqg;
6285	iflib_filter_info_t info;
6286	gtask_fn_t *fn;
6287	int tqrid, err;
6288	driver_filter_t *intr_fast;
6289	void *q;
6290
6291	info = &ctx->ifc_filter_info;
6292	tqrid = rid;
6293
6294	switch (type) {
6295	/* XXX merge tx/rx for netmap? */
6296	case IFLIB_INTR_TX:
6297		q = &ctx->ifc_txqs[qid];
6298		info = &ctx->ifc_txqs[qid].ift_filter_info;
6299		gtask = &ctx->ifc_txqs[qid].ift_task;
6300		tqg = qgroup_if_io_tqg;
6301		fn = _task_fn_tx;
6302		intr_fast = iflib_fast_intr;
6303		GROUPTASK_INIT(gtask, 0, fn, q);
6304		ctx->ifc_flags |= IFC_NETMAP_TX_IRQ;
6305		break;
6306	case IFLIB_INTR_RX:
6307		q = &ctx->ifc_rxqs[qid];
6308		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
6309		gtask = &ctx->ifc_rxqs[qid].ifr_task;
6310		tqg = qgroup_if_io_tqg;
6311		fn = _task_fn_rx;
6312		intr_fast = iflib_fast_intr;
6313		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6314		break;
6315	case IFLIB_INTR_RXTX:
6316		q = &ctx->ifc_rxqs[qid];
6317		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
6318		gtask = &ctx->ifc_rxqs[qid].ifr_task;
6319		tqg = qgroup_if_io_tqg;
6320		fn = _task_fn_rx;
6321		intr_fast = iflib_fast_intr_rxtx;
6322		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6323		break;
6324	case IFLIB_INTR_ADMIN:
6325		q = ctx;
6326		tqrid = -1;
6327		info = &ctx->ifc_filter_info;
6328		gtask = &ctx->ifc_admin_task;
6329		tqg = qgroup_if_config_tqg;
6330		fn = _task_fn_admin;
6331		intr_fast = iflib_fast_intr_ctx;
6332		break;
6333	default:
6334		device_printf(ctx->ifc_dev, "%s: unknown net intr type\n",
6335		    __func__);
6336		return (EINVAL);
6337	}
6338
6339	info->ifi_filter = filter;
6340	info->ifi_filter_arg = filter_arg;
6341	info->ifi_task = gtask;
6342	info->ifi_ctx = q;
6343
6344	dev = ctx->ifc_dev;
6345	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info,  name);
6346	if (err != 0) {
6347		device_printf(dev, "_iflib_irq_alloc failed %d\n", err);
6348		return (err);
6349	}
6350	if (type == IFLIB_INTR_ADMIN)
6351		return (0);
6352
6353	if (tqrid != -1) {
6354		err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
6355		    name);
6356		if (err)
6357			return (err);
6358	} else {
6359		taskqgroup_attach(tqg, gtask, q, dev, irq->ii_res, name);
6360	}
6361
6362	return (0);
6363}
6364
6365void
6366iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
6367			    void *arg, int qid, const char *name)
6368{
6369	device_t dev;
6370	struct grouptask *gtask;
6371	struct taskqgroup *tqg;
6372	gtask_fn_t *fn;
6373	void *q;
6374	int err;
6375
6376	switch (type) {
6377	case IFLIB_INTR_TX:
6378		q = &ctx->ifc_txqs[qid];
6379		gtask = &ctx->ifc_txqs[qid].ift_task;
6380		tqg = qgroup_if_io_tqg;
6381		fn = _task_fn_tx;
6382		GROUPTASK_INIT(gtask, 0, fn, q);
6383		break;
6384	case IFLIB_INTR_RX:
6385		q = &ctx->ifc_rxqs[qid];
6386		gtask = &ctx->ifc_rxqs[qid].ifr_task;
6387		tqg = qgroup_if_io_tqg;
6388		fn = _task_fn_rx;
6389		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6390		break;
6391	case IFLIB_INTR_IOV:
6392		q = ctx;
6393		gtask = &ctx->ifc_vflr_task;
6394		tqg = qgroup_if_config_tqg;
6395		fn = _task_fn_iov;
6396		GROUPTASK_INIT(gtask, 0, fn, q);
6397		break;
6398	default:
6399		panic("unknown net intr type");
6400	}
6401	err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name);
6402	if (err) {
6403		dev = ctx->ifc_dev;
6404		taskqgroup_attach(tqg, gtask, q, dev, irq ? irq->ii_res : NULL,
6405		    name);
6406	}
6407}
6408
6409void
6410iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
6411{
6412
6413	if (irq->ii_tag)
6414		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
6415
6416	if (irq->ii_res)
6417		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ,
6418		    rman_get_rid(irq->ii_res), irq->ii_res);
6419}
6420
6421static int
6422iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, const char *name)
6423{
6424	iflib_txq_t txq = ctx->ifc_txqs;
6425	iflib_rxq_t rxq = ctx->ifc_rxqs;
6426	if_irq_t irq = &ctx->ifc_legacy_irq;
6427	iflib_filter_info_t info;
6428	device_t dev;
6429	struct grouptask *gtask;
6430	struct resource *res;
6431	struct taskqgroup *tqg;
6432	void *q;
6433	int err, tqrid;
6434	bool rx_only;
6435
6436	q = &ctx->ifc_rxqs[0];
6437	info = &rxq[0].ifr_filter_info;
6438	gtask = &rxq[0].ifr_task;
6439	tqg = qgroup_if_io_tqg;
6440	tqrid = *rid;
6441	rx_only = (ctx->ifc_sctx->isc_flags & IFLIB_SINGLE_IRQ_RX_ONLY) != 0;
6442
6443	ctx->ifc_flags |= IFC_LEGACY;
6444	info->ifi_filter = filter;
6445	info->ifi_filter_arg = filter_arg;
6446	info->ifi_task = gtask;
6447	info->ifi_ctx = rx_only ? ctx : q;
6448
6449	dev = ctx->ifc_dev;
6450	/* We allocate a single interrupt resource */
6451	err = _iflib_irq_alloc(ctx, irq, tqrid, rx_only ? iflib_fast_intr_ctx :
6452	    iflib_fast_intr_rxtx, NULL, info, name);
6453	if (err != 0)
6454		return (err);
6455	NET_GROUPTASK_INIT(gtask, 0, _task_fn_rx, q);
6456	res = irq->ii_res;
6457	taskqgroup_attach(tqg, gtask, q, dev, res, name);
6458
6459	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
6460	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, dev, res,
6461	    "tx");
6462	return (0);
6463}
6464
6465void
6466iflib_led_create(if_ctx_t ctx)
6467{
6468
6469	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
6470	    device_get_nameunit(ctx->ifc_dev));
6471}
6472
6473void
6474iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
6475{
6476
6477	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
6478}
6479
6480void
6481iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
6482{
6483
6484	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
6485}
6486
6487void
6488iflib_admin_intr_deferred(if_ctx_t ctx)
6489{
6490
6491	MPASS(ctx->ifc_admin_task.gt_taskqueue != NULL);
6492	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
6493}
6494
6495void
6496iflib_iov_intr_deferred(if_ctx_t ctx)
6497{
6498
6499	GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
6500}
6501
6502void
6503iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, const char *name)
6504{
6505
6506	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, NULL, NULL,
6507	    name);
6508}
6509
6510void
6511iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn,
6512	const char *name)
6513{
6514
6515	GROUPTASK_INIT(gtask, 0, fn, ctx);
6516	taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, NULL, NULL,
6517	    name);
6518}
6519
6520void
6521iflib_config_gtask_deinit(struct grouptask *gtask)
6522{
6523
6524	taskqgroup_detach(qgroup_if_config_tqg, gtask);
6525}
6526
6527void
6528iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
6529{
6530	if_t ifp = ctx->ifc_ifp;
6531	iflib_txq_t txq = ctx->ifc_txqs;
6532
6533	if_setbaudrate(ifp, baudrate);
6534	if (baudrate >= IF_Gbps(10)) {
6535		STATE_LOCK(ctx);
6536		ctx->ifc_flags |= IFC_PREFETCH;
6537		STATE_UNLOCK(ctx);
6538	}
6539	/* If link down, disable watchdog */
6540	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
6541		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
6542			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
6543	}
6544	ctx->ifc_link_state = link_state;
6545	if_link_state_change(ifp, link_state);
6546}
6547
6548static int
6549iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
6550{
6551	int credits;
6552#ifdef INVARIANTS
6553	int credits_pre = txq->ift_cidx_processed;
6554#endif
6555
6556	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
6557	    BUS_DMASYNC_POSTREAD);
6558	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
6559		return (0);
6560
6561	txq->ift_processed += credits;
6562	txq->ift_cidx_processed += credits;
6563
6564	MPASS(credits_pre + credits == txq->ift_cidx_processed);
6565	if (txq->ift_cidx_processed >= txq->ift_size)
6566		txq->ift_cidx_processed -= txq->ift_size;
6567	return (credits);
6568}
6569
6570static int
6571iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
6572{
6573	iflib_fl_t fl;
6574	u_int i;
6575
6576	for (i = 0, fl = &rxq->ifr_fl[0]; i < rxq->ifr_nfl; i++, fl++)
6577		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
6578		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
6579	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
6580	    budget));
6581}
6582
6583void
6584iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
6585	const char *description, if_int_delay_info_t info,
6586	int offset, int value)
6587{
6588	info->iidi_ctx = ctx;
6589	info->iidi_offset = offset;
6590	info->iidi_value = value;
6591	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
6592	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
6593	    OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
6594	    info, 0, iflib_sysctl_int_delay, "I", description);
6595}
6596
6597struct sx *
6598iflib_ctx_lock_get(if_ctx_t ctx)
6599{
6600
6601	return (&ctx->ifc_ctx_sx);
6602}
6603
6604static int
6605iflib_msix_init(if_ctx_t ctx)
6606{
6607	device_t dev = ctx->ifc_dev;
6608	if_shared_ctx_t sctx = ctx->ifc_sctx;
6609	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
6610	int admincnt, bar, err, iflib_num_rx_queues, iflib_num_tx_queues;
6611	int msgs, queuemsgs, queues, rx_queues, tx_queues, vectors;
6612
6613	iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs;
6614	iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs;
6615
6616	if (bootverbose)
6617		device_printf(dev, "msix_init qsets capped at %d\n",
6618		    imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets));
6619
6620	/* Override by tuneable */
6621	if (scctx->isc_disable_msix)
6622		goto msi;
6623
6624	/* First try MSI-X */
6625	if ((msgs = pci_msix_count(dev)) == 0) {
6626		if (bootverbose)
6627			device_printf(dev, "MSI-X not supported or disabled\n");
6628		goto msi;
6629	}
6630
6631	bar = ctx->ifc_softc_ctx.isc_msix_bar;
6632	/*
6633	 * bar == -1 => "trust me I know what I'm doing"
6634	 * Some drivers are for hardware that is so shoddily
6635	 * documented that no one knows which bars are which
6636	 * so the developer has to map all bars. This hack
6637	 * allows shoddy garbage to use MSI-X in this framework.
6638	 */
6639	if (bar != -1) {
6640		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
6641	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
6642		if (ctx->ifc_msix_mem == NULL) {
6643			device_printf(dev, "Unable to map MSI-X table\n");
6644			goto msi;
6645		}
6646	}
6647
6648	admincnt = sctx->isc_admin_intrcnt;
6649#if IFLIB_DEBUG
6650	/* use only 1 qset in debug mode */
6651	queuemsgs = min(msgs - admincnt, 1);
6652#else
6653	queuemsgs = msgs - admincnt;
6654#endif
6655#ifdef RSS
6656	queues = imin(queuemsgs, rss_getnumbuckets());
6657#else
6658	queues = queuemsgs;
6659#endif
6660	queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
6661	if (bootverbose)
6662		device_printf(dev,
6663		    "intr CPUs: %d queue msgs: %d admincnt: %d\n",
6664		    CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
6665#ifdef  RSS
6666	/* If we're doing RSS, clamp at the number of RSS buckets */
6667	if (queues > rss_getnumbuckets())
6668		queues = rss_getnumbuckets();
6669#endif
6670	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
6671		rx_queues = iflib_num_rx_queues;
6672	else
6673		rx_queues = queues;
6674
6675	if (rx_queues > scctx->isc_nrxqsets)
6676		rx_queues = scctx->isc_nrxqsets;
6677
6678	/*
6679	 * We want this to be all logical CPUs by default
6680	 */
6681	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
6682		tx_queues = iflib_num_tx_queues;
6683	else
6684		tx_queues = mp_ncpus;
6685
6686	if (tx_queues > scctx->isc_ntxqsets)
6687		tx_queues = scctx->isc_ntxqsets;
6688
6689	if (ctx->ifc_sysctl_qs_eq_override == 0) {
6690#ifdef INVARIANTS
6691		if (tx_queues != rx_queues)
6692			device_printf(dev,
6693			    "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
6694			    min(rx_queues, tx_queues), min(rx_queues, tx_queues));
6695#endif
6696		tx_queues = min(rx_queues, tx_queues);
6697		rx_queues = min(rx_queues, tx_queues);
6698	}
6699
6700	vectors = rx_queues + admincnt;
6701	if (msgs < vectors) {
6702		device_printf(dev,
6703		    "insufficient number of MSI-X vectors "
6704		    "(supported %d, need %d)\n", msgs, vectors);
6705		goto msi;
6706	}
6707
6708	device_printf(dev, "Using %d RX queues %d TX queues\n", rx_queues,
6709	    tx_queues);
6710	msgs = vectors;
6711	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
6712		if (vectors != msgs) {
6713			device_printf(dev,
6714			    "Unable to allocate sufficient MSI-X vectors "
6715			    "(got %d, need %d)\n", vectors, msgs);
6716			pci_release_msi(dev);
6717			if (bar != -1) {
6718				bus_release_resource(dev, SYS_RES_MEMORY, bar,
6719				    ctx->ifc_msix_mem);
6720				ctx->ifc_msix_mem = NULL;
6721			}
6722			goto msi;
6723		}
6724		device_printf(dev, "Using MSI-X interrupts with %d vectors\n",
6725		    vectors);
6726		scctx->isc_vectors = vectors;
6727		scctx->isc_nrxqsets = rx_queues;
6728		scctx->isc_ntxqsets = tx_queues;
6729		scctx->isc_intr = IFLIB_INTR_MSIX;
6730
6731		return (vectors);
6732	} else {
6733		device_printf(dev,
6734		    "failed to allocate %d MSI-X vectors, err: %d\n", vectors,
6735		    err);
6736		if (bar != -1) {
6737			bus_release_resource(dev, SYS_RES_MEMORY, bar,
6738			    ctx->ifc_msix_mem);
6739			ctx->ifc_msix_mem = NULL;
6740		}
6741	}
6742
6743msi:
6744	vectors = pci_msi_count(dev);
6745	scctx->isc_nrxqsets = 1;
6746	scctx->isc_ntxqsets = 1;
6747	scctx->isc_vectors = vectors;
6748	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
6749		device_printf(dev,"Using an MSI interrupt\n");
6750		scctx->isc_intr = IFLIB_INTR_MSI;
6751	} else {
6752		scctx->isc_vectors = 1;
6753		device_printf(dev,"Using a Legacy interrupt\n");
6754		scctx->isc_intr = IFLIB_INTR_LEGACY;
6755	}
6756
6757	return (vectors);
6758}
6759
6760static const char *ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
6761
6762static int
6763mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
6764{
6765	int rc;
6766	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
6767	struct sbuf *sb;
6768	const char *ring_state = "UNKNOWN";
6769
6770	/* XXX needed ? */
6771	rc = sysctl_wire_old_buffer(req, 0);
6772	MPASS(rc == 0);
6773	if (rc != 0)
6774		return (rc);
6775	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
6776	MPASS(sb != NULL);
6777	if (sb == NULL)
6778		return (ENOMEM);
6779	if (state[3] <= 3)
6780		ring_state = ring_states[state[3]];
6781
6782	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
6783		    state[0], state[1], state[2], ring_state);
6784	rc = sbuf_finish(sb);
6785	sbuf_delete(sb);
6786        return(rc);
6787}
6788
6789enum iflib_ndesc_handler {
6790	IFLIB_NTXD_HANDLER,
6791	IFLIB_NRXD_HANDLER,
6792};
6793
6794static int
6795mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
6796{
6797	if_ctx_t ctx = (void *)arg1;
6798	enum iflib_ndesc_handler type = arg2;
6799	char buf[256] = {0};
6800	qidx_t *ndesc;
6801	char *p, *next;
6802	int nqs, rc, i;
6803
6804	nqs = 8;
6805	switch(type) {
6806	case IFLIB_NTXD_HANDLER:
6807		ndesc = ctx->ifc_sysctl_ntxds;
6808		if (ctx->ifc_sctx)
6809			nqs = ctx->ifc_sctx->isc_ntxqs;
6810		break;
6811	case IFLIB_NRXD_HANDLER:
6812		ndesc = ctx->ifc_sysctl_nrxds;
6813		if (ctx->ifc_sctx)
6814			nqs = ctx->ifc_sctx->isc_nrxqs;
6815		break;
6816	default:
6817		printf("%s: unhandled type\n", __func__);
6818		return (EINVAL);
6819	}
6820	if (nqs == 0)
6821		nqs = 8;
6822
6823	for (i=0; i<8; i++) {
6824		if (i >= nqs)
6825			break;
6826		if (i)
6827			strcat(buf, ",");
6828		sprintf(strchr(buf, 0), "%d", ndesc[i]);
6829	}
6830
6831	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
6832	if (rc || req->newptr == NULL)
6833		return rc;
6834
6835	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
6836	    i++, p = strsep(&next, " ,")) {
6837		ndesc[i] = strtoul(p, NULL, 10);
6838	}
6839
6840	return(rc);
6841}
6842
6843#define NAME_BUFLEN 32
6844static void
6845iflib_add_device_sysctl_pre(if_ctx_t ctx)
6846{
6847        device_t dev = iflib_get_dev(ctx);
6848	struct sysctl_oid_list *child, *oid_list;
6849	struct sysctl_ctx_list *ctx_list;
6850	struct sysctl_oid *node;
6851
6852	ctx_list = device_get_sysctl_ctx(dev);
6853	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
6854	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child,
6855	    OID_AUTO, "iflib", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
6856	    "IFLIB fields");
6857	oid_list = SYSCTL_CHILDREN(node);
6858
6859	SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
6860	    CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, "driver version");
6861
6862	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
6863	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
6864	    "# of txqs to use, 0 => use default #");
6865	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
6866	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
6867	    "# of rxqs to use, 0 => use default #");
6868	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
6869	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
6870	    "permit #txq != #rxq");
6871	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
6872	    CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
6873	    "disable MSI-X (default 0)");
6874	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
6875	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0, "set the RX budget");
6876	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "tx_abdicate",
6877	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_tx_abdicate, 0,
6878	    "cause TX to abdicate instead of running to completion");
6879	ctx->ifc_sysctl_core_offset = CORE_OFFSET_UNSPECIFIED;
6880	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "core_offset",
6881	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_core_offset, 0,
6882	    "offset to start using cores at");
6883	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx",
6884	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0,
6885	    "use separate cores for TX and RX");
6886	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores",
6887	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0,
6888	    "try to make use of logical cores for TX and RX");
6889	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "use_extra_msix_vectors",
6890	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_extra_msix_vectors, 0,
6891	    "attempt to reserve the given number of extra MSI-X vectors during driver load for the creation of additional interfaces later");
6892	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "allocated_msix_vectors",
6893       	    CTLFLAG_RDTUN, &ctx->ifc_softc_ctx.isc_vectors, 0,
6894	    "total # of MSI-X vectors allocated by driver");
6895
6896	/* XXX change for per-queue sizes */
6897	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
6898	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
6899	    IFLIB_NTXD_HANDLER, mp_ndesc_handler, "A",
6900	    "list of # of TX descriptors to use, 0 = use default #");
6901	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
6902	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
6903	    IFLIB_NRXD_HANDLER, mp_ndesc_handler, "A",
6904	    "list of # of RX descriptors to use, 0 = use default #");
6905}
6906
6907static void
6908iflib_add_device_sysctl_post(if_ctx_t ctx)
6909{
6910	if_shared_ctx_t sctx = ctx->ifc_sctx;
6911	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
6912        device_t dev = iflib_get_dev(ctx);
6913	struct sysctl_oid_list *child;
6914	struct sysctl_ctx_list *ctx_list;
6915	iflib_fl_t fl;
6916	iflib_txq_t txq;
6917	iflib_rxq_t rxq;
6918	int i, j;
6919	char namebuf[NAME_BUFLEN];
6920	char *qfmt;
6921	struct sysctl_oid *queue_node, *fl_node, *node;
6922	struct sysctl_oid_list *queue_list, *fl_list;
6923	ctx_list = device_get_sysctl_ctx(dev);
6924
6925	node = ctx->ifc_sysctl_node;
6926	child = SYSCTL_CHILDREN(node);
6927
6928	if (scctx->isc_ntxqsets > 100)
6929		qfmt = "txq%03d";
6930	else if (scctx->isc_ntxqsets > 10)
6931		qfmt = "txq%02d";
6932	else
6933		qfmt = "txq%d";
6934	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
6935		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
6936		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
6937		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
6938		queue_list = SYSCTL_CHILDREN(queue_node);
6939		SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
6940		    CTLFLAG_RD, &txq->ift_task.gt_cpu, 0,
6941		    "cpu this queue is bound to");
6942#if MEMORY_LOGGING
6943		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
6944		    CTLFLAG_RD, &txq->ift_dequeued, "total mbufs freed");
6945		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
6946		    CTLFLAG_RD, &txq->ift_enqueued, "total mbufs enqueued");
6947#endif
6948		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
6949		    CTLFLAG_RD, &txq->ift_mbuf_defrag,
6950		    "# of times m_defrag was called");
6951		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
6952		    CTLFLAG_RD, &txq->ift_pullups,
6953		    "# of times m_pullup was called");
6954		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6955		    "mbuf_defrag_failed", CTLFLAG_RD,
6956		    &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
6957		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6958		    "no_desc_avail", CTLFLAG_RD, &txq->ift_no_desc_avail,
6959		    "# of times no descriptors were available");
6960		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6961		    "tx_map_failed", CTLFLAG_RD, &txq->ift_map_failed,
6962		    "# of times DMA map failed");
6963		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6964		    "txd_encap_efbig", CTLFLAG_RD, &txq->ift_txd_encap_efbig,
6965		    "# of times txd_encap returned EFBIG");
6966		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6967		    "no_tx_dma_setup", CTLFLAG_RD, &txq->ift_no_tx_dma_setup,
6968		    "# of times map failed for other than EFBIG");
6969		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
6970		    CTLFLAG_RD, &txq->ift_pidx, 1, "Producer Index");
6971		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
6972		    CTLFLAG_RD, &txq->ift_cidx, 1, "Consumer Index");
6973		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO,
6974		    "txq_cidx_processed", CTLFLAG_RD, &txq->ift_cidx_processed,
6975		    1, "Consumer Index seen by credit update");
6976		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
6977		    CTLFLAG_RD, &txq->ift_in_use, 1, "descriptors in use");
6978		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6979		    "txq_processed", CTLFLAG_RD, &txq->ift_processed,
6980		    "descriptors procesed for clean");
6981		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
6982		    CTLFLAG_RD, &txq->ift_cleaned, "total cleaned");
6983		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
6984		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
6985		    __DEVOLATILE(uint64_t *, &txq->ift_br->state), 0,
6986		    mp_ring_state_handler, "A", "soft ring state");
6987		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
6988		    "r_enqueues", CTLFLAG_RD, &txq->ift_br->enqueues,
6989		    "# of enqueues to the mp_ring for this queue");
6990		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
6991		    "r_drops", CTLFLAG_RD, &txq->ift_br->drops,
6992		    "# of drops in the mp_ring for this queue");
6993		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
6994		    "r_starts", CTLFLAG_RD, &txq->ift_br->starts,
6995		    "# of normal consumer starts in mp_ring for this queue");
6996		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
6997		    "r_stalls", CTLFLAG_RD, &txq->ift_br->stalls,
6998		    "# of consumer stalls in the mp_ring for this queue");
6999		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
7000		    "r_restarts", CTLFLAG_RD, &txq->ift_br->restarts,
7001		    "# of consumer restarts in the mp_ring for this queue");
7002		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
7003		    "r_abdications", CTLFLAG_RD, &txq->ift_br->abdications,
7004		    "# of consumer abdications in the mp_ring for this queue");
7005	}
7006
7007	if (scctx->isc_nrxqsets > 100)
7008		qfmt = "rxq%03d";
7009	else if (scctx->isc_nrxqsets > 10)
7010		qfmt = "rxq%02d";
7011	else
7012		qfmt = "rxq%d";
7013	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
7014		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
7015		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
7016		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
7017		queue_list = SYSCTL_CHILDREN(queue_node);
7018		SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
7019		    CTLFLAG_RD, &rxq->ifr_task.gt_cpu, 0,
7020		    "cpu this queue is bound to");
7021		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
7022			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO,
7023			    "rxq_cq_cidx", CTLFLAG_RD, &rxq->ifr_cq_cidx, 1,
7024			    "Consumer Index");
7025		}
7026
7027		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
7028			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
7029			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list,
7030			    OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE,
7031			    NULL, "freelist Name");
7032			fl_list = SYSCTL_CHILDREN(fl_node);
7033			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
7034			    CTLFLAG_RD, &fl->ifl_pidx, 1, "Producer Index");
7035			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
7036			    CTLFLAG_RD, &fl->ifl_cidx, 1, "Consumer Index");
7037			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
7038			    CTLFLAG_RD, &fl->ifl_credits, 1,
7039			    "credits available");
7040			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "buf_size",
7041			    CTLFLAG_RD, &fl->ifl_buf_size, 1, "buffer size");
7042#if MEMORY_LOGGING
7043			SYSCTL_ADD_UQUAD(ctx_list, fl_list, OID_AUTO,
7044			    "fl_m_enqueued", CTLFLAG_RD, &fl->ifl_m_enqueued,
7045			    "mbufs allocated");
7046			SYSCTL_ADD_UQUAD(ctx_list, fl_list, OID_AUTO,
7047			    "fl_m_dequeued", CTLFLAG_RD, &fl->ifl_m_dequeued,
7048			    "mbufs freed");
7049			SYSCTL_ADD_UQUAD(ctx_list, fl_list, OID_AUTO,
7050			    "fl_cl_enqueued", CTLFLAG_RD, &fl->ifl_cl_enqueued,
7051			    "clusters allocated");
7052			SYSCTL_ADD_UQUAD(ctx_list, fl_list, OID_AUTO,
7053			    "fl_cl_dequeued", CTLFLAG_RD, &fl->ifl_cl_dequeued,
7054			    "clusters freed");
7055#endif
7056		}
7057	}
7058
7059}
7060
7061void
7062iflib_request_reset(if_ctx_t ctx)
7063{
7064
7065	STATE_LOCK(ctx);
7066	ctx->ifc_flags |= IFC_DO_RESET;
7067	STATE_UNLOCK(ctx);
7068}
7069
7070#ifndef __NO_STRICT_ALIGNMENT
7071static struct mbuf *
7072iflib_fixup_rx(struct mbuf *m)
7073{
7074	struct mbuf *n;
7075
7076	if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
7077		bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
7078		m->m_data += ETHER_HDR_LEN;
7079		n = m;
7080	} else {
7081		MGETHDR(n, M_NOWAIT, MT_DATA);
7082		if (n == NULL) {
7083			m_freem(m);
7084			return (NULL);
7085		}
7086		bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
7087		m->m_data += ETHER_HDR_LEN;
7088		m->m_len -= ETHER_HDR_LEN;
7089		n->m_len = ETHER_HDR_LEN;
7090		M_MOVE_PKTHDR(n, m);
7091		n->m_next = m;
7092	}
7093	return (n);
7094}
7095#endif
7096
7097#ifdef DEBUGNET
7098static void
7099iflib_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize)
7100{
7101	if_ctx_t ctx;
7102
7103	ctx = if_getsoftc(ifp);
7104	CTX_LOCK(ctx);
7105	*nrxr = NRXQSETS(ctx);
7106	*ncl = ctx->ifc_rxqs[0].ifr_fl->ifl_size;
7107	*clsize = ctx->ifc_rxqs[0].ifr_fl->ifl_buf_size;
7108	CTX_UNLOCK(ctx);
7109}
7110
7111static void
7112iflib_debugnet_event(if_t ifp, enum debugnet_ev event)
7113{
7114	if_ctx_t ctx;
7115	if_softc_ctx_t scctx;
7116	iflib_fl_t fl;
7117	iflib_rxq_t rxq;
7118	int i, j;
7119
7120	ctx = if_getsoftc(ifp);
7121	scctx = &ctx->ifc_softc_ctx;
7122
7123	switch (event) {
7124	case DEBUGNET_START:
7125		for (i = 0; i < scctx->isc_nrxqsets; i++) {
7126			rxq = &ctx->ifc_rxqs[i];
7127			for (j = 0; j < rxq->ifr_nfl; j++) {
7128				fl = rxq->ifr_fl;
7129				fl->ifl_zone = m_getzone(fl->ifl_buf_size);
7130			}
7131		}
7132		iflib_no_tx_batch = 1;
7133		break;
7134	default:
7135		break;
7136	}
7137}
7138
7139static int
7140iflib_debugnet_transmit(if_t ifp, struct mbuf *m)
7141{
7142	if_ctx_t ctx;
7143	iflib_txq_t txq;
7144	int error;
7145
7146	ctx = if_getsoftc(ifp);
7147	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
7148	    IFF_DRV_RUNNING)
7149		return (EBUSY);
7150
7151	txq = &ctx->ifc_txqs[0];
7152	error = iflib_encap(txq, &m);
7153	if (error == 0)
7154		(void)iflib_txd_db_check(txq, true);
7155	return (error);
7156}
7157
7158static int
7159iflib_debugnet_poll(if_t ifp, int count)
7160{
7161	struct epoch_tracker et;
7162	if_ctx_t ctx;
7163	if_softc_ctx_t scctx;
7164	iflib_txq_t txq;
7165	int i;
7166
7167	ctx = if_getsoftc(ifp);
7168	scctx = &ctx->ifc_softc_ctx;
7169
7170	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
7171	    IFF_DRV_RUNNING)
7172		return (EBUSY);
7173
7174	txq = &ctx->ifc_txqs[0];
7175	(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
7176
7177	NET_EPOCH_ENTER(et);
7178	for (i = 0; i < scctx->isc_nrxqsets; i++)
7179		(void)iflib_rxeof(&ctx->ifc_rxqs[i], 16 /* XXX */);
7180	NET_EPOCH_EXIT(et);
7181	return (0);
7182}
7183#endif /* DEBUGNET */
7184