1/*-
2 * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions, and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27/* Driver for VirtIO network devices. */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/sockio.h>
36#include <sys/mbuf.h>
37#include <sys/malloc.h>
38#include <sys/module.h>
39#include <sys/socket.h>
40#include <sys/sysctl.h>
41#include <sys/random.h>
42#include <sys/sglist.h>
43#include <sys/lock.h>
44#include <sys/mutex.h>
45#include <sys/taskqueue.h>
46#include <sys/smp.h>
47#include <machine/smp.h>
48
49#include <vm/uma.h>
50
51#include <net/ethernet.h>
52#include <net/if.h>
53#include <net/if_arp.h>
54#include <net/if_dl.h>
55#include <net/if_types.h>
56#include <net/if_media.h>
57#include <net/if_vlan_var.h>
58
59#include <net/bpf.h>
60
61#include <netinet/in_systm.h>
62#include <netinet/in.h>
63#include <netinet/ip.h>
64#include <netinet/ip6.h>
65#include <netinet6/ip6_var.h>
66#include <netinet/udp.h>
67#include <netinet/tcp.h>
68#include <netinet/sctp.h>
69
70#include <machine/bus.h>
71#include <machine/resource.h>
72#include <sys/bus.h>
73#include <sys/rman.h>
74
75#include <dev/virtio/virtio.h>
76#include <dev/virtio/virtqueue.h>
77#include <dev/virtio/network/virtio_net.h>
78#include <dev/virtio/network/if_vtnetvar.h>
79
80#include "virtio_if.h"
81
82#include "opt_inet.h"
83#include "opt_inet6.h"
84
85static int	vtnet_modevent(module_t, int, void *);
86
87static int	vtnet_probe(device_t);
88static int	vtnet_attach(device_t);
89static int	vtnet_detach(device_t);
90static int	vtnet_suspend(device_t);
91static int	vtnet_resume(device_t);
92static int	vtnet_shutdown(device_t);
93static int	vtnet_attach_completed(device_t);
94static int	vtnet_config_change(device_t);
95
96static void	vtnet_negotiate_features(struct vtnet_softc *);
97static void	vtnet_setup_features(struct vtnet_softc *);
98static int	vtnet_init_rxq(struct vtnet_softc *, int);
99static int	vtnet_init_txq(struct vtnet_softc *, int);
100static int	vtnet_alloc_rxtx_queues(struct vtnet_softc *);
101static void	vtnet_free_rxtx_queues(struct vtnet_softc *);
102static int	vtnet_alloc_rx_filters(struct vtnet_softc *);
103static void	vtnet_free_rx_filters(struct vtnet_softc *);
104static int	vtnet_alloc_virtqueues(struct vtnet_softc *);
105static int	vtnet_setup_interface(struct vtnet_softc *);
106static int	vtnet_change_mtu(struct vtnet_softc *, int);
107static int	vtnet_ioctl(struct ifnet *, u_long, caddr_t);
108
109static int	vtnet_rxq_populate(struct vtnet_rxq *);
110static void	vtnet_rxq_free_mbufs(struct vtnet_rxq *);
111static struct mbuf *
112		vtnet_rx_alloc_buf(struct vtnet_softc *, int , struct mbuf **);
113static int	vtnet_rxq_replace_lro_nomgr_buf(struct vtnet_rxq *,
114		    struct mbuf *, int);
115static int	vtnet_rxq_replace_buf(struct vtnet_rxq *, struct mbuf *, int);
116static int	vtnet_rxq_enqueue_buf(struct vtnet_rxq *, struct mbuf *);
117static int	vtnet_rxq_new_buf(struct vtnet_rxq *);
118static int	vtnet_rxq_csum(struct vtnet_rxq *, struct mbuf *,
119		     struct virtio_net_hdr *);
120static void	vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *, int);
121static void	vtnet_rxq_discard_buf(struct vtnet_rxq *, struct mbuf *);
122static int	vtnet_rxq_merged_eof(struct vtnet_rxq *, struct mbuf *, int);
123static void	vtnet_rxq_input(struct vtnet_rxq *, struct mbuf *,
124		    struct virtio_net_hdr *);
125static int	vtnet_rxq_eof(struct vtnet_rxq *);
126static void	vtnet_rx_vq_intr(void *);
127static void	vtnet_rxq_tq_intr(void *, int);
128
129static int	vtnet_txq_below_threshold(struct vtnet_txq *);
130static int	vtnet_txq_notify(struct vtnet_txq *);
131static void	vtnet_txq_free_mbufs(struct vtnet_txq *);
132static int	vtnet_txq_offload_ctx(struct vtnet_txq *, struct mbuf *,
133		    int *, int *, int *);
134static int	vtnet_txq_offload_tso(struct vtnet_txq *, struct mbuf *, int,
135		    int, struct virtio_net_hdr *);
136static struct mbuf *
137		vtnet_txq_offload(struct vtnet_txq *, struct mbuf *,
138		    struct virtio_net_hdr *);
139static int	vtnet_txq_enqueue_buf(struct vtnet_txq *, struct mbuf **,
140		    struct vtnet_tx_header *);
141static int	vtnet_txq_encap(struct vtnet_txq *, struct mbuf **);
142#ifdef VTNET_LEGACY_TX
143static void	vtnet_start_locked(struct vtnet_txq *, struct ifnet *);
144static void	vtnet_start(struct ifnet *);
145#else
146static int	vtnet_txq_mq_start_locked(struct vtnet_txq *, struct mbuf *);
147static int	vtnet_txq_mq_start(struct ifnet *, struct mbuf *);
148static void	vtnet_txq_tq_deferred(void *, int);
149#endif
150static void	vtnet_txq_start(struct vtnet_txq *);
151static void	vtnet_txq_tq_intr(void *, int);
152static int	vtnet_txq_eof(struct vtnet_txq *);
153static void	vtnet_tx_vq_intr(void *);
154static void	vtnet_tx_start_all(struct vtnet_softc *);
155
156#ifndef VTNET_LEGACY_TX
157static void	vtnet_qflush(struct ifnet *);
158#endif
159
160static int	vtnet_watchdog(struct vtnet_txq *);
161static void	vtnet_rxq_accum_stats(struct vtnet_rxq *,
162		    struct vtnet_rxq_stats *);
163static void	vtnet_txq_accum_stats(struct vtnet_txq *,
164		    struct vtnet_txq_stats *);
165static void	vtnet_accumulate_stats(struct vtnet_softc *);
166static void	vtnet_tick(void *);
167
168static void	vtnet_start_taskqueues(struct vtnet_softc *);
169static void	vtnet_free_taskqueues(struct vtnet_softc *);
170static void	vtnet_drain_taskqueues(struct vtnet_softc *);
171
172static void	vtnet_drain_rxtx_queues(struct vtnet_softc *);
173static void	vtnet_stop_rendezvous(struct vtnet_softc *);
174static void	vtnet_stop(struct vtnet_softc *);
175static int	vtnet_virtio_reinit(struct vtnet_softc *);
176static void	vtnet_init_rx_filters(struct vtnet_softc *);
177static int	vtnet_init_rx_queues(struct vtnet_softc *);
178static int	vtnet_init_tx_queues(struct vtnet_softc *);
179static int	vtnet_init_rxtx_queues(struct vtnet_softc *);
180static void	vtnet_set_active_vq_pairs(struct vtnet_softc *);
181static int	vtnet_reinit(struct vtnet_softc *);
182static void	vtnet_init_locked(struct vtnet_softc *);
183static void	vtnet_init(void *);
184
185static void	vtnet_free_ctrl_vq(struct vtnet_softc *);
186static void	vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *,
187		    struct sglist *, int, int);
188static int	vtnet_ctrl_mac_cmd(struct vtnet_softc *, uint8_t *);
189static int	vtnet_ctrl_mq_cmd(struct vtnet_softc *, uint16_t);
190static int	vtnet_ctrl_rx_cmd(struct vtnet_softc *, int, int);
191static int	vtnet_set_promisc(struct vtnet_softc *, int);
192static int	vtnet_set_allmulti(struct vtnet_softc *, int);
193static void	vtnet_attach_disable_promisc(struct vtnet_softc *);
194static void	vtnet_rx_filter(struct vtnet_softc *);
195static void	vtnet_rx_filter_mac(struct vtnet_softc *);
196static int	vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t);
197static void	vtnet_rx_filter_vlan(struct vtnet_softc *);
198static void	vtnet_update_vlan_filter(struct vtnet_softc *, int, uint16_t);
199static void	vtnet_register_vlan(void *, struct ifnet *, uint16_t);
200static void	vtnet_unregister_vlan(void *, struct ifnet *, uint16_t);
201
202static int	vtnet_is_link_up(struct vtnet_softc *);
203static void	vtnet_update_link_status(struct vtnet_softc *);
204static int	vtnet_ifmedia_upd(struct ifnet *);
205static void	vtnet_ifmedia_sts(struct ifnet *, struct ifmediareq *);
206static void	vtnet_get_hwaddr(struct vtnet_softc *);
207static void	vtnet_set_hwaddr(struct vtnet_softc *);
208static void	vtnet_vlan_tag_remove(struct mbuf *);
209static void	vtnet_set_rx_process_limit(struct vtnet_softc *);
210static void	vtnet_set_tx_intr_threshold(struct vtnet_softc *);
211
212static void	vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *,
213		    struct sysctl_oid_list *, struct vtnet_rxq *);
214static void	vtnet_setup_txq_sysctl(struct sysctl_ctx_list *,
215		    struct sysctl_oid_list *, struct vtnet_txq *);
216static void	vtnet_setup_queue_sysctl(struct vtnet_softc *);
217static void	vtnet_setup_sysctl(struct vtnet_softc *);
218
219static int	vtnet_rxq_enable_intr(struct vtnet_rxq *);
220static void	vtnet_rxq_disable_intr(struct vtnet_rxq *);
221static int	vtnet_txq_enable_intr(struct vtnet_txq *);
222static void	vtnet_txq_disable_intr(struct vtnet_txq *);
223static void	vtnet_enable_rx_interrupts(struct vtnet_softc *);
224static void	vtnet_enable_tx_interrupts(struct vtnet_softc *);
225static void	vtnet_enable_interrupts(struct vtnet_softc *);
226static void	vtnet_disable_rx_interrupts(struct vtnet_softc *);
227static void	vtnet_disable_tx_interrupts(struct vtnet_softc *);
228static void	vtnet_disable_interrupts(struct vtnet_softc *);
229
230static int	vtnet_tunable_int(struct vtnet_softc *, const char *, int);
231
232/* Tunables. */
233static int vtnet_csum_disable = 0;
234TUNABLE_INT("hw.vtnet.csum_disable", &vtnet_csum_disable);
235static int vtnet_tso_disable = 0;
236TUNABLE_INT("hw.vtnet.tso_disable", &vtnet_tso_disable);
237static int vtnet_lro_disable = 0;
238TUNABLE_INT("hw.vtnet.lro_disable", &vtnet_lro_disable);
239static int vtnet_mq_disable = 0;
240TUNABLE_INT("hw.vtnet.mq_disable", &vtnet_mq_disable);
241static int vtnet_mq_max_pairs = 0;
242TUNABLE_INT("hw.vtnet.mq_max_pairs", &vtnet_mq_max_pairs);
243static int vtnet_rx_process_limit = 512;
244TUNABLE_INT("hw.vtnet.rx_process_limit", &vtnet_rx_process_limit);
245
246static uma_zone_t vtnet_tx_header_zone;
247
248static struct virtio_feature_desc vtnet_feature_desc[] = {
249	{ VIRTIO_NET_F_CSUM,		"TxChecksum"	},
250	{ VIRTIO_NET_F_GUEST_CSUM,	"RxChecksum"	},
251	{ VIRTIO_NET_F_MAC,		"MacAddress"	},
252	{ VIRTIO_NET_F_GSO,		"TxAllGSO"	},
253	{ VIRTIO_NET_F_GUEST_TSO4,	"RxTSOv4"	},
254	{ VIRTIO_NET_F_GUEST_TSO6,	"RxTSOv6"	},
255	{ VIRTIO_NET_F_GUEST_ECN,	"RxECN"		},
256	{ VIRTIO_NET_F_GUEST_UFO,	"RxUFO"		},
257	{ VIRTIO_NET_F_HOST_TSO4,	"TxTSOv4"	},
258	{ VIRTIO_NET_F_HOST_TSO6,	"TxTSOv6"	},
259	{ VIRTIO_NET_F_HOST_ECN,	"TxTSOECN"	},
260	{ VIRTIO_NET_F_HOST_UFO,	"TxUFO"		},
261	{ VIRTIO_NET_F_MRG_RXBUF,	"MrgRxBuf"	},
262	{ VIRTIO_NET_F_STATUS,		"Status"	},
263	{ VIRTIO_NET_F_CTRL_VQ,		"ControlVq"	},
264	{ VIRTIO_NET_F_CTRL_RX,		"RxMode"	},
265	{ VIRTIO_NET_F_CTRL_VLAN,	"VLanFilter"	},
266	{ VIRTIO_NET_F_CTRL_RX_EXTRA,	"RxModeExtra"	},
267	{ VIRTIO_NET_F_GUEST_ANNOUNCE,	"GuestAnnounce"	},
268	{ VIRTIO_NET_F_MQ,		"Multiqueue"	},
269	{ VIRTIO_NET_F_CTRL_MAC_ADDR,	"SetMacAddress"	},
270
271	{ 0, NULL }
272};
273
274static device_method_t vtnet_methods[] = {
275	/* Device methods. */
276	DEVMETHOD(device_probe,			vtnet_probe),
277	DEVMETHOD(device_attach,		vtnet_attach),
278	DEVMETHOD(device_detach,		vtnet_detach),
279	DEVMETHOD(device_suspend,		vtnet_suspend),
280	DEVMETHOD(device_resume,		vtnet_resume),
281	DEVMETHOD(device_shutdown,		vtnet_shutdown),
282
283	/* VirtIO methods. */
284	DEVMETHOD(virtio_attach_completed,	vtnet_attach_completed),
285	DEVMETHOD(virtio_config_change,		vtnet_config_change),
286
287	DEVMETHOD_END
288};
289
290#ifdef DEV_NETMAP
291#include <dev/netmap/if_vtnet_netmap.h>
292#endif /* DEV_NETMAP */
293
294static driver_t vtnet_driver = {
295	"vtnet",
296	vtnet_methods,
297	sizeof(struct vtnet_softc)
298};
299static devclass_t vtnet_devclass;
300
301DRIVER_MODULE(vtnet, virtio_pci, vtnet_driver, vtnet_devclass,
302    vtnet_modevent, 0);
303MODULE_VERSION(vtnet, 1);
304MODULE_DEPEND(vtnet, virtio, 1, 1, 1);
305
306static int
307vtnet_modevent(module_t mod, int type, void *unused)
308{
309	int error;
310
311	error = 0;
312
313	switch (type) {
314	case MOD_LOAD:
315		vtnet_tx_header_zone = uma_zcreate("vtnet_tx_hdr",
316		    sizeof(struct vtnet_tx_header),
317		    NULL, NULL, NULL, NULL, 0, 0);
318		break;
319	case MOD_QUIESCE:
320	case MOD_UNLOAD:
321		if (uma_zone_get_cur(vtnet_tx_header_zone) > 0)
322			error = EBUSY;
323		else if (type == MOD_UNLOAD) {
324			uma_zdestroy(vtnet_tx_header_zone);
325			vtnet_tx_header_zone = NULL;
326		}
327		break;
328	case MOD_SHUTDOWN:
329		break;
330	default:
331		error = EOPNOTSUPP;
332		break;
333	}
334
335	return (error);
336}
337
338static int
339vtnet_probe(device_t dev)
340{
341
342	if (virtio_get_device_type(dev) != VIRTIO_ID_NETWORK)
343		return (ENXIO);
344
345	device_set_desc(dev, "VirtIO Networking Adapter");
346
347	return (BUS_PROBE_DEFAULT);
348}
349
350static int
351vtnet_attach(device_t dev)
352{
353	struct vtnet_softc *sc;
354	int error;
355
356	sc = device_get_softc(dev);
357	sc->vtnet_dev = dev;
358
359	/* Register our feature descriptions. */
360	virtio_set_feature_desc(dev, vtnet_feature_desc);
361
362	VTNET_CORE_LOCK_INIT(sc);
363	callout_init_mtx(&sc->vtnet_tick_ch, VTNET_CORE_MTX(sc), 0);
364
365	vtnet_setup_sysctl(sc);
366	vtnet_setup_features(sc);
367
368	error = vtnet_alloc_rx_filters(sc);
369	if (error) {
370		device_printf(dev, "cannot allocate Rx filters\n");
371		goto fail;
372	}
373
374	error = vtnet_alloc_rxtx_queues(sc);
375	if (error) {
376		device_printf(dev, "cannot allocate queues\n");
377		goto fail;
378	}
379
380	error = vtnet_alloc_virtqueues(sc);
381	if (error) {
382		device_printf(dev, "cannot allocate virtqueues\n");
383		goto fail;
384	}
385
386	error = vtnet_setup_interface(sc);
387	if (error) {
388		device_printf(dev, "cannot setup interface\n");
389		goto fail;
390	}
391
392	error = virtio_setup_intr(dev, INTR_TYPE_NET);
393	if (error) {
394		device_printf(dev, "cannot setup virtqueue interrupts\n");
395		/* BMV: This will crash if during boot! */
396		ether_ifdetach(sc->vtnet_ifp);
397		goto fail;
398	}
399
400#ifdef DEV_NETMAP
401	vtnet_netmap_attach(sc);
402#endif /* DEV_NETMAP */
403
404	vtnet_start_taskqueues(sc);
405
406fail:
407	if (error)
408		vtnet_detach(dev);
409
410	return (error);
411}
412
413static int
414vtnet_detach(device_t dev)
415{
416	struct vtnet_softc *sc;
417	struct ifnet *ifp;
418
419	sc = device_get_softc(dev);
420	ifp = sc->vtnet_ifp;
421
422	if (device_is_attached(dev)) {
423		VTNET_CORE_LOCK(sc);
424		vtnet_stop(sc);
425		VTNET_CORE_UNLOCK(sc);
426
427		callout_drain(&sc->vtnet_tick_ch);
428		vtnet_drain_taskqueues(sc);
429
430		ether_ifdetach(ifp);
431	}
432
433#ifdef DEV_NETMAP
434	netmap_detach(ifp);
435#endif /* DEV_NETMAP */
436
437	vtnet_free_taskqueues(sc);
438
439	if (sc->vtnet_vlan_attach != NULL) {
440		EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach);
441		sc->vtnet_vlan_attach = NULL;
442	}
443	if (sc->vtnet_vlan_detach != NULL) {
444		EVENTHANDLER_DEREGISTER(vlan_unconfg, sc->vtnet_vlan_detach);
445		sc->vtnet_vlan_detach = NULL;
446	}
447
448	ifmedia_removeall(&sc->vtnet_media);
449
450	if (ifp != NULL) {
451		if_free(ifp);
452		sc->vtnet_ifp = NULL;
453	}
454
455	vtnet_free_rxtx_queues(sc);
456	vtnet_free_rx_filters(sc);
457
458	if (sc->vtnet_ctrl_vq != NULL)
459		vtnet_free_ctrl_vq(sc);
460
461	VTNET_CORE_LOCK_DESTROY(sc);
462
463	return (0);
464}
465
466static int
467vtnet_suspend(device_t dev)
468{
469	struct vtnet_softc *sc;
470
471	sc = device_get_softc(dev);
472
473	VTNET_CORE_LOCK(sc);
474	vtnet_stop(sc);
475	sc->vtnet_flags |= VTNET_FLAG_SUSPENDED;
476	VTNET_CORE_UNLOCK(sc);
477
478	return (0);
479}
480
481static int
482vtnet_resume(device_t dev)
483{
484	struct vtnet_softc *sc;
485	struct ifnet *ifp;
486
487	sc = device_get_softc(dev);
488	ifp = sc->vtnet_ifp;
489
490	VTNET_CORE_LOCK(sc);
491	if (ifp->if_flags & IFF_UP)
492		vtnet_init_locked(sc);
493	sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED;
494	VTNET_CORE_UNLOCK(sc);
495
496	return (0);
497}
498
499static int
500vtnet_shutdown(device_t dev)
501{
502
503	/*
504	 * Suspend already does all of what we need to
505	 * do here; we just never expect to be resumed.
506	 */
507	return (vtnet_suspend(dev));
508}
509
510static int
511vtnet_attach_completed(device_t dev)
512{
513
514	vtnet_attach_disable_promisc(device_get_softc(dev));
515
516	return (0);
517}
518
519static int
520vtnet_config_change(device_t dev)
521{
522	struct vtnet_softc *sc;
523
524	sc = device_get_softc(dev);
525
526	VTNET_CORE_LOCK(sc);
527	vtnet_update_link_status(sc);
528	if (sc->vtnet_link_active != 0)
529		vtnet_tx_start_all(sc);
530	VTNET_CORE_UNLOCK(sc);
531
532	return (0);
533}
534
535static void
536vtnet_negotiate_features(struct vtnet_softc *sc)
537{
538	device_t dev;
539	uint64_t mask, features;
540
541	dev = sc->vtnet_dev;
542	mask = 0;
543
544	/*
545	 * TSO and LRO are only available when their corresponding checksum
546	 * offload feature is also negotiated.
547	 */
548	if (vtnet_tunable_int(sc, "csum_disable", vtnet_csum_disable)) {
549		mask |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM;
550		mask |= VTNET_TSO_FEATURES | VTNET_LRO_FEATURES;
551	}
552	if (vtnet_tunable_int(sc, "tso_disable", vtnet_tso_disable))
553		mask |= VTNET_TSO_FEATURES;
554	if (vtnet_tunable_int(sc, "lro_disable", vtnet_lro_disable))
555		mask |= VTNET_LRO_FEATURES;
556#ifndef VTNET_LEGACY_TX
557	if (vtnet_tunable_int(sc, "mq_disable", vtnet_mq_disable))
558		mask |= VIRTIO_NET_F_MQ;
559#else
560	mask |= VIRTIO_NET_F_MQ;
561#endif
562
563	features = VTNET_FEATURES & ~mask;
564	sc->vtnet_features = virtio_negotiate_features(dev, features);
565
566	if (virtio_with_feature(dev, VTNET_LRO_FEATURES) &&
567	    virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0) {
568		/*
569		 * LRO without mergeable buffers requires special care. This
570		 * is not ideal because every receive buffer must be large
571		 * enough to hold the maximum TCP packet, the Ethernet header,
572		 * and the header. This requires up to 34 descriptors with
573		 * MCLBYTES clusters. If we do not have indirect descriptors,
574		 * LRO is disabled since the virtqueue will not contain very
575		 * many receive buffers.
576		 */
577		if (!virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) {
578			device_printf(dev,
579			    "LRO disabled due to both mergeable buffers and "
580			    "indirect descriptors not negotiated\n");
581
582			features &= ~VTNET_LRO_FEATURES;
583			sc->vtnet_features =
584			    virtio_negotiate_features(dev, features);
585		} else
586			sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG;
587	}
588}
589
590static void
591vtnet_setup_features(struct vtnet_softc *sc)
592{
593	device_t dev;
594	int max_pairs, max;
595
596	dev = sc->vtnet_dev;
597
598	vtnet_negotiate_features(sc);
599
600	if (virtio_with_feature(dev, VIRTIO_RING_F_EVENT_IDX))
601		sc->vtnet_flags |= VTNET_FLAG_EVENT_IDX;
602
603	if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) {
604		/* This feature should always be negotiated. */
605		sc->vtnet_flags |= VTNET_FLAG_MAC;
606	}
607
608	if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) {
609		sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS;
610		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
611	} else
612		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
613
614	if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS)
615		sc->vtnet_rx_nsegs = VTNET_MRG_RX_SEGS;
616	else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG)
617		sc->vtnet_rx_nsegs = VTNET_MAX_RX_SEGS;
618	else
619		sc->vtnet_rx_nsegs = VTNET_MIN_RX_SEGS;
620
621	if (virtio_with_feature(dev, VIRTIO_NET_F_GSO) ||
622	    virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4) ||
623	    virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6))
624		sc->vtnet_tx_nsegs = VTNET_MAX_TX_SEGS;
625	else
626		sc->vtnet_tx_nsegs = VTNET_MIN_TX_SEGS;
627
628	if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) {
629		sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ;
630
631		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX))
632			sc->vtnet_flags |= VTNET_FLAG_CTRL_RX;
633		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN))
634			sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER;
635		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_MAC_ADDR))
636			sc->vtnet_flags |= VTNET_FLAG_CTRL_MAC;
637	}
638
639	if (virtio_with_feature(dev, VIRTIO_NET_F_MQ) &&
640	    sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) {
641		max_pairs = virtio_read_dev_config_2(dev,
642		    offsetof(struct virtio_net_config, max_virtqueue_pairs));
643		if (max_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
644		    max_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX)
645			max_pairs = 1;
646	} else
647		max_pairs = 1;
648
649	if (max_pairs > 1) {
650		/*
651		 * Limit the maximum number of queue pairs to the number of
652		 * CPUs or the configured maximum. The actual number of
653		 * queues that get used may be less.
654		 */
655		max = vtnet_tunable_int(sc, "mq_max_pairs", vtnet_mq_max_pairs);
656		if (max > 0 && max_pairs > max)
657			max_pairs = max;
658		if (max_pairs > mp_ncpus)
659			max_pairs = mp_ncpus;
660		if (max_pairs > VTNET_MAX_QUEUE_PAIRS)
661			max_pairs = VTNET_MAX_QUEUE_PAIRS;
662		if (max_pairs > 1)
663			sc->vtnet_flags |= VTNET_FLAG_MULTIQ;
664	}
665
666	sc->vtnet_max_vq_pairs = max_pairs;
667}
668
669static int
670vtnet_init_rxq(struct vtnet_softc *sc, int id)
671{
672	struct vtnet_rxq *rxq;
673
674	rxq = &sc->vtnet_rxqs[id];
675
676	snprintf(rxq->vtnrx_name, sizeof(rxq->vtnrx_name), "%s-rx%d",
677	    device_get_nameunit(sc->vtnet_dev), id);
678	mtx_init(&rxq->vtnrx_mtx, rxq->vtnrx_name, NULL, MTX_DEF);
679
680	rxq->vtnrx_sc = sc;
681	rxq->vtnrx_id = id;
682
683	rxq->vtnrx_sg = sglist_alloc(sc->vtnet_rx_nsegs, M_NOWAIT);
684	if (rxq->vtnrx_sg == NULL)
685		return (ENOMEM);
686
687	TASK_INIT(&rxq->vtnrx_intrtask, 0, vtnet_rxq_tq_intr, rxq);
688	rxq->vtnrx_tq = taskqueue_create(rxq->vtnrx_name, M_NOWAIT,
689	    taskqueue_thread_enqueue, &rxq->vtnrx_tq);
690
691	return (rxq->vtnrx_tq == NULL ? ENOMEM : 0);
692}
693
694static int
695vtnet_init_txq(struct vtnet_softc *sc, int id)
696{
697	struct vtnet_txq *txq;
698
699	txq = &sc->vtnet_txqs[id];
700
701	snprintf(txq->vtntx_name, sizeof(txq->vtntx_name), "%s-tx%d",
702	    device_get_nameunit(sc->vtnet_dev), id);
703	mtx_init(&txq->vtntx_mtx, txq->vtntx_name, NULL, MTX_DEF);
704
705	txq->vtntx_sc = sc;
706	txq->vtntx_id = id;
707
708	txq->vtntx_sg = sglist_alloc(sc->vtnet_tx_nsegs, M_NOWAIT);
709	if (txq->vtntx_sg == NULL)
710		return (ENOMEM);
711
712#ifndef VTNET_LEGACY_TX
713	txq->vtntx_br = buf_ring_alloc(VTNET_DEFAULT_BUFRING_SIZE, M_DEVBUF,
714	    M_NOWAIT, &txq->vtntx_mtx);
715	if (txq->vtntx_br == NULL)
716		return (ENOMEM);
717
718	TASK_INIT(&txq->vtntx_defrtask, 0, vtnet_txq_tq_deferred, txq);
719#endif
720	TASK_INIT(&txq->vtntx_intrtask, 0, vtnet_txq_tq_intr, txq);
721	txq->vtntx_tq = taskqueue_create(txq->vtntx_name, M_NOWAIT,
722	    taskqueue_thread_enqueue, &txq->vtntx_tq);
723	if (txq->vtntx_tq == NULL)
724		return (ENOMEM);
725
726	return (0);
727}
728
729static int
730vtnet_alloc_rxtx_queues(struct vtnet_softc *sc)
731{
732	int i, npairs, error;
733
734	npairs = sc->vtnet_max_vq_pairs;
735
736	sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * npairs, M_DEVBUF,
737	    M_NOWAIT | M_ZERO);
738	sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * npairs, M_DEVBUF,
739	    M_NOWAIT | M_ZERO);
740	if (sc->vtnet_rxqs == NULL || sc->vtnet_txqs == NULL)
741		return (ENOMEM);
742
743	for (i = 0; i < npairs; i++) {
744		error = vtnet_init_rxq(sc, i);
745		if (error)
746			return (error);
747		error = vtnet_init_txq(sc, i);
748		if (error)
749			return (error);
750	}
751
752	vtnet_setup_queue_sysctl(sc);
753
754	return (0);
755}
756
757static void
758vtnet_destroy_rxq(struct vtnet_rxq *rxq)
759{
760
761	rxq->vtnrx_sc = NULL;
762	rxq->vtnrx_id = -1;
763
764	if (rxq->vtnrx_sg != NULL) {
765		sglist_free(rxq->vtnrx_sg);
766		rxq->vtnrx_sg = NULL;
767	}
768
769	if (mtx_initialized(&rxq->vtnrx_mtx) != 0)
770		mtx_destroy(&rxq->vtnrx_mtx);
771}
772
773static void
774vtnet_destroy_txq(struct vtnet_txq *txq)
775{
776
777	txq->vtntx_sc = NULL;
778	txq->vtntx_id = -1;
779
780	if (txq->vtntx_sg != NULL) {
781		sglist_free(txq->vtntx_sg);
782		txq->vtntx_sg = NULL;
783	}
784
785#ifndef VTNET_LEGACY_TX
786	if (txq->vtntx_br != NULL) {
787		buf_ring_free(txq->vtntx_br, M_DEVBUF);
788		txq->vtntx_br = NULL;
789	}
790#endif
791
792	if (mtx_initialized(&txq->vtntx_mtx) != 0)
793		mtx_destroy(&txq->vtntx_mtx);
794}
795
796static void
797vtnet_free_rxtx_queues(struct vtnet_softc *sc)
798{
799	int i;
800
801	if (sc->vtnet_rxqs != NULL) {
802		for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
803			vtnet_destroy_rxq(&sc->vtnet_rxqs[i]);
804		free(sc->vtnet_rxqs, M_DEVBUF);
805		sc->vtnet_rxqs = NULL;
806	}
807
808	if (sc->vtnet_txqs != NULL) {
809		for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
810			vtnet_destroy_txq(&sc->vtnet_txqs[i]);
811		free(sc->vtnet_txqs, M_DEVBUF);
812		sc->vtnet_txqs = NULL;
813	}
814}
815
816static int
817vtnet_alloc_rx_filters(struct vtnet_softc *sc)
818{
819
820	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
821		sc->vtnet_mac_filter = malloc(sizeof(struct vtnet_mac_filter),
822		    M_DEVBUF, M_NOWAIT | M_ZERO);
823		if (sc->vtnet_mac_filter == NULL)
824			return (ENOMEM);
825	}
826
827	if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) {
828		sc->vtnet_vlan_filter = malloc(sizeof(uint32_t) *
829		    VTNET_VLAN_FILTER_NWORDS, M_DEVBUF, M_NOWAIT | M_ZERO);
830		if (sc->vtnet_vlan_filter == NULL)
831			return (ENOMEM);
832	}
833
834	return (0);
835}
836
837static void
838vtnet_free_rx_filters(struct vtnet_softc *sc)
839{
840
841	if (sc->vtnet_mac_filter != NULL) {
842		free(sc->vtnet_mac_filter, M_DEVBUF);
843		sc->vtnet_mac_filter = NULL;
844	}
845
846	if (sc->vtnet_vlan_filter != NULL) {
847		free(sc->vtnet_vlan_filter, M_DEVBUF);
848		sc->vtnet_vlan_filter = NULL;
849	}
850}
851
852static int
853vtnet_alloc_virtqueues(struct vtnet_softc *sc)
854{
855	device_t dev;
856	struct vq_alloc_info *info;
857	struct vtnet_rxq *rxq;
858	struct vtnet_txq *txq;
859	int i, idx, flags, nvqs, error;
860
861	dev = sc->vtnet_dev;
862	flags = 0;
863
864	nvqs = sc->vtnet_max_vq_pairs * 2;
865	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ)
866		nvqs++;
867
868	info = malloc(sizeof(struct vq_alloc_info) * nvqs, M_TEMP, M_NOWAIT);
869	if (info == NULL)
870		return (ENOMEM);
871
872	for (i = 0, idx = 0; i < sc->vtnet_max_vq_pairs; i++, idx+=2) {
873		rxq = &sc->vtnet_rxqs[i];
874		VQ_ALLOC_INFO_INIT(&info[idx], sc->vtnet_rx_nsegs,
875		    vtnet_rx_vq_intr, rxq, &rxq->vtnrx_vq,
876		    "%s-%d rx", device_get_nameunit(dev), rxq->vtnrx_id);
877
878		txq = &sc->vtnet_txqs[i];
879		VQ_ALLOC_INFO_INIT(&info[idx+1], sc->vtnet_tx_nsegs,
880		    vtnet_tx_vq_intr, txq, &txq->vtntx_vq,
881		    "%s-%d tx", device_get_nameunit(dev), txq->vtntx_id);
882	}
883
884	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) {
885		VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, NULL,
886		    &sc->vtnet_ctrl_vq, "%s ctrl", device_get_nameunit(dev));
887	}
888
889	/*
890	 * Enable interrupt binding if this is multiqueue. This only matters
891	 * when per-vq MSIX is available.
892	 */
893	if (sc->vtnet_flags & VTNET_FLAG_MULTIQ)
894		flags |= 0;
895
896	error = virtio_alloc_virtqueues(dev, flags, nvqs, info);
897	free(info, M_TEMP);
898
899	return (error);
900}
901
902static int
903vtnet_setup_interface(struct vtnet_softc *sc)
904{
905	device_t dev;
906	struct ifnet *ifp;
907
908	dev = sc->vtnet_dev;
909
910	ifp = sc->vtnet_ifp = if_alloc(IFT_ETHER);
911	if (ifp == NULL) {
912		device_printf(dev, "cannot allocate ifnet structure\n");
913		return (ENOSPC);
914	}
915
916	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
917	if_initbaudrate(ifp, IF_Gbps(10));	/* Approx. */
918	ifp->if_softc = sc;
919	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
920	ifp->if_init = vtnet_init;
921	ifp->if_ioctl = vtnet_ioctl;
922
923#ifndef VTNET_LEGACY_TX
924	ifp->if_transmit = vtnet_txq_mq_start;
925	ifp->if_qflush = vtnet_qflush;
926#else
927	struct virtqueue *vq = sc->vtnet_txqs[0].vtntx_vq;
928	ifp->if_start = vtnet_start;
929	IFQ_SET_MAXLEN(&ifp->if_snd, virtqueue_size(vq) - 1);
930	ifp->if_snd.ifq_drv_maxlen = virtqueue_size(vq) - 1;
931	IFQ_SET_READY(&ifp->if_snd);
932#endif
933
934	ifmedia_init(&sc->vtnet_media, IFM_IMASK, vtnet_ifmedia_upd,
935	    vtnet_ifmedia_sts);
936	ifmedia_add(&sc->vtnet_media, VTNET_MEDIATYPE, 0, NULL);
937	ifmedia_set(&sc->vtnet_media, VTNET_MEDIATYPE);
938
939	/* Read (or generate) the MAC address for the adapter. */
940	vtnet_get_hwaddr(sc);
941
942	ether_ifattach(ifp, sc->vtnet_hwaddr);
943
944	if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS))
945		ifp->if_capabilities |= IFCAP_LINKSTATE;
946
947	/* Tell the upper layer(s) we support long frames. */
948	ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
949	ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU;
950
951	if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) {
952		ifp->if_capabilities |= IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6;
953
954		if (virtio_with_feature(dev, VIRTIO_NET_F_GSO)) {
955			ifp->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6;
956			sc->vtnet_flags |= VTNET_FLAG_TSO_ECN;
957		} else {
958			if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4))
959				ifp->if_capabilities |= IFCAP_TSO4;
960			if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6))
961				ifp->if_capabilities |= IFCAP_TSO6;
962			if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN))
963				sc->vtnet_flags |= VTNET_FLAG_TSO_ECN;
964		}
965
966		if (ifp->if_capabilities & IFCAP_TSO)
967			ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
968	}
969
970	if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM))
971		ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6;
972
973	if (ifp->if_capabilities & IFCAP_HWCSUM) {
974		/*
975		 * VirtIO does not support VLAN tagging, but we can fake
976		 * it by inserting and removing the 802.1Q header during
977		 * transmit and receive. We are then able to do checksum
978		 * offloading of VLAN frames.
979		 */
980		ifp->if_capabilities |=
981		    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
982	}
983
984	ifp->if_capenable = ifp->if_capabilities;
985
986	/*
987	 * Capabilities after here are not enabled by default.
988	 */
989
990	if (ifp->if_capabilities & IFCAP_RXCSUM) {
991		if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) ||
992		    virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6))
993			ifp->if_capabilities |= IFCAP_LRO;
994	}
995
996	if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) {
997		ifp->if_capabilities |= IFCAP_VLAN_HWFILTER;
998
999		sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
1000		    vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
1001		sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
1002		    vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
1003	}
1004
1005	vtnet_set_rx_process_limit(sc);
1006	vtnet_set_tx_intr_threshold(sc);
1007
1008	return (0);
1009}
1010
1011static int
1012vtnet_change_mtu(struct vtnet_softc *sc, int new_mtu)
1013{
1014	struct ifnet *ifp;
1015	int frame_size, clsize;
1016
1017	ifp = sc->vtnet_ifp;
1018
1019	if (new_mtu < ETHERMIN || new_mtu > VTNET_MAX_MTU)
1020		return (EINVAL);
1021
1022	frame_size = sc->vtnet_hdr_size + sizeof(struct ether_vlan_header) +
1023	    new_mtu;
1024
1025	/*
1026	 * Based on the new MTU (and hence frame size) determine which
1027	 * cluster size is most appropriate for the receive queues.
1028	 */
1029	if (frame_size <= MCLBYTES) {
1030		clsize = MCLBYTES;
1031	} else if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
1032		/* Avoid going past 9K jumbos. */
1033		if (frame_size > MJUM9BYTES)
1034			return (EINVAL);
1035		clsize = MJUM9BYTES;
1036	} else
1037		clsize = MJUMPAGESIZE;
1038
1039	ifp->if_mtu = new_mtu;
1040	sc->vtnet_rx_new_clsize = clsize;
1041
1042	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1043		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1044		vtnet_init_locked(sc);
1045	}
1046
1047	return (0);
1048}
1049
1050static int
1051vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1052{
1053	struct vtnet_softc *sc;
1054	struct ifreq *ifr;
1055	int reinit, mask, error;
1056
1057	sc = ifp->if_softc;
1058	ifr = (struct ifreq *) data;
1059	error = 0;
1060
1061	switch (cmd) {
1062	case SIOCSIFMTU:
1063		if (ifp->if_mtu != ifr->ifr_mtu) {
1064			VTNET_CORE_LOCK(sc);
1065			error = vtnet_change_mtu(sc, ifr->ifr_mtu);
1066			VTNET_CORE_UNLOCK(sc);
1067		}
1068		break;
1069
1070	case SIOCSIFFLAGS:
1071		VTNET_CORE_LOCK(sc);
1072		if ((ifp->if_flags & IFF_UP) == 0) {
1073			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1074				vtnet_stop(sc);
1075		} else if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1076			if ((ifp->if_flags ^ sc->vtnet_if_flags) &
1077			    (IFF_PROMISC | IFF_ALLMULTI)) {
1078				if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX)
1079					vtnet_rx_filter(sc);
1080				else
1081					error = ENOTSUP;
1082			}
1083		} else
1084			vtnet_init_locked(sc);
1085
1086		if (error == 0)
1087			sc->vtnet_if_flags = ifp->if_flags;
1088		VTNET_CORE_UNLOCK(sc);
1089		break;
1090
1091	case SIOCADDMULTI:
1092	case SIOCDELMULTI:
1093		if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0)
1094			break;
1095		VTNET_CORE_LOCK(sc);
1096		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1097			vtnet_rx_filter_mac(sc);
1098		VTNET_CORE_UNLOCK(sc);
1099		break;
1100
1101	case SIOCSIFMEDIA:
1102	case SIOCGIFMEDIA:
1103		error = ifmedia_ioctl(ifp, ifr, &sc->vtnet_media, cmd);
1104		break;
1105
1106	case SIOCSIFCAP:
1107		VTNET_CORE_LOCK(sc);
1108		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
1109
1110		if (mask & IFCAP_TXCSUM)
1111			ifp->if_capenable ^= IFCAP_TXCSUM;
1112		if (mask & IFCAP_TXCSUM_IPV6)
1113			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
1114		if (mask & IFCAP_TSO4)
1115			ifp->if_capenable ^= IFCAP_TSO4;
1116		if (mask & IFCAP_TSO6)
1117			ifp->if_capenable ^= IFCAP_TSO6;
1118
1119		if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO |
1120		    IFCAP_VLAN_HWFILTER)) {
1121			/* These Rx features require us to renegotiate. */
1122			reinit = 1;
1123
1124			if (mask & IFCAP_RXCSUM)
1125				ifp->if_capenable ^= IFCAP_RXCSUM;
1126			if (mask & IFCAP_RXCSUM_IPV6)
1127				ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
1128			if (mask & IFCAP_LRO)
1129				ifp->if_capenable ^= IFCAP_LRO;
1130			if (mask & IFCAP_VLAN_HWFILTER)
1131				ifp->if_capenable ^= IFCAP_VLAN_HWFILTER;
1132		} else
1133			reinit = 0;
1134
1135		if (mask & IFCAP_VLAN_HWTSO)
1136			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
1137		if (mask & IFCAP_VLAN_HWTAGGING)
1138			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
1139
1140		if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1141			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1142			vtnet_init_locked(sc);
1143		}
1144
1145		VTNET_CORE_UNLOCK(sc);
1146		VLAN_CAPABILITIES(ifp);
1147
1148		break;
1149
1150	default:
1151		error = ether_ioctl(ifp, cmd, data);
1152		break;
1153	}
1154
1155	VTNET_CORE_LOCK_ASSERT_NOTOWNED(sc);
1156
1157	return (error);
1158}
1159
1160static int
1161vtnet_rxq_populate(struct vtnet_rxq *rxq)
1162{
1163	struct virtqueue *vq;
1164	int nbufs, error;
1165
1166	vq = rxq->vtnrx_vq;
1167	error = ENOSPC;
1168
1169	for (nbufs = 0; !virtqueue_full(vq); nbufs++) {
1170		error = vtnet_rxq_new_buf(rxq);
1171		if (error)
1172			break;
1173	}
1174
1175	if (nbufs > 0) {
1176		virtqueue_notify(vq);
1177		/*
1178		 * EMSGSIZE signifies the virtqueue did not have enough
1179		 * entries available to hold the last mbuf. This is not
1180		 * an error.
1181		 */
1182		if (error == EMSGSIZE)
1183			error = 0;
1184	}
1185
1186	return (error);
1187}
1188
1189static void
1190vtnet_rxq_free_mbufs(struct vtnet_rxq *rxq)
1191{
1192	struct virtqueue *vq;
1193	struct mbuf *m;
1194	int last;
1195
1196	vq = rxq->vtnrx_vq;
1197	last = 0;
1198
1199	while ((m = virtqueue_drain(vq, &last)) != NULL)
1200		m_freem(m);
1201
1202	KASSERT(virtqueue_empty(vq),
1203	    ("%s: mbufs remaining in rx queue %p", __func__, rxq));
1204}
1205
1206static struct mbuf *
1207vtnet_rx_alloc_buf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp)
1208{
1209	struct mbuf *m_head, *m_tail, *m;
1210	int i, clsize;
1211
1212	clsize = sc->vtnet_rx_clsize;
1213
1214	KASSERT(nbufs == 1 || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
1215	    ("%s: chained mbuf %d request without LRO_NOMRG", __func__, nbufs));
1216
1217	m_head = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, clsize);
1218	if (m_head == NULL)
1219		goto fail;
1220
1221	m_head->m_len = clsize;
1222	m_tail = m_head;
1223
1224	/* Allocate the rest of the chain. */
1225	for (i = 1; i < nbufs; i++) {
1226		m = m_getjcl(M_NOWAIT, MT_DATA, 0, clsize);
1227		if (m == NULL)
1228			goto fail;
1229
1230		m->m_len = clsize;
1231		m_tail->m_next = m;
1232		m_tail = m;
1233	}
1234
1235	if (m_tailp != NULL)
1236		*m_tailp = m_tail;
1237
1238	return (m_head);
1239
1240fail:
1241	sc->vtnet_stats.mbuf_alloc_failed++;
1242	m_freem(m_head);
1243
1244	return (NULL);
1245}
1246
1247/*
1248 * Slow path for when LRO without mergeable buffers is negotiated.
1249 */
1250static int
1251vtnet_rxq_replace_lro_nomgr_buf(struct vtnet_rxq *rxq, struct mbuf *m0,
1252    int len0)
1253{
1254	struct vtnet_softc *sc;
1255	struct mbuf *m, *m_prev;
1256	struct mbuf *m_new, *m_tail;
1257	int len, clsize, nreplace, error;
1258
1259	sc = rxq->vtnrx_sc;
1260	clsize = sc->vtnet_rx_clsize;
1261
1262	m_prev = NULL;
1263	m_tail = NULL;
1264	nreplace = 0;
1265
1266	m = m0;
1267	len = len0;
1268
1269	/*
1270	 * Since these mbuf chains are so large, we avoid allocating an
1271	 * entire replacement chain if possible. When the received frame
1272	 * did not consume the entire chain, the unused mbufs are moved
1273	 * to the replacement chain.
1274	 */
1275	while (len > 0) {
1276		/*
1277		 * Something is seriously wrong if we received a frame
1278		 * larger than the chain. Drop it.
1279		 */
1280		if (m == NULL) {
1281			sc->vtnet_stats.rx_frame_too_large++;
1282			return (EMSGSIZE);
1283		}
1284
1285		/* We always allocate the same cluster size. */
1286		KASSERT(m->m_len == clsize,
1287		    ("%s: mbuf size %d is not the cluster size %d",
1288		    __func__, m->m_len, clsize));
1289
1290		m->m_len = MIN(m->m_len, len);
1291		len -= m->m_len;
1292
1293		m_prev = m;
1294		m = m->m_next;
1295		nreplace++;
1296	}
1297
1298	KASSERT(nreplace <= sc->vtnet_rx_nmbufs,
1299	    ("%s: too many replacement mbufs %d max %d", __func__, nreplace,
1300	    sc->vtnet_rx_nmbufs));
1301
1302	m_new = vtnet_rx_alloc_buf(sc, nreplace, &m_tail);
1303	if (m_new == NULL) {
1304		m_prev->m_len = clsize;
1305		return (ENOBUFS);
1306	}
1307
1308	/*
1309	 * Move any unused mbufs from the received chain onto the end
1310	 * of the new chain.
1311	 */
1312	if (m_prev->m_next != NULL) {
1313		m_tail->m_next = m_prev->m_next;
1314		m_prev->m_next = NULL;
1315	}
1316
1317	error = vtnet_rxq_enqueue_buf(rxq, m_new);
1318	if (error) {
1319		/*
1320		 * BAD! We could not enqueue the replacement mbuf chain. We
1321		 * must restore the m0 chain to the original state if it was
1322		 * modified so we can subsequently discard it.
1323		 *
1324		 * NOTE: The replacement is suppose to be an identical copy
1325		 * to the one just dequeued so this is an unexpected error.
1326		 */
1327		sc->vtnet_stats.rx_enq_replacement_failed++;
1328
1329		if (m_tail->m_next != NULL) {
1330			m_prev->m_next = m_tail->m_next;
1331			m_tail->m_next = NULL;
1332		}
1333
1334		m_prev->m_len = clsize;
1335		m_freem(m_new);
1336	}
1337
1338	return (error);
1339}
1340
1341static int
1342vtnet_rxq_replace_buf(struct vtnet_rxq *rxq, struct mbuf *m, int len)
1343{
1344	struct vtnet_softc *sc;
1345	struct mbuf *m_new;
1346	int error;
1347
1348	sc = rxq->vtnrx_sc;
1349
1350	KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || m->m_next == NULL,
1351	    ("%s: chained mbuf without LRO_NOMRG", __func__));
1352
1353	if (m->m_next == NULL) {
1354		/* Fast-path for the common case of just one mbuf. */
1355		if (m->m_len < len)
1356			return (EINVAL);
1357
1358		m_new = vtnet_rx_alloc_buf(sc, 1, NULL);
1359		if (m_new == NULL)
1360			return (ENOBUFS);
1361
1362		error = vtnet_rxq_enqueue_buf(rxq, m_new);
1363		if (error) {
1364			/*
1365			 * The new mbuf is suppose to be an identical
1366			 * copy of the one just dequeued so this is an
1367			 * unexpected error.
1368			 */
1369			m_freem(m_new);
1370			sc->vtnet_stats.rx_enq_replacement_failed++;
1371		} else
1372			m->m_len = len;
1373	} else
1374		error = vtnet_rxq_replace_lro_nomgr_buf(rxq, m, len);
1375
1376	return (error);
1377}
1378
1379static int
1380vtnet_rxq_enqueue_buf(struct vtnet_rxq *rxq, struct mbuf *m)
1381{
1382	struct vtnet_softc *sc;
1383	struct sglist *sg;
1384	struct vtnet_rx_header *rxhdr;
1385	uint8_t *mdata;
1386	int offset, error;
1387
1388	sc = rxq->vtnrx_sc;
1389	sg = rxq->vtnrx_sg;
1390	mdata = mtod(m, uint8_t *);
1391
1392	VTNET_RXQ_LOCK_ASSERT(rxq);
1393	KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || m->m_next == NULL,
1394	    ("%s: chained mbuf without LRO_NOMRG", __func__));
1395	KASSERT(m->m_len == sc->vtnet_rx_clsize,
1396	    ("%s: unexpected cluster size %d/%d", __func__, m->m_len,
1397	     sc->vtnet_rx_clsize));
1398
1399	sglist_reset(sg);
1400	if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
1401		MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr));
1402		rxhdr = (struct vtnet_rx_header *) mdata;
1403		sglist_append(sg, &rxhdr->vrh_hdr, sc->vtnet_hdr_size);
1404		offset = sizeof(struct vtnet_rx_header);
1405	} else
1406		offset = 0;
1407
1408	sglist_append(sg, mdata + offset, m->m_len - offset);
1409	if (m->m_next != NULL) {
1410		error = sglist_append_mbuf(sg, m->m_next);
1411		MPASS(error == 0);
1412	}
1413
1414	error = virtqueue_enqueue(rxq->vtnrx_vq, m, sg, 0, sg->sg_nseg);
1415
1416	return (error);
1417}
1418
1419static int
1420vtnet_rxq_new_buf(struct vtnet_rxq *rxq)
1421{
1422	struct vtnet_softc *sc;
1423	struct mbuf *m;
1424	int error;
1425
1426	sc = rxq->vtnrx_sc;
1427
1428	m = vtnet_rx_alloc_buf(sc, sc->vtnet_rx_nmbufs, NULL);
1429	if (m == NULL)
1430		return (ENOBUFS);
1431
1432	error = vtnet_rxq_enqueue_buf(rxq, m);
1433	if (error)
1434		m_freem(m);
1435
1436	return (error);
1437}
1438
1439/*
1440 * Use the checksum offset in the VirtIO header to set the
1441 * correct CSUM_* flags.
1442 */
1443static int
1444vtnet_rxq_csum_by_offset(struct vtnet_rxq *rxq, struct mbuf *m,
1445    uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr)
1446{
1447	struct vtnet_softc *sc;
1448#if defined(INET) || defined(INET6)
1449	int offset = hdr->csum_start + hdr->csum_offset;
1450#endif
1451
1452	sc = rxq->vtnrx_sc;
1453
1454	/* Only do a basic sanity check on the offset. */
1455	switch (eth_type) {
1456#if defined(INET)
1457	case ETHERTYPE_IP:
1458		if (__predict_false(offset < ip_start + sizeof(struct ip)))
1459			return (1);
1460		break;
1461#endif
1462#if defined(INET6)
1463	case ETHERTYPE_IPV6:
1464		if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr)))
1465			return (1);
1466		break;
1467#endif
1468	default:
1469		sc->vtnet_stats.rx_csum_bad_ethtype++;
1470		return (1);
1471	}
1472
1473	/*
1474	 * Use the offset to determine the appropriate CSUM_* flags. This is
1475	 * a bit dirty, but we can get by with it since the checksum offsets
1476	 * happen to be different. We assume the host host does not do IPv4
1477	 * header checksum offloading.
1478	 */
1479	switch (hdr->csum_offset) {
1480	case offsetof(struct udphdr, uh_sum):
1481	case offsetof(struct tcphdr, th_sum):
1482		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1483		m->m_pkthdr.csum_data = 0xFFFF;
1484		break;
1485	case offsetof(struct sctphdr, checksum):
1486		m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
1487		break;
1488	default:
1489		sc->vtnet_stats.rx_csum_bad_offset++;
1490		return (1);
1491	}
1492
1493	return (0);
1494}
1495
1496static int
1497vtnet_rxq_csum_by_parse(struct vtnet_rxq *rxq, struct mbuf *m,
1498    uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr)
1499{
1500	struct vtnet_softc *sc;
1501	int offset, proto;
1502
1503	sc = rxq->vtnrx_sc;
1504
1505	switch (eth_type) {
1506#if defined(INET)
1507	case ETHERTYPE_IP: {
1508		struct ip *ip;
1509		if (__predict_false(m->m_len < ip_start + sizeof(struct ip)))
1510			return (1);
1511		ip = (struct ip *)(m->m_data + ip_start);
1512		proto = ip->ip_p;
1513		offset = ip_start + (ip->ip_hl << 2);
1514		break;
1515	}
1516#endif
1517#if defined(INET6)
1518	case ETHERTYPE_IPV6:
1519		if (__predict_false(m->m_len < ip_start +
1520		    sizeof(struct ip6_hdr)))
1521			return (1);
1522		offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto);
1523		if (__predict_false(offset < 0))
1524			return (1);
1525		break;
1526#endif
1527	default:
1528		sc->vtnet_stats.rx_csum_bad_ethtype++;
1529		return (1);
1530	}
1531
1532	switch (proto) {
1533	case IPPROTO_TCP:
1534		if (__predict_false(m->m_len < offset + sizeof(struct tcphdr)))
1535			return (1);
1536		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1537		m->m_pkthdr.csum_data = 0xFFFF;
1538		break;
1539	case IPPROTO_UDP:
1540		if (__predict_false(m->m_len < offset + sizeof(struct udphdr)))
1541			return (1);
1542		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1543		m->m_pkthdr.csum_data = 0xFFFF;
1544		break;
1545	case IPPROTO_SCTP:
1546		if (__predict_false(m->m_len < offset + sizeof(struct sctphdr)))
1547			return (1);
1548		m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
1549		break;
1550	default:
1551		/*
1552		 * For the remaining protocols, FreeBSD does not support
1553		 * checksum offloading, so the checksum will be recomputed.
1554		 */
1555#if 0
1556		if_printf(sc->vtnet_ifp, "cksum offload of unsupported "
1557		    "protocol eth_type=%#x proto=%d csum_start=%d "
1558		    "csum_offset=%d\n", __func__, eth_type, proto,
1559		    hdr->csum_start, hdr->csum_offset);
1560#endif
1561		break;
1562	}
1563
1564	return (0);
1565}
1566
1567/*
1568 * Set the appropriate CSUM_* flags. Unfortunately, the information
1569 * provided is not directly useful to us. The VirtIO header gives the
1570 * offset of the checksum, which is all Linux needs, but this is not
1571 * how FreeBSD does things. We are forced to peek inside the packet
1572 * a bit.
1573 *
1574 * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD
1575 * could accept the offsets and let the stack figure it out.
1576 */
1577static int
1578vtnet_rxq_csum(struct vtnet_rxq *rxq, struct mbuf *m,
1579    struct virtio_net_hdr *hdr)
1580{
1581	struct ether_header *eh;
1582	struct ether_vlan_header *evh;
1583	uint16_t eth_type;
1584	int offset, error;
1585
1586	eh = mtod(m, struct ether_header *);
1587	eth_type = ntohs(eh->ether_type);
1588	if (eth_type == ETHERTYPE_VLAN) {
1589		/* BMV: We should handle nested VLAN tags too. */
1590		evh = mtod(m, struct ether_vlan_header *);
1591		eth_type = ntohs(evh->evl_proto);
1592		offset = sizeof(struct ether_vlan_header);
1593	} else
1594		offset = sizeof(struct ether_header);
1595
1596	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
1597		error = vtnet_rxq_csum_by_offset(rxq, m, eth_type, offset, hdr);
1598	else
1599		error = vtnet_rxq_csum_by_parse(rxq, m, eth_type, offset, hdr);
1600
1601	return (error);
1602}
1603
1604static void
1605vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *rxq, int nbufs)
1606{
1607	struct mbuf *m;
1608
1609	while (--nbufs > 0) {
1610		m = virtqueue_dequeue(rxq->vtnrx_vq, NULL);
1611		if (m == NULL)
1612			break;
1613		vtnet_rxq_discard_buf(rxq, m);
1614	}
1615}
1616
1617static void
1618vtnet_rxq_discard_buf(struct vtnet_rxq *rxq, struct mbuf *m)
1619{
1620	int error;
1621
1622	/*
1623	 * Requeue the discarded mbuf. This should always be successful
1624	 * since it was just dequeued.
1625	 */
1626	error = vtnet_rxq_enqueue_buf(rxq, m);
1627	KASSERT(error == 0,
1628	    ("%s: cannot requeue discarded mbuf %d", __func__, error));
1629}
1630
1631static int
1632vtnet_rxq_merged_eof(struct vtnet_rxq *rxq, struct mbuf *m_head, int nbufs)
1633{
1634	struct vtnet_softc *sc;
1635	struct ifnet *ifp;
1636	struct virtqueue *vq;
1637	struct mbuf *m, *m_tail;
1638	int len;
1639
1640	sc = rxq->vtnrx_sc;
1641	vq = rxq->vtnrx_vq;
1642	ifp = sc->vtnet_ifp;
1643	m_tail = m_head;
1644
1645	while (--nbufs > 0) {
1646		m = virtqueue_dequeue(vq, &len);
1647		if (m == NULL) {
1648			rxq->vtnrx_stats.vrxs_ierrors++;
1649			goto fail;
1650		}
1651
1652		if (vtnet_rxq_new_buf(rxq) != 0) {
1653			rxq->vtnrx_stats.vrxs_iqdrops++;
1654			vtnet_rxq_discard_buf(rxq, m);
1655			if (nbufs > 1)
1656				vtnet_rxq_discard_merged_bufs(rxq, nbufs);
1657			goto fail;
1658		}
1659
1660		if (m->m_len < len)
1661			len = m->m_len;
1662
1663		m->m_len = len;
1664		m->m_flags &= ~M_PKTHDR;
1665
1666		m_head->m_pkthdr.len += len;
1667		m_tail->m_next = m;
1668		m_tail = m;
1669	}
1670
1671	return (0);
1672
1673fail:
1674	sc->vtnet_stats.rx_mergeable_failed++;
1675	m_freem(m_head);
1676
1677	return (1);
1678}
1679
1680static void
1681vtnet_rxq_input(struct vtnet_rxq *rxq, struct mbuf *m,
1682    struct virtio_net_hdr *hdr)
1683{
1684	struct vtnet_softc *sc;
1685	struct ifnet *ifp;
1686	struct ether_header *eh;
1687
1688	sc = rxq->vtnrx_sc;
1689	ifp = sc->vtnet_ifp;
1690
1691	if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) {
1692		eh = mtod(m, struct ether_header *);
1693		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
1694			vtnet_vlan_tag_remove(m);
1695			/*
1696			 * With the 802.1Q header removed, update the
1697			 * checksum starting location accordingly.
1698			 */
1699			if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
1700				hdr->csum_start -= ETHER_VLAN_ENCAP_LEN;
1701		}
1702	}
1703
1704	m->m_pkthdr.flowid = rxq->vtnrx_id;
1705	m->m_flags |= M_FLOWID;
1706
1707	/*
1708	 * BMV: FreeBSD does not have the UNNECESSARY and PARTIAL checksum
1709	 * distinction that Linux does. Need to reevaluate if performing
1710	 * offloading for the NEEDS_CSUM case is really appropriate.
1711	 */
1712	if (hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM |
1713	    VIRTIO_NET_HDR_F_DATA_VALID)) {
1714		if (vtnet_rxq_csum(rxq, m, hdr) == 0)
1715			rxq->vtnrx_stats.vrxs_csum++;
1716		else
1717			rxq->vtnrx_stats.vrxs_csum_failed++;
1718	}
1719
1720	rxq->vtnrx_stats.vrxs_ipackets++;
1721	rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len;
1722
1723	VTNET_RXQ_UNLOCK(rxq);
1724	(*ifp->if_input)(ifp, m);
1725	VTNET_RXQ_LOCK(rxq);
1726}
1727
1728static int
1729vtnet_rxq_eof(struct vtnet_rxq *rxq)
1730{
1731	struct virtio_net_hdr lhdr, *hdr;
1732	struct vtnet_softc *sc;
1733	struct ifnet *ifp;
1734	struct virtqueue *vq;
1735	struct mbuf *m;
1736	struct virtio_net_hdr_mrg_rxbuf *mhdr;
1737	int len, deq, nbufs, adjsz, count;
1738
1739	sc = rxq->vtnrx_sc;
1740	vq = rxq->vtnrx_vq;
1741	ifp = sc->vtnet_ifp;
1742	hdr = &lhdr;
1743	deq = 0;
1744	count = sc->vtnet_rx_process_limit;
1745
1746	VTNET_RXQ_LOCK_ASSERT(rxq);
1747
1748#ifdef DEV_NETMAP
1749	if (netmap_rx_irq(ifp, 0, &deq)) {
1750		return (FALSE);
1751	}
1752#endif /* DEV_NETMAP */
1753
1754	while (count-- > 0) {
1755		m = virtqueue_dequeue(vq, &len);
1756		if (m == NULL)
1757			break;
1758		deq++;
1759
1760		if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) {
1761			rxq->vtnrx_stats.vrxs_ierrors++;
1762			vtnet_rxq_discard_buf(rxq, m);
1763			continue;
1764		}
1765
1766		if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
1767			nbufs = 1;
1768			adjsz = sizeof(struct vtnet_rx_header);
1769			/*
1770			 * Account for our pad inserted between the header
1771			 * and the actual start of the frame.
1772			 */
1773			len += VTNET_RX_HEADER_PAD;
1774		} else {
1775			mhdr = mtod(m, struct virtio_net_hdr_mrg_rxbuf *);
1776			nbufs = mhdr->num_buffers;
1777			adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1778		}
1779
1780		if (vtnet_rxq_replace_buf(rxq, m, len) != 0) {
1781			rxq->vtnrx_stats.vrxs_iqdrops++;
1782			vtnet_rxq_discard_buf(rxq, m);
1783			if (nbufs > 1)
1784				vtnet_rxq_discard_merged_bufs(rxq, nbufs);
1785			continue;
1786		}
1787
1788		m->m_pkthdr.len = len;
1789		m->m_pkthdr.rcvif = ifp;
1790		m->m_pkthdr.csum_flags = 0;
1791
1792		if (nbufs > 1) {
1793			/* Dequeue the rest of chain. */
1794			if (vtnet_rxq_merged_eof(rxq, m, nbufs) != 0)
1795				continue;
1796		}
1797
1798		/*
1799		 * Save copy of header before we strip it. For both mergeable
1800		 * and non-mergeable, the header is at the beginning of the
1801		 * mbuf data. We no longer need num_buffers, so always use a
1802		 * regular header.
1803		 *
1804		 * BMV: Is this memcpy() expensive? We know the mbuf data is
1805		 * still valid even after the m_adj().
1806		 */
1807		memcpy(hdr, mtod(m, void *), sizeof(struct virtio_net_hdr));
1808		m_adj(m, adjsz);
1809
1810		vtnet_rxq_input(rxq, m, hdr);
1811
1812		/* Must recheck after dropping the Rx lock. */
1813		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1814			break;
1815	}
1816
1817	if (deq > 0)
1818		virtqueue_notify(vq);
1819
1820	return (count > 0 ? 0 : EAGAIN);
1821}
1822
1823static void
1824vtnet_rx_vq_intr(void *xrxq)
1825{
1826	struct vtnet_softc *sc;
1827	struct vtnet_rxq *rxq;
1828	struct ifnet *ifp;
1829	int tries, more;
1830
1831	rxq = xrxq;
1832	sc = rxq->vtnrx_sc;
1833	ifp = sc->vtnet_ifp;
1834	tries = 0;
1835
1836	if (__predict_false(rxq->vtnrx_id >= sc->vtnet_act_vq_pairs)) {
1837		/*
1838		 * Ignore this interrupt. Either this is a spurious interrupt
1839		 * or multiqueue without per-VQ MSIX so every queue needs to
1840		 * be polled (a brain dead configuration we could try harder
1841		 * to avoid).
1842		 */
1843		vtnet_rxq_disable_intr(rxq);
1844		return;
1845	}
1846
1847	VTNET_RXQ_LOCK(rxq);
1848
1849again:
1850	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
1851		VTNET_RXQ_UNLOCK(rxq);
1852		return;
1853	}
1854
1855	more = vtnet_rxq_eof(rxq);
1856	if (more || vtnet_rxq_enable_intr(rxq) != 0) {
1857		if (!more)
1858			vtnet_rxq_disable_intr(rxq);
1859		/*
1860		 * This is an occasional condition or race (when !more),
1861		 * so retry a few times before scheduling the taskqueue.
1862		 */
1863		if (tries++ < VTNET_INTR_DISABLE_RETRIES)
1864			goto again;
1865
1866		VTNET_RXQ_UNLOCK(rxq);
1867		rxq->vtnrx_stats.vrxs_rescheduled++;
1868		taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
1869	} else
1870		VTNET_RXQ_UNLOCK(rxq);
1871}
1872
1873static void
1874vtnet_rxq_tq_intr(void *xrxq, int pending)
1875{
1876	struct vtnet_softc *sc;
1877	struct vtnet_rxq *rxq;
1878	struct ifnet *ifp;
1879	int more;
1880
1881	rxq = xrxq;
1882	sc = rxq->vtnrx_sc;
1883	ifp = sc->vtnet_ifp;
1884
1885	VTNET_RXQ_LOCK(rxq);
1886
1887	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
1888		VTNET_RXQ_UNLOCK(rxq);
1889		return;
1890	}
1891
1892	more = vtnet_rxq_eof(rxq);
1893	if (more || vtnet_rxq_enable_intr(rxq) != 0) {
1894		if (!more)
1895			vtnet_rxq_disable_intr(rxq);
1896		rxq->vtnrx_stats.vrxs_rescheduled++;
1897		taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
1898	}
1899
1900	VTNET_RXQ_UNLOCK(rxq);
1901}
1902
1903static int
1904vtnet_txq_below_threshold(struct vtnet_txq *txq)
1905{
1906	struct vtnet_softc *sc;
1907	struct virtqueue *vq;
1908
1909	sc = txq->vtntx_sc;
1910	vq = txq->vtntx_vq;
1911
1912	return (virtqueue_nfree(vq) <= sc->vtnet_tx_intr_thresh);
1913}
1914
1915static int
1916vtnet_txq_notify(struct vtnet_txq *txq)
1917{
1918	struct virtqueue *vq;
1919
1920	vq = txq->vtntx_vq;
1921
1922	txq->vtntx_watchdog = VTNET_TX_TIMEOUT;
1923	virtqueue_notify(vq);
1924
1925	if (vtnet_txq_enable_intr(txq) == 0)
1926		return (0);
1927
1928	/*
1929	 * Drain frames that were completed since last checked. If this
1930	 * causes the queue to go above the threshold, the caller should
1931	 * continue transmitting.
1932	 */
1933	if (vtnet_txq_eof(txq) != 0 && vtnet_txq_below_threshold(txq) == 0) {
1934		virtqueue_disable_intr(vq);
1935		return (1);
1936	}
1937
1938	return (0);
1939}
1940
1941static void
1942vtnet_txq_free_mbufs(struct vtnet_txq *txq)
1943{
1944	struct virtqueue *vq;
1945	struct vtnet_tx_header *txhdr;
1946	int last;
1947
1948	vq = txq->vtntx_vq;
1949	last = 0;
1950
1951	while ((txhdr = virtqueue_drain(vq, &last)) != NULL) {
1952		m_freem(txhdr->vth_mbuf);
1953		uma_zfree(vtnet_tx_header_zone, txhdr);
1954	}
1955
1956	KASSERT(virtqueue_empty(vq),
1957	    ("%s: mbufs remaining in tx queue %p", __func__, txq));
1958}
1959
1960/*
1961 * BMV: Much of this can go away once we finally have offsets in
1962 * the mbuf packet header. Bug andre@.
1963 */
1964static int
1965vtnet_txq_offload_ctx(struct vtnet_txq *txq, struct mbuf *m,
1966    int *etype, int *proto, int *start)
1967{
1968	struct vtnet_softc *sc;
1969	struct ether_vlan_header *evh;
1970	int offset;
1971
1972	sc = txq->vtntx_sc;
1973
1974	evh = mtod(m, struct ether_vlan_header *);
1975	if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1976		/* BMV: We should handle nested VLAN tags too. */
1977		*etype = ntohs(evh->evl_proto);
1978		offset = sizeof(struct ether_vlan_header);
1979	} else {
1980		*etype = ntohs(evh->evl_encap_proto);
1981		offset = sizeof(struct ether_header);
1982	}
1983
1984	switch (*etype) {
1985#if defined(INET)
1986	case ETHERTYPE_IP: {
1987		struct ip *ip, iphdr;
1988		if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
1989			m_copydata(m, offset, sizeof(struct ip),
1990			    (caddr_t) &iphdr);
1991			ip = &iphdr;
1992		} else
1993			ip = (struct ip *)(m->m_data + offset);
1994		*proto = ip->ip_p;
1995		*start = offset + (ip->ip_hl << 2);
1996		break;
1997	}
1998#endif
1999#if defined(INET6)
2000	case ETHERTYPE_IPV6:
2001		*proto = -1;
2002		*start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
2003		/* Assert the network stack sent us a valid packet. */
2004		KASSERT(*start > offset,
2005		    ("%s: mbuf %p start %d offset %d proto %d", __func__, m,
2006		    *start, offset, *proto));
2007		break;
2008#endif
2009	default:
2010		sc->vtnet_stats.tx_csum_bad_ethtype++;
2011		return (EINVAL);
2012	}
2013
2014	return (0);
2015}
2016
2017static int
2018vtnet_txq_offload_tso(struct vtnet_txq *txq, struct mbuf *m, int eth_type,
2019    int offset, struct virtio_net_hdr *hdr)
2020{
2021	static struct timeval lastecn;
2022	static int curecn;
2023	struct vtnet_softc *sc;
2024	struct tcphdr *tcp, tcphdr;
2025
2026	sc = txq->vtntx_sc;
2027
2028	if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
2029		m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
2030		tcp = &tcphdr;
2031	} else
2032		tcp = (struct tcphdr *)(m->m_data + offset);
2033
2034	hdr->hdr_len = offset + (tcp->th_off << 2);
2035	hdr->gso_size = m->m_pkthdr.tso_segsz;
2036	hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
2037	    VIRTIO_NET_HDR_GSO_TCPV6;
2038
2039	if (tcp->th_flags & TH_CWR) {
2040		/*
2041		 * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD,
2042		 * ECN support is not on a per-interface basis, but globally via
2043		 * the net.inet.tcp.ecn.enable sysctl knob. The default is off.
2044		 */
2045		if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) {
2046			if (ppsratecheck(&lastecn, &curecn, 1))
2047				if_printf(sc->vtnet_ifp,
2048				    "TSO with ECN not negotiated with host\n");
2049			return (ENOTSUP);
2050		}
2051		hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2052	}
2053
2054	txq->vtntx_stats.vtxs_tso++;
2055
2056	return (0);
2057}
2058
2059static struct mbuf *
2060vtnet_txq_offload(struct vtnet_txq *txq, struct mbuf *m,
2061    struct virtio_net_hdr *hdr)
2062{
2063	struct vtnet_softc *sc;
2064	int flags, etype, csum_start, proto, error;
2065
2066	sc = txq->vtntx_sc;
2067	flags = m->m_pkthdr.csum_flags;
2068
2069	error = vtnet_txq_offload_ctx(txq, m, &etype, &proto, &csum_start);
2070	if (error)
2071		goto drop;
2072
2073	if ((etype == ETHERTYPE_IP && flags & VTNET_CSUM_OFFLOAD) ||
2074	    (etype == ETHERTYPE_IPV6 && flags & VTNET_CSUM_OFFLOAD_IPV6)) {
2075		/*
2076		 * We could compare the IP protocol vs the CSUM_ flag too,
2077		 * but that really should not be necessary.
2078		 */
2079		hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
2080		hdr->csum_start = csum_start;
2081		hdr->csum_offset = m->m_pkthdr.csum_data;
2082		txq->vtntx_stats.vtxs_csum++;
2083	}
2084
2085	if (flags & CSUM_TSO) {
2086		if (__predict_false(proto != IPPROTO_TCP)) {
2087			/* Likely failed to correctly parse the mbuf. */
2088			sc->vtnet_stats.tx_tso_not_tcp++;
2089			goto drop;
2090		}
2091
2092		KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM,
2093		    ("%s: mbuf %p TSO without checksum offload %#x",
2094		    __func__, m, flags));
2095
2096		error = vtnet_txq_offload_tso(txq, m, etype, csum_start, hdr);
2097		if (error)
2098			goto drop;
2099	}
2100
2101	return (m);
2102
2103drop:
2104	m_freem(m);
2105	return (NULL);
2106}
2107
2108static int
2109vtnet_txq_enqueue_buf(struct vtnet_txq *txq, struct mbuf **m_head,
2110    struct vtnet_tx_header *txhdr)
2111{
2112	struct vtnet_softc *sc;
2113	struct virtqueue *vq;
2114	struct sglist *sg;
2115	struct mbuf *m;
2116	int error;
2117
2118	sc = txq->vtntx_sc;
2119	vq = txq->vtntx_vq;
2120	sg = txq->vtntx_sg;
2121	m = *m_head;
2122
2123	sglist_reset(sg);
2124	error = sglist_append(sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size);
2125	KASSERT(error == 0 && sg->sg_nseg == 1,
2126	    ("%s: error %d adding header to sglist", __func__, error));
2127
2128	error = sglist_append_mbuf(sg, m);
2129	if (error) {
2130		m = m_defrag(m, M_NOWAIT);
2131		if (m == NULL)
2132			goto fail;
2133
2134		*m_head = m;
2135		sc->vtnet_stats.tx_defragged++;
2136
2137		error = sglist_append_mbuf(sg, m);
2138		if (error)
2139			goto fail;
2140	}
2141
2142	txhdr->vth_mbuf = m;
2143	error = virtqueue_enqueue(vq, txhdr, sg, sg->sg_nseg, 0);
2144
2145	return (error);
2146
2147fail:
2148	sc->vtnet_stats.tx_defrag_failed++;
2149	m_freem(*m_head);
2150	*m_head = NULL;
2151
2152	return (ENOBUFS);
2153}
2154
2155static int
2156vtnet_txq_encap(struct vtnet_txq *txq, struct mbuf **m_head)
2157{
2158	struct vtnet_tx_header *txhdr;
2159	struct virtio_net_hdr *hdr;
2160	struct mbuf *m;
2161	int error;
2162
2163	m = *m_head;
2164	M_ASSERTPKTHDR(m);
2165
2166	txhdr = uma_zalloc(vtnet_tx_header_zone, M_NOWAIT | M_ZERO);
2167	if (txhdr == NULL) {
2168		m_freem(m);
2169		*m_head = NULL;
2170		return (ENOMEM);
2171	}
2172
2173	/*
2174	 * Always use the non-mergeable header, regardless if the feature
2175	 * was negotiated. For transmit, num_buffers is always zero. The
2176	 * vtnet_hdr_size is used to enqueue the correct header size.
2177	 */
2178	hdr = &txhdr->vth_uhdr.hdr;
2179
2180	if (m->m_flags & M_VLANTAG) {
2181		m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
2182		if ((*m_head = m) == NULL) {
2183			error = ENOBUFS;
2184			goto fail;
2185		}
2186		m->m_flags &= ~M_VLANTAG;
2187	}
2188
2189	if (m->m_pkthdr.csum_flags & VTNET_CSUM_ALL_OFFLOAD) {
2190		m = vtnet_txq_offload(txq, m, hdr);
2191		if ((*m_head = m) == NULL) {
2192			error = ENOBUFS;
2193			goto fail;
2194		}
2195	}
2196
2197	error = vtnet_txq_enqueue_buf(txq, m_head, txhdr);
2198	if (error == 0)
2199		return (0);
2200
2201fail:
2202	uma_zfree(vtnet_tx_header_zone, txhdr);
2203
2204	return (error);
2205}
2206
2207#ifdef VTNET_LEGACY_TX
2208
2209static void
2210vtnet_start_locked(struct vtnet_txq *txq, struct ifnet *ifp)
2211{
2212	struct vtnet_softc *sc;
2213	struct virtqueue *vq;
2214	struct mbuf *m0;
2215	int tries, enq;
2216
2217	sc = txq->vtntx_sc;
2218	vq = txq->vtntx_vq;
2219	tries = 0;
2220
2221	VTNET_TXQ_LOCK_ASSERT(txq);
2222
2223	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
2224	    sc->vtnet_link_active == 0)
2225		return;
2226
2227	vtnet_txq_eof(txq);
2228
2229again:
2230	enq = 0;
2231
2232	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
2233		if (virtqueue_full(vq))
2234			break;
2235
2236		IFQ_DRV_DEQUEUE(&ifp->if_snd, m0);
2237		if (m0 == NULL)
2238			break;
2239
2240		if (vtnet_txq_encap(txq, &m0) != 0) {
2241			if (m0 != NULL)
2242				IFQ_DRV_PREPEND(&ifp->if_snd, m0);
2243			break;
2244		}
2245
2246		enq++;
2247		ETHER_BPF_MTAP(ifp, m0);
2248	}
2249
2250	if (enq > 0 && vtnet_txq_notify(txq) != 0) {
2251		if (tries++ < VTNET_NOTIFY_RETRIES)
2252			goto again;
2253
2254		txq->vtntx_stats.vtxs_rescheduled++;
2255		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask);
2256	}
2257}
2258
2259static void
2260vtnet_start(struct ifnet *ifp)
2261{
2262	struct vtnet_softc *sc;
2263	struct vtnet_txq *txq;
2264
2265	sc = ifp->if_softc;
2266	txq = &sc->vtnet_txqs[0];
2267
2268	VTNET_TXQ_LOCK(txq);
2269	vtnet_start_locked(txq, ifp);
2270	VTNET_TXQ_UNLOCK(txq);
2271}
2272
2273#else /* !VTNET_LEGACY_TX */
2274
2275static int
2276vtnet_txq_mq_start_locked(struct vtnet_txq *txq, struct mbuf *m)
2277{
2278	struct vtnet_softc *sc;
2279	struct virtqueue *vq;
2280	struct buf_ring *br;
2281	struct ifnet *ifp;
2282	int enq, tries, error;
2283
2284	sc = txq->vtntx_sc;
2285	vq = txq->vtntx_vq;
2286	br = txq->vtntx_br;
2287	ifp = sc->vtnet_ifp;
2288	tries = 0;
2289	error = 0;
2290
2291	VTNET_TXQ_LOCK_ASSERT(txq);
2292
2293	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
2294	    sc->vtnet_link_active == 0) {
2295		if (m != NULL)
2296			error = drbr_enqueue(ifp, br, m);
2297		return (error);
2298	}
2299
2300	if (m != NULL) {
2301		error = drbr_enqueue(ifp, br, m);
2302		if (error)
2303			return (error);
2304	}
2305
2306	vtnet_txq_eof(txq);
2307
2308again:
2309	enq = 0;
2310
2311	while ((m = drbr_peek(ifp, br)) != NULL) {
2312		if (virtqueue_full(vq)) {
2313			drbr_putback(ifp, br, m);
2314			break;
2315		}
2316
2317		if (vtnet_txq_encap(txq, &m) != 0) {
2318			if (m != NULL)
2319				drbr_putback(ifp, br, m);
2320			else
2321				drbr_advance(ifp, br);
2322			break;
2323		}
2324		drbr_advance(ifp, br);
2325
2326		enq++;
2327		ETHER_BPF_MTAP(ifp, m);
2328	}
2329
2330	if (enq > 0 && vtnet_txq_notify(txq) != 0) {
2331		if (tries++ < VTNET_NOTIFY_RETRIES)
2332			goto again;
2333
2334		txq->vtntx_stats.vtxs_rescheduled++;
2335		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask);
2336	}
2337
2338	return (0);
2339}
2340
2341static int
2342vtnet_txq_mq_start(struct ifnet *ifp, struct mbuf *m)
2343{
2344	struct vtnet_softc *sc;
2345	struct vtnet_txq *txq;
2346	int i, npairs, error;
2347
2348	sc = ifp->if_softc;
2349	npairs = sc->vtnet_act_vq_pairs;
2350
2351	if (m->m_flags & M_FLOWID)
2352		i = m->m_pkthdr.flowid % npairs;
2353	else
2354		i = curcpu % npairs;
2355
2356	txq = &sc->vtnet_txqs[i];
2357
2358	if (VTNET_TXQ_TRYLOCK(txq) != 0) {
2359		error = vtnet_txq_mq_start_locked(txq, m);
2360		VTNET_TXQ_UNLOCK(txq);
2361	} else {
2362		error = drbr_enqueue(ifp, txq->vtntx_br, m);
2363		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_defrtask);
2364	}
2365
2366	return (error);
2367}
2368
2369static void
2370vtnet_txq_tq_deferred(void *xtxq, int pending)
2371{
2372	struct vtnet_softc *sc;
2373	struct vtnet_txq *txq;
2374
2375	txq = xtxq;
2376	sc = txq->vtntx_sc;
2377
2378	VTNET_TXQ_LOCK(txq);
2379	if (!drbr_empty(sc->vtnet_ifp, txq->vtntx_br))
2380		vtnet_txq_mq_start_locked(txq, NULL);
2381	VTNET_TXQ_UNLOCK(txq);
2382}
2383
2384#endif /* VTNET_LEGACY_TX */
2385
2386static void
2387vtnet_txq_start(struct vtnet_txq *txq)
2388{
2389	struct vtnet_softc *sc;
2390	struct ifnet *ifp;
2391
2392	sc = txq->vtntx_sc;
2393	ifp = sc->vtnet_ifp;
2394
2395#ifdef VTNET_LEGACY_TX
2396	if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
2397		vtnet_start_locked(txq, ifp);
2398#else
2399	if (!drbr_empty(ifp, txq->vtntx_br))
2400		vtnet_txq_mq_start_locked(txq, NULL);
2401#endif
2402}
2403
2404static void
2405vtnet_txq_tq_intr(void *xtxq, int pending)
2406{
2407	struct vtnet_softc *sc;
2408	struct vtnet_txq *txq;
2409	struct ifnet *ifp;
2410
2411	txq = xtxq;
2412	sc = txq->vtntx_sc;
2413	ifp = sc->vtnet_ifp;
2414
2415	VTNET_TXQ_LOCK(txq);
2416
2417	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2418		VTNET_TXQ_UNLOCK(txq);
2419		return;
2420	}
2421
2422	vtnet_txq_eof(txq);
2423	vtnet_txq_start(txq);
2424
2425	VTNET_TXQ_UNLOCK(txq);
2426}
2427
2428static int
2429vtnet_txq_eof(struct vtnet_txq *txq)
2430{
2431	struct virtqueue *vq;
2432	struct vtnet_tx_header *txhdr;
2433	struct mbuf *m;
2434	int deq;
2435
2436	vq = txq->vtntx_vq;
2437	deq = 0;
2438	VTNET_TXQ_LOCK_ASSERT(txq);
2439
2440#ifdef DEV_NETMAP
2441	if (netmap_tx_irq(txq->vtntx_sc->vtnet_ifp, txq->vtntx_id)) {
2442		virtqueue_disable_intr(vq); // XXX luigi
2443		return 0; // XXX or 1 ?
2444	}
2445#endif /* DEV_NETMAP */
2446
2447	while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) {
2448		m = txhdr->vth_mbuf;
2449		deq++;
2450
2451		txq->vtntx_stats.vtxs_opackets++;
2452		txq->vtntx_stats.vtxs_obytes += m->m_pkthdr.len;
2453		if (m->m_flags & M_MCAST)
2454			txq->vtntx_stats.vtxs_omcasts++;
2455
2456		m_freem(m);
2457		uma_zfree(vtnet_tx_header_zone, txhdr);
2458	}
2459
2460	if (virtqueue_empty(vq))
2461		txq->vtntx_watchdog = 0;
2462
2463	return (deq);
2464}
2465
2466static void
2467vtnet_tx_vq_intr(void *xtxq)
2468{
2469	struct vtnet_softc *sc;
2470	struct vtnet_txq *txq;
2471	struct ifnet *ifp;
2472
2473	txq = xtxq;
2474	sc = txq->vtntx_sc;
2475	ifp = sc->vtnet_ifp;
2476
2477	if (__predict_false(txq->vtntx_id >= sc->vtnet_act_vq_pairs)) {
2478		/*
2479		 * Ignore this interrupt. Either this is a spurious interrupt
2480		 * or multiqueue without per-VQ MSIX so every queue needs to
2481		 * be polled (a brain dead configuration we could try harder
2482		 * to avoid).
2483		 */
2484		vtnet_txq_disable_intr(txq);
2485		return;
2486	}
2487
2488	VTNET_TXQ_LOCK(txq);
2489
2490	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2491		VTNET_TXQ_UNLOCK(txq);
2492		return;
2493	}
2494
2495	vtnet_txq_eof(txq);
2496	vtnet_txq_start(txq);
2497
2498	VTNET_TXQ_UNLOCK(txq);
2499}
2500
2501static void
2502vtnet_tx_start_all(struct vtnet_softc *sc)
2503{
2504	struct vtnet_txq *txq;
2505	int i;
2506
2507	VTNET_CORE_LOCK_ASSERT(sc);
2508
2509	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
2510		txq = &sc->vtnet_txqs[i];
2511
2512		VTNET_TXQ_LOCK(txq);
2513		vtnet_txq_start(txq);
2514		VTNET_TXQ_UNLOCK(txq);
2515	}
2516}
2517
2518#ifndef VTNET_LEGACY_TX
2519static void
2520vtnet_qflush(struct ifnet *ifp)
2521{
2522	struct vtnet_softc *sc;
2523	struct vtnet_txq *txq;
2524	struct mbuf *m;
2525	int i;
2526
2527	sc = ifp->if_softc;
2528
2529	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
2530		txq = &sc->vtnet_txqs[i];
2531
2532		VTNET_TXQ_LOCK(txq);
2533		while ((m = buf_ring_dequeue_sc(txq->vtntx_br)) != NULL)
2534			m_freem(m);
2535		VTNET_TXQ_UNLOCK(txq);
2536	}
2537
2538	if_qflush(ifp);
2539}
2540#endif
2541
2542static int
2543vtnet_watchdog(struct vtnet_txq *txq)
2544{
2545	struct ifnet *ifp;
2546
2547	ifp = txq->vtntx_sc->vtnet_ifp;
2548
2549	VTNET_TXQ_LOCK(txq);
2550	if (txq->vtntx_watchdog == 1) {
2551		/*
2552		 * Only drain completed frames if the watchdog is about to
2553		 * expire. If any frames were drained, there may be enough
2554		 * free descriptors now available to transmit queued frames.
2555		 * In that case, the timer will immediately be decremented
2556		 * below, but the timeout is generous enough that should not
2557		 * be a problem.
2558		 */
2559		if (vtnet_txq_eof(txq) != 0)
2560			vtnet_txq_start(txq);
2561	}
2562
2563	if (txq->vtntx_watchdog == 0 || --txq->vtntx_watchdog) {
2564		VTNET_TXQ_UNLOCK(txq);
2565		return (0);
2566	}
2567	VTNET_TXQ_UNLOCK(txq);
2568
2569	if_printf(ifp, "watchdog timeout on queue %d\n", txq->vtntx_id);
2570	return (1);
2571}
2572
2573static void
2574vtnet_rxq_accum_stats(struct vtnet_rxq *rxq, struct vtnet_rxq_stats *accum)
2575{
2576	struct vtnet_rxq_stats *st;
2577
2578	st = &rxq->vtnrx_stats;
2579
2580	accum->vrxs_ipackets += st->vrxs_ipackets;
2581	accum->vrxs_ibytes += st->vrxs_ibytes;
2582	accum->vrxs_iqdrops += st->vrxs_iqdrops;
2583	accum->vrxs_csum += st->vrxs_csum;
2584	accum->vrxs_csum_failed += st->vrxs_csum_failed;
2585	accum->vrxs_rescheduled += st->vrxs_rescheduled;
2586}
2587
2588static void
2589vtnet_txq_accum_stats(struct vtnet_txq *txq, struct vtnet_txq_stats *accum)
2590{
2591	struct vtnet_txq_stats *st;
2592
2593	st = &txq->vtntx_stats;
2594
2595	accum->vtxs_opackets += st->vtxs_opackets;
2596	accum->vtxs_obytes += st->vtxs_obytes;
2597	accum->vtxs_csum += st->vtxs_csum;
2598	accum->vtxs_tso += st->vtxs_tso;
2599	accum->vtxs_rescheduled += st->vtxs_rescheduled;
2600}
2601
2602static void
2603vtnet_accumulate_stats(struct vtnet_softc *sc)
2604{
2605	struct ifnet *ifp;
2606	struct vtnet_statistics *st;
2607	struct vtnet_rxq_stats rxaccum;
2608	struct vtnet_txq_stats txaccum;
2609	int i;
2610
2611	ifp = sc->vtnet_ifp;
2612	st = &sc->vtnet_stats;
2613	bzero(&rxaccum, sizeof(struct vtnet_rxq_stats));
2614	bzero(&txaccum, sizeof(struct vtnet_txq_stats));
2615
2616	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
2617		vtnet_rxq_accum_stats(&sc->vtnet_rxqs[i], &rxaccum);
2618		vtnet_txq_accum_stats(&sc->vtnet_txqs[i], &txaccum);
2619	}
2620
2621	st->rx_csum_offloaded = rxaccum.vrxs_csum;
2622	st->rx_csum_failed = rxaccum.vrxs_csum_failed;
2623	st->rx_task_rescheduled = rxaccum.vrxs_rescheduled;
2624	st->tx_csum_offloaded = txaccum.vtxs_csum;
2625	st->tx_tso_offloaded = txaccum.vtxs_tso;
2626	st->tx_task_rescheduled = txaccum.vtxs_rescheduled;
2627
2628	/*
2629	 * With the exception of if_ierrors, these ifnet statistics are
2630	 * only updated in the driver, so just set them to our accumulated
2631	 * values. if_ierrors is updated in ether_input() for malformed
2632	 * frames that we should have already discarded.
2633	 */
2634	ifp->if_ipackets = rxaccum.vrxs_ipackets;
2635	ifp->if_iqdrops = rxaccum.vrxs_iqdrops;
2636	ifp->if_ierrors = rxaccum.vrxs_ierrors;
2637	ifp->if_opackets = txaccum.vtxs_opackets;
2638#ifndef VTNET_LEGACY_TX
2639	ifp->if_obytes = txaccum.vtxs_obytes;
2640	ifp->if_omcasts = txaccum.vtxs_omcasts;
2641#endif
2642}
2643
2644static void
2645vtnet_tick(void *xsc)
2646{
2647	struct vtnet_softc *sc;
2648	struct ifnet *ifp;
2649	int i, timedout;
2650
2651	sc = xsc;
2652	ifp = sc->vtnet_ifp;
2653	timedout = 0;
2654
2655	VTNET_CORE_LOCK_ASSERT(sc);
2656	vtnet_accumulate_stats(sc);
2657
2658	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
2659		timedout |= vtnet_watchdog(&sc->vtnet_txqs[i]);
2660
2661	if (timedout != 0) {
2662		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2663		vtnet_init_locked(sc);
2664	} else
2665		callout_schedule(&sc->vtnet_tick_ch, hz);
2666}
2667
2668static void
2669vtnet_start_taskqueues(struct vtnet_softc *sc)
2670{
2671	device_t dev;
2672	struct vtnet_rxq *rxq;
2673	struct vtnet_txq *txq;
2674	int i, error;
2675
2676	dev = sc->vtnet_dev;
2677
2678	/*
2679	 * Errors here are very difficult to recover from - we cannot
2680	 * easily fail because, if this is during boot, we will hang
2681	 * when freeing any successfully started taskqueues because
2682	 * the scheduler isn't up yet.
2683	 *
2684	 * Most drivers just ignore the return value - it only fails
2685	 * with ENOMEM so an error is not likely.
2686	 */
2687	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
2688		rxq = &sc->vtnet_rxqs[i];
2689		error = taskqueue_start_threads(&rxq->vtnrx_tq, 1, PI_NET,
2690		    "%s rxq %d", device_get_nameunit(dev), rxq->vtnrx_id);
2691		if (error) {
2692			device_printf(dev, "failed to start rx taskq %d\n",
2693			    rxq->vtnrx_id);
2694		}
2695
2696		txq = &sc->vtnet_txqs[i];
2697		error = taskqueue_start_threads(&txq->vtntx_tq, 1, PI_NET,
2698		    "%s txq %d", device_get_nameunit(dev), txq->vtntx_id);
2699		if (error) {
2700			device_printf(dev, "failed to start tx taskq %d\n",
2701			    txq->vtntx_id);
2702		}
2703	}
2704}
2705
2706static void
2707vtnet_free_taskqueues(struct vtnet_softc *sc)
2708{
2709	struct vtnet_rxq *rxq;
2710	struct vtnet_txq *txq;
2711	int i;
2712
2713	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
2714		rxq = &sc->vtnet_rxqs[i];
2715		if (rxq->vtnrx_tq != NULL) {
2716			taskqueue_free(rxq->vtnrx_tq);
2717			rxq->vtnrx_vq = NULL;
2718		}
2719
2720		txq = &sc->vtnet_txqs[i];
2721		if (txq->vtntx_tq != NULL) {
2722			taskqueue_free(txq->vtntx_tq);
2723			txq->vtntx_tq = NULL;
2724		}
2725	}
2726}
2727
2728static void
2729vtnet_drain_taskqueues(struct vtnet_softc *sc)
2730{
2731	struct vtnet_rxq *rxq;
2732	struct vtnet_txq *txq;
2733	int i;
2734
2735	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
2736		rxq = &sc->vtnet_rxqs[i];
2737		if (rxq->vtnrx_tq != NULL)
2738			taskqueue_drain(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
2739
2740		txq = &sc->vtnet_txqs[i];
2741		if (txq->vtntx_tq != NULL) {
2742			taskqueue_drain(txq->vtntx_tq, &txq->vtntx_intrtask);
2743#ifndef VTNET_LEGACY_TX
2744			taskqueue_drain(txq->vtntx_tq, &txq->vtntx_defrtask);
2745#endif
2746		}
2747	}
2748}
2749
2750static void
2751vtnet_drain_rxtx_queues(struct vtnet_softc *sc)
2752{
2753	struct vtnet_rxq *rxq;
2754	struct vtnet_txq *txq;
2755	int i;
2756
2757	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
2758		rxq = &sc->vtnet_rxqs[i];
2759		vtnet_rxq_free_mbufs(rxq);
2760
2761		txq = &sc->vtnet_txqs[i];
2762		vtnet_txq_free_mbufs(txq);
2763	}
2764}
2765
2766static void
2767vtnet_stop_rendezvous(struct vtnet_softc *sc)
2768{
2769	struct vtnet_rxq *rxq;
2770	struct vtnet_txq *txq;
2771	int i;
2772
2773	/*
2774	 * Lock and unlock the per-queue mutex so we known the stop
2775	 * state is visible. Doing only the active queues should be
2776	 * sufficient, but it does not cost much extra to do all the
2777	 * queues. Note we hold the core mutex here too.
2778	 */
2779	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
2780		rxq = &sc->vtnet_rxqs[i];
2781		VTNET_RXQ_LOCK(rxq);
2782		VTNET_RXQ_UNLOCK(rxq);
2783
2784		txq = &sc->vtnet_txqs[i];
2785		VTNET_TXQ_LOCK(txq);
2786		VTNET_TXQ_UNLOCK(txq);
2787	}
2788}
2789
2790static void
2791vtnet_stop(struct vtnet_softc *sc)
2792{
2793	device_t dev;
2794	struct ifnet *ifp;
2795
2796	dev = sc->vtnet_dev;
2797	ifp = sc->vtnet_ifp;
2798
2799	VTNET_CORE_LOCK_ASSERT(sc);
2800
2801	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2802	sc->vtnet_link_active = 0;
2803	callout_stop(&sc->vtnet_tick_ch);
2804
2805	/* Only advisory. */
2806	vtnet_disable_interrupts(sc);
2807
2808	/*
2809	 * Stop the host adapter. This resets it to the pre-initialized
2810	 * state. It will not generate any interrupts until after it is
2811	 * reinitialized.
2812	 */
2813	virtio_stop(dev);
2814	vtnet_stop_rendezvous(sc);
2815
2816	/* Free any mbufs left in the virtqueues. */
2817	vtnet_drain_rxtx_queues(sc);
2818}
2819
2820static int
2821vtnet_virtio_reinit(struct vtnet_softc *sc)
2822{
2823	device_t dev;
2824	struct ifnet *ifp;
2825	uint64_t features;
2826	int mask, error;
2827
2828	dev = sc->vtnet_dev;
2829	ifp = sc->vtnet_ifp;
2830	features = sc->vtnet_features;
2831
2832	mask = 0;
2833#if defined(INET)
2834	mask |= IFCAP_RXCSUM;
2835#endif
2836#if defined (INET6)
2837	mask |= IFCAP_RXCSUM_IPV6;
2838#endif
2839
2840	/*
2841	 * Re-negotiate with the host, removing any disabled receive
2842	 * features. Transmit features are disabled only on our side
2843	 * via if_capenable and if_hwassist.
2844	 */
2845
2846	if (ifp->if_capabilities & mask) {
2847		/*
2848		 * We require both IPv4 and IPv6 offloading to be enabled
2849		 * in order to negotiated it: VirtIO does not distinguish
2850		 * between the two.
2851		 */
2852		if ((ifp->if_capenable & mask) != mask)
2853			features &= ~VIRTIO_NET_F_GUEST_CSUM;
2854	}
2855
2856	if (ifp->if_capabilities & IFCAP_LRO) {
2857		if ((ifp->if_capenable & IFCAP_LRO) == 0)
2858			features &= ~VTNET_LRO_FEATURES;
2859	}
2860
2861	if (ifp->if_capabilities & IFCAP_VLAN_HWFILTER) {
2862		if ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0)
2863			features &= ~VIRTIO_NET_F_CTRL_VLAN;
2864	}
2865
2866	error = virtio_reinit(dev, features);
2867	if (error)
2868		device_printf(dev, "virtio reinit error %d\n", error);
2869
2870	return (error);
2871}
2872
2873static void
2874vtnet_init_rx_filters(struct vtnet_softc *sc)
2875{
2876	struct ifnet *ifp;
2877
2878	ifp = sc->vtnet_ifp;
2879
2880	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
2881		/* Restore promiscuous and all-multicast modes. */
2882		vtnet_rx_filter(sc);
2883		/* Restore filtered MAC addresses. */
2884		vtnet_rx_filter_mac(sc);
2885	}
2886
2887	if (ifp->if_capenable & IFCAP_VLAN_HWFILTER)
2888		vtnet_rx_filter_vlan(sc);
2889}
2890
2891static int
2892vtnet_init_rx_queues(struct vtnet_softc *sc)
2893{
2894	device_t dev;
2895	struct vtnet_rxq *rxq;
2896	int i, clsize, error;
2897
2898	dev = sc->vtnet_dev;
2899
2900	/*
2901	 * Use the new cluster size if one has been set (via a MTU
2902	 * change). Otherwise, use the standard 2K clusters.
2903	 *
2904	 * BMV: It might make sense to use page sized clusters as
2905	 * the default (depending on the features negotiated).
2906	 */
2907	if (sc->vtnet_rx_new_clsize != 0) {
2908		clsize = sc->vtnet_rx_new_clsize;
2909		sc->vtnet_rx_new_clsize = 0;
2910	} else
2911		clsize = MCLBYTES;
2912
2913	sc->vtnet_rx_clsize = clsize;
2914	sc->vtnet_rx_nmbufs = VTNET_NEEDED_RX_MBUFS(sc, clsize);
2915
2916	KASSERT(sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS ||
2917	    sc->vtnet_rx_nmbufs < sc->vtnet_rx_nsegs,
2918	    ("%s: too many rx mbufs %d for %d segments", __func__,
2919	    sc->vtnet_rx_nmbufs, sc->vtnet_rx_nsegs));
2920
2921#ifdef DEV_NETMAP
2922	if (vtnet_netmap_init_rx_buffers(sc))
2923		return 0;
2924#endif /* DEV_NETMAP */
2925
2926	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
2927		rxq = &sc->vtnet_rxqs[i];
2928
2929		/* Hold the lock to satisfy asserts. */
2930		VTNET_RXQ_LOCK(rxq);
2931		error = vtnet_rxq_populate(rxq);
2932		VTNET_RXQ_UNLOCK(rxq);
2933
2934		if (error) {
2935			device_printf(dev,
2936			    "cannot allocate mbufs for Rx queue %d\n", i);
2937			return (error);
2938		}
2939	}
2940
2941	return (0);
2942}
2943
2944static int
2945vtnet_init_tx_queues(struct vtnet_softc *sc)
2946{
2947	struct vtnet_txq *txq;
2948	int i;
2949
2950	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
2951		txq = &sc->vtnet_txqs[i];
2952		txq->vtntx_watchdog = 0;
2953	}
2954
2955	return (0);
2956}
2957
2958static int
2959vtnet_init_rxtx_queues(struct vtnet_softc *sc)
2960{
2961	int error;
2962
2963	error = vtnet_init_rx_queues(sc);
2964	if (error)
2965		return (error);
2966
2967	error = vtnet_init_tx_queues(sc);
2968	if (error)
2969		return (error);
2970
2971	return (0);
2972}
2973
2974static void
2975vtnet_set_active_vq_pairs(struct vtnet_softc *sc)
2976{
2977	device_t dev;
2978	int npairs;
2979
2980	dev = sc->vtnet_dev;
2981
2982	if ((sc->vtnet_flags & VTNET_FLAG_MULTIQ) == 0) {
2983		MPASS(sc->vtnet_max_vq_pairs == 1);
2984		sc->vtnet_act_vq_pairs = 1;
2985		return;
2986	}
2987
2988	/* BMV: Just use the maximum configured for now. */
2989	npairs = sc->vtnet_max_vq_pairs;
2990
2991	if (vtnet_ctrl_mq_cmd(sc, npairs) != 0) {
2992		device_printf(dev,
2993		    "cannot set active queue pairs to %d\n", npairs);
2994		npairs = 1;
2995	}
2996
2997	sc->vtnet_act_vq_pairs = npairs;
2998}
2999
3000static int
3001vtnet_reinit(struct vtnet_softc *sc)
3002{
3003	struct ifnet *ifp;
3004	int error;
3005
3006	ifp = sc->vtnet_ifp;
3007
3008	/* Use the current MAC address. */
3009	bcopy(IF_LLADDR(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN);
3010	vtnet_set_hwaddr(sc);
3011
3012	vtnet_set_active_vq_pairs(sc);
3013
3014	ifp->if_hwassist = 0;
3015	if (ifp->if_capenable & IFCAP_TXCSUM)
3016		ifp->if_hwassist |= VTNET_CSUM_OFFLOAD;
3017	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3018		ifp->if_hwassist |= VTNET_CSUM_OFFLOAD_IPV6;
3019	if (ifp->if_capenable & IFCAP_TSO4)
3020		ifp->if_hwassist |= CSUM_TSO;
3021	if (ifp->if_capenable & IFCAP_TSO6)
3022		ifp->if_hwassist |= CSUM_TSO; /* No CSUM_TSO_IPV6. */
3023
3024	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ)
3025		vtnet_init_rx_filters(sc);
3026
3027	error = vtnet_init_rxtx_queues(sc);
3028	if (error)
3029		return (error);
3030
3031	vtnet_enable_interrupts(sc);
3032	ifp->if_drv_flags |= IFF_DRV_RUNNING;
3033
3034	return (0);
3035}
3036
3037static void
3038vtnet_init_locked(struct vtnet_softc *sc)
3039{
3040	device_t dev;
3041	struct ifnet *ifp;
3042
3043	dev = sc->vtnet_dev;
3044	ifp = sc->vtnet_ifp;
3045
3046	VTNET_CORE_LOCK_ASSERT(sc);
3047
3048	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3049		return;
3050
3051	vtnet_stop(sc);
3052
3053	/* Reinitialize with the host. */
3054	if (vtnet_virtio_reinit(sc) != 0)
3055		goto fail;
3056
3057	if (vtnet_reinit(sc) != 0)
3058		goto fail;
3059
3060	virtio_reinit_complete(dev);
3061
3062	vtnet_update_link_status(sc);
3063	callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc);
3064
3065	return;
3066
3067fail:
3068	vtnet_stop(sc);
3069}
3070
3071static void
3072vtnet_init(void *xsc)
3073{
3074	struct vtnet_softc *sc;
3075
3076	sc = xsc;
3077
3078#ifdef DEV_NETMAP
3079	if (!NA(sc->vtnet_ifp)) {
3080		D("try to attach again");
3081		vtnet_netmap_attach(sc);
3082	}
3083#endif /* DEV_NETMAP */
3084
3085	VTNET_CORE_LOCK(sc);
3086	vtnet_init_locked(sc);
3087	VTNET_CORE_UNLOCK(sc);
3088}
3089
3090static void
3091vtnet_free_ctrl_vq(struct vtnet_softc *sc)
3092{
3093	struct virtqueue *vq;
3094
3095	vq = sc->vtnet_ctrl_vq;
3096
3097	/*
3098	 * The control virtqueue is only polled and therefore it should
3099	 * already be empty.
3100	 */
3101	KASSERT(virtqueue_empty(vq),
3102	    ("%s: ctrl vq %p not empty", __func__, vq));
3103}
3104
3105static void
3106vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie,
3107    struct sglist *sg, int readable, int writable)
3108{
3109	struct virtqueue *vq;
3110
3111	vq = sc->vtnet_ctrl_vq;
3112
3113	VTNET_CORE_LOCK_ASSERT(sc);
3114	KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ,
3115	    ("%s: CTRL_VQ feature not negotiated", __func__));
3116
3117	if (!virtqueue_empty(vq))
3118		return;
3119	if (virtqueue_enqueue(vq, cookie, sg, readable, writable) != 0)
3120		return;
3121
3122	/*
3123	 * Poll for the response, but the command is likely already
3124	 * done when we return from the notify.
3125	 */
3126	virtqueue_notify(vq);
3127	virtqueue_poll(vq, NULL);
3128}
3129
3130static int
3131vtnet_ctrl_mac_cmd(struct vtnet_softc *sc, uint8_t *hwaddr)
3132{
3133	struct virtio_net_ctrl_hdr hdr __aligned(2);
3134	struct sglist_seg segs[3];
3135	struct sglist sg;
3136	uint8_t ack;
3137	int error;
3138
3139	hdr.class = VIRTIO_NET_CTRL_MAC;
3140	hdr.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET;
3141	ack = VIRTIO_NET_ERR;
3142
3143	sglist_init(&sg, 3, segs);
3144	error = 0;
3145	error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
3146	error |= sglist_append(&sg, hwaddr, ETHER_ADDR_LEN);
3147	error |= sglist_append(&sg, &ack, sizeof(uint8_t));
3148	KASSERT(error == 0 && sg.sg_nseg == 3,
3149	    ("%s: error %d adding set MAC msg to sglist", __func__, error));
3150
3151	vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
3152
3153	return (ack == VIRTIO_NET_OK ? 0 : EIO);
3154}
3155
3156static int
3157vtnet_ctrl_mq_cmd(struct vtnet_softc *sc, uint16_t npairs)
3158{
3159	struct sglist_seg segs[3];
3160	struct sglist sg;
3161	struct {
3162		struct virtio_net_ctrl_hdr hdr;
3163		uint8_t pad1;
3164		struct virtio_net_ctrl_mq mq;
3165		uint8_t pad2;
3166		uint8_t ack;
3167	} s __aligned(2);
3168	int error;
3169
3170	s.hdr.class = VIRTIO_NET_CTRL_MQ;
3171	s.hdr.cmd = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET;
3172	s.mq.virtqueue_pairs = npairs;
3173	s.ack = VIRTIO_NET_ERR;
3174
3175	sglist_init(&sg, 3, segs);
3176	error = 0;
3177	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
3178	error |= sglist_append(&sg, &s.mq, sizeof(struct virtio_net_ctrl_mq));
3179	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
3180	KASSERT(error == 0 && sg.sg_nseg == 3,
3181	    ("%s: error %d adding MQ message to sglist", __func__, error));
3182
3183	vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
3184
3185	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
3186}
3187
3188static int
3189vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, int cmd, int on)
3190{
3191	struct sglist_seg segs[3];
3192	struct sglist sg;
3193	struct {
3194		struct virtio_net_ctrl_hdr hdr;
3195		uint8_t pad1;
3196		uint8_t onoff;
3197		uint8_t pad2;
3198		uint8_t ack;
3199	} s __aligned(2);
3200	int error;
3201
3202	KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX,
3203	    ("%s: CTRL_RX feature not negotiated", __func__));
3204
3205	s.hdr.class = VIRTIO_NET_CTRL_RX;
3206	s.hdr.cmd = cmd;
3207	s.onoff = !!on;
3208	s.ack = VIRTIO_NET_ERR;
3209
3210	sglist_init(&sg, 3, segs);
3211	error = 0;
3212	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
3213	error |= sglist_append(&sg, &s.onoff, sizeof(uint8_t));
3214	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
3215	KASSERT(error == 0 && sg.sg_nseg == 3,
3216	    ("%s: error %d adding Rx message to sglist", __func__, error));
3217
3218	vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
3219
3220	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
3221}
3222
3223static int
3224vtnet_set_promisc(struct vtnet_softc *sc, int on)
3225{
3226
3227	return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_PROMISC, on));
3228}
3229
3230static int
3231vtnet_set_allmulti(struct vtnet_softc *sc, int on)
3232{
3233
3234	return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on));
3235}
3236
3237/*
3238 * The device defaults to promiscuous mode for backwards compatibility.
3239 * Turn it off at attach time if possible.
3240 */
3241static void
3242vtnet_attach_disable_promisc(struct vtnet_softc *sc)
3243{
3244	struct ifnet *ifp;
3245
3246	ifp = sc->vtnet_ifp;
3247
3248	VTNET_CORE_LOCK(sc);
3249	if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) {
3250		ifp->if_flags |= IFF_PROMISC;
3251	} else if (vtnet_set_promisc(sc, 0) != 0) {
3252		ifp->if_flags |= IFF_PROMISC;
3253		device_printf(sc->vtnet_dev,
3254		    "cannot disable default promiscuous mode\n");
3255	}
3256	VTNET_CORE_UNLOCK(sc);
3257}
3258
3259static void
3260vtnet_rx_filter(struct vtnet_softc *sc)
3261{
3262	device_t dev;
3263	struct ifnet *ifp;
3264
3265	dev = sc->vtnet_dev;
3266	ifp = sc->vtnet_ifp;
3267
3268	VTNET_CORE_LOCK_ASSERT(sc);
3269
3270	if (vtnet_set_promisc(sc, ifp->if_flags & IFF_PROMISC) != 0)
3271		device_printf(dev, "cannot %s promiscuous mode\n",
3272		    ifp->if_flags & IFF_PROMISC ? "enable" : "disable");
3273
3274	if (vtnet_set_allmulti(sc, ifp->if_flags & IFF_ALLMULTI) != 0)
3275		device_printf(dev, "cannot %s all-multicast mode\n",
3276		    ifp->if_flags & IFF_ALLMULTI ? "enable" : "disable");
3277}
3278
3279static void
3280vtnet_rx_filter_mac(struct vtnet_softc *sc)
3281{
3282	struct virtio_net_ctrl_hdr hdr __aligned(2);
3283	struct vtnet_mac_filter *filter;
3284	struct sglist_seg segs[4];
3285	struct sglist sg;
3286	struct ifnet *ifp;
3287	struct ifaddr *ifa;
3288	struct ifmultiaddr *ifma;
3289	int ucnt, mcnt, promisc, allmulti, error;
3290	uint8_t ack;
3291
3292	ifp = sc->vtnet_ifp;
3293	filter = sc->vtnet_mac_filter;
3294	ucnt = 0;
3295	mcnt = 0;
3296	promisc = 0;
3297	allmulti = 0;
3298
3299	VTNET_CORE_LOCK_ASSERT(sc);
3300	KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX,
3301	    ("%s: CTRL_RX feature not negotiated", __func__));
3302
3303	/* Unicast MAC addresses: */
3304	if_addr_rlock(ifp);
3305	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
3306		if (ifa->ifa_addr->sa_family != AF_LINK)
3307			continue;
3308		else if (memcmp(LLADDR((struct sockaddr_dl *)ifa->ifa_addr),
3309		    sc->vtnet_hwaddr, ETHER_ADDR_LEN) == 0)
3310			continue;
3311		else if (ucnt == VTNET_MAX_MAC_ENTRIES) {
3312			promisc = 1;
3313			break;
3314		}
3315
3316		bcopy(LLADDR((struct sockaddr_dl *)ifa->ifa_addr),
3317		    &filter->vmf_unicast.macs[ucnt], ETHER_ADDR_LEN);
3318		ucnt++;
3319	}
3320	if_addr_runlock(ifp);
3321
3322	if (promisc != 0) {
3323		filter->vmf_unicast.nentries = 0;
3324		if_printf(ifp, "more than %d MAC addresses assigned, "
3325		    "falling back to promiscuous mode\n",
3326		    VTNET_MAX_MAC_ENTRIES);
3327	} else
3328		filter->vmf_unicast.nentries = ucnt;
3329
3330	/* Multicast MAC addresses: */
3331	if_maddr_rlock(ifp);
3332	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
3333		if (ifma->ifma_addr->sa_family != AF_LINK)
3334			continue;
3335		else if (mcnt == VTNET_MAX_MAC_ENTRIES) {
3336			allmulti = 1;
3337			break;
3338		}
3339
3340		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
3341		    &filter->vmf_multicast.macs[mcnt], ETHER_ADDR_LEN);
3342		mcnt++;
3343	}
3344	if_maddr_runlock(ifp);
3345
3346	if (allmulti != 0) {
3347		filter->vmf_multicast.nentries = 0;
3348		if_printf(ifp, "more than %d multicast MAC addresses "
3349		    "assigned, falling back to all-multicast mode\n",
3350		    VTNET_MAX_MAC_ENTRIES);
3351	} else
3352		filter->vmf_multicast.nentries = mcnt;
3353
3354	if (promisc != 0 && allmulti != 0)
3355		goto out;
3356
3357	hdr.class = VIRTIO_NET_CTRL_MAC;
3358	hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
3359	ack = VIRTIO_NET_ERR;
3360
3361	sglist_init(&sg, 4, segs);
3362	error = 0;
3363	error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
3364	error |= sglist_append(&sg, &filter->vmf_unicast,
3365	    sizeof(uint32_t) + filter->vmf_unicast.nentries * ETHER_ADDR_LEN);
3366	error |= sglist_append(&sg, &filter->vmf_multicast,
3367	    sizeof(uint32_t) + filter->vmf_multicast.nentries * ETHER_ADDR_LEN);
3368	error |= sglist_append(&sg, &ack, sizeof(uint8_t));
3369	KASSERT(error == 0 && sg.sg_nseg == 4,
3370	    ("%s: error %d adding MAC filter msg to sglist", __func__, error));
3371
3372	vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
3373
3374	if (ack != VIRTIO_NET_OK)
3375		if_printf(ifp, "error setting host MAC filter table\n");
3376
3377out:
3378	if (promisc != 0 && vtnet_set_promisc(sc, 1) != 0)
3379		if_printf(ifp, "cannot enable promiscuous mode\n");
3380	if (allmulti != 0 && vtnet_set_allmulti(sc, 1) != 0)
3381		if_printf(ifp, "cannot enable all-multicast mode\n");
3382}
3383
3384static int
3385vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
3386{
3387	struct sglist_seg segs[3];
3388	struct sglist sg;
3389	struct {
3390		struct virtio_net_ctrl_hdr hdr;
3391		uint8_t pad1;
3392		uint16_t tag;
3393		uint8_t pad2;
3394		uint8_t ack;
3395	} s __aligned(2);
3396	int error;
3397
3398	s.hdr.class = VIRTIO_NET_CTRL_VLAN;
3399	s.hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL;
3400	s.tag = tag;
3401	s.ack = VIRTIO_NET_ERR;
3402
3403	sglist_init(&sg, 3, segs);
3404	error = 0;
3405	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
3406	error |= sglist_append(&sg, &s.tag, sizeof(uint16_t));
3407	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
3408	KASSERT(error == 0 && sg.sg_nseg == 3,
3409	    ("%s: error %d adding VLAN message to sglist", __func__, error));
3410
3411	vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
3412
3413	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
3414}
3415
3416static void
3417vtnet_rx_filter_vlan(struct vtnet_softc *sc)
3418{
3419	uint32_t w;
3420	uint16_t tag;
3421	int i, bit;
3422
3423	VTNET_CORE_LOCK_ASSERT(sc);
3424	KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER,
3425	    ("%s: VLAN_FILTER feature not negotiated", __func__));
3426
3427	/* Enable the filter for each configured VLAN. */
3428	for (i = 0; i < VTNET_VLAN_FILTER_NWORDS; i++) {
3429		w = sc->vtnet_vlan_filter[i];
3430
3431		while ((bit = ffs(w) - 1) != -1) {
3432			w &= ~(1 << bit);
3433			tag = sizeof(w) * CHAR_BIT * i + bit;
3434
3435			if (vtnet_exec_vlan_filter(sc, 1, tag) != 0) {
3436				device_printf(sc->vtnet_dev,
3437				    "cannot enable VLAN %d filter\n", tag);
3438			}
3439		}
3440	}
3441}
3442
3443static void
3444vtnet_update_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
3445{
3446	struct ifnet *ifp;
3447	int idx, bit;
3448
3449	ifp = sc->vtnet_ifp;
3450	idx = (tag >> 5) & 0x7F;
3451	bit = tag & 0x1F;
3452
3453	if (tag == 0 || tag > 4095)
3454		return;
3455
3456	VTNET_CORE_LOCK(sc);
3457
3458	if (add)
3459		sc->vtnet_vlan_filter[idx] |= (1 << bit);
3460	else
3461		sc->vtnet_vlan_filter[idx] &= ~(1 << bit);
3462
3463	if (ifp->if_capenable & IFCAP_VLAN_HWFILTER &&
3464	    vtnet_exec_vlan_filter(sc, add, tag) != 0) {
3465		device_printf(sc->vtnet_dev,
3466		    "cannot %s VLAN %d %s the host filter table\n",
3467		    add ? "add" : "remove", tag, add ? "to" : "from");
3468	}
3469
3470	VTNET_CORE_UNLOCK(sc);
3471}
3472
3473static void
3474vtnet_register_vlan(void *arg, struct ifnet *ifp, uint16_t tag)
3475{
3476
3477	if (ifp->if_softc != arg)
3478		return;
3479
3480	vtnet_update_vlan_filter(arg, 1, tag);
3481}
3482
3483static void
3484vtnet_unregister_vlan(void *arg, struct ifnet *ifp, uint16_t tag)
3485{
3486
3487	if (ifp->if_softc != arg)
3488		return;
3489
3490	vtnet_update_vlan_filter(arg, 0, tag);
3491}
3492
3493static int
3494vtnet_is_link_up(struct vtnet_softc *sc)
3495{
3496	device_t dev;
3497	struct ifnet *ifp;
3498	uint16_t status;
3499
3500	dev = sc->vtnet_dev;
3501	ifp = sc->vtnet_ifp;
3502
3503	if ((ifp->if_capabilities & IFCAP_LINKSTATE) == 0)
3504		status = VIRTIO_NET_S_LINK_UP;
3505	else
3506		status = virtio_read_dev_config_2(dev,
3507		    offsetof(struct virtio_net_config, status));
3508
3509	return ((status & VIRTIO_NET_S_LINK_UP) != 0);
3510}
3511
3512static void
3513vtnet_update_link_status(struct vtnet_softc *sc)
3514{
3515	struct ifnet *ifp;
3516	int link;
3517
3518	ifp = sc->vtnet_ifp;
3519
3520	VTNET_CORE_LOCK_ASSERT(sc);
3521	link = vtnet_is_link_up(sc);
3522
3523	/* Notify if the link status has changed. */
3524	if (link != 0 && sc->vtnet_link_active == 0) {
3525		sc->vtnet_link_active = 1;
3526		if_link_state_change(ifp, LINK_STATE_UP);
3527	} else if (link == 0 && sc->vtnet_link_active != 0) {
3528		sc->vtnet_link_active = 0;
3529		if_link_state_change(ifp, LINK_STATE_DOWN);
3530	}
3531}
3532
3533static int
3534vtnet_ifmedia_upd(struct ifnet *ifp)
3535{
3536	struct vtnet_softc *sc;
3537	struct ifmedia *ifm;
3538
3539	sc = ifp->if_softc;
3540	ifm = &sc->vtnet_media;
3541
3542	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
3543		return (EINVAL);
3544
3545	return (0);
3546}
3547
3548static void
3549vtnet_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
3550{
3551	struct vtnet_softc *sc;
3552
3553	sc = ifp->if_softc;
3554
3555	ifmr->ifm_status = IFM_AVALID;
3556	ifmr->ifm_active = IFM_ETHER;
3557
3558	VTNET_CORE_LOCK(sc);
3559	if (vtnet_is_link_up(sc) != 0) {
3560		ifmr->ifm_status |= IFM_ACTIVE;
3561		ifmr->ifm_active |= VTNET_MEDIATYPE;
3562	} else
3563		ifmr->ifm_active |= IFM_NONE;
3564	VTNET_CORE_UNLOCK(sc);
3565}
3566
3567static void
3568vtnet_set_hwaddr(struct vtnet_softc *sc)
3569{
3570	device_t dev;
3571	int i;
3572
3573	dev = sc->vtnet_dev;
3574
3575	if (sc->vtnet_flags & VTNET_FLAG_CTRL_MAC) {
3576		if (vtnet_ctrl_mac_cmd(sc, sc->vtnet_hwaddr) != 0)
3577			device_printf(dev, "unable to set MAC address\n");
3578	} else if (sc->vtnet_flags & VTNET_FLAG_MAC) {
3579		for (i = 0; i < ETHER_ADDR_LEN; i++) {
3580			virtio_write_dev_config_1(dev,
3581			    offsetof(struct virtio_net_config, mac) + i,
3582			    sc->vtnet_hwaddr[i]);
3583		}
3584	}
3585}
3586
3587static void
3588vtnet_get_hwaddr(struct vtnet_softc *sc)
3589{
3590	device_t dev;
3591	int i;
3592
3593	dev = sc->vtnet_dev;
3594
3595	if ((sc->vtnet_flags & VTNET_FLAG_MAC) == 0) {
3596		/*
3597		 * Generate a random locally administered unicast address.
3598		 *
3599		 * It would be nice to generate the same MAC address across
3600		 * reboots, but it seems all the hosts currently available
3601		 * support the MAC feature, so this isn't too important.
3602		 */
3603		sc->vtnet_hwaddr[0] = 0xB2;
3604		arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0);
3605		vtnet_set_hwaddr(sc);
3606		return;
3607	}
3608
3609	for (i = 0; i < ETHER_ADDR_LEN; i++) {
3610		sc->vtnet_hwaddr[i] = virtio_read_dev_config_1(dev,
3611		    offsetof(struct virtio_net_config, mac) + i);
3612	}
3613}
3614
3615static void
3616vtnet_vlan_tag_remove(struct mbuf *m)
3617{
3618	struct ether_vlan_header *evh;
3619
3620	evh = mtod(m, struct ether_vlan_header *);
3621	m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag);
3622	m->m_flags |= M_VLANTAG;
3623
3624	/* Strip the 802.1Q header. */
3625	bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN,
3626	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
3627	m_adj(m, ETHER_VLAN_ENCAP_LEN);
3628}
3629
3630static void
3631vtnet_set_rx_process_limit(struct vtnet_softc *sc)
3632{
3633	int limit;
3634
3635	limit = vtnet_tunable_int(sc, "rx_process_limit",
3636	    vtnet_rx_process_limit);
3637	if (limit < 0)
3638		limit = INT_MAX;
3639	sc->vtnet_rx_process_limit = limit;
3640}
3641
3642static void
3643vtnet_set_tx_intr_threshold(struct vtnet_softc *sc)
3644{
3645	device_t dev;
3646	int size, thresh;
3647
3648	dev = sc->vtnet_dev;
3649	size = virtqueue_size(sc->vtnet_txqs[0].vtntx_vq);
3650
3651	/*
3652	 * The Tx interrupt is disabled until the queue free count falls
3653	 * below our threshold. Completed frames are drained from the Tx
3654	 * virtqueue before transmitting new frames and in the watchdog
3655	 * callout, so the frequency of Tx interrupts is greatly reduced,
3656	 * at the cost of not freeing mbufs as quickly as they otherwise
3657	 * would be.
3658	 *
3659	 * N.B. We assume all the Tx queues are the same size.
3660	 */
3661	thresh = size / 4;
3662
3663	/*
3664	 * Without indirect descriptors, leave enough room for the most
3665	 * segments we handle.
3666	 */
3667	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC) == 0 &&
3668	    thresh < sc->vtnet_tx_nsegs)
3669		thresh = sc->vtnet_tx_nsegs;
3670
3671	sc->vtnet_tx_intr_thresh = thresh;
3672}
3673
3674static void
3675vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *ctx,
3676    struct sysctl_oid_list *child, struct vtnet_rxq *rxq)
3677{
3678	struct sysctl_oid *node;
3679	struct sysctl_oid_list *list;
3680	struct vtnet_rxq_stats *stats;
3681	char namebuf[16];
3682
3683	snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vtnrx_id);
3684	node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
3685	    CTLFLAG_RD, NULL, "Receive Queue");
3686	list = SYSCTL_CHILDREN(node);
3687
3688	stats = &rxq->vtnrx_stats;
3689
3690	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ipackets", CTLFLAG_RD,
3691	    &stats->vrxs_ipackets, "Receive packets");
3692	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ibytes", CTLFLAG_RD,
3693	    &stats->vrxs_ibytes, "Receive bytes");
3694	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "iqdrops", CTLFLAG_RD,
3695	    &stats->vrxs_iqdrops, "Receive drops");
3696	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ierrors", CTLFLAG_RD,
3697	    &stats->vrxs_ierrors, "Receive errors");
3698	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD,
3699	    &stats->vrxs_csum, "Receive checksum offloaded");
3700	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum_failed", CTLFLAG_RD,
3701	    &stats->vrxs_csum_failed, "Receive checksum offload failed");
3702	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD,
3703	    &stats->vrxs_rescheduled,
3704	    "Receive interrupt handler rescheduled");
3705}
3706
3707static void
3708vtnet_setup_txq_sysctl(struct sysctl_ctx_list *ctx,
3709    struct sysctl_oid_list *child, struct vtnet_txq *txq)
3710{
3711	struct sysctl_oid *node;
3712	struct sysctl_oid_list *list;
3713	struct vtnet_txq_stats *stats;
3714	char namebuf[16];
3715
3716	snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vtntx_id);
3717	node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
3718	    CTLFLAG_RD, NULL, "Transmit Queue");
3719	list = SYSCTL_CHILDREN(node);
3720
3721	stats = &txq->vtntx_stats;
3722
3723	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "opackets", CTLFLAG_RD,
3724	    &stats->vtxs_opackets, "Transmit packets");
3725	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "obytes", CTLFLAG_RD,
3726	    &stats->vtxs_obytes, "Transmit bytes");
3727	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "omcasts", CTLFLAG_RD,
3728	    &stats->vtxs_omcasts, "Transmit multicasts");
3729	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD,
3730	    &stats->vtxs_csum, "Transmit checksum offloaded");
3731	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "tso", CTLFLAG_RD,
3732	    &stats->vtxs_tso, "Transmit segmentation offloaded");
3733	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD,
3734	    &stats->vtxs_rescheduled,
3735	    "Transmit interrupt handler rescheduled");
3736}
3737
3738static void
3739vtnet_setup_queue_sysctl(struct vtnet_softc *sc)
3740{
3741	device_t dev;
3742	struct sysctl_ctx_list *ctx;
3743	struct sysctl_oid *tree;
3744	struct sysctl_oid_list *child;
3745	int i;
3746
3747	dev = sc->vtnet_dev;
3748	ctx = device_get_sysctl_ctx(dev);
3749	tree = device_get_sysctl_tree(dev);
3750	child = SYSCTL_CHILDREN(tree);
3751
3752	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
3753		vtnet_setup_rxq_sysctl(ctx, child, &sc->vtnet_rxqs[i]);
3754		vtnet_setup_txq_sysctl(ctx, child, &sc->vtnet_txqs[i]);
3755	}
3756}
3757
3758static void
3759vtnet_setup_stat_sysctl(struct sysctl_ctx_list *ctx,
3760    struct sysctl_oid_list *child, struct vtnet_softc *sc)
3761{
3762	struct vtnet_statistics *stats;
3763
3764	stats = &sc->vtnet_stats;
3765
3766	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "mbuf_alloc_failed",
3767	    CTLFLAG_RD, &stats->mbuf_alloc_failed,
3768	    "Mbuf cluster allocation failures");
3769
3770	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_frame_too_large",
3771	    CTLFLAG_RD, &stats->rx_frame_too_large,
3772	    "Received frame larger than the mbuf chain");
3773	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_enq_replacement_failed",
3774	    CTLFLAG_RD, &stats->rx_enq_replacement_failed,
3775	    "Enqueuing the replacement receive mbuf failed");
3776	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_mergeable_failed",
3777	    CTLFLAG_RD, &stats->rx_mergeable_failed,
3778	    "Mergeable buffers receive failures");
3779	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ethtype",
3780	    CTLFLAG_RD, &stats->rx_csum_bad_ethtype,
3781	    "Received checksum offloaded buffer with unsupported "
3782	    "Ethernet type");
3783	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ipproto",
3784	    CTLFLAG_RD, &stats->rx_csum_bad_ipproto,
3785	    "Received checksum offloaded buffer with incorrect IP protocol");
3786	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_offset",
3787	    CTLFLAG_RD, &stats->rx_csum_bad_offset,
3788	    "Received checksum offloaded buffer with incorrect offset");
3789	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_proto",
3790	    CTLFLAG_RD, &stats->rx_csum_bad_proto,
3791	    "Received checksum offloaded buffer with incorrect protocol");
3792	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_failed",
3793	    CTLFLAG_RD, &stats->rx_csum_failed,
3794	    "Received buffer checksum offload failed");
3795	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_offloaded",
3796	    CTLFLAG_RD, &stats->rx_csum_offloaded,
3797	    "Received buffer checksum offload succeeded");
3798	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_task_rescheduled",
3799	    CTLFLAG_RD, &stats->rx_task_rescheduled,
3800	    "Times the receive interrupt task rescheduled itself");
3801
3802	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_bad_ethtype",
3803	    CTLFLAG_RD, &stats->tx_csum_bad_ethtype,
3804	    "Aborted transmit of checksum offloaded buffer with unknown "
3805	    "Ethernet type");
3806	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_bad_ethtype",
3807	    CTLFLAG_RD, &stats->tx_tso_bad_ethtype,
3808	    "Aborted transmit of TSO buffer with unknown Ethernet type");
3809	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_not_tcp",
3810	    CTLFLAG_RD, &stats->tx_tso_not_tcp,
3811	    "Aborted transmit of TSO buffer with non TCP protocol");
3812	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defragged",
3813	    CTLFLAG_RD, &stats->tx_defragged,
3814	    "Transmit mbufs defragged");
3815	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defrag_failed",
3816	    CTLFLAG_RD, &stats->tx_defrag_failed,
3817	    "Aborted transmit of buffer because defrag failed");
3818	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_offloaded",
3819	    CTLFLAG_RD, &stats->tx_csum_offloaded,
3820	    "Offloaded checksum of transmitted buffer");
3821	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_offloaded",
3822	    CTLFLAG_RD, &stats->tx_tso_offloaded,
3823	    "Segmentation offload of transmitted buffer");
3824	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_task_rescheduled",
3825	    CTLFLAG_RD, &stats->tx_task_rescheduled,
3826	    "Times the transmit interrupt task rescheduled itself");
3827}
3828
3829static void
3830vtnet_setup_sysctl(struct vtnet_softc *sc)
3831{
3832	device_t dev;
3833	struct sysctl_ctx_list *ctx;
3834	struct sysctl_oid *tree;
3835	struct sysctl_oid_list *child;
3836
3837	dev = sc->vtnet_dev;
3838	ctx = device_get_sysctl_ctx(dev);
3839	tree = device_get_sysctl_tree(dev);
3840	child = SYSCTL_CHILDREN(tree);
3841
3842	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_vq_pairs",
3843	    CTLFLAG_RD, &sc->vtnet_max_vq_pairs, 0,
3844	    "Maximum number of supported virtqueue pairs");
3845	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "act_vq_pairs",
3846	    CTLFLAG_RD, &sc->vtnet_act_vq_pairs, 0,
3847	    "Number of active virtqueue pairs");
3848
3849	vtnet_setup_stat_sysctl(ctx, child, sc);
3850}
3851
3852static int
3853vtnet_rxq_enable_intr(struct vtnet_rxq *rxq)
3854{
3855
3856	return (virtqueue_enable_intr(rxq->vtnrx_vq));
3857}
3858
3859static void
3860vtnet_rxq_disable_intr(struct vtnet_rxq *rxq)
3861{
3862
3863	virtqueue_disable_intr(rxq->vtnrx_vq);
3864}
3865
3866static int
3867vtnet_txq_enable_intr(struct vtnet_txq *txq)
3868{
3869	struct virtqueue *vq;
3870
3871	vq = txq->vtntx_vq;
3872
3873	if (vtnet_txq_below_threshold(txq) != 0)
3874		return (virtqueue_postpone_intr(vq, VQ_POSTPONE_LONG));
3875
3876	/*
3877	 * The free count is above our threshold. Keep the Tx interrupt
3878	 * disabled until the queue is fuller.
3879	 */
3880	return (0);
3881}
3882
3883static void
3884vtnet_txq_disable_intr(struct vtnet_txq *txq)
3885{
3886
3887	virtqueue_disable_intr(txq->vtntx_vq);
3888}
3889
3890static void
3891vtnet_enable_rx_interrupts(struct vtnet_softc *sc)
3892{
3893	int i;
3894
3895	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
3896		vtnet_rxq_enable_intr(&sc->vtnet_rxqs[i]);
3897}
3898
3899static void
3900vtnet_enable_tx_interrupts(struct vtnet_softc *sc)
3901{
3902	int i;
3903
3904	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
3905		vtnet_txq_enable_intr(&sc->vtnet_txqs[i]);
3906}
3907
3908static void
3909vtnet_enable_interrupts(struct vtnet_softc *sc)
3910{
3911
3912	vtnet_enable_rx_interrupts(sc);
3913	vtnet_enable_tx_interrupts(sc);
3914}
3915
3916static void
3917vtnet_disable_rx_interrupts(struct vtnet_softc *sc)
3918{
3919	int i;
3920
3921	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
3922		vtnet_rxq_disable_intr(&sc->vtnet_rxqs[i]);
3923}
3924
3925static void
3926vtnet_disable_tx_interrupts(struct vtnet_softc *sc)
3927{
3928	int i;
3929
3930	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
3931		vtnet_txq_disable_intr(&sc->vtnet_txqs[i]);
3932}
3933
3934static void
3935vtnet_disable_interrupts(struct vtnet_softc *sc)
3936{
3937
3938	vtnet_disable_rx_interrupts(sc);
3939	vtnet_disable_tx_interrupts(sc);
3940}
3941
3942static int
3943vtnet_tunable_int(struct vtnet_softc *sc, const char *knob, int def)
3944{
3945	char path[64];
3946
3947	snprintf(path, sizeof(path),
3948	    "hw.vtnet.%d.%s", device_get_unit(sc->vtnet_dev), knob);
3949	TUNABLE_INT_FETCH(path, &def);
3950
3951	return (def);
3952}
3953