1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2004-2006 Kip Macy
5 * Copyright (c) 2015 Wei Liu <wei.liu2@citrix.com>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31#include "opt_inet.h"
32#include "opt_inet6.h"
33
34#include <sys/param.h>
35#include <sys/sockio.h>
36#include <sys/limits.h>
37#include <sys/mbuf.h>
38#include <sys/malloc.h>
39#include <sys/module.h>
40#include <sys/kernel.h>
41#include <sys/socket.h>
42#include <sys/sysctl.h>
43#include <sys/taskqueue.h>
44
45#include <net/if.h>
46#include <net/if_var.h>
47#include <net/if_arp.h>
48#include <net/ethernet.h>
49#include <net/if_media.h>
50#include <net/bpf.h>
51#include <net/if_types.h>
52
53#include <netinet/in.h>
54#include <netinet/ip.h>
55#include <netinet/if_ether.h>
56#include <netinet/tcp.h>
57#include <netinet/tcp_lro.h>
58
59#include <vm/vm.h>
60#include <vm/pmap.h>
61
62#include <sys/bus.h>
63
64#include <xen/xen-os.h>
65#include <xen/hypervisor.h>
66#include <xen/xen_intr.h>
67#include <xen/gnttab.h>
68#include <contrib/xen/memory.h>
69#include <contrib/xen/io/netif.h>
70#include <xen/xenbus/xenbusvar.h>
71
72#include <machine/bus.h>
73
74#include "xenbus_if.h"
75
76/* Features supported by all backends.  TSO and LRO can be negotiated */
77#define XN_CSUM_FEATURES	(CSUM_TCP | CSUM_UDP)
78
79#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE)
80#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE)
81
82#define NET_RX_SLOTS_MIN (XEN_NETIF_NR_SLOTS_MIN + 1)
83
84/*
85 * Should the driver do LRO on the RX end
86 *  this can be toggled on the fly, but the
87 *  interface must be reset (down/up) for it
88 *  to take effect.
89 */
90static int xn_enable_lro = 1;
91TUNABLE_INT("hw.xn.enable_lro", &xn_enable_lro);
92
93/*
94 * Number of pairs of queues.
95 */
96static unsigned long xn_num_queues = 4;
97TUNABLE_ULONG("hw.xn.num_queues", &xn_num_queues);
98
99/**
100 * \brief The maximum allowed data fragments in a single transmit
101 *        request.
102 *
103 * This limit is imposed by the backend driver.  We assume here that
104 * we are dealing with a Linux driver domain and have set our limit
105 * to mirror the Linux MAX_SKB_FRAGS constant.
106 */
107#define	MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2)
108
109#define RX_COPY_THRESHOLD 256
110
111#define net_ratelimit() 0
112
113struct netfront_rxq;
114struct netfront_txq;
115struct netfront_info;
116struct netfront_rx_info;
117
118static void xn_txeof(struct netfront_txq *);
119static void xn_rxeof(struct netfront_rxq *);
120static void xn_alloc_rx_buffers(struct netfront_rxq *);
121static void xn_alloc_rx_buffers_callout(void *arg);
122
123static void xn_release_rx_bufs(struct netfront_rxq *);
124static void xn_release_tx_bufs(struct netfront_txq *);
125
126static void xn_rxq_intr(struct netfront_rxq *);
127static void xn_txq_intr(struct netfront_txq *);
128static void xn_intr(void *);
129static int xn_assemble_tx_request(struct netfront_txq *, struct mbuf *);
130static int xn_ioctl(if_t, u_long, caddr_t);
131static void xn_ifinit_locked(struct netfront_info *);
132static void xn_ifinit(void *);
133static void xn_stop(struct netfront_info *);
134static void xn_query_features(struct netfront_info *np);
135static int xn_configure_features(struct netfront_info *np);
136static void netif_free(struct netfront_info *info);
137static int netfront_detach(device_t dev);
138
139static int xn_txq_mq_start_locked(struct netfront_txq *, struct mbuf *);
140static int xn_txq_mq_start(if_t, struct mbuf *);
141
142static int talk_to_backend(device_t dev, struct netfront_info *info);
143static int create_netdev(device_t dev);
144static void netif_disconnect_backend(struct netfront_info *info);
145static int setup_device(device_t dev, struct netfront_info *info,
146    unsigned long);
147static int xn_ifmedia_upd(if_t ifp);
148static void xn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr);
149
150static int xn_connect(struct netfront_info *);
151static void xn_kick_rings(struct netfront_info *);
152
153static int xn_get_responses(struct netfront_rxq *,
154    struct netfront_rx_info *, RING_IDX, RING_IDX *,
155    struct mbuf **);
156
157#define virt_to_mfn(x) (vtophys(x) >> PAGE_SHIFT)
158
159#define INVALID_P2M_ENTRY (~0UL)
160#define XN_QUEUE_NAME_LEN  8	/* xn{t,r}x_%u, allow for two digits */
161struct netfront_rxq {
162	struct netfront_info 	*info;
163	u_int			id;
164	char			name[XN_QUEUE_NAME_LEN];
165	struct mtx		lock;
166
167	int			ring_ref;
168	netif_rx_front_ring_t 	ring;
169	xen_intr_handle_t	xen_intr_handle;
170
171	grant_ref_t 		gref_head;
172	grant_ref_t 		grant_ref[NET_RX_RING_SIZE + 1];
173
174	struct mbuf		*mbufs[NET_RX_RING_SIZE + 1];
175
176	struct lro_ctrl		lro;
177
178	struct callout		rx_refill;
179};
180
181struct netfront_txq {
182	struct netfront_info 	*info;
183	u_int 			id;
184	char			name[XN_QUEUE_NAME_LEN];
185	struct mtx		lock;
186
187	int			ring_ref;
188	netif_tx_front_ring_t	ring;
189	xen_intr_handle_t 	xen_intr_handle;
190
191	grant_ref_t		gref_head;
192	grant_ref_t		grant_ref[NET_TX_RING_SIZE + 1];
193
194	struct mbuf		*mbufs[NET_TX_RING_SIZE + 1];
195	int			mbufs_cnt;
196	struct buf_ring		*br;
197
198	struct taskqueue 	*tq;
199	struct task       	defrtask;
200
201	bus_dma_segment_t	segs[MAX_TX_REQ_FRAGS];
202	struct mbuf_xennet {
203		struct m_tag 	tag;
204		bus_dma_tag_t	dma_tag;
205		bus_dmamap_t	dma_map;
206		struct netfront_txq *txq;
207		SLIST_ENTRY(mbuf_xennet) next;
208		u_int 		count;
209	}			xennet_tag[NET_TX_RING_SIZE + 1];
210	SLIST_HEAD(, mbuf_xennet) tags;
211
212	bool			full;
213};
214
215struct netfront_info {
216	if_t			xn_ifp;
217
218	struct mtx   		sc_lock;
219
220	u_int  num_queues;
221	struct netfront_rxq 	*rxq;
222	struct netfront_txq 	*txq;
223
224	u_int			carrier;
225	u_int			maxfrags;
226
227	device_t		xbdev;
228	uint8_t			mac[ETHER_ADDR_LEN];
229
230	int			xn_if_flags;
231
232	struct ifmedia		sc_media;
233
234	bus_dma_tag_t		dma_tag;
235
236	bool			xn_reset;
237};
238
239struct netfront_rx_info {
240	struct netif_rx_response rx;
241	struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
242};
243
244#define XN_RX_LOCK(_q)         mtx_lock(&(_q)->lock)
245#define XN_RX_UNLOCK(_q)       mtx_unlock(&(_q)->lock)
246
247#define XN_TX_LOCK(_q)         mtx_lock(&(_q)->lock)
248#define XN_TX_TRYLOCK(_q)      mtx_trylock(&(_q)->lock)
249#define XN_TX_UNLOCK(_q)       mtx_unlock(&(_q)->lock)
250
251#define XN_LOCK(_sc)           mtx_lock(&(_sc)->sc_lock);
252#define XN_UNLOCK(_sc)         mtx_unlock(&(_sc)->sc_lock);
253
254#define XN_LOCK_ASSERT(_sc)    mtx_assert(&(_sc)->sc_lock, MA_OWNED);
255#define XN_RX_LOCK_ASSERT(_q)  mtx_assert(&(_q)->lock, MA_OWNED);
256#define XN_TX_LOCK_ASSERT(_q)  mtx_assert(&(_q)->lock, MA_OWNED);
257
258#define netfront_carrier_on(netif)	((netif)->carrier = 1)
259#define netfront_carrier_off(netif)	((netif)->carrier = 0)
260#define netfront_carrier_ok(netif)	((netif)->carrier)
261
262/* Access macros for acquiring freeing slots in xn_free_{tx,rx}_idxs[]. */
263
264static inline void
265add_id_to_freelist(struct mbuf **list, uintptr_t id)
266{
267
268	KASSERT(id != 0,
269		("%s: the head item (0) must always be free.", __func__));
270	list[id] = list[0];
271	list[0]  = (struct mbuf *)id;
272}
273
274static inline unsigned short
275get_id_from_freelist(struct mbuf **list)
276{
277	uintptr_t id;
278
279	id = (uintptr_t)list[0];
280	KASSERT(id != 0,
281		("%s: the head item (0) must always remain free.", __func__));
282	list[0] = list[id];
283	return (id);
284}
285
286static inline int
287xn_rxidx(RING_IDX idx)
288{
289
290	return idx & (NET_RX_RING_SIZE - 1);
291}
292
293static inline struct mbuf *
294xn_get_rx_mbuf(struct netfront_rxq *rxq, RING_IDX ri)
295{
296	int i;
297	struct mbuf *m;
298
299	i = xn_rxidx(ri);
300	m = rxq->mbufs[i];
301	rxq->mbufs[i] = NULL;
302	return (m);
303}
304
305static inline grant_ref_t
306xn_get_rx_ref(struct netfront_rxq *rxq, RING_IDX ri)
307{
308	int i = xn_rxidx(ri);
309	grant_ref_t ref = rxq->grant_ref[i];
310
311	KASSERT(ref != GRANT_REF_INVALID, ("Invalid grant reference!\n"));
312	rxq->grant_ref[i] = GRANT_REF_INVALID;
313	return (ref);
314}
315
316#define MTAG_COOKIE 1218492000
317#define MTAG_XENNET 0
318
319static void mbuf_grab(struct mbuf *m)
320{
321	struct mbuf_xennet *ref;
322
323	ref = (struct mbuf_xennet *)m_tag_locate(m, MTAG_COOKIE,
324	    MTAG_XENNET, NULL);
325	KASSERT(ref != NULL, ("Cannot find refcount"));
326	ref->count++;
327}
328
329static void mbuf_release(struct mbuf *m)
330{
331	struct mbuf_xennet *ref;
332
333	ref = (struct mbuf_xennet *)m_tag_locate(m, MTAG_COOKIE,
334	    MTAG_XENNET, NULL);
335	KASSERT(ref != NULL, ("Cannot find refcount"));
336	KASSERT(ref->count > 0, ("Invalid reference count"));
337
338	if (--ref->count == 0)
339		m_freem(m);
340}
341
342static void tag_free(struct m_tag *t)
343{
344	struct mbuf_xennet *ref = (struct mbuf_xennet *)t;
345
346	KASSERT(ref->count == 0, ("Free mbuf tag with pending refcnt"));
347	bus_dmamap_sync(ref->dma_tag, ref->dma_map, BUS_DMASYNC_POSTWRITE);
348	bus_dmamap_destroy(ref->dma_tag, ref->dma_map);
349	SLIST_INSERT_HEAD(&ref->txq->tags, ref, next);
350}
351
352#define IPRINTK(fmt, args...) \
353    printf("[XEN] " fmt, ##args)
354#ifdef INVARIANTS
355#define WPRINTK(fmt, args...) \
356    printf("[XEN] " fmt, ##args)
357#else
358#define WPRINTK(fmt, args...)
359#endif
360#ifdef DEBUG
361#define DPRINTK(fmt, args...) \
362    printf("[XEN] %s: " fmt, __func__, ##args)
363#else
364#define DPRINTK(fmt, args...)
365#endif
366
367/**
368 * Read the 'mac' node at the given device's node in the store, and parse that
369 * as colon-separated octets, placing result the given mac array.  mac must be
370 * a preallocated array of length ETH_ALEN (as declared in linux/if_ether.h).
371 * Return 0 on success, or errno on error.
372 */
373static int
374xen_net_read_mac(device_t dev, uint8_t mac[])
375{
376	int error, i;
377	char *s, *e, *macstr;
378	const char *path;
379
380	path = xenbus_get_node(dev);
381	error = xs_read(XST_NIL, path, "mac", NULL, (void **) &macstr);
382	if (error == ENOENT) {
383		/*
384		 * Deal with missing mac XenStore nodes on devices with
385		 * HVM emulation (the 'ioemu' configuration attribute)
386		 * enabled.
387		 *
388		 * The HVM emulator may execute in a stub device model
389		 * domain which lacks the permission, only given to Dom0,
390		 * to update the guest's XenStore tree.  For this reason,
391		 * the HVM emulator doesn't even attempt to write the
392		 * front-side mac node, even when operating in Dom0.
393		 * However, there should always be a mac listed in the
394		 * backend tree.  Fallback to this version if our query
395		 * of the front side XenStore location doesn't find
396		 * anything.
397		 */
398		path = xenbus_get_otherend_path(dev);
399		error = xs_read(XST_NIL, path, "mac", NULL, (void **) &macstr);
400	}
401	if (error != 0) {
402		xenbus_dev_fatal(dev, error, "parsing %s/mac", path);
403		return (error);
404	}
405
406	s = macstr;
407	for (i = 0; i < ETHER_ADDR_LEN; i++) {
408		mac[i] = strtoul(s, &e, 16);
409		if (s == e || (e[0] != ':' && e[0] != 0)) {
410			free(macstr, M_XENBUS);
411			return (ENOENT);
412		}
413		s = &e[1];
414	}
415	free(macstr, M_XENBUS);
416	return (0);
417}
418
419/**
420 * Entry point to this code when a new device is created.  Allocate the basic
421 * structures and the ring buffers for communication with the backend, and
422 * inform the backend of the appropriate details for those.  Switch to
423 * Connected state.
424 */
425static int
426netfront_probe(device_t dev)
427{
428
429	if (xen_pv_nics_disabled())
430		return (ENXIO);
431
432	if (!strcmp(xenbus_get_type(dev), "vif")) {
433		device_set_desc(dev, "Virtual Network Interface");
434		return (0);
435	}
436
437	return (ENXIO);
438}
439
440static int
441netfront_attach(device_t dev)
442{
443	int err;
444
445	err = create_netdev(dev);
446	if (err != 0) {
447		xenbus_dev_fatal(dev, err, "creating netdev");
448		return (err);
449	}
450
451	SYSCTL_ADD_INT(device_get_sysctl_ctx(dev),
452	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
453	    OID_AUTO, "enable_lro", CTLFLAG_RW,
454	    &xn_enable_lro, 0, "Large Receive Offload");
455
456	SYSCTL_ADD_ULONG(device_get_sysctl_ctx(dev),
457	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
458	    OID_AUTO, "num_queues", CTLFLAG_RD,
459	    &xn_num_queues, "Number of pairs of queues");
460
461	return (0);
462}
463
464static int
465netfront_suspend(device_t dev)
466{
467	struct netfront_info *np = device_get_softc(dev);
468	u_int i;
469
470	for (i = 0; i < np->num_queues; i++) {
471		XN_RX_LOCK(&np->rxq[i]);
472		XN_TX_LOCK(&np->txq[i]);
473	}
474	netfront_carrier_off(np);
475	for (i = 0; i < np->num_queues; i++) {
476		XN_RX_UNLOCK(&np->rxq[i]);
477		XN_TX_UNLOCK(&np->txq[i]);
478	}
479	return (0);
480}
481
482/**
483 * We are reconnecting to the backend, due to a suspend/resume, or a backend
484 * driver restart.  We tear down our netif structure and recreate it, but
485 * leave the device-layer structures intact so that this is transparent to the
486 * rest of the kernel.
487 */
488static int
489netfront_resume(device_t dev)
490{
491	struct netfront_info *info = device_get_softc(dev);
492	u_int i;
493
494	if (xen_suspend_cancelled) {
495		for (i = 0; i < info->num_queues; i++) {
496			XN_RX_LOCK(&info->rxq[i]);
497			XN_TX_LOCK(&info->txq[i]);
498		}
499		netfront_carrier_on(info);
500		for (i = 0; i < info->num_queues; i++) {
501			XN_RX_UNLOCK(&info->rxq[i]);
502			XN_TX_UNLOCK(&info->txq[i]);
503		}
504		return (0);
505	}
506
507	netif_disconnect_backend(info);
508	return (0);
509}
510
511static int
512write_queue_xenstore_keys(device_t dev,
513    struct netfront_rxq *rxq,
514    struct netfront_txq *txq,
515    struct xs_transaction *xst, bool hierarchy)
516{
517	int err;
518	const char *message;
519	const char *node = xenbus_get_node(dev);
520	char *path;
521	size_t path_size;
522
523	KASSERT(rxq->id == txq->id, ("Mismatch between RX and TX queue ids"));
524	/* Split event channel support is not yet there. */
525	KASSERT(rxq->xen_intr_handle == txq->xen_intr_handle,
526	    ("Split event channels are not supported"));
527
528	if (hierarchy) {
529		path_size = strlen(node) + 10;
530		path = malloc(path_size, M_DEVBUF, M_WAITOK|M_ZERO);
531		snprintf(path, path_size, "%s/queue-%u", node, rxq->id);
532	} else {
533		path_size = strlen(node) + 1;
534		path = malloc(path_size, M_DEVBUF, M_WAITOK|M_ZERO);
535		snprintf(path, path_size, "%s", node);
536	}
537
538	err = xs_printf(*xst, path, "tx-ring-ref","%u", txq->ring_ref);
539	if (err != 0) {
540		message = "writing tx ring-ref";
541		goto error;
542	}
543	err = xs_printf(*xst, path, "rx-ring-ref","%u", rxq->ring_ref);
544	if (err != 0) {
545		message = "writing rx ring-ref";
546		goto error;
547	}
548	err = xs_printf(*xst, path, "event-channel", "%u",
549	    xen_intr_port(rxq->xen_intr_handle));
550	if (err != 0) {
551		message = "writing event-channel";
552		goto error;
553	}
554
555	free(path, M_DEVBUF);
556
557	return (0);
558
559error:
560	free(path, M_DEVBUF);
561	xenbus_dev_fatal(dev, err, "%s", message);
562
563	return (err);
564}
565
566/* Common code used when first setting up, and when resuming. */
567static int
568talk_to_backend(device_t dev, struct netfront_info *info)
569{
570	const char *message;
571	struct xs_transaction xst;
572	const char *node = xenbus_get_node(dev);
573	int err;
574	unsigned long num_queues, max_queues = 0;
575	unsigned int i;
576
577	err = xen_net_read_mac(dev, info->mac);
578	if (err != 0) {
579		xenbus_dev_fatal(dev, err, "parsing %s/mac", node);
580		goto out;
581	}
582
583	err = xs_scanf(XST_NIL, xenbus_get_otherend_path(info->xbdev),
584	    "multi-queue-max-queues", NULL, "%lu", &max_queues);
585	if (err != 0)
586		max_queues = 1;
587	num_queues = xn_num_queues;
588	if (num_queues > max_queues)
589		num_queues = max_queues;
590
591	err = setup_device(dev, info, num_queues);
592	if (err != 0) {
593		xenbus_dev_fatal(dev, err, "setup device");
594		goto out;
595	}
596
597 again:
598	err = xs_transaction_start(&xst);
599	if (err != 0) {
600		xenbus_dev_fatal(dev, err, "starting transaction");
601		goto free;
602	}
603
604	if (info->num_queues == 1) {
605		err = write_queue_xenstore_keys(dev, &info->rxq[0],
606		    &info->txq[0], &xst, false);
607		if (err != 0)
608			goto abort_transaction_no_def_error;
609	} else {
610		err = xs_printf(xst, node, "multi-queue-num-queues",
611		    "%u", info->num_queues);
612		if (err != 0) {
613			message = "writing multi-queue-num-queues";
614			goto abort_transaction;
615		}
616
617		for (i = 0; i < info->num_queues; i++) {
618			err = write_queue_xenstore_keys(dev, &info->rxq[i],
619			    &info->txq[i], &xst, true);
620			if (err != 0)
621				goto abort_transaction_no_def_error;
622		}
623	}
624
625	err = xs_printf(xst, node, "request-rx-copy", "%u", 1);
626	if (err != 0) {
627		message = "writing request-rx-copy";
628		goto abort_transaction;
629	}
630	err = xs_printf(xst, node, "feature-rx-notify", "%d", 1);
631	if (err != 0) {
632		message = "writing feature-rx-notify";
633		goto abort_transaction;
634	}
635	err = xs_printf(xst, node, "feature-sg", "%d", 1);
636	if (err != 0) {
637		message = "writing feature-sg";
638		goto abort_transaction;
639	}
640	if ((if_getcapenable(info->xn_ifp) & IFCAP_LRO) != 0) {
641		err = xs_printf(xst, node, "feature-gso-tcpv4", "%d", 1);
642		if (err != 0) {
643			message = "writing feature-gso-tcpv4";
644			goto abort_transaction;
645		}
646	}
647	if ((if_getcapenable(info->xn_ifp) & IFCAP_RXCSUM) == 0) {
648		err = xs_printf(xst, node, "feature-no-csum-offload", "%d", 1);
649		if (err != 0) {
650			message = "writing feature-no-csum-offload";
651			goto abort_transaction;
652		}
653	}
654
655	err = xs_transaction_end(xst, 0);
656	if (err != 0) {
657		if (err == EAGAIN)
658			goto again;
659		xenbus_dev_fatal(dev, err, "completing transaction");
660		goto free;
661	}
662
663	return 0;
664
665 abort_transaction:
666	xenbus_dev_fatal(dev, err, "%s", message);
667 abort_transaction_no_def_error:
668	xs_transaction_end(xst, 1);
669 free:
670	netif_free(info);
671 out:
672	return (err);
673}
674
675static void
676xn_rxq_intr(struct netfront_rxq *rxq)
677{
678
679	XN_RX_LOCK(rxq);
680	xn_rxeof(rxq);
681	XN_RX_UNLOCK(rxq);
682}
683
684static void
685xn_txq_start(struct netfront_txq *txq)
686{
687	struct netfront_info *np = txq->info;
688	if_t ifp = np->xn_ifp;
689
690	XN_TX_LOCK_ASSERT(txq);
691	if (!drbr_empty(ifp, txq->br))
692		xn_txq_mq_start_locked(txq, NULL);
693}
694
695static void
696xn_txq_intr(struct netfront_txq *txq)
697{
698
699	XN_TX_LOCK(txq);
700	if (RING_HAS_UNCONSUMED_RESPONSES(&txq->ring))
701		xn_txeof(txq);
702	xn_txq_start(txq);
703	XN_TX_UNLOCK(txq);
704}
705
706static void
707xn_txq_tq_deferred(void *xtxq, int pending)
708{
709	struct netfront_txq *txq = xtxq;
710
711	XN_TX_LOCK(txq);
712	xn_txq_start(txq);
713	XN_TX_UNLOCK(txq);
714}
715
716static void
717disconnect_rxq(struct netfront_rxq *rxq)
718{
719
720	xn_release_rx_bufs(rxq);
721	gnttab_free_grant_references(rxq->gref_head);
722	if (rxq->ring_ref != GRANT_REF_INVALID) {
723		gnttab_end_foreign_access(rxq->ring_ref, NULL);
724		rxq->ring_ref = GRANT_REF_INVALID;
725	}
726	/*
727	 * No split event channel support at the moment, handle will
728	 * be unbound in tx. So no need to call xen_intr_unbind here,
729	 * but we do want to reset the handler to 0.
730	 */
731	rxq->xen_intr_handle = 0;
732}
733
734static void
735destroy_rxq(struct netfront_rxq *rxq)
736{
737
738	callout_drain(&rxq->rx_refill);
739	free(rxq->ring.sring, M_DEVBUF);
740	rxq->ring.sring = NULL;
741}
742
743static void
744destroy_rxqs(struct netfront_info *np)
745{
746	int i;
747
748	for (i = 0; i < np->num_queues; i++)
749		destroy_rxq(&np->rxq[i]);
750
751	free(np->rxq, M_DEVBUF);
752	np->rxq = NULL;
753}
754
755static int
756setup_rxqs(device_t dev, struct netfront_info *info,
757	   unsigned long num_queues)
758{
759	int q, i;
760	int error;
761	netif_rx_sring_t *rxs;
762	struct netfront_rxq *rxq;
763
764	info->rxq = malloc(sizeof(struct netfront_rxq) * num_queues,
765	    M_DEVBUF, M_WAITOK|M_ZERO);
766
767	for (q = 0; q < num_queues; q++) {
768		rxq = &info->rxq[q];
769
770		rxq->id = q;
771		rxq->info = info;
772
773		rxq->gref_head = GNTTAB_LIST_END;
774		rxq->ring_ref = GRANT_REF_INVALID;
775		rxq->ring.sring = NULL;
776		snprintf(rxq->name, XN_QUEUE_NAME_LEN, "xnrx_%u", q);
777		mtx_init(&rxq->lock, rxq->name, "netfront receive lock",
778		    MTX_DEF);
779
780		for (i = 0; i <= NET_RX_RING_SIZE; i++) {
781			rxq->mbufs[i] = NULL;
782			rxq->grant_ref[i] = GRANT_REF_INVALID;
783		}
784
785		/* Start resources allocation */
786
787		if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
788		    &rxq->gref_head) != 0) {
789			device_printf(dev, "allocating rx gref");
790			error = ENOMEM;
791			goto fail;
792		}
793
794		rxs = (netif_rx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF,
795		    M_WAITOK|M_ZERO);
796		SHARED_RING_INIT(rxs);
797		FRONT_RING_INIT(&rxq->ring, rxs, PAGE_SIZE);
798
799		error = xenbus_grant_ring(dev, virt_to_mfn(rxs),
800		    &rxq->ring_ref);
801		if (error != 0) {
802			device_printf(dev, "granting rx ring page");
803			goto fail_grant_ring;
804		}
805
806		callout_init(&rxq->rx_refill, 1);
807	}
808
809	return (0);
810
811fail_grant_ring:
812	gnttab_free_grant_references(rxq->gref_head);
813	free(rxq->ring.sring, M_DEVBUF);
814fail:
815	for (; q >= 0; q--) {
816		disconnect_rxq(&info->rxq[q]);
817		destroy_rxq(&info->rxq[q]);
818	}
819
820	free(info->rxq, M_DEVBUF);
821	return (error);
822}
823
824static void
825disconnect_txq(struct netfront_txq *txq)
826{
827
828	xn_release_tx_bufs(txq);
829	gnttab_free_grant_references(txq->gref_head);
830	if (txq->ring_ref != GRANT_REF_INVALID) {
831		gnttab_end_foreign_access(txq->ring_ref, NULL);
832		txq->ring_ref = GRANT_REF_INVALID;
833	}
834	xen_intr_unbind(&txq->xen_intr_handle);
835}
836
837static void
838destroy_txq(struct netfront_txq *txq)
839{
840	unsigned int i;
841
842	free(txq->ring.sring, M_DEVBUF);
843	txq->ring.sring = NULL;
844	buf_ring_free(txq->br, M_DEVBUF);
845	txq->br = NULL;
846	if (txq->tq) {
847		taskqueue_drain_all(txq->tq);
848		taskqueue_free(txq->tq);
849		txq->tq = NULL;
850	}
851
852	for (i = 0; i <= NET_TX_RING_SIZE; i++) {
853		bus_dmamap_destroy(txq->info->dma_tag,
854		    txq->xennet_tag[i].dma_map);
855		txq->xennet_tag[i].dma_map = NULL;
856	}
857}
858
859static void
860destroy_txqs(struct netfront_info *np)
861{
862	int i;
863
864	for (i = 0; i < np->num_queues; i++)
865		destroy_txq(&np->txq[i]);
866
867	free(np->txq, M_DEVBUF);
868	np->txq = NULL;
869}
870
871static int
872setup_txqs(device_t dev, struct netfront_info *info,
873	   unsigned long num_queues)
874{
875	int q, i;
876	int error;
877	netif_tx_sring_t *txs;
878	struct netfront_txq *txq;
879
880	info->txq = malloc(sizeof(struct netfront_txq) * num_queues,
881	    M_DEVBUF, M_WAITOK|M_ZERO);
882
883	for (q = 0; q < num_queues; q++) {
884		txq = &info->txq[q];
885
886		txq->id = q;
887		txq->info = info;
888
889		txq->gref_head = GNTTAB_LIST_END;
890		txq->ring_ref = GRANT_REF_INVALID;
891		txq->ring.sring = NULL;
892
893		snprintf(txq->name, XN_QUEUE_NAME_LEN, "xntx_%u", q);
894
895		mtx_init(&txq->lock, txq->name, "netfront transmit lock",
896		    MTX_DEF);
897		SLIST_INIT(&txq->tags);
898
899		for (i = 0; i <= NET_TX_RING_SIZE; i++) {
900			txq->mbufs[i] = (void *) ((u_long) i+1);
901			txq->grant_ref[i] = GRANT_REF_INVALID;
902			txq->xennet_tag[i].txq = txq;
903			txq->xennet_tag[i].dma_tag = info->dma_tag;
904			error = bus_dmamap_create(info->dma_tag, 0,
905			    &txq->xennet_tag[i].dma_map);
906			if (error != 0) {
907				device_printf(dev,
908				    "failed to allocate dma map\n");
909				goto fail;
910			}
911			m_tag_setup(&txq->xennet_tag[i].tag,
912			    MTAG_COOKIE, MTAG_XENNET,
913			    sizeof(txq->xennet_tag[i]) -
914			    sizeof(txq->xennet_tag[i].tag));
915			txq->xennet_tag[i].tag.m_tag_free = &tag_free;
916			SLIST_INSERT_HEAD(&txq->tags, &txq->xennet_tag[i],
917			    next);
918		}
919		txq->mbufs[NET_TX_RING_SIZE] = (void *)0;
920
921		/* Start resources allocation. */
922
923		if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
924		    &txq->gref_head) != 0) {
925			device_printf(dev, "failed to allocate tx grant refs\n");
926			error = ENOMEM;
927			goto fail;
928		}
929
930		txs = (netif_tx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF,
931		    M_WAITOK|M_ZERO);
932		SHARED_RING_INIT(txs);
933		FRONT_RING_INIT(&txq->ring, txs, PAGE_SIZE);
934
935		error = xenbus_grant_ring(dev, virt_to_mfn(txs),
936		    &txq->ring_ref);
937		if (error != 0) {
938			device_printf(dev, "failed to grant tx ring\n");
939			goto fail_grant_ring;
940		}
941
942		txq->br = buf_ring_alloc(NET_TX_RING_SIZE, M_DEVBUF,
943		    M_WAITOK, &txq->lock);
944		TASK_INIT(&txq->defrtask, 0, xn_txq_tq_deferred, txq);
945
946		txq->tq = taskqueue_create(txq->name, M_WAITOK,
947		    taskqueue_thread_enqueue, &txq->tq);
948
949		error = taskqueue_start_threads(&txq->tq, 1, PI_NET,
950		    "%s txq %d", device_get_nameunit(dev), txq->id);
951		if (error != 0) {
952			device_printf(dev, "failed to start tx taskq %d\n",
953			    txq->id);
954			goto fail_start_thread;
955		}
956
957		error = xen_intr_alloc_and_bind_local_port(dev,
958		    xenbus_get_otherend_id(dev), /* filter */ NULL, xn_intr,
959		    &info->txq[q], INTR_TYPE_NET | INTR_MPSAFE | INTR_ENTROPY,
960		    &txq->xen_intr_handle);
961
962		if (error != 0) {
963			device_printf(dev, "xen_intr_alloc_and_bind_local_port failed\n");
964			goto fail_bind_port;
965		}
966	}
967
968	return (0);
969
970fail_bind_port:
971	taskqueue_drain_all(txq->tq);
972fail_start_thread:
973	buf_ring_free(txq->br, M_DEVBUF);
974	taskqueue_free(txq->tq);
975	gnttab_end_foreign_access(txq->ring_ref, NULL);
976fail_grant_ring:
977	gnttab_free_grant_references(txq->gref_head);
978	free(txq->ring.sring, M_DEVBUF);
979fail:
980	for (; q >= 0; q--) {
981		disconnect_txq(&info->txq[q]);
982		destroy_txq(&info->txq[q]);
983	}
984
985	free(info->txq, M_DEVBUF);
986	return (error);
987}
988
989static int
990setup_device(device_t dev, struct netfront_info *info,
991    unsigned long num_queues)
992{
993	int error;
994	int q;
995
996	if (info->txq)
997		destroy_txqs(info);
998
999	if (info->rxq)
1000		destroy_rxqs(info);
1001
1002	info->num_queues = 0;
1003
1004	error = setup_rxqs(dev, info, num_queues);
1005	if (error != 0)
1006		goto out;
1007	error = setup_txqs(dev, info, num_queues);
1008	if (error != 0)
1009		goto out;
1010
1011	info->num_queues = num_queues;
1012
1013	/* No split event channel at the moment. */
1014	for (q = 0; q < num_queues; q++)
1015		info->rxq[q].xen_intr_handle = info->txq[q].xen_intr_handle;
1016
1017	return (0);
1018
1019out:
1020	KASSERT(error != 0, ("Error path taken without providing an error code"));
1021	return (error);
1022}
1023
1024#ifdef INET
1025static u_int
1026netfront_addr_cb(void *arg, struct ifaddr *a, u_int count)
1027{
1028	arp_ifinit((if_t)arg, a);
1029	return (1);
1030}
1031/**
1032 * If this interface has an ipv4 address, send an arp for it. This
1033 * helps to get the network going again after migrating hosts.
1034 */
1035static void
1036netfront_send_fake_arp(device_t dev, struct netfront_info *info)
1037{
1038	if_t ifp;
1039
1040	ifp = info->xn_ifp;
1041	if_foreach_addr_type(ifp, AF_INET, netfront_addr_cb, ifp);
1042}
1043#endif
1044
1045/**
1046 * Callback received when the backend's state changes.
1047 */
1048static void
1049netfront_backend_changed(device_t dev, XenbusState newstate)
1050{
1051	struct netfront_info *sc = device_get_softc(dev);
1052
1053	DPRINTK("newstate=%d\n", newstate);
1054
1055	CURVNET_SET(if_getvnet(sc->xn_ifp));
1056
1057	switch (newstate) {
1058	case XenbusStateInitialising:
1059	case XenbusStateInitialised:
1060	case XenbusStateUnknown:
1061	case XenbusStateReconfigured:
1062	case XenbusStateReconfiguring:
1063		break;
1064	case XenbusStateInitWait:
1065		if (xenbus_get_state(dev) != XenbusStateInitialising)
1066			break;
1067		if (xn_connect(sc) != 0)
1068			break;
1069		/* Switch to connected state before kicking the rings. */
1070		xenbus_set_state(sc->xbdev, XenbusStateConnected);
1071		xn_kick_rings(sc);
1072		break;
1073	case XenbusStateClosing:
1074		xenbus_set_state(dev, XenbusStateClosed);
1075		break;
1076	case XenbusStateClosed:
1077		if (sc->xn_reset) {
1078			netif_disconnect_backend(sc);
1079			xenbus_set_state(dev, XenbusStateInitialising);
1080			sc->xn_reset = false;
1081		}
1082		break;
1083	case XenbusStateConnected:
1084#ifdef INET
1085		netfront_send_fake_arp(dev, sc);
1086#endif
1087		break;
1088	}
1089
1090	CURVNET_RESTORE();
1091}
1092
1093/**
1094 * \brief Verify that there is sufficient space in the Tx ring
1095 *        buffer for a maximally sized request to be enqueued.
1096 *
1097 * A transmit request requires a transmit descriptor for each packet
1098 * fragment, plus up to 2 entries for "options" (e.g. TSO).
1099 */
1100static inline int
1101xn_tx_slot_available(struct netfront_txq *txq)
1102{
1103
1104	return (RING_FREE_REQUESTS(&txq->ring) > (MAX_TX_REQ_FRAGS + 2));
1105}
1106
1107static void
1108xn_release_tx_bufs(struct netfront_txq *txq)
1109{
1110	int i;
1111
1112	for (i = 1; i <= NET_TX_RING_SIZE; i++) {
1113		struct mbuf *m;
1114
1115		m = txq->mbufs[i];
1116
1117		/*
1118		 * We assume that no kernel addresses are
1119		 * less than NET_TX_RING_SIZE.  Any entry
1120		 * in the table that is below this number
1121		 * must be an index from free-list tracking.
1122		 */
1123		if (((uintptr_t)m) <= NET_TX_RING_SIZE)
1124			continue;
1125		gnttab_end_foreign_access_ref(txq->grant_ref[i]);
1126		gnttab_release_grant_reference(&txq->gref_head,
1127		    txq->grant_ref[i]);
1128		txq->grant_ref[i] = GRANT_REF_INVALID;
1129		add_id_to_freelist(txq->mbufs, i);
1130		txq->mbufs_cnt--;
1131		if (txq->mbufs_cnt < 0) {
1132			panic("%s: tx_chain_cnt must be >= 0", __func__);
1133		}
1134		mbuf_release(m);
1135	}
1136}
1137
1138static struct mbuf *
1139xn_alloc_one_rx_buffer(struct netfront_rxq *rxq)
1140{
1141	struct mbuf *m;
1142
1143	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
1144	if (m == NULL)
1145		return NULL;
1146	m->m_len = m->m_pkthdr.len = MJUMPAGESIZE;
1147
1148	return (m);
1149}
1150
1151static void
1152xn_alloc_rx_buffers(struct netfront_rxq *rxq)
1153{
1154	RING_IDX req_prod;
1155	int notify;
1156
1157	XN_RX_LOCK_ASSERT(rxq);
1158
1159	if (__predict_false(rxq->info->carrier == 0))
1160		return;
1161
1162	for (req_prod = rxq->ring.req_prod_pvt;
1163	     req_prod - rxq->ring.rsp_cons < NET_RX_RING_SIZE;
1164	     req_prod++) {
1165		struct mbuf *m;
1166		unsigned short id;
1167		grant_ref_t ref;
1168		struct netif_rx_request *req;
1169		unsigned long pfn;
1170
1171		m = xn_alloc_one_rx_buffer(rxq);
1172		if (m == NULL)
1173			break;
1174
1175		id = xn_rxidx(req_prod);
1176
1177		KASSERT(rxq->mbufs[id] == NULL, ("non-NULL xn_rx_chain"));
1178		rxq->mbufs[id] = m;
1179
1180		ref = gnttab_claim_grant_reference(&rxq->gref_head);
1181		KASSERT(ref != GNTTAB_LIST_END,
1182		    ("reserved grant references exhuasted"));
1183		rxq->grant_ref[id] = ref;
1184
1185		pfn = atop(vtophys(mtod(m, vm_offset_t)));
1186		req = RING_GET_REQUEST(&rxq->ring, req_prod);
1187
1188		gnttab_grant_foreign_access_ref(ref,
1189		    xenbus_get_otherend_id(rxq->info->xbdev), pfn, 0);
1190		req->id = id;
1191		req->gref = ref;
1192	}
1193
1194	rxq->ring.req_prod_pvt = req_prod;
1195
1196	/* Not enough requests? Try again later. */
1197	if (req_prod - rxq->ring.rsp_cons < NET_RX_SLOTS_MIN) {
1198		callout_reset_curcpu(&rxq->rx_refill, hz/10,
1199		    xn_alloc_rx_buffers_callout, rxq);
1200		return;
1201	}
1202
1203	wmb();		/* barrier so backend seens requests */
1204
1205	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rxq->ring, notify);
1206	if (notify)
1207		xen_intr_signal(rxq->xen_intr_handle);
1208}
1209
1210static void xn_alloc_rx_buffers_callout(void *arg)
1211{
1212	struct netfront_rxq *rxq;
1213
1214	rxq = (struct netfront_rxq *)arg;
1215	XN_RX_LOCK(rxq);
1216	xn_alloc_rx_buffers(rxq);
1217	XN_RX_UNLOCK(rxq);
1218}
1219
1220static void
1221xn_release_rx_bufs(struct netfront_rxq *rxq)
1222{
1223	int i,  ref;
1224	struct mbuf *m;
1225
1226	for (i = 0; i < NET_RX_RING_SIZE; i++) {
1227		m = rxq->mbufs[i];
1228
1229		if (m == NULL)
1230			continue;
1231
1232		ref = rxq->grant_ref[i];
1233		if (ref == GRANT_REF_INVALID)
1234			continue;
1235
1236		gnttab_end_foreign_access_ref(ref);
1237		gnttab_release_grant_reference(&rxq->gref_head, ref);
1238		rxq->mbufs[i] = NULL;
1239		rxq->grant_ref[i] = GRANT_REF_INVALID;
1240		m_freem(m);
1241	}
1242}
1243
1244static void
1245xn_rxeof(struct netfront_rxq *rxq)
1246{
1247	if_t ifp;
1248	struct netfront_info *np = rxq->info;
1249#if (defined(INET) || defined(INET6))
1250	struct lro_ctrl *lro = &rxq->lro;
1251#endif
1252	struct netfront_rx_info rinfo;
1253	struct netif_rx_response *rx = &rinfo.rx;
1254	struct netif_extra_info *extras = rinfo.extras;
1255	RING_IDX i, rp;
1256	struct mbuf *m;
1257	struct mbufq mbufq_rxq, mbufq_errq;
1258	int err, work_to_do;
1259
1260	XN_RX_LOCK_ASSERT(rxq);
1261
1262	if (!netfront_carrier_ok(np))
1263		return;
1264
1265	/* XXX: there should be some sane limit. */
1266	mbufq_init(&mbufq_errq, INT_MAX);
1267	mbufq_init(&mbufq_rxq, INT_MAX);
1268
1269	ifp = np->xn_ifp;
1270
1271	do {
1272		rp = rxq->ring.sring->rsp_prod;
1273		rmb();	/* Ensure we see queued responses up to 'rp'. */
1274
1275		i = rxq->ring.rsp_cons;
1276		while ((i != rp)) {
1277			memcpy(rx, RING_GET_RESPONSE(&rxq->ring, i), sizeof(*rx));
1278			memset(extras, 0, sizeof(rinfo.extras));
1279
1280			m = NULL;
1281			err = xn_get_responses(rxq, &rinfo, rp, &i, &m);
1282
1283			if (__predict_false(err)) {
1284				if (m)
1285					(void )mbufq_enqueue(&mbufq_errq, m);
1286				if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1287				continue;
1288			}
1289
1290			m->m_pkthdr.rcvif = ifp;
1291			if (rx->flags & NETRXF_data_validated) {
1292				/*
1293				 * According to mbuf(9) the correct way to tell
1294				 * the stack that the checksum of an inbound
1295				 * packet is correct, without it actually being
1296				 * present (because the underlying interface
1297				 * doesn't provide it), is to set the
1298				 * CSUM_DATA_VALID and CSUM_PSEUDO_HDR flags,
1299				 * and the csum_data field to 0xffff.
1300				 */
1301				m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID
1302				    | CSUM_PSEUDO_HDR);
1303				m->m_pkthdr.csum_data = 0xffff;
1304			}
1305			if ((rx->flags & NETRXF_extra_info) != 0 &&
1306			    (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type ==
1307			    XEN_NETIF_EXTRA_TYPE_GSO)) {
1308				m->m_pkthdr.tso_segsz =
1309				extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].u.gso.size;
1310				m->m_pkthdr.csum_flags |= CSUM_TSO;
1311			}
1312
1313			(void )mbufq_enqueue(&mbufq_rxq, m);
1314		}
1315
1316		rxq->ring.rsp_cons = i;
1317
1318		xn_alloc_rx_buffers(rxq);
1319
1320		RING_FINAL_CHECK_FOR_RESPONSES(&rxq->ring, work_to_do);
1321	} while (work_to_do);
1322
1323	mbufq_drain(&mbufq_errq);
1324	/*
1325	 * Process all the mbufs after the remapping is complete.
1326	 * Break the mbuf chain first though.
1327	 */
1328	while ((m = mbufq_dequeue(&mbufq_rxq)) != NULL) {
1329		if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
1330#if (defined(INET) || defined(INET6))
1331		/* Use LRO if possible */
1332		if ((if_getcapenable(ifp) & IFCAP_LRO) == 0 ||
1333		    lro->lro_cnt == 0 || tcp_lro_rx(lro, m, 0)) {
1334			/*
1335			 * If LRO fails, pass up to the stack
1336			 * directly.
1337			 */
1338			if_input(ifp, m);
1339		}
1340#else
1341		if_input(ifp, m);
1342#endif
1343	}
1344
1345#if (defined(INET) || defined(INET6))
1346	/*
1347	 * Flush any outstanding LRO work
1348	 */
1349	tcp_lro_flush_all(lro);
1350#endif
1351}
1352
1353static void
1354xn_txeof(struct netfront_txq *txq)
1355{
1356	RING_IDX i, prod;
1357	unsigned short id;
1358	if_t ifp;
1359	netif_tx_response_t *txr;
1360	struct mbuf *m;
1361	struct netfront_info *np = txq->info;
1362
1363	XN_TX_LOCK_ASSERT(txq);
1364
1365	if (!netfront_carrier_ok(np))
1366		return;
1367
1368	ifp = np->xn_ifp;
1369
1370	do {
1371		prod = txq->ring.sring->rsp_prod;
1372		rmb(); /* Ensure we see responses up to 'rp'. */
1373
1374		for (i = txq->ring.rsp_cons; i != prod; i++) {
1375			txr = RING_GET_RESPONSE(&txq->ring, i);
1376			if (txr->status == NETIF_RSP_NULL)
1377				continue;
1378
1379			if (txr->status != NETIF_RSP_OKAY) {
1380				printf("%s: WARNING: response is %d!\n",
1381				       __func__, txr->status);
1382			}
1383			id = txr->id;
1384			m = txq->mbufs[id];
1385			KASSERT(m != NULL, ("mbuf not found in chain"));
1386			KASSERT((uintptr_t)m > NET_TX_RING_SIZE,
1387				("mbuf already on the free list, but we're "
1388				"trying to free it again!"));
1389			M_ASSERTVALID(m);
1390
1391			if (__predict_false(gnttab_query_foreign_access(
1392			    txq->grant_ref[id]) != 0)) {
1393				panic("%s: grant id %u still in use by the "
1394				    "backend", __func__, id);
1395			}
1396			gnttab_end_foreign_access_ref(txq->grant_ref[id]);
1397			gnttab_release_grant_reference(
1398				&txq->gref_head, txq->grant_ref[id]);
1399			txq->grant_ref[id] = GRANT_REF_INVALID;
1400
1401			txq->mbufs[id] = NULL;
1402			add_id_to_freelist(txq->mbufs, id);
1403			txq->mbufs_cnt--;
1404			mbuf_release(m);
1405			/* Only mark the txq active if we've freed up at least one slot to try */
1406			if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
1407		}
1408		txq->ring.rsp_cons = prod;
1409
1410		/*
1411		 * Set a new event, then check for race with update of
1412		 * tx_cons. Note that it is essential to schedule a
1413		 * callback, no matter how few buffers are pending. Even if
1414		 * there is space in the transmit ring, higher layers may
1415		 * be blocked because too much data is outstanding: in such
1416		 * cases notification from Xen is likely to be the only kick
1417		 * that we'll get.
1418		 */
1419		txq->ring.sring->rsp_event =
1420		    prod + ((txq->ring.sring->req_prod - prod) >> 1) + 1;
1421
1422		mb();
1423	} while (prod != txq->ring.sring->rsp_prod);
1424
1425	if (txq->full &&
1426	    ((txq->ring.sring->req_prod - prod) < NET_TX_RING_SIZE)) {
1427		txq->full = false;
1428		xn_txq_start(txq);
1429	}
1430}
1431
1432static void
1433xn_intr(void *xsc)
1434{
1435	struct netfront_txq *txq = xsc;
1436	struct netfront_info *np = txq->info;
1437	struct netfront_rxq *rxq = &np->rxq[txq->id];
1438
1439	/* kick both tx and rx */
1440	xn_rxq_intr(rxq);
1441	xn_txq_intr(txq);
1442}
1443
1444static void
1445xn_move_rx_slot(struct netfront_rxq *rxq, struct mbuf *m,
1446    grant_ref_t ref)
1447{
1448	int new = xn_rxidx(rxq->ring.req_prod_pvt);
1449
1450	KASSERT(rxq->mbufs[new] == NULL, ("mbufs != NULL"));
1451	rxq->mbufs[new] = m;
1452	rxq->grant_ref[new] = ref;
1453	RING_GET_REQUEST(&rxq->ring, rxq->ring.req_prod_pvt)->id = new;
1454	RING_GET_REQUEST(&rxq->ring, rxq->ring.req_prod_pvt)->gref = ref;
1455	rxq->ring.req_prod_pvt++;
1456}
1457
1458static int
1459xn_get_extras(struct netfront_rxq *rxq,
1460    struct netif_extra_info *extras, RING_IDX rp, RING_IDX *cons)
1461{
1462	struct netif_extra_info *extra;
1463
1464	int err = 0;
1465
1466	do {
1467		struct mbuf *m;
1468		grant_ref_t ref;
1469
1470		if (__predict_false(*cons + 1 == rp)) {
1471			err = EINVAL;
1472			break;
1473		}
1474
1475		extra = (struct netif_extra_info *)
1476		RING_GET_RESPONSE(&rxq->ring, ++(*cons));
1477
1478		if (__predict_false(!extra->type ||
1479			extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
1480			err = EINVAL;
1481		} else {
1482			memcpy(&extras[extra->type - 1], extra, sizeof(*extra));
1483		}
1484
1485		m = xn_get_rx_mbuf(rxq, *cons);
1486		ref = xn_get_rx_ref(rxq,  *cons);
1487		xn_move_rx_slot(rxq, m, ref);
1488	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
1489
1490	return err;
1491}
1492
1493static int
1494xn_get_responses(struct netfront_rxq *rxq,
1495    struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons,
1496    struct mbuf  **list)
1497{
1498	struct netif_rx_response *rx = &rinfo->rx;
1499	struct netif_extra_info *extras = rinfo->extras;
1500	struct mbuf *m, *m0, *m_prev;
1501	grant_ref_t ref = xn_get_rx_ref(rxq, *cons);
1502	int frags = 1;
1503	int err = 0;
1504	u_long ret __diagused;
1505
1506	m0 = m = m_prev = xn_get_rx_mbuf(rxq, *cons);
1507
1508	if (rx->flags & NETRXF_extra_info) {
1509		err = xn_get_extras(rxq, extras, rp, cons);
1510	}
1511
1512	if (m0 != NULL) {
1513		m0->m_pkthdr.len = 0;
1514		m0->m_next = NULL;
1515	}
1516
1517	for (;;) {
1518#if 0
1519		DPRINTK("rx->status=%hd rx->offset=%hu frags=%u\n",
1520			rx->status, rx->offset, frags);
1521#endif
1522		if (__predict_false(rx->status < 0 ||
1523			rx->offset + rx->status > PAGE_SIZE)) {
1524			xn_move_rx_slot(rxq, m, ref);
1525			if (m0 == m)
1526				m0 = NULL;
1527			m = NULL;
1528			err = EINVAL;
1529			goto next_skip_queue;
1530		}
1531
1532		/*
1533		 * This definitely indicates a bug, either in this driver or in
1534		 * the backend driver. In future this should flag the bad
1535		 * situation to the system controller to reboot the backed.
1536		 */
1537		if (ref == GRANT_REF_INVALID) {
1538			printf("%s: Bad rx response id %d.\n", __func__, rx->id);
1539			err = EINVAL;
1540			goto next;
1541		}
1542
1543		ret = gnttab_end_foreign_access_ref(ref);
1544		KASSERT(ret, ("Unable to end access to grant references"));
1545
1546		gnttab_release_grant_reference(&rxq->gref_head, ref);
1547
1548next:
1549		if (m == NULL)
1550			break;
1551
1552		m->m_len = rx->status;
1553		m->m_data += rx->offset;
1554		m0->m_pkthdr.len += rx->status;
1555
1556next_skip_queue:
1557		if (!(rx->flags & NETRXF_more_data))
1558			break;
1559
1560		if (*cons + frags == rp) {
1561			if (net_ratelimit())
1562				WPRINTK("Need more frags\n");
1563			err = ENOENT;
1564			printf("%s: cons %u frags %u rp %u, not enough frags\n",
1565			       __func__, *cons, frags, rp);
1566			break;
1567		}
1568		/*
1569		 * Note that m can be NULL, if rx->status < 0 or if
1570		 * rx->offset + rx->status > PAGE_SIZE above.
1571		 */
1572		m_prev = m;
1573
1574		rx = RING_GET_RESPONSE(&rxq->ring, *cons + frags);
1575		m = xn_get_rx_mbuf(rxq, *cons + frags);
1576
1577		/*
1578		 * m_prev == NULL can happen if rx->status < 0 or if
1579		 * rx->offset + * rx->status > PAGE_SIZE above.
1580		 */
1581		if (m_prev != NULL)
1582			m_prev->m_next = m;
1583
1584		/*
1585		 * m0 can be NULL if rx->status < 0 or if * rx->offset +
1586		 * rx->status > PAGE_SIZE above.
1587		 */
1588		if (m0 == NULL)
1589			m0 = m;
1590		m->m_next = NULL;
1591		ref = xn_get_rx_ref(rxq, *cons + frags);
1592		frags++;
1593	}
1594	*list = m0;
1595	*cons += frags;
1596
1597	return (err);
1598}
1599
1600/**
1601 * Given an mbuf chain, make sure we have enough room and then push
1602 * it onto the transmit ring.
1603 */
1604static int
1605xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head)
1606{
1607	struct netfront_info *np = txq->info;
1608	if_t ifp = np->xn_ifp;
1609	int otherend_id, error, nfrags;
1610	bus_dma_segment_t *segs = txq->segs;
1611	struct mbuf_xennet *tag;
1612	bus_dmamap_t map;
1613	unsigned int i;
1614
1615	KASSERT(!SLIST_EMPTY(&txq->tags), ("no tags available"));
1616	tag = SLIST_FIRST(&txq->tags);
1617	SLIST_REMOVE_HEAD(&txq->tags, next);
1618	KASSERT(tag->count == 0, ("tag already in-use"));
1619	map = tag->dma_map;
1620	error = bus_dmamap_load_mbuf_sg(np->dma_tag, map, m_head, segs,
1621	    &nfrags, 0);
1622	if (error == EFBIG || nfrags > np->maxfrags) {
1623		struct mbuf *m;
1624
1625		bus_dmamap_unload(np->dma_tag, map);
1626		m = m_defrag(m_head, M_NOWAIT);
1627		if (!m) {
1628			/*
1629			 * Defrag failed, so free the mbuf and
1630			 * therefore drop the packet.
1631			 */
1632			SLIST_INSERT_HEAD(&txq->tags, tag, next);
1633			m_freem(m_head);
1634			return (EMSGSIZE);
1635		}
1636		m_head = m;
1637		error = bus_dmamap_load_mbuf_sg(np->dma_tag, map, m_head, segs,
1638		    &nfrags, 0);
1639		if (error != 0 || nfrags > np->maxfrags) {
1640			bus_dmamap_unload(np->dma_tag, map);
1641			SLIST_INSERT_HEAD(&txq->tags, tag, next);
1642			m_freem(m_head);
1643			return (error ?: EFBIG);
1644		}
1645	} else if (error != 0) {
1646		SLIST_INSERT_HEAD(&txq->tags, tag, next);
1647		m_freem(m_head);
1648		return (error);
1649	}
1650
1651	/**
1652	 * The FreeBSD TCP stack, with TSO enabled, can produce a chain
1653	 * of mbufs longer than Linux can handle.  Make sure we don't
1654	 * pass a too-long chain over to the other side by dropping the
1655	 * packet.  It doesn't look like there is currently a way to
1656	 * tell the TCP stack to generate a shorter chain of packets.
1657	 */
1658	if (nfrags > MAX_TX_REQ_FRAGS) {
1659#ifdef DEBUG
1660		printf("%s: nfrags %d > MAX_TX_REQ_FRAGS %d, netback "
1661		       "won't be able to handle it, dropping\n",
1662		       __func__, nfrags, MAX_TX_REQ_FRAGS);
1663#endif
1664		SLIST_INSERT_HEAD(&txq->tags, tag, next);
1665		bus_dmamap_unload(np->dma_tag, map);
1666		m_freem(m_head);
1667		return (EMSGSIZE);
1668	}
1669
1670	/*
1671	 * This check should be redundant.  We've already verified that we
1672	 * have enough slots in the ring to handle a packet of maximum
1673	 * size, and that our packet is less than the maximum size.  Keep
1674	 * it in here as an assert for now just to make certain that
1675	 * chain_cnt is accurate.
1676	 */
1677	KASSERT((txq->mbufs_cnt + nfrags) <= NET_TX_RING_SIZE,
1678		("%s: chain_cnt (%d) + nfrags (%d) > NET_TX_RING_SIZE "
1679		 "(%d)!", __func__, (int) txq->mbufs_cnt,
1680                    (int) nfrags, (int) NET_TX_RING_SIZE));
1681
1682	/*
1683	 * Start packing the mbufs in this chain into
1684	 * the fragment pointers. Stop when we run out
1685	 * of fragments or hit the end of the mbuf chain.
1686	 */
1687	otherend_id = xenbus_get_otherend_id(np->xbdev);
1688	m_tag_prepend(m_head, &tag->tag);
1689	for (i = 0; i < nfrags; i++) {
1690		netif_tx_request_t *tx;
1691		uintptr_t id;
1692		grant_ref_t ref;
1693		u_long mfn; /* XXX Wrong type? */
1694
1695		tx = RING_GET_REQUEST(&txq->ring, txq->ring.req_prod_pvt);
1696		id = get_id_from_freelist(txq->mbufs);
1697		if (id == 0)
1698			panic("%s: was allocated the freelist head!\n",
1699			    __func__);
1700		txq->mbufs_cnt++;
1701		if (txq->mbufs_cnt > NET_TX_RING_SIZE)
1702			panic("%s: tx_chain_cnt must be <= NET_TX_RING_SIZE\n",
1703			    __func__);
1704		mbuf_grab(m_head);
1705		txq->mbufs[id] = m_head;
1706		tx->id = id;
1707		ref = gnttab_claim_grant_reference(&txq->gref_head);
1708		KASSERT((short)ref >= 0, ("Negative ref"));
1709		mfn = atop(segs[i].ds_addr);
1710		gnttab_grant_foreign_access_ref(ref, otherend_id,
1711		    mfn, GNTMAP_readonly);
1712		tx->gref = txq->grant_ref[id] = ref;
1713		tx->offset = segs[i].ds_addr & PAGE_MASK;
1714		KASSERT(tx->offset + segs[i].ds_len <= PAGE_SIZE,
1715		    ("mbuf segment crosses a page boundary"));
1716		tx->flags = 0;
1717		if (i == 0) {
1718			/*
1719			 * The first fragment has the entire packet
1720			 * size, subsequent fragments have just the
1721			 * fragment size. The backend works out the
1722			 * true size of the first fragment by
1723			 * subtracting the sizes of the other
1724			 * fragments.
1725			 */
1726			tx->size = m_head->m_pkthdr.len;
1727
1728			/*
1729			 * The first fragment contains the checksum flags
1730			 * and is optionally followed by extra data for
1731			 * TSO etc.
1732			 */
1733			/**
1734			 * CSUM_TSO requires checksum offloading.
1735			 * Some versions of FreeBSD fail to
1736			 * set CSUM_TCP in the CSUM_TSO case,
1737			 * so we have to test for CSUM_TSO
1738			 * explicitly.
1739			 */
1740			if (m_head->m_pkthdr.csum_flags
1741			    & (CSUM_DELAY_DATA | CSUM_TSO)) {
1742				tx->flags |= (NETTXF_csum_blank
1743				    | NETTXF_data_validated);
1744			}
1745			if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1746				struct netif_extra_info *gso =
1747					(struct netif_extra_info *)
1748					RING_GET_REQUEST(&txq->ring,
1749							 ++txq->ring.req_prod_pvt);
1750
1751				tx->flags |= NETTXF_extra_info;
1752
1753				gso->u.gso.size = m_head->m_pkthdr.tso_segsz;
1754				gso->u.gso.type =
1755					XEN_NETIF_GSO_TYPE_TCPV4;
1756				gso->u.gso.pad = 0;
1757				gso->u.gso.features = 0;
1758
1759				gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
1760				gso->flags = 0;
1761			}
1762		} else {
1763			tx->size = segs[i].ds_len;
1764		}
1765		if (i != nfrags - 1)
1766			tx->flags |= NETTXF_more_data;
1767
1768		txq->ring.req_prod_pvt++;
1769	}
1770	bus_dmamap_sync(np->dma_tag, map, BUS_DMASYNC_PREWRITE);
1771	BPF_MTAP(ifp, m_head);
1772
1773	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1774	if_inc_counter(ifp, IFCOUNTER_OBYTES, m_head->m_pkthdr.len);
1775	if (m_head->m_flags & M_MCAST)
1776		if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
1777
1778	xn_txeof(txq);
1779
1780	return (0);
1781}
1782
1783/* equivalent of network_open() in Linux */
1784static void
1785xn_ifinit_locked(struct netfront_info *np)
1786{
1787	if_t ifp;
1788	int i;
1789	struct netfront_rxq *rxq;
1790
1791	XN_LOCK_ASSERT(np);
1792
1793	ifp = np->xn_ifp;
1794
1795	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING || !netfront_carrier_ok(np))
1796		return;
1797
1798	xn_stop(np);
1799
1800	for (i = 0; i < np->num_queues; i++) {
1801		rxq = &np->rxq[i];
1802		XN_RX_LOCK(rxq);
1803		xn_alloc_rx_buffers(rxq);
1804		rxq->ring.sring->rsp_event = rxq->ring.rsp_cons + 1;
1805		if (RING_HAS_UNCONSUMED_RESPONSES(&rxq->ring))
1806			xn_rxeof(rxq);
1807		XN_RX_UNLOCK(rxq);
1808	}
1809
1810	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
1811	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
1812	if_link_state_change(ifp, LINK_STATE_UP);
1813}
1814
1815static void
1816xn_ifinit(void *xsc)
1817{
1818	struct netfront_info *sc = xsc;
1819
1820	XN_LOCK(sc);
1821	xn_ifinit_locked(sc);
1822	XN_UNLOCK(sc);
1823}
1824
1825static int
1826xn_ioctl(if_t ifp, u_long cmd, caddr_t data)
1827{
1828	struct netfront_info *sc = if_getsoftc(ifp);
1829	struct ifreq *ifr = (struct ifreq *) data;
1830	device_t dev;
1831#ifdef INET
1832	struct ifaddr *ifa = (struct ifaddr *)data;
1833#endif
1834	int mask, error = 0, reinit;
1835
1836	dev = sc->xbdev;
1837
1838	switch(cmd) {
1839	case SIOCSIFADDR:
1840#ifdef INET
1841		XN_LOCK(sc);
1842		if (ifa->ifa_addr->sa_family == AF_INET) {
1843			if_setflagbits(ifp, IFF_UP, 0);
1844			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
1845				xn_ifinit_locked(sc);
1846			arp_ifinit(ifp, ifa);
1847			XN_UNLOCK(sc);
1848		} else {
1849			XN_UNLOCK(sc);
1850#endif
1851			error = ether_ioctl(ifp, cmd, data);
1852#ifdef INET
1853		}
1854#endif
1855		break;
1856	case SIOCSIFMTU:
1857		if (if_getmtu(ifp) == ifr->ifr_mtu)
1858			break;
1859
1860		if_setmtu(ifp, ifr->ifr_mtu);
1861		if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
1862		xn_ifinit(sc);
1863		break;
1864	case SIOCSIFFLAGS:
1865		XN_LOCK(sc);
1866		if (if_getflags(ifp) & IFF_UP) {
1867			/*
1868			 * If only the state of the PROMISC flag changed,
1869			 * then just use the 'set promisc mode' command
1870			 * instead of reinitializing the entire NIC. Doing
1871			 * a full re-init means reloading the firmware and
1872			 * waiting for it to start up, which may take a
1873			 * second or two.
1874			 */
1875			xn_ifinit_locked(sc);
1876		} else {
1877			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
1878				xn_stop(sc);
1879			}
1880		}
1881		sc->xn_if_flags = if_getflags(ifp);
1882		XN_UNLOCK(sc);
1883		break;
1884	case SIOCSIFCAP:
1885		mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
1886		reinit = 0;
1887
1888		if (mask & IFCAP_TXCSUM) {
1889			if_togglecapenable(ifp, IFCAP_TXCSUM);
1890			if_togglehwassist(ifp, XN_CSUM_FEATURES);
1891		}
1892		if (mask & IFCAP_TSO4) {
1893			if_togglecapenable(ifp, IFCAP_TSO4);
1894			if_togglehwassist(ifp, CSUM_TSO);
1895		}
1896
1897		if (mask & (IFCAP_RXCSUM | IFCAP_LRO)) {
1898			/* These Rx features require us to renegotiate. */
1899			reinit = 1;
1900
1901			if (mask & IFCAP_RXCSUM)
1902				if_togglecapenable(ifp, IFCAP_RXCSUM);
1903			if (mask & IFCAP_LRO)
1904				if_togglecapenable(ifp, IFCAP_LRO);
1905		}
1906
1907		if (reinit == 0)
1908			break;
1909
1910		/*
1911		 * We must reset the interface so the backend picks up the
1912		 * new features.
1913		 */
1914		device_printf(sc->xbdev,
1915		    "performing interface reset due to feature change\n");
1916		XN_LOCK(sc);
1917		netfront_carrier_off(sc);
1918		sc->xn_reset = true;
1919		/*
1920		 * NB: the pending packet queue is not flushed, since
1921		 * the interface should still support the old options.
1922		 */
1923		XN_UNLOCK(sc);
1924		/*
1925		 * Delete the xenstore nodes that export features.
1926		 *
1927		 * NB: There's a xenbus state called
1928		 * "XenbusStateReconfiguring", which is what we should set
1929		 * here. Sadly none of the backends know how to handle it,
1930		 * and simply disconnect from the frontend, so we will just
1931		 * switch back to XenbusStateInitialising in order to force
1932		 * a reconnection.
1933		 */
1934		xs_rm(XST_NIL, xenbus_get_node(dev), "feature-gso-tcpv4");
1935		xs_rm(XST_NIL, xenbus_get_node(dev), "feature-no-csum-offload");
1936		xenbus_set_state(dev, XenbusStateClosing);
1937
1938		/*
1939		 * Wait for the frontend to reconnect before returning
1940		 * from the ioctl. 30s should be more than enough for any
1941		 * sane backend to reconnect.
1942		 */
1943		error = tsleep(sc, 0, "xn_rst", 30*hz);
1944		break;
1945	case SIOCADDMULTI:
1946	case SIOCDELMULTI:
1947		break;
1948	case SIOCSIFMEDIA:
1949	case SIOCGIFMEDIA:
1950		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
1951		break;
1952	default:
1953		error = ether_ioctl(ifp, cmd, data);
1954	}
1955
1956	return (error);
1957}
1958
1959static void
1960xn_stop(struct netfront_info *sc)
1961{
1962	if_t ifp;
1963
1964	XN_LOCK_ASSERT(sc);
1965
1966	ifp = sc->xn_ifp;
1967
1968	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
1969	if_link_state_change(ifp, LINK_STATE_DOWN);
1970}
1971
1972static void
1973xn_rebuild_rx_bufs(struct netfront_rxq *rxq)
1974{
1975	int requeue_idx, i;
1976	grant_ref_t ref;
1977	netif_rx_request_t *req;
1978
1979	for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
1980		struct mbuf *m;
1981		u_long pfn;
1982
1983		if (rxq->mbufs[i] == NULL)
1984			continue;
1985
1986		m = rxq->mbufs[requeue_idx] = xn_get_rx_mbuf(rxq, i);
1987		ref = rxq->grant_ref[requeue_idx] = xn_get_rx_ref(rxq, i);
1988
1989		req = RING_GET_REQUEST(&rxq->ring, requeue_idx);
1990		pfn = vtophys(mtod(m, vm_offset_t)) >> PAGE_SHIFT;
1991
1992		gnttab_grant_foreign_access_ref(ref,
1993		    xenbus_get_otherend_id(rxq->info->xbdev),
1994		    pfn, 0);
1995
1996		req->gref = ref;
1997		req->id   = requeue_idx;
1998
1999		requeue_idx++;
2000	}
2001
2002	rxq->ring.req_prod_pvt = requeue_idx;
2003}
2004
2005/* START of Xenolinux helper functions adapted to FreeBSD */
2006static int
2007xn_connect(struct netfront_info *np)
2008{
2009	int i, error;
2010	u_int feature_rx_copy;
2011	struct netfront_rxq *rxq;
2012	struct netfront_txq *txq;
2013
2014	error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
2015	    "feature-rx-copy", NULL, "%u", &feature_rx_copy);
2016	if (error != 0)
2017		feature_rx_copy = 0;
2018
2019	/* We only support rx copy. */
2020	if (!feature_rx_copy)
2021		return (EPROTONOSUPPORT);
2022
2023	/* Recovery procedure: */
2024	error = talk_to_backend(np->xbdev, np);
2025	if (error != 0)
2026		return (error);
2027
2028	/* Step 1: Reinitialise variables. */
2029	xn_query_features(np);
2030	xn_configure_features(np);
2031
2032	/* Step 2: Release TX buffer */
2033	for (i = 0; i < np->num_queues; i++) {
2034		txq = &np->txq[i];
2035		xn_release_tx_bufs(txq);
2036	}
2037
2038	/* Step 3: Rebuild the RX buffer freelist and the RX ring itself. */
2039	for (i = 0; i < np->num_queues; i++) {
2040		rxq = &np->rxq[i];
2041		xn_rebuild_rx_bufs(rxq);
2042	}
2043
2044	/* Step 4: All public and private state should now be sane.  Get
2045	 * ready to start sending and receiving packets and give the driver
2046	 * domain a kick because we've probably just requeued some
2047	 * packets.
2048	 */
2049	netfront_carrier_on(np);
2050	wakeup(np);
2051
2052	return (0);
2053}
2054
2055static void
2056xn_kick_rings(struct netfront_info *np)
2057{
2058	struct netfront_rxq *rxq;
2059	struct netfront_txq *txq;
2060	int i;
2061
2062	for (i = 0; i < np->num_queues; i++) {
2063		txq = &np->txq[i];
2064		rxq = &np->rxq[i];
2065		xen_intr_signal(txq->xen_intr_handle);
2066		XN_TX_LOCK(txq);
2067		xn_txeof(txq);
2068		XN_TX_UNLOCK(txq);
2069		XN_RX_LOCK(rxq);
2070		xn_alloc_rx_buffers(rxq);
2071		XN_RX_UNLOCK(rxq);
2072	}
2073}
2074
2075static void
2076xn_query_features(struct netfront_info *np)
2077{
2078	int val;
2079
2080	device_printf(np->xbdev, "backend features:");
2081
2082	if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
2083		"feature-sg", NULL, "%d", &val) != 0)
2084		val = 0;
2085
2086	np->maxfrags = 1;
2087	if (val) {
2088		np->maxfrags = MAX_TX_REQ_FRAGS;
2089		printf(" feature-sg");
2090	}
2091
2092	if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
2093		"feature-gso-tcpv4", NULL, "%d", &val) != 0)
2094		val = 0;
2095
2096	if_setcapabilitiesbit(np->xn_ifp, 0, IFCAP_TSO4 | IFCAP_LRO);
2097	if (val) {
2098		if_setcapabilitiesbit(np->xn_ifp, IFCAP_TSO4 | IFCAP_LRO, 0);
2099		printf(" feature-gso-tcp4");
2100	}
2101
2102	/*
2103	 * HW CSUM offload is assumed to be available unless
2104	 * feature-no-csum-offload is set in xenstore.
2105	 */
2106	if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
2107		"feature-no-csum-offload", NULL, "%d", &val) != 0)
2108		val = 0;
2109
2110	if_setcapabilitiesbit(np->xn_ifp, IFCAP_HWCSUM, 0);
2111	if (val) {
2112		if_setcapabilitiesbit(np->xn_ifp, 0, IFCAP_HWCSUM);
2113		printf(" feature-no-csum-offload");
2114	}
2115
2116	printf("\n");
2117}
2118
2119static int
2120xn_configure_features(struct netfront_info *np)
2121{
2122	int err, cap_enabled;
2123#if (defined(INET) || defined(INET6))
2124	int i;
2125#endif
2126	if_t ifp;
2127
2128	ifp = np->xn_ifp;
2129	err = 0;
2130
2131	if ((if_getcapenable(ifp) & if_getcapabilities(ifp)) == if_getcapenable(ifp)) {
2132		/* Current options are available, no need to do anything. */
2133		return (0);
2134	}
2135
2136	/* Try to preserve as many options as possible. */
2137	cap_enabled = if_getcapenable(ifp);
2138	if_setcapenable(ifp, 0);
2139	if_sethwassist(ifp, 0);
2140
2141#if (defined(INET) || defined(INET6))
2142	if ((cap_enabled & IFCAP_LRO) != 0)
2143		for (i = 0; i < np->num_queues; i++)
2144			tcp_lro_free(&np->rxq[i].lro);
2145	if (xn_enable_lro &&
2146	    (if_getcapabilities(ifp) & cap_enabled & IFCAP_LRO) != 0) {
2147	    	if_setcapenablebit(ifp, IFCAP_LRO, 0);
2148		for (i = 0; i < np->num_queues; i++) {
2149			err = tcp_lro_init(&np->rxq[i].lro);
2150			if (err != 0) {
2151				device_printf(np->xbdev,
2152				    "LRO initialization failed\n");
2153				if_setcapenablebit(ifp, 0, IFCAP_LRO);
2154				break;
2155			}
2156			np->rxq[i].lro.ifp = ifp;
2157		}
2158	}
2159	if ((if_getcapabilities(ifp) & cap_enabled & IFCAP_TSO4) != 0) {
2160		if_setcapenablebit(ifp, IFCAP_TSO4, 0);
2161		if_sethwassistbits(ifp, CSUM_TSO, 0);
2162	}
2163#endif
2164	if ((if_getcapabilities(ifp) & cap_enabled & IFCAP_TXCSUM) != 0) {
2165		if_setcapenablebit(ifp, IFCAP_TXCSUM, 0);
2166		if_sethwassistbits(ifp, XN_CSUM_FEATURES, 0);
2167	}
2168	if ((if_getcapabilities(ifp) & cap_enabled & IFCAP_RXCSUM) != 0)
2169		if_setcapenablebit(ifp, IFCAP_RXCSUM, 0);
2170
2171	return (err);
2172}
2173
2174static int
2175xn_txq_mq_start_locked(struct netfront_txq *txq, struct mbuf *m)
2176{
2177	struct netfront_info *np;
2178	if_t ifp;
2179	struct buf_ring *br;
2180	int error, notify;
2181
2182	np = txq->info;
2183	br = txq->br;
2184	ifp = np->xn_ifp;
2185	error = 0;
2186
2187	XN_TX_LOCK_ASSERT(txq);
2188
2189	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 ||
2190	    !netfront_carrier_ok(np)) {
2191		if (m != NULL)
2192			error = drbr_enqueue(ifp, br, m);
2193		return (error);
2194	}
2195
2196	if (m != NULL) {
2197		error = drbr_enqueue(ifp, br, m);
2198		if (error != 0)
2199			return (error);
2200	}
2201
2202	while ((m = drbr_peek(ifp, br)) != NULL) {
2203		if (!xn_tx_slot_available(txq)) {
2204			drbr_putback(ifp, br, m);
2205			break;
2206		}
2207
2208		error = xn_assemble_tx_request(txq, m);
2209		/* xn_assemble_tx_request always consumes the mbuf*/
2210		if (error != 0) {
2211			drbr_advance(ifp, br);
2212			break;
2213		}
2214
2215		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&txq->ring, notify);
2216		if (notify)
2217			xen_intr_signal(txq->xen_intr_handle);
2218
2219		drbr_advance(ifp, br);
2220	}
2221
2222	if (RING_FULL(&txq->ring))
2223		txq->full = true;
2224
2225	return (0);
2226}
2227
2228static int
2229xn_txq_mq_start(if_t ifp, struct mbuf *m)
2230{
2231	struct netfront_info *np;
2232	struct netfront_txq *txq;
2233	int i, npairs, error;
2234
2235	np = if_getsoftc(ifp);
2236	npairs = np->num_queues;
2237
2238	if (!netfront_carrier_ok(np))
2239		return (ENOBUFS);
2240
2241	KASSERT(npairs != 0, ("called with 0 available queues"));
2242
2243	/* check if flowid is set */
2244	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
2245		i = m->m_pkthdr.flowid % npairs;
2246	else
2247		i = curcpu % npairs;
2248
2249	txq = &np->txq[i];
2250
2251	if (XN_TX_TRYLOCK(txq) != 0) {
2252		error = xn_txq_mq_start_locked(txq, m);
2253		XN_TX_UNLOCK(txq);
2254	} else {
2255		error = drbr_enqueue(ifp, txq->br, m);
2256		taskqueue_enqueue(txq->tq, &txq->defrtask);
2257	}
2258
2259	return (error);
2260}
2261
2262static void
2263xn_qflush(if_t ifp)
2264{
2265	struct netfront_info *np;
2266	struct netfront_txq *txq;
2267	struct mbuf *m;
2268	int i;
2269
2270	np = if_getsoftc(ifp);
2271
2272	for (i = 0; i < np->num_queues; i++) {
2273		txq = &np->txq[i];
2274
2275		XN_TX_LOCK(txq);
2276		while ((m = buf_ring_dequeue_sc(txq->br)) != NULL)
2277			m_freem(m);
2278		XN_TX_UNLOCK(txq);
2279	}
2280
2281	if_qflush(ifp);
2282}
2283
2284/**
2285 * Create a network device.
2286 * @param dev  Newbus device representing this virtual NIC.
2287 */
2288int
2289create_netdev(device_t dev)
2290{
2291	struct netfront_info *np;
2292	int err, cap_enabled;
2293	if_t ifp;
2294
2295	np = device_get_softc(dev);
2296
2297	np->xbdev         = dev;
2298
2299	mtx_init(&np->sc_lock, "xnsc", "netfront softc lock", MTX_DEF);
2300
2301	ifmedia_init(&np->sc_media, 0, xn_ifmedia_upd, xn_ifmedia_sts);
2302	ifmedia_add(&np->sc_media, IFM_ETHER|IFM_MANUAL, 0, NULL);
2303	ifmedia_set(&np->sc_media, IFM_ETHER|IFM_MANUAL);
2304
2305	err = xen_net_read_mac(dev, np->mac);
2306	if (err != 0)
2307		goto error;
2308
2309	/* Set up ifnet structure */
2310	ifp = np->xn_ifp = if_alloc(IFT_ETHER);
2311	if_setsoftc(ifp, np);
2312	if_initname(ifp, "xn",  device_get_unit(dev));
2313	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
2314	if_setioctlfn(ifp, xn_ioctl);
2315
2316	if_settransmitfn(ifp, xn_txq_mq_start);
2317	if_setqflushfn(ifp, xn_qflush);
2318
2319	if_setinitfn(ifp, xn_ifinit);
2320
2321	if_sethwassist(ifp, XN_CSUM_FEATURES);
2322	/* Enable all supported features at device creation. */
2323	if_setcapabilities(ifp, IFCAP_HWCSUM|IFCAP_TSO4|IFCAP_LRO);
2324	cap_enabled = if_getcapabilities(ifp);
2325	if (!xn_enable_lro) {
2326		cap_enabled &= ~IFCAP_LRO;
2327	}
2328	if_setcapenable(ifp, cap_enabled);
2329
2330	if_sethwtsomax(ifp, 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
2331	if_sethwtsomaxsegcount(ifp, MAX_TX_REQ_FRAGS);
2332	if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
2333
2334	ether_ifattach(ifp, np->mac);
2335	netfront_carrier_off(np);
2336
2337	err = bus_dma_tag_create(
2338	    bus_get_dma_tag(dev),		/* parent */
2339	    1, PAGE_SIZE,			/* algnmnt, boundary */
2340	    BUS_SPACE_MAXADDR,			/* lowaddr */
2341	    BUS_SPACE_MAXADDR,			/* highaddr */
2342	    NULL, NULL,				/* filter, filterarg */
2343	    PAGE_SIZE * MAX_TX_REQ_FRAGS,	/* max request size */
2344	    MAX_TX_REQ_FRAGS,			/* max segments */
2345	    PAGE_SIZE,				/* maxsegsize */
2346	    BUS_DMA_ALLOCNOW,			/* flags */
2347	    NULL, NULL,				/* lockfunc, lockarg */
2348	    &np->dma_tag);
2349
2350	return (err);
2351
2352error:
2353	KASSERT(err != 0, ("Error path with no error code specified"));
2354	return (err);
2355}
2356
2357static int
2358netfront_detach(device_t dev)
2359{
2360	struct netfront_info *info = device_get_softc(dev);
2361
2362	DPRINTK("%s\n", xenbus_get_node(dev));
2363
2364	netif_free(info);
2365
2366	return 0;
2367}
2368
2369static void
2370netif_free(struct netfront_info *np)
2371{
2372
2373	XN_LOCK(np);
2374	xn_stop(np);
2375	XN_UNLOCK(np);
2376	netif_disconnect_backend(np);
2377	ether_ifdetach(np->xn_ifp);
2378	free(np->rxq, M_DEVBUF);
2379	free(np->txq, M_DEVBUF);
2380	if_free(np->xn_ifp);
2381	np->xn_ifp = NULL;
2382	ifmedia_removeall(&np->sc_media);
2383	bus_dma_tag_destroy(np->dma_tag);
2384}
2385
2386static void
2387netif_disconnect_backend(struct netfront_info *np)
2388{
2389	u_int i;
2390
2391	for (i = 0; i < np->num_queues; i++) {
2392		XN_RX_LOCK(&np->rxq[i]);
2393		XN_TX_LOCK(&np->txq[i]);
2394	}
2395	netfront_carrier_off(np);
2396	for (i = 0; i < np->num_queues; i++) {
2397		XN_RX_UNLOCK(&np->rxq[i]);
2398		XN_TX_UNLOCK(&np->txq[i]);
2399	}
2400
2401	for (i = 0; i < np->num_queues; i++) {
2402		disconnect_rxq(&np->rxq[i]);
2403		disconnect_txq(&np->txq[i]);
2404	}
2405}
2406
2407static int
2408xn_ifmedia_upd(if_t ifp)
2409{
2410
2411	return (0);
2412}
2413
2414static void
2415xn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
2416{
2417
2418	ifmr->ifm_status = IFM_AVALID|IFM_ACTIVE;
2419	ifmr->ifm_active = IFM_ETHER|IFM_MANUAL;
2420}
2421
2422/* ** Driver registration ** */
2423static device_method_t netfront_methods[] = {
2424	/* Device interface */
2425	DEVMETHOD(device_probe,         netfront_probe),
2426	DEVMETHOD(device_attach,        netfront_attach),
2427	DEVMETHOD(device_detach,        netfront_detach),
2428	DEVMETHOD(device_shutdown,      bus_generic_shutdown),
2429	DEVMETHOD(device_suspend,       netfront_suspend),
2430	DEVMETHOD(device_resume,        netfront_resume),
2431
2432	/* Xenbus interface */
2433	DEVMETHOD(xenbus_otherend_changed, netfront_backend_changed),
2434
2435	DEVMETHOD_END
2436};
2437
2438static driver_t netfront_driver = {
2439	"xn",
2440	netfront_methods,
2441	sizeof(struct netfront_info),
2442};
2443
2444DRIVER_MODULE(xe, xenbusb_front, netfront_driver, NULL, NULL);
2445