if_hn.c revision 308511
1/*-
2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*-
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54
55#include <sys/cdefs.h>
56__FBSDID("$FreeBSD: stable/10/sys/dev/hyperv/netvsc/if_hn.c 308511 2016-11-11 07:25:14Z sephe $");
57
58#include "opt_inet6.h"
59#include "opt_inet.h"
60
61#include <sys/param.h>
62#include <sys/bus.h>
63#include <sys/kernel.h>
64#include <sys/limits.h>
65#include <sys/malloc.h>
66#include <sys/mbuf.h>
67#include <sys/module.h>
68#include <sys/proc.h>
69#include <sys/queue.h>
70#include <sys/lock.h>
71#include <sys/smp.h>
72#include <sys/socket.h>
73#include <sys/sockio.h>
74#include <sys/sx.h>
75#include <sys/sysctl.h>
76#include <sys/systm.h>
77#include <sys/taskqueue.h>
78#include <sys/buf_ring.h>
79
80#include <machine/atomic.h>
81#include <machine/in_cksum.h>
82
83#include <net/bpf.h>
84#include <net/ethernet.h>
85#include <net/if.h>
86#include <net/if_arp.h>
87#include <net/if_media.h>
88#include <net/if_types.h>
89#include <net/if_var.h>
90#include <net/if_vlan_var.h>
91#include <net/rndis.h>
92
93#include <netinet/in_systm.h>
94#include <netinet/in.h>
95#include <netinet/ip.h>
96#include <netinet/ip6.h>
97#include <netinet/tcp.h>
98#include <netinet/tcp_lro.h>
99#include <netinet/udp.h>
100
101#include <dev/hyperv/include/hyperv.h>
102#include <dev/hyperv/include/hyperv_busdma.h>
103#include <dev/hyperv/include/vmbus.h>
104#include <dev/hyperv/include/vmbus_xact.h>
105
106#include <dev/hyperv/netvsc/ndis.h>
107#include <dev/hyperv/netvsc/if_hnreg.h>
108#include <dev/hyperv/netvsc/if_hnvar.h>
109#include <dev/hyperv/netvsc/hn_nvs.h>
110#include <dev/hyperv/netvsc/hn_rndis.h>
111
112#include "vmbus_if.h"
113
114#define HN_RING_CNT_DEF_MAX		8
115
116/* YYY should get it from the underlying channel */
117#define HN_TX_DESC_CNT			512
118
119#define HN_RNDIS_PKT_LEN					\
120	(sizeof(struct rndis_packet_msg) +			\
121	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
122	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
123	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
124	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
125#define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
126#define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
127
128#define HN_TX_DATA_BOUNDARY		PAGE_SIZE
129#define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
130#define HN_TX_DATA_SEGSIZE		PAGE_SIZE
131/* -1 for RNDIS packet message */
132#define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
133
134#define HN_DIRECT_TX_SIZE_DEF		128
135
136#define HN_EARLY_TXEOF_THRESH		8
137
138#define HN_PKTBUF_LEN_DEF		(16 * 1024)
139
140#define HN_LROENT_CNT_DEF		128
141
142#define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
143#define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
144/* YYY 2*MTU is a bit rough, but should be good enough. */
145#define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
146
147#define HN_LRO_ACKCNT_DEF		1
148
149#define HN_LOCK_INIT(sc)		\
150	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
151#define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
152#define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
153#define HN_LOCK(sc)			sx_xlock(&(sc)->hn_lock)
154#define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
155
156#define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
157#define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
158#define HN_CSUM_IP_HWASSIST(sc)		\
159	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
160#define HN_CSUM_IP6_HWASSIST(sc)	\
161	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
162
163struct hn_txdesc {
164#ifndef HN_USE_TXDESC_BUFRING
165	SLIST_ENTRY(hn_txdesc)		link;
166#endif
167	struct mbuf			*m;
168	struct hn_tx_ring		*txr;
169	int				refs;
170	uint32_t			flags;	/* HN_TXD_FLAG_ */
171	struct hn_nvs_sendctx		send_ctx;
172	uint32_t			chim_index;
173	int				chim_size;
174
175	bus_dmamap_t			data_dmap;
176
177	bus_addr_t			rndis_pkt_paddr;
178	struct rndis_packet_msg		*rndis_pkt;
179	bus_dmamap_t			rndis_pkt_dmap;
180};
181
182#define HN_TXD_FLAG_ONLIST		0x0001
183#define HN_TXD_FLAG_DMAMAP		0x0002
184
185struct hn_rxinfo {
186	uint32_t			vlan_info;
187	uint32_t			csum_info;
188	uint32_t			hash_info;
189	uint32_t			hash_value;
190};
191
192#define HN_RXINFO_VLAN			0x0001
193#define HN_RXINFO_CSUM			0x0002
194#define HN_RXINFO_HASHINF		0x0004
195#define HN_RXINFO_HASHVAL		0x0008
196#define HN_RXINFO_ALL			\
197	(HN_RXINFO_VLAN |		\
198	 HN_RXINFO_CSUM |		\
199	 HN_RXINFO_HASHINF |		\
200	 HN_RXINFO_HASHVAL)
201
202#define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
203#define HN_NDIS_RXCSUM_INFO_INVALID	0
204#define HN_NDIS_HASH_INFO_INVALID	0
205
206static int			hn_probe(device_t);
207static int			hn_attach(device_t);
208static int			hn_detach(device_t);
209static int			hn_shutdown(device_t);
210static void			hn_chan_callback(struct vmbus_channel *,
211				    void *);
212
213static void			hn_init(void *);
214static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
215static void			hn_start(struct ifnet *);
216static int			hn_transmit(struct ifnet *, struct mbuf *);
217static void			hn_xmit_qflush(struct ifnet *);
218static int			hn_ifmedia_upd(struct ifnet *);
219static void			hn_ifmedia_sts(struct ifnet *,
220				    struct ifmediareq *);
221
222static int			hn_rndis_rxinfo(const void *, int,
223				    struct hn_rxinfo *);
224static void			hn_rndis_rx_data(struct hn_rx_ring *,
225				    const void *, int);
226static void			hn_rndis_rx_status(struct hn_softc *,
227				    const void *, int);
228
229static void			hn_nvs_handle_notify(struct hn_softc *,
230				    const struct vmbus_chanpkt_hdr *);
231static void			hn_nvs_handle_comp(struct hn_softc *,
232				    struct vmbus_channel *,
233				    const struct vmbus_chanpkt_hdr *);
234static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
235				    struct vmbus_channel *,
236				    const struct vmbus_chanpkt_hdr *);
237static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
238				    struct vmbus_channel *, uint64_t);
239
240#if __FreeBSD_version >= 1100099
241static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
242static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
243#endif
244static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
245static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
246#if __FreeBSD_version < 1100095
247static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
248#else
249static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
250#endif
251static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
252static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
253static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
254static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
255static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
256static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
257static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
258static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
259static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
260static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
261
262static void			hn_stop(struct hn_softc *);
263static void			hn_init_locked(struct hn_softc *);
264static int			hn_chan_attach(struct hn_softc *,
265				    struct vmbus_channel *);
266static void			hn_chan_detach(struct hn_softc *,
267				    struct vmbus_channel *);
268static int			hn_attach_subchans(struct hn_softc *);
269static void			hn_detach_allchans(struct hn_softc *);
270static void			hn_chan_rollup(struct hn_rx_ring *,
271				    struct hn_tx_ring *);
272static void			hn_set_ring_inuse(struct hn_softc *, int);
273static int			hn_synth_attach(struct hn_softc *, int);
274static void			hn_synth_detach(struct hn_softc *);
275static int			hn_synth_alloc_subchans(struct hn_softc *,
276				    int *);
277static void			hn_suspend(struct hn_softc *);
278static void			hn_suspend_data(struct hn_softc *);
279static void			hn_suspend_mgmt(struct hn_softc *);
280static void			hn_resume(struct hn_softc *);
281static void			hn_resume_data(struct hn_softc *);
282static void			hn_resume_mgmt(struct hn_softc *);
283static void			hn_suspend_mgmt_taskfunc(void *, int);
284static void			hn_chan_drain(struct vmbus_channel *);
285
286static void			hn_update_link_status(struct hn_softc *);
287static void			hn_change_network(struct hn_softc *);
288static void			hn_link_taskfunc(void *, int);
289static void			hn_netchg_init_taskfunc(void *, int);
290static void			hn_netchg_status_taskfunc(void *, int);
291static void			hn_link_status(struct hn_softc *);
292
293static int			hn_create_rx_data(struct hn_softc *, int);
294static void			hn_destroy_rx_data(struct hn_softc *);
295static int			hn_check_iplen(const struct mbuf *, int);
296static int			hn_set_rxfilter(struct hn_softc *);
297static int			hn_rss_reconfig(struct hn_softc *);
298static void			hn_rss_ind_fixup(struct hn_softc *, int);
299static int			hn_rxpkt(struct hn_rx_ring *, const void *,
300				    int, const struct hn_rxinfo *);
301
302static int			hn_tx_ring_create(struct hn_softc *, int);
303static void			hn_tx_ring_destroy(struct hn_tx_ring *);
304static int			hn_create_tx_data(struct hn_softc *, int);
305static void			hn_fixup_tx_data(struct hn_softc *);
306static void			hn_destroy_tx_data(struct hn_softc *);
307static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
308static int			hn_encap(struct hn_tx_ring *,
309				    struct hn_txdesc *, struct mbuf **);
310static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
311				    struct hn_txdesc *);
312static void			hn_set_chim_size(struct hn_softc *, int);
313static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
314static bool			hn_tx_ring_pending(struct hn_tx_ring *);
315static void			hn_tx_ring_qflush(struct hn_tx_ring *);
316static void			hn_resume_tx(struct hn_softc *, int);
317static int			hn_get_txswq_depth(const struct hn_tx_ring *);
318static void			hn_txpkt_done(struct hn_nvs_sendctx *,
319				    struct hn_softc *, struct vmbus_channel *,
320				    const void *, int);
321static int			hn_txpkt_sglist(struct hn_tx_ring *,
322				    struct hn_txdesc *);
323static int			hn_txpkt_chim(struct hn_tx_ring *,
324				    struct hn_txdesc *);
325static int			hn_xmit(struct hn_tx_ring *, int);
326static void			hn_xmit_taskfunc(void *, int);
327static void			hn_xmit_txeof(struct hn_tx_ring *);
328static void			hn_xmit_txeof_taskfunc(void *, int);
329static int			hn_start_locked(struct hn_tx_ring *, int);
330static void			hn_start_taskfunc(void *, int);
331static void			hn_start_txeof(struct hn_tx_ring *);
332static void			hn_start_txeof_taskfunc(void *, int);
333
334SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
335    "Hyper-V network interface");
336
337/* Trust tcp segements verification on host side. */
338static int			hn_trust_hosttcp = 1;
339SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
340    &hn_trust_hosttcp, 0,
341    "Trust tcp segement verification on host side, "
342    "when csum info is missing (global setting)");
343
344/* Trust udp datagrams verification on host side. */
345static int			hn_trust_hostudp = 1;
346SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
347    &hn_trust_hostudp, 0,
348    "Trust udp datagram verification on host side, "
349    "when csum info is missing (global setting)");
350
351/* Trust ip packets verification on host side. */
352static int			hn_trust_hostip = 1;
353SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
354    &hn_trust_hostip, 0,
355    "Trust ip packet verification on host side, "
356    "when csum info is missing (global setting)");
357
358/* Limit TSO burst size */
359static int			hn_tso_maxlen = IP_MAXPACKET;
360SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
361    &hn_tso_maxlen, 0, "TSO burst limit");
362
363/* Limit chimney send size */
364static int			hn_tx_chimney_size = 0;
365SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
366    &hn_tx_chimney_size, 0, "Chimney send packet size limit");
367
368/* Limit the size of packet for direct transmission */
369static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
370SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
371    &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
372
373/* # of LRO entries per RX ring */
374#if defined(INET) || defined(INET6)
375#if __FreeBSD_version >= 1100095
376static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
377SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
378    &hn_lro_entry_count, 0, "LRO entry count");
379#endif
380#endif
381
382/* Use shared TX taskqueue */
383static int			hn_share_tx_taskq = 0;
384SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
385    &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
386
387#ifndef HN_USE_TXDESC_BUFRING
388static int			hn_use_txdesc_bufring = 0;
389#else
390static int			hn_use_txdesc_bufring = 1;
391#endif
392SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
393    &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
394
395/* Bind TX taskqueue to the target CPU */
396static int			hn_bind_tx_taskq = -1;
397SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
398    &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
399
400/* Use ifnet.if_start instead of ifnet.if_transmit */
401static int			hn_use_if_start = 0;
402SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
403    &hn_use_if_start, 0, "Use if_start TX method");
404
405/* # of channels to use */
406static int			hn_chan_cnt = 0;
407SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
408    &hn_chan_cnt, 0,
409    "# of channels to use; each channel has one RX ring and one TX ring");
410
411/* # of transmit rings to use */
412static int			hn_tx_ring_cnt = 0;
413SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
414    &hn_tx_ring_cnt, 0, "# of TX rings to use");
415
416/* Software TX ring deptch */
417static int			hn_tx_swq_depth = 0;
418SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
419    &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
420
421/* Enable sorted LRO, and the depth of the per-channel mbuf queue */
422#if __FreeBSD_version >= 1100095
423static u_int			hn_lro_mbufq_depth = 0;
424SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
425    &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
426#endif
427
428static u_int			hn_cpu_index;	/* next CPU for channel */
429static struct taskqueue		*hn_tx_taskq;	/* shared TX taskqueue */
430
431static const uint8_t
432hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
433	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
434	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
435	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
436	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
437	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
438};
439
440static device_method_t hn_methods[] = {
441	/* Device interface */
442	DEVMETHOD(device_probe,		hn_probe),
443	DEVMETHOD(device_attach,	hn_attach),
444	DEVMETHOD(device_detach,	hn_detach),
445	DEVMETHOD(device_shutdown,	hn_shutdown),
446	DEVMETHOD_END
447};
448
449static driver_t hn_driver = {
450	"hn",
451	hn_methods,
452	sizeof(struct hn_softc)
453};
454
455static devclass_t hn_devclass;
456
457DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
458MODULE_VERSION(hn, 1);
459MODULE_DEPEND(hn, vmbus, 1, 1, 1);
460
461#if __FreeBSD_version >= 1100099
462static void
463hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
464{
465	int i;
466
467	for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
468		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
469}
470#endif
471
472static int
473hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
474{
475
476	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
477	    txd->chim_size == 0, ("invalid rndis sglist txd"));
478	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
479	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
480}
481
482static int
483hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
484{
485	struct hn_nvs_rndis rndis;
486
487	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
488	    txd->chim_size > 0, ("invalid rndis chim txd"));
489
490	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
491	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
492	rndis.nvs_chim_idx = txd->chim_index;
493	rndis.nvs_chim_sz = txd->chim_size;
494
495	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
496	    &rndis, sizeof(rndis), &txd->send_ctx));
497}
498
499static __inline uint32_t
500hn_chim_alloc(struct hn_softc *sc)
501{
502	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
503	u_long *bmap = sc->hn_chim_bmap;
504	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
505
506	for (i = 0; i < bmap_cnt; ++i) {
507		int idx;
508
509		idx = ffsl(~bmap[i]);
510		if (idx == 0)
511			continue;
512
513		--idx; /* ffsl is 1-based */
514		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
515		    ("invalid i %d and idx %d", i, idx));
516
517		if (atomic_testandset_long(&bmap[i], idx))
518			continue;
519
520		ret = i * LONG_BIT + idx;
521		break;
522	}
523	return (ret);
524}
525
526static __inline void
527hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
528{
529	u_long mask;
530	uint32_t idx;
531
532	idx = chim_idx / LONG_BIT;
533	KASSERT(idx < sc->hn_chim_bmap_cnt,
534	    ("invalid chimney index 0x%x", chim_idx));
535
536	mask = 1UL << (chim_idx % LONG_BIT);
537	KASSERT(sc->hn_chim_bmap[idx] & mask,
538	    ("index bitmap 0x%lx, chimney index %u, "
539	     "bitmap idx %d, bitmask 0x%lx",
540	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
541
542	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
543}
544
545static int
546hn_set_rxfilter(struct hn_softc *sc)
547{
548	struct ifnet *ifp = sc->hn_ifp;
549	uint32_t filter;
550	int error = 0;
551
552	HN_LOCK_ASSERT(sc);
553
554	if (ifp->if_flags & IFF_PROMISC) {
555		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
556	} else {
557		filter = NDIS_PACKET_TYPE_DIRECTED;
558		if (ifp->if_flags & IFF_BROADCAST)
559			filter |= NDIS_PACKET_TYPE_BROADCAST;
560#ifdef notyet
561		/*
562		 * See the comment in SIOCADDMULTI/SIOCDELMULTI.
563		 */
564		/* TODO: support multicast list */
565		if ((ifp->if_flags & IFF_ALLMULTI) ||
566		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
567			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
568#else
569		/* Always enable ALLMULTI */
570		filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
571#endif
572	}
573
574	if (sc->hn_rx_filter != filter) {
575		error = hn_rndis_set_rxfilter(sc, filter);
576		if (!error)
577			sc->hn_rx_filter = filter;
578	}
579	return (error);
580}
581
582static int
583hn_get_txswq_depth(const struct hn_tx_ring *txr)
584{
585
586	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
587	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
588		return txr->hn_txdesc_cnt;
589	return hn_tx_swq_depth;
590}
591
592static int
593hn_rss_reconfig(struct hn_softc *sc)
594{
595	int error;
596
597	HN_LOCK_ASSERT(sc);
598
599	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
600		return (ENXIO);
601
602	/*
603	 * Disable RSS first.
604	 *
605	 * NOTE:
606	 * Direct reconfiguration by setting the UNCHG flags does
607	 * _not_ work properly.
608	 */
609	if (bootverbose)
610		if_printf(sc->hn_ifp, "disable RSS\n");
611	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
612	if (error) {
613		if_printf(sc->hn_ifp, "RSS disable failed\n");
614		return (error);
615	}
616
617	/*
618	 * Reenable the RSS w/ the updated RSS key or indirect
619	 * table.
620	 */
621	if (bootverbose)
622		if_printf(sc->hn_ifp, "reconfig RSS\n");
623	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
624	if (error) {
625		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
626		return (error);
627	}
628	return (0);
629}
630
631static void
632hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
633{
634	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
635	int i;
636
637	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
638
639	/*
640	 * Check indirect table to make sure that all channels in it
641	 * can be used.
642	 */
643	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
644		if (rss->rss_ind[i] >= nchan) {
645			if_printf(sc->hn_ifp,
646			    "RSS indirect table %d fixup: %u -> %d\n",
647			    i, rss->rss_ind[i], nchan - 1);
648			rss->rss_ind[i] = nchan - 1;
649		}
650	}
651}
652
653static int
654hn_ifmedia_upd(struct ifnet *ifp __unused)
655{
656
657	return EOPNOTSUPP;
658}
659
660static void
661hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
662{
663	struct hn_softc *sc = ifp->if_softc;
664
665	ifmr->ifm_status = IFM_AVALID;
666	ifmr->ifm_active = IFM_ETHER;
667
668	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
669		ifmr->ifm_active |= IFM_NONE;
670		return;
671	}
672	ifmr->ifm_status |= IFM_ACTIVE;
673	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
674}
675
676/* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
677static const struct hyperv_guid g_net_vsc_device_type = {
678	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
679		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
680};
681
682static int
683hn_probe(device_t dev)
684{
685
686	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
687	    &g_net_vsc_device_type) == 0) {
688		device_set_desc(dev, "Hyper-V Network Interface");
689		return BUS_PROBE_DEFAULT;
690	}
691	return ENXIO;
692}
693
694static void
695hn_cpuset_setthread_task(void *xmask, int pending __unused)
696{
697	cpuset_t *mask = xmask;
698	int error;
699
700	error = cpuset_setthread(curthread->td_tid, mask);
701	if (error) {
702		panic("curthread=%ju: can't pin; error=%d",
703		    (uintmax_t)curthread->td_tid, error);
704	}
705}
706
707static int
708hn_attach(device_t dev)
709{
710	struct hn_softc *sc = device_get_softc(dev);
711	struct sysctl_oid_list *child;
712	struct sysctl_ctx_list *ctx;
713	uint8_t eaddr[ETHER_ADDR_LEN];
714	struct ifnet *ifp = NULL;
715	int error, ring_cnt, tx_ring_cnt;
716
717	sc->hn_dev = dev;
718	sc->hn_prichan = vmbus_get_channel(dev);
719	HN_LOCK_INIT(sc);
720
721	/*
722	 * Setup taskqueue for transmission.
723	 */
724	if (hn_tx_taskq == NULL) {
725		sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
726		    taskqueue_thread_enqueue, &sc->hn_tx_taskq);
727		taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
728		    device_get_nameunit(dev));
729		if (hn_bind_tx_taskq >= 0) {
730			int cpu = hn_bind_tx_taskq;
731			struct task cpuset_task;
732			cpuset_t cpu_set;
733
734			if (cpu > mp_ncpus - 1)
735				cpu = mp_ncpus - 1;
736			CPU_SETOF(cpu, &cpu_set);
737			TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
738			    &cpu_set);
739			taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
740			taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
741		}
742	} else {
743		sc->hn_tx_taskq = hn_tx_taskq;
744	}
745
746	/*
747	 * Setup taskqueue for mangement tasks, e.g. link status.
748	 */
749	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
750	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
751	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
752	    device_get_nameunit(dev));
753	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
754	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
755	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
756	    hn_netchg_status_taskfunc, sc);
757
758	/*
759	 * Allocate ifnet and setup its name earlier, so that if_printf
760	 * can be used by functions, which will be called after
761	 * ether_ifattach().
762	 */
763	ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
764	ifp->if_softc = sc;
765	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
766
767	/*
768	 * Initialize ifmedia earlier so that it can be unconditionally
769	 * destroyed, if error happened later on.
770	 */
771	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
772
773	/*
774	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
775	 * to use (tx_ring_cnt).
776	 *
777	 * NOTE:
778	 * The # of RX rings to use is same as the # of channels to use.
779	 */
780	ring_cnt = hn_chan_cnt;
781	if (ring_cnt <= 0) {
782		/* Default */
783		ring_cnt = mp_ncpus;
784		if (ring_cnt > HN_RING_CNT_DEF_MAX)
785			ring_cnt = HN_RING_CNT_DEF_MAX;
786	} else if (ring_cnt > mp_ncpus) {
787		ring_cnt = mp_ncpus;
788	}
789
790	tx_ring_cnt = hn_tx_ring_cnt;
791	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
792		tx_ring_cnt = ring_cnt;
793	if (hn_use_if_start) {
794		/* ifnet.if_start only needs one TX ring. */
795		tx_ring_cnt = 1;
796	}
797
798	/*
799	 * Set the leader CPU for channels.
800	 */
801	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
802
803	/*
804	 * Create enough TX/RX rings, even if only limited number of
805	 * channels can be allocated.
806	 */
807	error = hn_create_tx_data(sc, tx_ring_cnt);
808	if (error)
809		goto failed;
810	error = hn_create_rx_data(sc, ring_cnt);
811	if (error)
812		goto failed;
813
814	/*
815	 * Create transaction context for NVS and RNDIS transactions.
816	 */
817	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
818	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
819	if (sc->hn_xact == NULL)
820		goto failed;
821
822	/*
823	 * Attach the synthetic parts, i.e. NVS and RNDIS.
824	 */
825	error = hn_synth_attach(sc, ETHERMTU);
826	if (error)
827		goto failed;
828
829	error = hn_rndis_get_eaddr(sc, eaddr);
830	if (error)
831		goto failed;
832
833#if __FreeBSD_version >= 1100099
834	if (sc->hn_rx_ring_inuse > 1) {
835		/*
836		 * Reduce TCP segment aggregation limit for multiple
837		 * RX rings to increase ACK timeliness.
838		 */
839		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
840	}
841#endif
842
843	/*
844	 * Fixup TX stuffs after synthetic parts are attached.
845	 */
846	hn_fixup_tx_data(sc);
847
848	ctx = device_get_sysctl_ctx(dev);
849	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
850	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
851	    &sc->hn_nvs_ver, 0, "NVS version");
852	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
853	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
854	    hn_ndis_version_sysctl, "A", "NDIS version");
855	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
856	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
857	    hn_caps_sysctl, "A", "capabilities");
858	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
859	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
860	    hn_hwassist_sysctl, "A", "hwassist");
861	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
862	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
863	    hn_rxfilter_sysctl, "A", "rxfilter");
864	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
865	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
866	    hn_rss_hash_sysctl, "A", "RSS hash");
867	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
868	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
869	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
870	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
871	    hn_rss_key_sysctl, "IU", "RSS key");
872	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
873	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
874	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
875
876	/*
877	 * Setup the ifmedia, which has been initialized earlier.
878	 */
879	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
880	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
881	/* XXX ifmedia_set really should do this for us */
882	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
883
884	/*
885	 * Setup the ifnet for this interface.
886	 */
887
888#ifdef __LP64__
889	ifp->if_baudrate = IF_Gbps(10);
890#else
891	/* if_baudrate is 32bits on 32bit system. */
892	ifp->if_baudrate = IF_Gbps(1);
893#endif
894	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
895	ifp->if_ioctl = hn_ioctl;
896	ifp->if_init = hn_init;
897	if (hn_use_if_start) {
898		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
899
900		ifp->if_start = hn_start;
901		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
902		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
903		IFQ_SET_READY(&ifp->if_snd);
904	} else {
905		ifp->if_transmit = hn_transmit;
906		ifp->if_qflush = hn_xmit_qflush;
907	}
908
909	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
910#ifdef foo
911	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
912	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
913#endif
914	if (sc->hn_caps & HN_CAP_VLAN) {
915		/* XXX not sure about VLAN_MTU. */
916		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
917	}
918
919	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
920	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
921		ifp->if_capabilities |= IFCAP_TXCSUM;
922	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
923		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
924	if (sc->hn_caps & HN_CAP_TSO4) {
925		ifp->if_capabilities |= IFCAP_TSO4;
926		ifp->if_hwassist |= CSUM_IP_TSO;
927	}
928	if (sc->hn_caps & HN_CAP_TSO6) {
929		ifp->if_capabilities |= IFCAP_TSO6;
930		ifp->if_hwassist |= CSUM_IP6_TSO;
931	}
932
933	/* Enable all available capabilities by default. */
934	ifp->if_capenable = ifp->if_capabilities;
935
936	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
937		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
938		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
939		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
940	}
941
942	ether_ifattach(ifp, eaddr);
943
944	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
945		if_printf(ifp, "TSO segcnt %u segsz %u\n",
946		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
947	}
948
949	/* Inform the upper layer about the long frame support. */
950	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
951
952	/*
953	 * Kick off link status check.
954	 */
955	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
956	hn_update_link_status(sc);
957
958	return (0);
959failed:
960	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
961		hn_synth_detach(sc);
962	hn_detach(dev);
963	return (error);
964}
965
966static int
967hn_detach(device_t dev)
968{
969	struct hn_softc *sc = device_get_softc(dev);
970	struct ifnet *ifp = sc->hn_ifp;
971
972	if (device_is_attached(dev)) {
973		HN_LOCK(sc);
974		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
975			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
976				hn_stop(sc);
977			/*
978			 * NOTE:
979			 * hn_stop() only suspends data, so managment
980			 * stuffs have to be suspended manually here.
981			 */
982			hn_suspend_mgmt(sc);
983			hn_synth_detach(sc);
984		}
985		HN_UNLOCK(sc);
986		ether_ifdetach(ifp);
987	}
988
989	ifmedia_removeall(&sc->hn_media);
990	hn_destroy_rx_data(sc);
991	hn_destroy_tx_data(sc);
992
993	if (sc->hn_tx_taskq != hn_tx_taskq)
994		taskqueue_free(sc->hn_tx_taskq);
995	taskqueue_free(sc->hn_mgmt_taskq0);
996
997	if (sc->hn_xact != NULL)
998		vmbus_xact_ctx_destroy(sc->hn_xact);
999
1000	if_free(ifp);
1001
1002	HN_LOCK_DESTROY(sc);
1003	return (0);
1004}
1005
1006static int
1007hn_shutdown(device_t dev)
1008{
1009
1010	return (0);
1011}
1012
1013static void
1014hn_link_status(struct hn_softc *sc)
1015{
1016	uint32_t link_status;
1017	int error;
1018
1019	error = hn_rndis_get_linkstatus(sc, &link_status);
1020	if (error) {
1021		/* XXX what to do? */
1022		return;
1023	}
1024
1025	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1026		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1027	else
1028		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1029	if_link_state_change(sc->hn_ifp,
1030	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1031	    LINK_STATE_UP : LINK_STATE_DOWN);
1032}
1033
1034static void
1035hn_link_taskfunc(void *xsc, int pending __unused)
1036{
1037	struct hn_softc *sc = xsc;
1038
1039	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1040		return;
1041	hn_link_status(sc);
1042}
1043
1044static void
1045hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1046{
1047	struct hn_softc *sc = xsc;
1048
1049	/* Prevent any link status checks from running. */
1050	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1051
1052	/*
1053	 * Fake up a [link down --> link up] state change; 5 seconds
1054	 * delay is used, which closely simulates miibus reaction
1055	 * upon link down event.
1056	 */
1057	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1058	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1059	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1060	    &sc->hn_netchg_status, 5 * hz);
1061}
1062
1063static void
1064hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1065{
1066	struct hn_softc *sc = xsc;
1067
1068	/* Re-allow link status checks. */
1069	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1070	hn_link_status(sc);
1071}
1072
1073static void
1074hn_update_link_status(struct hn_softc *sc)
1075{
1076
1077	if (sc->hn_mgmt_taskq != NULL)
1078		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1079}
1080
1081static void
1082hn_change_network(struct hn_softc *sc)
1083{
1084
1085	if (sc->hn_mgmt_taskq != NULL)
1086		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1087}
1088
1089static __inline int
1090hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1091    struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1092{
1093	struct mbuf *m = *m_head;
1094	int error;
1095
1096	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1097
1098	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1099	    m, segs, nsegs, BUS_DMA_NOWAIT);
1100	if (error == EFBIG) {
1101		struct mbuf *m_new;
1102
1103		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1104		if (m_new == NULL)
1105			return ENOBUFS;
1106		else
1107			*m_head = m = m_new;
1108		txr->hn_tx_collapsed++;
1109
1110		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1111		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1112	}
1113	if (!error) {
1114		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1115		    BUS_DMASYNC_PREWRITE);
1116		txd->flags |= HN_TXD_FLAG_DMAMAP;
1117	}
1118	return error;
1119}
1120
1121static __inline int
1122hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1123{
1124
1125	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1126	    ("put an onlist txd %#x", txd->flags));
1127
1128	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1129	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1130		return 0;
1131
1132	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1133		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1134		    ("chim txd uses dmamap"));
1135		hn_chim_free(txr->hn_sc, txd->chim_index);
1136		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1137	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1138		bus_dmamap_sync(txr->hn_tx_data_dtag,
1139		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1140		bus_dmamap_unload(txr->hn_tx_data_dtag,
1141		    txd->data_dmap);
1142		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1143	}
1144
1145	if (txd->m != NULL) {
1146		m_freem(txd->m);
1147		txd->m = NULL;
1148	}
1149
1150	txd->flags |= HN_TXD_FLAG_ONLIST;
1151#ifndef HN_USE_TXDESC_BUFRING
1152	mtx_lock_spin(&txr->hn_txlist_spin);
1153	KASSERT(txr->hn_txdesc_avail >= 0 &&
1154	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1155	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1156	txr->hn_txdesc_avail++;
1157	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1158	mtx_unlock_spin(&txr->hn_txlist_spin);
1159#else
1160	atomic_add_int(&txr->hn_txdesc_avail, 1);
1161	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1162#endif
1163
1164	return 1;
1165}
1166
1167static __inline struct hn_txdesc *
1168hn_txdesc_get(struct hn_tx_ring *txr)
1169{
1170	struct hn_txdesc *txd;
1171
1172#ifndef HN_USE_TXDESC_BUFRING
1173	mtx_lock_spin(&txr->hn_txlist_spin);
1174	txd = SLIST_FIRST(&txr->hn_txlist);
1175	if (txd != NULL) {
1176		KASSERT(txr->hn_txdesc_avail > 0,
1177		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1178		txr->hn_txdesc_avail--;
1179		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1180	}
1181	mtx_unlock_spin(&txr->hn_txlist_spin);
1182#else
1183	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1184#endif
1185
1186	if (txd != NULL) {
1187#ifdef HN_USE_TXDESC_BUFRING
1188		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1189#endif
1190		KASSERT(txd->m == NULL && txd->refs == 0 &&
1191		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1192		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1193		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1194		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1195		txd->refs = 1;
1196	}
1197	return txd;
1198}
1199
1200static __inline void
1201hn_txdesc_hold(struct hn_txdesc *txd)
1202{
1203
1204	/* 0->1 transition will never work */
1205	KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
1206	atomic_add_int(&txd->refs, 1);
1207}
1208
1209static bool
1210hn_tx_ring_pending(struct hn_tx_ring *txr)
1211{
1212	bool pending = false;
1213
1214#ifndef HN_USE_TXDESC_BUFRING
1215	mtx_lock_spin(&txr->hn_txlist_spin);
1216	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1217		pending = true;
1218	mtx_unlock_spin(&txr->hn_txlist_spin);
1219#else
1220	if (!buf_ring_full(txr->hn_txdesc_br))
1221		pending = true;
1222#endif
1223	return (pending);
1224}
1225
1226static __inline void
1227hn_txeof(struct hn_tx_ring *txr)
1228{
1229	txr->hn_has_txeof = 0;
1230	txr->hn_txeof(txr);
1231}
1232
1233static void
1234hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1235    struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1236{
1237	struct hn_txdesc *txd = sndc->hn_cbarg;
1238	struct hn_tx_ring *txr;
1239
1240	txr = txd->txr;
1241	KASSERT(txr->hn_chan == chan,
1242	    ("channel mismatch, on chan%u, should be chan%u",
1243	     vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
1244
1245	txr->hn_has_txeof = 1;
1246	hn_txdesc_put(txr, txd);
1247
1248	++txr->hn_txdone_cnt;
1249	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1250		txr->hn_txdone_cnt = 0;
1251		if (txr->hn_oactive)
1252			hn_txeof(txr);
1253	}
1254}
1255
1256static void
1257hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1258{
1259#if defined(INET) || defined(INET6)
1260	struct lro_ctrl *lro = &rxr->hn_lro;
1261	struct lro_entry *queued;
1262
1263	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1264		SLIST_REMOVE_HEAD(&lro->lro_active, next);
1265		tcp_lro_flush(lro, queued);
1266	}
1267#endif
1268
1269	/*
1270	 * NOTE:
1271	 * 'txr' could be NULL, if multiple channels and
1272	 * ifnet.if_start method are enabled.
1273	 */
1274	if (txr == NULL || !txr->hn_has_txeof)
1275		return;
1276
1277	txr->hn_txdone_cnt = 0;
1278	hn_txeof(txr);
1279}
1280
1281static __inline uint32_t
1282hn_rndis_pktmsg_offset(uint32_t ofs)
1283{
1284
1285	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1286	    ("invalid RNDIS packet msg offset %u", ofs));
1287	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1288}
1289
1290static __inline void *
1291hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1292    size_t pi_dlen, uint32_t pi_type)
1293{
1294	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1295	struct rndis_pktinfo *pi;
1296
1297	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1298	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1299
1300	/*
1301	 * Per-packet-info does not move; it only grows.
1302	 *
1303	 * NOTE:
1304	 * rm_pktinfooffset in this phase counts from the beginning
1305	 * of rndis_packet_msg.
1306	 */
1307	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1308	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1309	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1310	    pkt->rm_pktinfolen);
1311	pkt->rm_pktinfolen += pi_size;
1312
1313	pi->rm_size = pi_size;
1314	pi->rm_type = pi_type;
1315	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1316
1317	/* Data immediately follow per-packet-info. */
1318	pkt->rm_dataoffset += pi_size;
1319
1320	/* Update RNDIS packet msg length */
1321	pkt->rm_len += pi_size;
1322
1323	return (pi->rm_data);
1324}
1325
1326/*
1327 * NOTE:
1328 * If this function fails, then both txd and m_head0 will be freed.
1329 */
1330static int
1331hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
1332{
1333	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1334	int error, nsegs, i;
1335	struct mbuf *m_head = *m_head0;
1336	struct rndis_packet_msg *pkt;
1337	uint32_t *pi_data;
1338	int pktlen;
1339
1340	/*
1341	 * extension points to the area reserved for the
1342	 * rndis_filter_packet, which is placed just after
1343	 * the netvsc_packet (and rppi struct, if present;
1344	 * length is updated later).
1345	 */
1346	pkt = txd->rndis_pkt;
1347	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1348	pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1349	pkt->rm_dataoffset = sizeof(*pkt);
1350	pkt->rm_datalen = m_head->m_pkthdr.len;
1351	pkt->rm_pktinfooffset = sizeof(*pkt);
1352	pkt->rm_pktinfolen = 0;
1353
1354	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1355		/*
1356		 * Set the hash value for this packet, so that the host could
1357		 * dispatch the TX done event for this packet back to this TX
1358		 * ring's channel.
1359		 */
1360		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1361		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1362		*pi_data = txr->hn_tx_idx;
1363	}
1364
1365	if (m_head->m_flags & M_VLANTAG) {
1366		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1367		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1368		*pi_data = NDIS_VLAN_INFO_MAKE(
1369		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1370		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1371		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1372	}
1373
1374	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1375#if defined(INET6) || defined(INET)
1376		struct ether_vlan_header *eh;
1377		int ether_len;
1378
1379		/*
1380		 * XXX need m_pullup and use mtodo
1381		 */
1382		eh = mtod(m_head, struct ether_vlan_header*);
1383		if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN))
1384			ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1385		else
1386			ether_len = ETHER_HDR_LEN;
1387
1388		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1389		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1390#ifdef INET
1391		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1392			struct ip *ip =
1393			    (struct ip *)(m_head->m_data + ether_len);
1394			unsigned long iph_len = ip->ip_hl << 2;
1395			struct tcphdr *th =
1396			    (struct tcphdr *)((caddr_t)ip + iph_len);
1397
1398			ip->ip_len = 0;
1399			ip->ip_sum = 0;
1400			th->th_sum = in_pseudo(ip->ip_src.s_addr,
1401			    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
1402			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1403			    m_head->m_pkthdr.tso_segsz);
1404		}
1405#endif
1406#if defined(INET6) && defined(INET)
1407		else
1408#endif
1409#ifdef INET6
1410		{
1411			struct ip6_hdr *ip6 = (struct ip6_hdr *)
1412			    (m_head->m_data + ether_len);
1413			struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
1414
1415			ip6->ip6_plen = 0;
1416			th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
1417			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1418			    m_head->m_pkthdr.tso_segsz);
1419		}
1420#endif
1421#endif	/* INET6 || INET */
1422	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1423		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1424		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1425		if (m_head->m_pkthdr.csum_flags &
1426		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1427			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1428		} else {
1429			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1430			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1431				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1432		}
1433
1434		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1435			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1436		else if (m_head->m_pkthdr.csum_flags &
1437		    (CSUM_IP_UDP | CSUM_IP6_UDP))
1438			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1439	}
1440
1441	pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1442	/* Convert RNDIS packet message offsets */
1443	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1444	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1445
1446	/*
1447	 * Chimney send, if the packet could fit into one chimney buffer.
1448	 */
1449	if (pkt->rm_len < txr->hn_chim_size) {
1450		txr->hn_tx_chimney_tried++;
1451		txd->chim_index = hn_chim_alloc(txr->hn_sc);
1452		if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1453			uint8_t *dest = txr->hn_sc->hn_chim +
1454			    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1455
1456			memcpy(dest, pkt, pktlen);
1457			dest += pktlen;
1458			m_copydata(m_head, 0, m_head->m_pkthdr.len, dest);
1459
1460			txd->chim_size = pkt->rm_len;
1461			txr->hn_gpa_cnt = 0;
1462			txr->hn_tx_chimney++;
1463			txr->hn_sendpkt = hn_txpkt_chim;
1464			goto done;
1465		}
1466	}
1467
1468	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1469	if (error) {
1470		int freed;
1471
1472		/*
1473		 * This mbuf is not linked w/ the txd yet, so free it now.
1474		 */
1475		m_freem(m_head);
1476		*m_head0 = NULL;
1477
1478		freed = hn_txdesc_put(txr, txd);
1479		KASSERT(freed != 0,
1480		    ("fail to free txd upon txdma error"));
1481
1482		txr->hn_txdma_failed++;
1483		if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
1484		return error;
1485	}
1486	*m_head0 = m_head;
1487
1488	/* +1 RNDIS packet message */
1489	txr->hn_gpa_cnt = nsegs + 1;
1490
1491	/* send packet with page buffer */
1492	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1493	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1494	txr->hn_gpa[0].gpa_len = pktlen;
1495
1496	/*
1497	 * Fill the page buffers with mbuf info after the page
1498	 * buffer for RNDIS packet message.
1499	 */
1500	for (i = 0; i < nsegs; ++i) {
1501		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1502
1503		gpa->gpa_page = atop(segs[i].ds_addr);
1504		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1505		gpa->gpa_len = segs[i].ds_len;
1506	}
1507
1508	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1509	txd->chim_size = 0;
1510	txr->hn_sendpkt = hn_txpkt_sglist;
1511done:
1512	txd->m = m_head;
1513
1514	/* Set the completion routine */
1515	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1516
1517	return 0;
1518}
1519
1520/*
1521 * NOTE:
1522 * If this function fails, then txd will be freed, but the mbuf
1523 * associated w/ the txd will _not_ be freed.
1524 */
1525static int
1526hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1527{
1528	int error, send_failed = 0;
1529
1530again:
1531	/*
1532	 * Make sure that txd is not freed before ETHER_BPF_MTAP.
1533	 */
1534	hn_txdesc_hold(txd);
1535	error = txr->hn_sendpkt(txr, txd);
1536	if (!error) {
1537		ETHER_BPF_MTAP(ifp, txd->m);
1538		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1539		if (!hn_use_if_start) {
1540			if_inc_counter(ifp, IFCOUNTER_OBYTES,
1541			    txd->m->m_pkthdr.len);
1542			if (txd->m->m_flags & M_MCAST)
1543				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
1544		}
1545		txr->hn_pkts++;
1546	}
1547	hn_txdesc_put(txr, txd);
1548
1549	if (__predict_false(error)) {
1550		int freed;
1551
1552		/*
1553		 * This should "really rarely" happen.
1554		 *
1555		 * XXX Too many RX to be acked or too many sideband
1556		 * commands to run?  Ask netvsc_channel_rollup()
1557		 * to kick start later.
1558		 */
1559		txr->hn_has_txeof = 1;
1560		if (!send_failed) {
1561			txr->hn_send_failed++;
1562			send_failed = 1;
1563			/*
1564			 * Try sending again after set hn_has_txeof;
1565			 * in case that we missed the last
1566			 * netvsc_channel_rollup().
1567			 */
1568			goto again;
1569		}
1570		if_printf(ifp, "send failed\n");
1571
1572		/*
1573		 * Caller will perform further processing on the
1574		 * associated mbuf, so don't free it in hn_txdesc_put();
1575		 * only unload it from the DMA map in hn_txdesc_put(),
1576		 * if it was loaded.
1577		 */
1578		txd->m = NULL;
1579		freed = hn_txdesc_put(txr, txd);
1580		KASSERT(freed != 0,
1581		    ("fail to free txd upon send error"));
1582
1583		txr->hn_send_failed++;
1584	}
1585	return error;
1586}
1587
1588/*
1589 * Start a transmit of one or more packets
1590 */
1591static int
1592hn_start_locked(struct hn_tx_ring *txr, int len)
1593{
1594	struct hn_softc *sc = txr->hn_sc;
1595	struct ifnet *ifp = sc->hn_ifp;
1596
1597	KASSERT(hn_use_if_start,
1598	    ("hn_start_locked is called, when if_start is disabled"));
1599	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1600	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
1601
1602	if (__predict_false(txr->hn_suspended))
1603		return 0;
1604
1605	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
1606	    IFF_DRV_RUNNING)
1607		return 0;
1608
1609	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
1610		struct hn_txdesc *txd;
1611		struct mbuf *m_head;
1612		int error;
1613
1614		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
1615		if (m_head == NULL)
1616			break;
1617
1618		if (len > 0 && m_head->m_pkthdr.len > len) {
1619			/*
1620			 * This sending could be time consuming; let callers
1621			 * dispatch this packet sending (and sending of any
1622			 * following up packets) to tx taskqueue.
1623			 */
1624			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1625			return 1;
1626		}
1627
1628		txd = hn_txdesc_get(txr);
1629		if (txd == NULL) {
1630			txr->hn_no_txdescs++;
1631			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1632			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1633			break;
1634		}
1635
1636		error = hn_encap(txr, txd, &m_head);
1637		if (error) {
1638			/* Both txd and m_head are freed */
1639			continue;
1640		}
1641
1642		error = hn_txpkt(ifp, txr, txd);
1643		if (__predict_false(error)) {
1644			/* txd is freed, but m_head is not */
1645			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1646			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1647			break;
1648		}
1649	}
1650	return 0;
1651}
1652
1653/*
1654 * Append the specified data to the indicated mbuf chain,
1655 * Extend the mbuf chain if the new data does not fit in
1656 * existing space.
1657 *
1658 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1659 * There should be an equivalent in the kernel mbuf code,
1660 * but there does not appear to be one yet.
1661 *
1662 * Differs from m_append() in that additional mbufs are
1663 * allocated with cluster size MJUMPAGESIZE, and filled
1664 * accordingly.
1665 *
1666 * Return 1 if able to complete the job; otherwise 0.
1667 */
1668static int
1669hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1670{
1671	struct mbuf *m, *n;
1672	int remainder, space;
1673
1674	for (m = m0; m->m_next != NULL; m = m->m_next)
1675		;
1676	remainder = len;
1677	space = M_TRAILINGSPACE(m);
1678	if (space > 0) {
1679		/*
1680		 * Copy into available space.
1681		 */
1682		if (space > remainder)
1683			space = remainder;
1684		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1685		m->m_len += space;
1686		cp += space;
1687		remainder -= space;
1688	}
1689	while (remainder > 0) {
1690		/*
1691		 * Allocate a new mbuf; could check space
1692		 * and allocate a cluster instead.
1693		 */
1694		n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
1695		if (n == NULL)
1696			break;
1697		n->m_len = min(MJUMPAGESIZE, remainder);
1698		bcopy(cp, mtod(n, caddr_t), n->m_len);
1699		cp += n->m_len;
1700		remainder -= n->m_len;
1701		m->m_next = n;
1702		m = n;
1703	}
1704	if (m0->m_flags & M_PKTHDR)
1705		m0->m_pkthdr.len += len - remainder;
1706
1707	return (remainder == 0);
1708}
1709
1710#if defined(INET) || defined(INET6)
1711static __inline int
1712hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
1713{
1714#if __FreeBSD_version >= 1100095
1715	if (hn_lro_mbufq_depth) {
1716		tcp_lro_queue_mbuf(lc, m);
1717		return 0;
1718	}
1719#endif
1720	return tcp_lro_rx(lc, m, 0);
1721}
1722#endif
1723
1724static int
1725hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
1726    const struct hn_rxinfo *info)
1727{
1728	struct ifnet *ifp = rxr->hn_ifp;
1729	struct mbuf *m_new;
1730	int size, do_lro = 0, do_csum = 1;
1731	int hash_type = M_HASHTYPE_OPAQUE;
1732
1733	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1734		return (0);
1735
1736	/*
1737	 * Bail out if packet contains more data than configured MTU.
1738	 */
1739	if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
1740		return (0);
1741	} else if (dlen <= MHLEN) {
1742		m_new = m_gethdr(M_NOWAIT, MT_DATA);
1743		if (m_new == NULL) {
1744			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1745			return (0);
1746		}
1747		memcpy(mtod(m_new, void *), data, dlen);
1748		m_new->m_pkthdr.len = m_new->m_len = dlen;
1749		rxr->hn_small_pkts++;
1750	} else {
1751		/*
1752		 * Get an mbuf with a cluster.  For packets 2K or less,
1753		 * get a standard 2K cluster.  For anything larger, get a
1754		 * 4K cluster.  Any buffers larger than 4K can cause problems
1755		 * if looped around to the Hyper-V TX channel, so avoid them.
1756		 */
1757		size = MCLBYTES;
1758		if (dlen > MCLBYTES) {
1759			/* 4096 */
1760			size = MJUMPAGESIZE;
1761		}
1762
1763		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1764		if (m_new == NULL) {
1765			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1766			return (0);
1767		}
1768
1769		hv_m_append(m_new, dlen, data);
1770	}
1771	m_new->m_pkthdr.rcvif = ifp;
1772
1773	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
1774		do_csum = 0;
1775
1776	/* receive side checksum offload */
1777	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
1778		/* IP csum offload */
1779		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
1780			m_new->m_pkthdr.csum_flags |=
1781			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
1782			rxr->hn_csum_ip++;
1783		}
1784
1785		/* TCP/UDP csum offload */
1786		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
1787		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
1788			m_new->m_pkthdr.csum_flags |=
1789			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1790			m_new->m_pkthdr.csum_data = 0xffff;
1791			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
1792				rxr->hn_csum_tcp++;
1793			else
1794				rxr->hn_csum_udp++;
1795		}
1796
1797		/*
1798		 * XXX
1799		 * As of this write (Oct 28th, 2016), host side will turn
1800		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
1801		 * the do_lro setting here is actually _not_ accurate.  We
1802		 * depend on the RSS hash type check to reset do_lro.
1803		 */
1804		if ((info->csum_info &
1805		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
1806		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
1807			do_lro = 1;
1808	} else {
1809		const struct ether_header *eh;
1810		uint16_t etype;
1811		int hoff;
1812
1813		hoff = sizeof(*eh);
1814		if (m_new->m_len < hoff)
1815			goto skip;
1816		eh = mtod(m_new, struct ether_header *);
1817		etype = ntohs(eh->ether_type);
1818		if (etype == ETHERTYPE_VLAN) {
1819			const struct ether_vlan_header *evl;
1820
1821			hoff = sizeof(*evl);
1822			if (m_new->m_len < hoff)
1823				goto skip;
1824			evl = mtod(m_new, struct ether_vlan_header *);
1825			etype = ntohs(evl->evl_proto);
1826		}
1827
1828		if (etype == ETHERTYPE_IP) {
1829			int pr;
1830
1831			pr = hn_check_iplen(m_new, hoff);
1832			if (pr == IPPROTO_TCP) {
1833				if (do_csum &&
1834				    (rxr->hn_trust_hcsum &
1835				     HN_TRUST_HCSUM_TCP)) {
1836					rxr->hn_csum_trusted++;
1837					m_new->m_pkthdr.csum_flags |=
1838					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
1839					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1840					m_new->m_pkthdr.csum_data = 0xffff;
1841				}
1842				do_lro = 1;
1843			} else if (pr == IPPROTO_UDP) {
1844				if (do_csum &&
1845				    (rxr->hn_trust_hcsum &
1846				     HN_TRUST_HCSUM_UDP)) {
1847					rxr->hn_csum_trusted++;
1848					m_new->m_pkthdr.csum_flags |=
1849					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
1850					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1851					m_new->m_pkthdr.csum_data = 0xffff;
1852				}
1853			} else if (pr != IPPROTO_DONE && do_csum &&
1854			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
1855				rxr->hn_csum_trusted++;
1856				m_new->m_pkthdr.csum_flags |=
1857				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
1858			}
1859		}
1860	}
1861skip:
1862	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
1863		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
1864		    NDIS_VLAN_INFO_ID(info->vlan_info),
1865		    NDIS_VLAN_INFO_PRI(info->vlan_info),
1866		    NDIS_VLAN_INFO_CFI(info->vlan_info));
1867		m_new->m_flags |= M_VLANTAG;
1868	}
1869
1870	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
1871		rxr->hn_rss_pkts++;
1872		m_new->m_pkthdr.flowid = info->hash_value;
1873		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
1874		    NDIS_HASH_FUNCTION_TOEPLITZ) {
1875			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
1876
1877			/*
1878			 * NOTE:
1879			 * do_lro is resetted, if the hash types are not TCP
1880			 * related.  See the comment in the above csum_flags
1881			 * setup section.
1882			 */
1883			switch (type) {
1884			case NDIS_HASH_IPV4:
1885				hash_type = M_HASHTYPE_RSS_IPV4;
1886				do_lro = 0;
1887				break;
1888
1889			case NDIS_HASH_TCP_IPV4:
1890				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
1891				break;
1892
1893			case NDIS_HASH_IPV6:
1894				hash_type = M_HASHTYPE_RSS_IPV6;
1895				do_lro = 0;
1896				break;
1897
1898			case NDIS_HASH_IPV6_EX:
1899				hash_type = M_HASHTYPE_RSS_IPV6_EX;
1900				do_lro = 0;
1901				break;
1902
1903			case NDIS_HASH_TCP_IPV6:
1904				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
1905				break;
1906
1907			case NDIS_HASH_TCP_IPV6_EX:
1908				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
1909				break;
1910			}
1911		}
1912	} else {
1913		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
1914	}
1915	M_HASHTYPE_SET(m_new, hash_type);
1916
1917	/*
1918	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
1919	 * messages (not just data messages) will trigger a response.
1920	 */
1921
1922	ifp->if_ipackets++;
1923	rxr->hn_pkts++;
1924
1925	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1926#if defined(INET) || defined(INET6)
1927		struct lro_ctrl *lro = &rxr->hn_lro;
1928
1929		if (lro->lro_cnt) {
1930			rxr->hn_lro_tried++;
1931			if (hn_lro_rx(lro, m_new) == 0) {
1932				/* DONE! */
1933				return 0;
1934			}
1935		}
1936#endif
1937	}
1938
1939	/* We're not holding the lock here, so don't release it */
1940	(*ifp->if_input)(ifp, m_new);
1941
1942	return (0);
1943}
1944
1945static int
1946hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1947{
1948	struct hn_softc *sc = ifp->if_softc;
1949	struct ifreq *ifr = (struct ifreq *)data;
1950	int mask, error = 0;
1951
1952	switch (cmd) {
1953	case SIOCSIFMTU:
1954		if (ifr->ifr_mtu > HN_MTU_MAX) {
1955			error = EINVAL;
1956			break;
1957		}
1958
1959		HN_LOCK(sc);
1960
1961		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
1962			HN_UNLOCK(sc);
1963			break;
1964		}
1965
1966		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
1967			/* Can't change MTU */
1968			HN_UNLOCK(sc);
1969			error = EOPNOTSUPP;
1970			break;
1971		}
1972
1973		if (ifp->if_mtu == ifr->ifr_mtu) {
1974			HN_UNLOCK(sc);
1975			break;
1976		}
1977
1978		/*
1979		 * Suspend this interface before the synthetic parts
1980		 * are ripped.
1981		 */
1982		hn_suspend(sc);
1983
1984		/*
1985		 * Detach the synthetics parts, i.e. NVS and RNDIS.
1986		 */
1987		hn_synth_detach(sc);
1988
1989		/*
1990		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
1991		 * with the new MTU setting.
1992		 */
1993		error = hn_synth_attach(sc, ifr->ifr_mtu);
1994		if (error) {
1995			HN_UNLOCK(sc);
1996			break;
1997		}
1998
1999		/*
2000		 * Commit the requested MTU, after the synthetic parts
2001		 * have been successfully attached.
2002		 */
2003		ifp->if_mtu = ifr->ifr_mtu;
2004
2005		/*
2006		 * Make sure that various parameters based on MTU are
2007		 * still valid, after the MTU change.
2008		 */
2009		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2010			hn_set_chim_size(sc, sc->hn_chim_szmax);
2011		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2012#if __FreeBSD_version >= 1100099
2013		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2014		    HN_LRO_LENLIM_MIN(ifp))
2015			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2016#endif
2017
2018		/*
2019		 * All done!  Resume the interface now.
2020		 */
2021		hn_resume(sc);
2022
2023		HN_UNLOCK(sc);
2024		break;
2025
2026	case SIOCSIFFLAGS:
2027		HN_LOCK(sc);
2028
2029		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2030			HN_UNLOCK(sc);
2031			break;
2032		}
2033
2034		if (ifp->if_flags & IFF_UP) {
2035			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2036				hn_set_rxfilter(sc);
2037			else
2038				hn_init_locked(sc);
2039		} else {
2040			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2041				hn_stop(sc);
2042		}
2043		sc->hn_if_flags = ifp->if_flags;
2044
2045		HN_UNLOCK(sc);
2046		break;
2047
2048	case SIOCSIFCAP:
2049		HN_LOCK(sc);
2050		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2051
2052		if (mask & IFCAP_TXCSUM) {
2053			ifp->if_capenable ^= IFCAP_TXCSUM;
2054			if (ifp->if_capenable & IFCAP_TXCSUM)
2055				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2056			else
2057				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2058		}
2059		if (mask & IFCAP_TXCSUM_IPV6) {
2060			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2061			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2062				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2063			else
2064				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2065		}
2066
2067		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2068		if (mask & IFCAP_RXCSUM)
2069			ifp->if_capenable ^= IFCAP_RXCSUM;
2070#ifdef foo
2071		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2072		if (mask & IFCAP_RXCSUM_IPV6)
2073			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2074#endif
2075
2076		if (mask & IFCAP_LRO)
2077			ifp->if_capenable ^= IFCAP_LRO;
2078
2079		if (mask & IFCAP_TSO4) {
2080			ifp->if_capenable ^= IFCAP_TSO4;
2081			if (ifp->if_capenable & IFCAP_TSO4)
2082				ifp->if_hwassist |= CSUM_IP_TSO;
2083			else
2084				ifp->if_hwassist &= ~CSUM_IP_TSO;
2085		}
2086		if (mask & IFCAP_TSO6) {
2087			ifp->if_capenable ^= IFCAP_TSO6;
2088			if (ifp->if_capenable & IFCAP_TSO6)
2089				ifp->if_hwassist |= CSUM_IP6_TSO;
2090			else
2091				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2092		}
2093
2094		HN_UNLOCK(sc);
2095		break;
2096
2097	case SIOCADDMULTI:
2098	case SIOCDELMULTI:
2099#ifdef notyet
2100		/*
2101		 * XXX
2102		 * Multicast uses mutex, while RNDIS RX filter setting
2103		 * sleeps.  We workaround this by always enabling
2104		 * ALLMULTI.  ALLMULTI would actually always be on, even
2105		 * if we supported the SIOCADDMULTI/SIOCDELMULTI, since
2106		 * we don't support multicast address list configuration
2107		 * for this driver.
2108		 */
2109		HN_LOCK(sc);
2110
2111		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2112			HN_UNLOCK(sc);
2113			break;
2114		}
2115		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2116			hn_set_rxfilter(sc);
2117
2118		HN_UNLOCK(sc);
2119#endif
2120		break;
2121
2122	case SIOCSIFMEDIA:
2123	case SIOCGIFMEDIA:
2124		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2125		break;
2126
2127	default:
2128		error = ether_ioctl(ifp, cmd, data);
2129		break;
2130	}
2131	return (error);
2132}
2133
2134static void
2135hn_stop(struct hn_softc *sc)
2136{
2137	struct ifnet *ifp = sc->hn_ifp;
2138	int i;
2139
2140	HN_LOCK_ASSERT(sc);
2141
2142	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2143	    ("synthetic parts were not attached"));
2144
2145	/* Clear RUNNING bit _before_ hn_suspend_data() */
2146	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2147	hn_suspend_data(sc);
2148
2149	/* Clear OACTIVE bit. */
2150	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2151	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2152		sc->hn_tx_ring[i].hn_oactive = 0;
2153}
2154
2155static void
2156hn_start(struct ifnet *ifp)
2157{
2158	struct hn_softc *sc = ifp->if_softc;
2159	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
2160
2161	if (txr->hn_sched_tx)
2162		goto do_sched;
2163
2164	if (mtx_trylock(&txr->hn_tx_lock)) {
2165		int sched;
2166
2167		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
2168		mtx_unlock(&txr->hn_tx_lock);
2169		if (!sched)
2170			return;
2171	}
2172do_sched:
2173	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
2174}
2175
2176static void
2177hn_start_txeof(struct hn_tx_ring *txr)
2178{
2179	struct hn_softc *sc = txr->hn_sc;
2180	struct ifnet *ifp = sc->hn_ifp;
2181
2182	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
2183
2184	if (txr->hn_sched_tx)
2185		goto do_sched;
2186
2187	if (mtx_trylock(&txr->hn_tx_lock)) {
2188		int sched;
2189
2190		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2191		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
2192		mtx_unlock(&txr->hn_tx_lock);
2193		if (sched) {
2194			taskqueue_enqueue(txr->hn_tx_taskq,
2195			    &txr->hn_tx_task);
2196		}
2197	} else {
2198do_sched:
2199		/*
2200		 * Release the OACTIVE earlier, with the hope, that
2201		 * others could catch up.  The task will clear the
2202		 * flag again with the hn_tx_lock to avoid possible
2203		 * races.
2204		 */
2205		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2206		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
2207	}
2208}
2209
2210static void
2211hn_init_locked(struct hn_softc *sc)
2212{
2213	struct ifnet *ifp = sc->hn_ifp;
2214	int i;
2215
2216	HN_LOCK_ASSERT(sc);
2217
2218	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2219		return;
2220
2221	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2222		return;
2223
2224	/* Configure RX filter */
2225	hn_set_rxfilter(sc);
2226
2227	/* Clear OACTIVE bit. */
2228	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2229	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2230		sc->hn_tx_ring[i].hn_oactive = 0;
2231
2232	/* Clear TX 'suspended' bit. */
2233	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2234
2235	/* Everything is ready; unleash! */
2236	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2237}
2238
2239static void
2240hn_init(void *xsc)
2241{
2242	struct hn_softc *sc = xsc;
2243
2244	HN_LOCK(sc);
2245	hn_init_locked(sc);
2246	HN_UNLOCK(sc);
2247}
2248
2249#if __FreeBSD_version >= 1100099
2250
2251static int
2252hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2253{
2254	struct hn_softc *sc = arg1;
2255	unsigned int lenlim;
2256	int error;
2257
2258	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2259	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2260	if (error || req->newptr == NULL)
2261		return error;
2262
2263	HN_LOCK(sc);
2264	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2265	    lenlim > TCP_LRO_LENGTH_MAX) {
2266		HN_UNLOCK(sc);
2267		return EINVAL;
2268	}
2269	hn_set_lro_lenlim(sc, lenlim);
2270	HN_UNLOCK(sc);
2271
2272	return 0;
2273}
2274
2275static int
2276hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2277{
2278	struct hn_softc *sc = arg1;
2279	int ackcnt, error, i;
2280
2281	/*
2282	 * lro_ackcnt_lim is append count limit,
2283	 * +1 to turn it into aggregation limit.
2284	 */
2285	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2286	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2287	if (error || req->newptr == NULL)
2288		return error;
2289
2290	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2291		return EINVAL;
2292
2293	/*
2294	 * Convert aggregation limit back to append
2295	 * count limit.
2296	 */
2297	--ackcnt;
2298	HN_LOCK(sc);
2299	for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
2300		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2301	HN_UNLOCK(sc);
2302	return 0;
2303}
2304
2305#endif
2306
2307static int
2308hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2309{
2310	struct hn_softc *sc = arg1;
2311	int hcsum = arg2;
2312	int on, error, i;
2313
2314	on = 0;
2315	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2316		on = 1;
2317
2318	error = sysctl_handle_int(oidp, &on, 0, req);
2319	if (error || req->newptr == NULL)
2320		return error;
2321
2322	HN_LOCK(sc);
2323	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2324		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2325
2326		if (on)
2327			rxr->hn_trust_hcsum |= hcsum;
2328		else
2329			rxr->hn_trust_hcsum &= ~hcsum;
2330	}
2331	HN_UNLOCK(sc);
2332	return 0;
2333}
2334
2335static int
2336hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2337{
2338	struct hn_softc *sc = arg1;
2339	int chim_size, error;
2340
2341	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2342	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2343	if (error || req->newptr == NULL)
2344		return error;
2345
2346	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2347		return EINVAL;
2348
2349	HN_LOCK(sc);
2350	hn_set_chim_size(sc, chim_size);
2351	HN_UNLOCK(sc);
2352	return 0;
2353}
2354
2355#if __FreeBSD_version < 1100095
2356static int
2357hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2358{
2359	struct hn_softc *sc = arg1;
2360	int ofs = arg2, i, error;
2361	struct hn_rx_ring *rxr;
2362	uint64_t stat;
2363
2364	stat = 0;
2365	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2366		rxr = &sc->hn_rx_ring[i];
2367		stat += *((int *)((uint8_t *)rxr + ofs));
2368	}
2369
2370	error = sysctl_handle_64(oidp, &stat, 0, req);
2371	if (error || req->newptr == NULL)
2372		return error;
2373
2374	/* Zero out this stat. */
2375	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2376		rxr = &sc->hn_rx_ring[i];
2377		*((int *)((uint8_t *)rxr + ofs)) = 0;
2378	}
2379	return 0;
2380}
2381#else
2382static int
2383hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2384{
2385	struct hn_softc *sc = arg1;
2386	int ofs = arg2, i, error;
2387	struct hn_rx_ring *rxr;
2388	uint64_t stat;
2389
2390	stat = 0;
2391	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2392		rxr = &sc->hn_rx_ring[i];
2393		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2394	}
2395
2396	error = sysctl_handle_64(oidp, &stat, 0, req);
2397	if (error || req->newptr == NULL)
2398		return error;
2399
2400	/* Zero out this stat. */
2401	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2402		rxr = &sc->hn_rx_ring[i];
2403		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2404	}
2405	return 0;
2406}
2407
2408#endif
2409
2410static int
2411hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2412{
2413	struct hn_softc *sc = arg1;
2414	int ofs = arg2, i, error;
2415	struct hn_rx_ring *rxr;
2416	u_long stat;
2417
2418	stat = 0;
2419	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2420		rxr = &sc->hn_rx_ring[i];
2421		stat += *((u_long *)((uint8_t *)rxr + ofs));
2422	}
2423
2424	error = sysctl_handle_long(oidp, &stat, 0, req);
2425	if (error || req->newptr == NULL)
2426		return error;
2427
2428	/* Zero out this stat. */
2429	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2430		rxr = &sc->hn_rx_ring[i];
2431		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2432	}
2433	return 0;
2434}
2435
2436static int
2437hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2438{
2439	struct hn_softc *sc = arg1;
2440	int ofs = arg2, i, error;
2441	struct hn_tx_ring *txr;
2442	u_long stat;
2443
2444	stat = 0;
2445	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2446		txr = &sc->hn_tx_ring[i];
2447		stat += *((u_long *)((uint8_t *)txr + ofs));
2448	}
2449
2450	error = sysctl_handle_long(oidp, &stat, 0, req);
2451	if (error || req->newptr == NULL)
2452		return error;
2453
2454	/* Zero out this stat. */
2455	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2456		txr = &sc->hn_tx_ring[i];
2457		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2458	}
2459	return 0;
2460}
2461
2462static int
2463hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2464{
2465	struct hn_softc *sc = arg1;
2466	int ofs = arg2, i, error, conf;
2467	struct hn_tx_ring *txr;
2468
2469	txr = &sc->hn_tx_ring[0];
2470	conf = *((int *)((uint8_t *)txr + ofs));
2471
2472	error = sysctl_handle_int(oidp, &conf, 0, req);
2473	if (error || req->newptr == NULL)
2474		return error;
2475
2476	HN_LOCK(sc);
2477	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2478		txr = &sc->hn_tx_ring[i];
2479		*((int *)((uint8_t *)txr + ofs)) = conf;
2480	}
2481	HN_UNLOCK(sc);
2482
2483	return 0;
2484}
2485
2486static int
2487hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2488{
2489	struct hn_softc *sc = arg1;
2490	char verstr[16];
2491
2492	snprintf(verstr, sizeof(verstr), "%u.%u",
2493	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2494	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2495	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2496}
2497
2498static int
2499hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2500{
2501	struct hn_softc *sc = arg1;
2502	char caps_str[128];
2503	uint32_t caps;
2504
2505	HN_LOCK(sc);
2506	caps = sc->hn_caps;
2507	HN_UNLOCK(sc);
2508	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2509	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2510}
2511
2512static int
2513hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2514{
2515	struct hn_softc *sc = arg1;
2516	char assist_str[128];
2517	uint32_t hwassist;
2518
2519	HN_LOCK(sc);
2520	hwassist = sc->hn_ifp->if_hwassist;
2521	HN_UNLOCK(sc);
2522	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2523	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2524}
2525
2526static int
2527hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2528{
2529	struct hn_softc *sc = arg1;
2530	char filter_str[128];
2531	uint32_t filter;
2532
2533	HN_LOCK(sc);
2534	filter = sc->hn_rx_filter;
2535	HN_UNLOCK(sc);
2536	snprintf(filter_str, sizeof(filter_str), "%b", filter,
2537	    NDIS_PACKET_TYPES);
2538	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2539}
2540
2541static int
2542hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2543{
2544	struct hn_softc *sc = arg1;
2545	int error;
2546
2547	HN_LOCK(sc);
2548
2549	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2550	if (error || req->newptr == NULL)
2551		goto back;
2552
2553	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2554	if (error)
2555		goto back;
2556	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2557
2558	if (sc->hn_rx_ring_inuse > 1) {
2559		error = hn_rss_reconfig(sc);
2560	} else {
2561		/* Not RSS capable, at least for now; just save the RSS key. */
2562		error = 0;
2563	}
2564back:
2565	HN_UNLOCK(sc);
2566	return (error);
2567}
2568
2569static int
2570hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2571{
2572	struct hn_softc *sc = arg1;
2573	int error;
2574
2575	HN_LOCK(sc);
2576
2577	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2578	if (error || req->newptr == NULL)
2579		goto back;
2580
2581	/*
2582	 * Don't allow RSS indirect table change, if this interface is not
2583	 * RSS capable currently.
2584	 */
2585	if (sc->hn_rx_ring_inuse == 1) {
2586		error = EOPNOTSUPP;
2587		goto back;
2588	}
2589
2590	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2591	if (error)
2592		goto back;
2593	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2594
2595	hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2596	error = hn_rss_reconfig(sc);
2597back:
2598	HN_UNLOCK(sc);
2599	return (error);
2600}
2601
2602static int
2603hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2604{
2605	struct hn_softc *sc = arg1;
2606	char hash_str[128];
2607	uint32_t hash;
2608
2609	HN_LOCK(sc);
2610	hash = sc->hn_rss_hash;
2611	HN_UNLOCK(sc);
2612	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2613	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2614}
2615
2616static int
2617hn_check_iplen(const struct mbuf *m, int hoff)
2618{
2619	const struct ip *ip;
2620	int len, iphlen, iplen;
2621	const struct tcphdr *th;
2622	int thoff;				/* TCP data offset */
2623
2624	len = hoff + sizeof(struct ip);
2625
2626	/* The packet must be at least the size of an IP header. */
2627	if (m->m_pkthdr.len < len)
2628		return IPPROTO_DONE;
2629
2630	/* The fixed IP header must reside completely in the first mbuf. */
2631	if (m->m_len < len)
2632		return IPPROTO_DONE;
2633
2634	ip = mtodo(m, hoff);
2635
2636	/* Bound check the packet's stated IP header length. */
2637	iphlen = ip->ip_hl << 2;
2638	if (iphlen < sizeof(struct ip))		/* minimum header length */
2639		return IPPROTO_DONE;
2640
2641	/* The full IP header must reside completely in the one mbuf. */
2642	if (m->m_len < hoff + iphlen)
2643		return IPPROTO_DONE;
2644
2645	iplen = ntohs(ip->ip_len);
2646
2647	/*
2648	 * Check that the amount of data in the buffers is as
2649	 * at least much as the IP header would have us expect.
2650	 */
2651	if (m->m_pkthdr.len < hoff + iplen)
2652		return IPPROTO_DONE;
2653
2654	/*
2655	 * Ignore IP fragments.
2656	 */
2657	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
2658		return IPPROTO_DONE;
2659
2660	/*
2661	 * The TCP/IP or UDP/IP header must be entirely contained within
2662	 * the first fragment of a packet.
2663	 */
2664	switch (ip->ip_p) {
2665	case IPPROTO_TCP:
2666		if (iplen < iphlen + sizeof(struct tcphdr))
2667			return IPPROTO_DONE;
2668		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
2669			return IPPROTO_DONE;
2670		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
2671		thoff = th->th_off << 2;
2672		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
2673			return IPPROTO_DONE;
2674		if (m->m_len < hoff + iphlen + thoff)
2675			return IPPROTO_DONE;
2676		break;
2677	case IPPROTO_UDP:
2678		if (iplen < iphlen + sizeof(struct udphdr))
2679			return IPPROTO_DONE;
2680		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
2681			return IPPROTO_DONE;
2682		break;
2683	default:
2684		if (iplen < iphlen)
2685			return IPPROTO_DONE;
2686		break;
2687	}
2688	return ip->ip_p;
2689}
2690
2691static int
2692hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
2693{
2694	struct sysctl_oid_list *child;
2695	struct sysctl_ctx_list *ctx;
2696	device_t dev = sc->hn_dev;
2697#if defined(INET) || defined(INET6)
2698#if __FreeBSD_version >= 1100095
2699	int lroent_cnt;
2700#endif
2701#endif
2702	int i;
2703
2704	/*
2705	 * Create RXBUF for reception.
2706	 *
2707	 * NOTE:
2708	 * - It is shared by all channels.
2709	 * - A large enough buffer is allocated, certain version of NVSes
2710	 *   may further limit the usable space.
2711	 */
2712	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2713	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
2714	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
2715	if (sc->hn_rxbuf == NULL) {
2716		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
2717		return (ENOMEM);
2718	}
2719
2720	sc->hn_rx_ring_cnt = ring_cnt;
2721	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
2722
2723	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
2724	    M_DEVBUF, M_WAITOK | M_ZERO);
2725
2726#if defined(INET) || defined(INET6)
2727#if __FreeBSD_version >= 1100095
2728	lroent_cnt = hn_lro_entry_count;
2729	if (lroent_cnt < TCP_LRO_ENTRIES)
2730		lroent_cnt = TCP_LRO_ENTRIES;
2731	if (bootverbose)
2732		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
2733#endif
2734#endif	/* INET || INET6 */
2735
2736	ctx = device_get_sysctl_ctx(dev);
2737	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2738
2739	/* Create dev.hn.UNIT.rx sysctl tree */
2740	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
2741	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2742
2743	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2744		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2745
2746		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2747		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
2748		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
2749		if (rxr->hn_br == NULL) {
2750			device_printf(dev, "allocate bufring failed\n");
2751			return (ENOMEM);
2752		}
2753
2754		if (hn_trust_hosttcp)
2755			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
2756		if (hn_trust_hostudp)
2757			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
2758		if (hn_trust_hostip)
2759			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
2760		rxr->hn_ifp = sc->hn_ifp;
2761		if (i < sc->hn_tx_ring_cnt)
2762			rxr->hn_txr = &sc->hn_tx_ring[i];
2763		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
2764		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
2765		rxr->hn_rx_idx = i;
2766		rxr->hn_rxbuf = sc->hn_rxbuf;
2767
2768		/*
2769		 * Initialize LRO.
2770		 */
2771#if defined(INET) || defined(INET6)
2772#if __FreeBSD_version >= 1100095
2773		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
2774		    hn_lro_mbufq_depth);
2775#else
2776		tcp_lro_init(&rxr->hn_lro);
2777		rxr->hn_lro.ifp = sc->hn_ifp;
2778#endif
2779#if __FreeBSD_version >= 1100099
2780		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
2781		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
2782#endif
2783#endif	/* INET || INET6 */
2784
2785		if (sc->hn_rx_sysctl_tree != NULL) {
2786			char name[16];
2787
2788			/*
2789			 * Create per RX ring sysctl tree:
2790			 * dev.hn.UNIT.rx.RINGID
2791			 */
2792			snprintf(name, sizeof(name), "%d", i);
2793			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
2794			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
2795			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2796
2797			if (rxr->hn_rx_sysctl_tree != NULL) {
2798				SYSCTL_ADD_ULONG(ctx,
2799				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2800				    OID_AUTO, "packets", CTLFLAG_RW,
2801				    &rxr->hn_pkts, "# of packets received");
2802				SYSCTL_ADD_ULONG(ctx,
2803				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2804				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
2805				    &rxr->hn_rss_pkts,
2806				    "# of packets w/ RSS info received");
2807				SYSCTL_ADD_INT(ctx,
2808				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2809				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
2810				    &rxr->hn_pktbuf_len, 0,
2811				    "Temporary channel packet buffer length");
2812			}
2813		}
2814	}
2815
2816	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
2817	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2818	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
2819#if __FreeBSD_version < 1100095
2820	    hn_rx_stat_int_sysctl,
2821#else
2822	    hn_rx_stat_u64_sysctl,
2823#endif
2824	    "LU", "LRO queued");
2825	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
2826	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2827	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
2828#if __FreeBSD_version < 1100095
2829	    hn_rx_stat_int_sysctl,
2830#else
2831	    hn_rx_stat_u64_sysctl,
2832#endif
2833	    "LU", "LRO flushed");
2834	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
2835	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2836	    __offsetof(struct hn_rx_ring, hn_lro_tried),
2837	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
2838#if __FreeBSD_version >= 1100099
2839	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
2840	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2841	    hn_lro_lenlim_sysctl, "IU",
2842	    "Max # of data bytes to be aggregated by LRO");
2843	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
2844	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2845	    hn_lro_ackcnt_sysctl, "I",
2846	    "Max # of ACKs to be aggregated by LRO");
2847#endif
2848	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
2849	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
2850	    hn_trust_hcsum_sysctl, "I",
2851	    "Trust tcp segement verification on host side, "
2852	    "when csum info is missing");
2853	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
2854	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
2855	    hn_trust_hcsum_sysctl, "I",
2856	    "Trust udp datagram verification on host side, "
2857	    "when csum info is missing");
2858	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
2859	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
2860	    hn_trust_hcsum_sysctl, "I",
2861	    "Trust ip packet verification on host side, "
2862	    "when csum info is missing");
2863	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
2864	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2865	    __offsetof(struct hn_rx_ring, hn_csum_ip),
2866	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
2867	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
2868	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2869	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
2870	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
2871	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
2872	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2873	    __offsetof(struct hn_rx_ring, hn_csum_udp),
2874	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
2875	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
2876	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2877	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
2878	    hn_rx_stat_ulong_sysctl, "LU",
2879	    "# of packets that we trust host's csum verification");
2880	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
2881	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2882	    __offsetof(struct hn_rx_ring, hn_small_pkts),
2883	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
2884	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
2885	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2886	    __offsetof(struct hn_rx_ring, hn_ack_failed),
2887	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
2888	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
2889	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
2890	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
2891	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
2892
2893	return (0);
2894}
2895
2896static void
2897hn_destroy_rx_data(struct hn_softc *sc)
2898{
2899	int i;
2900
2901	if (sc->hn_rxbuf != NULL) {
2902		hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
2903		sc->hn_rxbuf = NULL;
2904	}
2905
2906	if (sc->hn_rx_ring_cnt == 0)
2907		return;
2908
2909	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2910		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2911
2912		if (rxr->hn_br == NULL)
2913			continue;
2914		hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
2915		rxr->hn_br = NULL;
2916
2917#if defined(INET) || defined(INET6)
2918		tcp_lro_free(&rxr->hn_lro);
2919#endif
2920		free(rxr->hn_pktbuf, M_DEVBUF);
2921	}
2922	free(sc->hn_rx_ring, M_DEVBUF);
2923	sc->hn_rx_ring = NULL;
2924
2925	sc->hn_rx_ring_cnt = 0;
2926	sc->hn_rx_ring_inuse = 0;
2927}
2928
2929static int
2930hn_tx_ring_create(struct hn_softc *sc, int id)
2931{
2932	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
2933	device_t dev = sc->hn_dev;
2934	bus_dma_tag_t parent_dtag;
2935	int error, i;
2936
2937	txr->hn_sc = sc;
2938	txr->hn_tx_idx = id;
2939
2940#ifndef HN_USE_TXDESC_BUFRING
2941	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
2942#endif
2943	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
2944
2945	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
2946	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
2947	    M_DEVBUF, M_WAITOK | M_ZERO);
2948#ifndef HN_USE_TXDESC_BUFRING
2949	SLIST_INIT(&txr->hn_txlist);
2950#else
2951	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
2952	    M_WAITOK, &txr->hn_tx_lock);
2953#endif
2954
2955	txr->hn_tx_taskq = sc->hn_tx_taskq;
2956
2957	if (hn_use_if_start) {
2958		txr->hn_txeof = hn_start_txeof;
2959		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
2960		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
2961	} else {
2962		int br_depth;
2963
2964		txr->hn_txeof = hn_xmit_txeof;
2965		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
2966		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
2967
2968		br_depth = hn_get_txswq_depth(txr);
2969		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
2970		    M_WAITOK, &txr->hn_tx_lock);
2971	}
2972
2973	txr->hn_direct_tx_size = hn_direct_tx_size;
2974
2975	/*
2976	 * Always schedule transmission instead of trying to do direct
2977	 * transmission.  This one gives the best performance so far.
2978	 */
2979	txr->hn_sched_tx = 1;
2980
2981	parent_dtag = bus_get_dma_tag(dev);
2982
2983	/* DMA tag for RNDIS packet messages. */
2984	error = bus_dma_tag_create(parent_dtag, /* parent */
2985	    HN_RNDIS_PKT_ALIGN,		/* alignment */
2986	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
2987	    BUS_SPACE_MAXADDR,		/* lowaddr */
2988	    BUS_SPACE_MAXADDR,		/* highaddr */
2989	    NULL, NULL,			/* filter, filterarg */
2990	    HN_RNDIS_PKT_LEN,		/* maxsize */
2991	    1,				/* nsegments */
2992	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
2993	    0,				/* flags */
2994	    NULL,			/* lockfunc */
2995	    NULL,			/* lockfuncarg */
2996	    &txr->hn_tx_rndis_dtag);
2997	if (error) {
2998		device_printf(dev, "failed to create rndis dmatag\n");
2999		return error;
3000	}
3001
3002	/* DMA tag for data. */
3003	error = bus_dma_tag_create(parent_dtag, /* parent */
3004	    1,				/* alignment */
3005	    HN_TX_DATA_BOUNDARY,	/* boundary */
3006	    BUS_SPACE_MAXADDR,		/* lowaddr */
3007	    BUS_SPACE_MAXADDR,		/* highaddr */
3008	    NULL, NULL,			/* filter, filterarg */
3009	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3010	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3011	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3012	    0,				/* flags */
3013	    NULL,			/* lockfunc */
3014	    NULL,			/* lockfuncarg */
3015	    &txr->hn_tx_data_dtag);
3016	if (error) {
3017		device_printf(dev, "failed to create data dmatag\n");
3018		return error;
3019	}
3020
3021	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3022		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3023
3024		txd->txr = txr;
3025		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3026
3027		/*
3028		 * Allocate and load RNDIS packet message.
3029		 */
3030        	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3031		    (void **)&txd->rndis_pkt,
3032		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3033		    &txd->rndis_pkt_dmap);
3034		if (error) {
3035			device_printf(dev,
3036			    "failed to allocate rndis_packet_msg, %d\n", i);
3037			return error;
3038		}
3039
3040		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3041		    txd->rndis_pkt_dmap,
3042		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3043		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3044		    BUS_DMA_NOWAIT);
3045		if (error) {
3046			device_printf(dev,
3047			    "failed to load rndis_packet_msg, %d\n", i);
3048			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3049			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3050			return error;
3051		}
3052
3053		/* DMA map for TX data. */
3054		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3055		    &txd->data_dmap);
3056		if (error) {
3057			device_printf(dev,
3058			    "failed to allocate tx data dmamap\n");
3059			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3060			    txd->rndis_pkt_dmap);
3061			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3062			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3063			return error;
3064		}
3065
3066		/* All set, put it to list */
3067		txd->flags |= HN_TXD_FLAG_ONLIST;
3068#ifndef HN_USE_TXDESC_BUFRING
3069		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3070#else
3071		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3072#endif
3073	}
3074	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3075
3076	if (sc->hn_tx_sysctl_tree != NULL) {
3077		struct sysctl_oid_list *child;
3078		struct sysctl_ctx_list *ctx;
3079		char name[16];
3080
3081		/*
3082		 * Create per TX ring sysctl tree:
3083		 * dev.hn.UNIT.tx.RINGID
3084		 */
3085		ctx = device_get_sysctl_ctx(dev);
3086		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3087
3088		snprintf(name, sizeof(name), "%d", id);
3089		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3090		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3091
3092		if (txr->hn_tx_sysctl_tree != NULL) {
3093			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3094
3095			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3096			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3097			    "# of available TX descs");
3098			if (!hn_use_if_start) {
3099				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3100				    CTLFLAG_RD, &txr->hn_oactive, 0,
3101				    "over active");
3102			}
3103			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3104			    CTLFLAG_RW, &txr->hn_pkts,
3105			    "# of packets transmitted");
3106		}
3107	}
3108
3109	return 0;
3110}
3111
3112static void
3113hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3114{
3115	struct hn_tx_ring *txr = txd->txr;
3116
3117	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3118	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3119
3120	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3121	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3122	    txd->rndis_pkt_dmap);
3123	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3124}
3125
3126static void
3127hn_tx_ring_destroy(struct hn_tx_ring *txr)
3128{
3129	struct hn_txdesc *txd;
3130
3131	if (txr->hn_txdesc == NULL)
3132		return;
3133
3134#ifndef HN_USE_TXDESC_BUFRING
3135	while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
3136		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
3137		hn_txdesc_dmamap_destroy(txd);
3138	}
3139#else
3140	mtx_lock(&txr->hn_tx_lock);
3141	while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
3142		hn_txdesc_dmamap_destroy(txd);
3143	mtx_unlock(&txr->hn_tx_lock);
3144#endif
3145
3146	if (txr->hn_tx_data_dtag != NULL)
3147		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3148	if (txr->hn_tx_rndis_dtag != NULL)
3149		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3150
3151#ifdef HN_USE_TXDESC_BUFRING
3152	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3153#endif
3154
3155	free(txr->hn_txdesc, M_DEVBUF);
3156	txr->hn_txdesc = NULL;
3157
3158	if (txr->hn_mbuf_br != NULL)
3159		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3160
3161#ifndef HN_USE_TXDESC_BUFRING
3162	mtx_destroy(&txr->hn_txlist_spin);
3163#endif
3164	mtx_destroy(&txr->hn_tx_lock);
3165}
3166
3167static int
3168hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3169{
3170	struct sysctl_oid_list *child;
3171	struct sysctl_ctx_list *ctx;
3172	int i;
3173
3174	/*
3175	 * Create TXBUF for chimney sending.
3176	 *
3177	 * NOTE: It is shared by all channels.
3178	 */
3179	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3180	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3181	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3182	if (sc->hn_chim == NULL) {
3183		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3184		return (ENOMEM);
3185	}
3186
3187	sc->hn_tx_ring_cnt = ring_cnt;
3188	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3189
3190	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3191	    M_DEVBUF, M_WAITOK | M_ZERO);
3192
3193	ctx = device_get_sysctl_ctx(sc->hn_dev);
3194	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3195
3196	/* Create dev.hn.UNIT.tx sysctl tree */
3197	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3198	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3199
3200	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3201		int error;
3202
3203		error = hn_tx_ring_create(sc, i);
3204		if (error)
3205			return error;
3206	}
3207
3208	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3209	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3210	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3211	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3212	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3213	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3214	    __offsetof(struct hn_tx_ring, hn_send_failed),
3215	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3216	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3217	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3218	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3219	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3220	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3221	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3222	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3223	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3224	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3225	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3226	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3227	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3228	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3229	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3230	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3231	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3232	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3233	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3234	    "# of total TX descs");
3235	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3236	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3237	    "Chimney send packet size upper boundary");
3238	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3239	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3240	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3241	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3242	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3243	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3244	    hn_tx_conf_int_sysctl, "I",
3245	    "Size of the packet for direct transmission");
3246	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3247	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3248	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3249	    hn_tx_conf_int_sysctl, "I",
3250	    "Always schedule transmission "
3251	    "instead of doing direct transmission");
3252	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3253	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3254	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3255	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3256
3257	return 0;
3258}
3259
3260static void
3261hn_set_chim_size(struct hn_softc *sc, int chim_size)
3262{
3263	int i;
3264
3265	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3266		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3267}
3268
3269static void
3270hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3271{
3272	struct ifnet *ifp = sc->hn_ifp;
3273	int tso_minlen;
3274
3275	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3276		return;
3277
3278	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3279	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3280	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3281
3282	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3283	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3284	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3285
3286	if (tso_maxlen < tso_minlen)
3287		tso_maxlen = tso_minlen;
3288	else if (tso_maxlen > IP_MAXPACKET)
3289		tso_maxlen = IP_MAXPACKET;
3290	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3291		tso_maxlen = sc->hn_ndis_tso_szmax;
3292	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3293	if (bootverbose)
3294		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3295}
3296
3297static void
3298hn_fixup_tx_data(struct hn_softc *sc)
3299{
3300	uint64_t csum_assist;
3301	int i;
3302
3303	hn_set_chim_size(sc, sc->hn_chim_szmax);
3304	if (hn_tx_chimney_size > 0 &&
3305	    hn_tx_chimney_size < sc->hn_chim_szmax)
3306		hn_set_chim_size(sc, hn_tx_chimney_size);
3307
3308	csum_assist = 0;
3309	if (sc->hn_caps & HN_CAP_IPCS)
3310		csum_assist |= CSUM_IP;
3311	if (sc->hn_caps & HN_CAP_TCP4CS)
3312		csum_assist |= CSUM_IP_TCP;
3313	if (sc->hn_caps & HN_CAP_UDP4CS)
3314		csum_assist |= CSUM_IP_UDP;
3315#ifdef notyet
3316	if (sc->hn_caps & HN_CAP_TCP6CS)
3317		csum_assist |= CSUM_IP6_TCP;
3318	if (sc->hn_caps & HN_CAP_UDP6CS)
3319		csum_assist |= CSUM_IP6_UDP;
3320#endif
3321	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3322		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3323
3324	if (sc->hn_caps & HN_CAP_HASHVAL) {
3325		/*
3326		 * Support HASHVAL pktinfo on TX path.
3327		 */
3328		if (bootverbose)
3329			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3330		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3331			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3332	}
3333}
3334
3335static void
3336hn_destroy_tx_data(struct hn_softc *sc)
3337{
3338	int i;
3339
3340	if (sc->hn_chim != NULL) {
3341		hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3342		sc->hn_chim = NULL;
3343	}
3344
3345	if (sc->hn_tx_ring_cnt == 0)
3346		return;
3347
3348	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3349		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3350
3351	free(sc->hn_tx_ring, M_DEVBUF);
3352	sc->hn_tx_ring = NULL;
3353
3354	sc->hn_tx_ring_cnt = 0;
3355	sc->hn_tx_ring_inuse = 0;
3356}
3357
3358static void
3359hn_start_taskfunc(void *xtxr, int pending __unused)
3360{
3361	struct hn_tx_ring *txr = xtxr;
3362
3363	mtx_lock(&txr->hn_tx_lock);
3364	hn_start_locked(txr, 0);
3365	mtx_unlock(&txr->hn_tx_lock);
3366}
3367
3368static void
3369hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3370{
3371	struct hn_tx_ring *txr = xtxr;
3372
3373	mtx_lock(&txr->hn_tx_lock);
3374	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3375	hn_start_locked(txr, 0);
3376	mtx_unlock(&txr->hn_tx_lock);
3377}
3378
3379static int
3380hn_xmit(struct hn_tx_ring *txr, int len)
3381{
3382	struct hn_softc *sc = txr->hn_sc;
3383	struct ifnet *ifp = sc->hn_ifp;
3384	struct mbuf *m_head;
3385
3386	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3387	KASSERT(hn_use_if_start == 0,
3388	    ("hn_xmit is called, when if_start is enabled"));
3389
3390	if (__predict_false(txr->hn_suspended))
3391		return 0;
3392
3393	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3394		return 0;
3395
3396	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3397		struct hn_txdesc *txd;
3398		int error;
3399
3400		if (len > 0 && m_head->m_pkthdr.len > len) {
3401			/*
3402			 * This sending could be time consuming; let callers
3403			 * dispatch this packet sending (and sending of any
3404			 * following up packets) to tx taskqueue.
3405			 */
3406			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3407			return 1;
3408		}
3409
3410		txd = hn_txdesc_get(txr);
3411		if (txd == NULL) {
3412			txr->hn_no_txdescs++;
3413			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3414			txr->hn_oactive = 1;
3415			break;
3416		}
3417
3418		error = hn_encap(txr, txd, &m_head);
3419		if (error) {
3420			/* Both txd and m_head are freed; discard */
3421			drbr_advance(ifp, txr->hn_mbuf_br);
3422			continue;
3423		}
3424
3425		error = hn_txpkt(ifp, txr, txd);
3426		if (__predict_false(error)) {
3427			/* txd is freed, but m_head is not */
3428			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3429			txr->hn_oactive = 1;
3430			break;
3431		}
3432
3433		/* Sent */
3434		drbr_advance(ifp, txr->hn_mbuf_br);
3435	}
3436	return 0;
3437}
3438
3439static int
3440hn_transmit(struct ifnet *ifp, struct mbuf *m)
3441{
3442	struct hn_softc *sc = ifp->if_softc;
3443	struct hn_tx_ring *txr;
3444	int error, idx = 0;
3445
3446	/*
3447	 * Select the TX ring based on flowid
3448	 */
3449	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
3450		idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
3451	txr = &sc->hn_tx_ring[idx];
3452
3453	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
3454	if (error) {
3455		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
3456		return error;
3457	}
3458
3459	if (txr->hn_oactive)
3460		return 0;
3461
3462	if (txr->hn_sched_tx)
3463		goto do_sched;
3464
3465	if (mtx_trylock(&txr->hn_tx_lock)) {
3466		int sched;
3467
3468		sched = hn_xmit(txr, txr->hn_direct_tx_size);
3469		mtx_unlock(&txr->hn_tx_lock);
3470		if (!sched)
3471			return 0;
3472	}
3473do_sched:
3474	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3475	return 0;
3476}
3477
3478static void
3479hn_tx_ring_qflush(struct hn_tx_ring *txr)
3480{
3481	struct mbuf *m;
3482
3483	mtx_lock(&txr->hn_tx_lock);
3484	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
3485		m_freem(m);
3486	mtx_unlock(&txr->hn_tx_lock);
3487}
3488
3489static void
3490hn_xmit_qflush(struct ifnet *ifp)
3491{
3492	struct hn_softc *sc = ifp->if_softc;
3493	int i;
3494
3495	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3496		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
3497	if_qflush(ifp);
3498}
3499
3500static void
3501hn_xmit_txeof(struct hn_tx_ring *txr)
3502{
3503
3504	if (txr->hn_sched_tx)
3505		goto do_sched;
3506
3507	if (mtx_trylock(&txr->hn_tx_lock)) {
3508		int sched;
3509
3510		txr->hn_oactive = 0;
3511		sched = hn_xmit(txr, txr->hn_direct_tx_size);
3512		mtx_unlock(&txr->hn_tx_lock);
3513		if (sched) {
3514			taskqueue_enqueue(txr->hn_tx_taskq,
3515			    &txr->hn_tx_task);
3516		}
3517	} else {
3518do_sched:
3519		/*
3520		 * Release the oactive earlier, with the hope, that
3521		 * others could catch up.  The task will clear the
3522		 * oactive again with the hn_tx_lock to avoid possible
3523		 * races.
3524		 */
3525		txr->hn_oactive = 0;
3526		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3527	}
3528}
3529
3530static void
3531hn_xmit_taskfunc(void *xtxr, int pending __unused)
3532{
3533	struct hn_tx_ring *txr = xtxr;
3534
3535	mtx_lock(&txr->hn_tx_lock);
3536	hn_xmit(txr, 0);
3537	mtx_unlock(&txr->hn_tx_lock);
3538}
3539
3540static void
3541hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
3542{
3543	struct hn_tx_ring *txr = xtxr;
3544
3545	mtx_lock(&txr->hn_tx_lock);
3546	txr->hn_oactive = 0;
3547	hn_xmit(txr, 0);
3548	mtx_unlock(&txr->hn_tx_lock);
3549}
3550
3551static int
3552hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
3553{
3554	struct vmbus_chan_br cbr;
3555	struct hn_rx_ring *rxr;
3556	struct hn_tx_ring *txr = NULL;
3557	int idx, error;
3558
3559	idx = vmbus_chan_subidx(chan);
3560
3561	/*
3562	 * Link this channel to RX/TX ring.
3563	 */
3564	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3565	    ("invalid channel index %d, should > 0 && < %d",
3566	     idx, sc->hn_rx_ring_inuse));
3567	rxr = &sc->hn_rx_ring[idx];
3568	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
3569	    ("RX ring %d already attached", idx));
3570	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
3571
3572	if (bootverbose) {
3573		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
3574		    idx, vmbus_chan_id(chan));
3575	}
3576
3577	if (idx < sc->hn_tx_ring_inuse) {
3578		txr = &sc->hn_tx_ring[idx];
3579		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
3580		    ("TX ring %d already attached", idx));
3581		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
3582
3583		txr->hn_chan = chan;
3584		if (bootverbose) {
3585			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
3586			    idx, vmbus_chan_id(chan));
3587		}
3588	}
3589
3590	/* Bind this channel to a proper CPU. */
3591	vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
3592
3593	/*
3594	 * Open this channel
3595	 */
3596	cbr.cbr = rxr->hn_br;
3597	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
3598	cbr.cbr_txsz = HN_TXBR_SIZE;
3599	cbr.cbr_rxsz = HN_RXBR_SIZE;
3600	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
3601	if (error) {
3602		if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
3603		    vmbus_chan_id(chan), error);
3604		rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3605		if (txr != NULL)
3606			txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3607	}
3608	return (error);
3609}
3610
3611static void
3612hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
3613{
3614	struct hn_rx_ring *rxr;
3615	int idx;
3616
3617	idx = vmbus_chan_subidx(chan);
3618
3619	/*
3620	 * Link this channel to RX/TX ring.
3621	 */
3622	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3623	    ("invalid channel index %d, should > 0 && < %d",
3624	     idx, sc->hn_rx_ring_inuse));
3625	rxr = &sc->hn_rx_ring[idx];
3626	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
3627	    ("RX ring %d is not attached", idx));
3628	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3629
3630	if (idx < sc->hn_tx_ring_inuse) {
3631		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
3632
3633		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
3634		    ("TX ring %d is not attached attached", idx));
3635		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3636	}
3637
3638	/*
3639	 * Close this channel.
3640	 *
3641	 * NOTE:
3642	 * Channel closing does _not_ destroy the target channel.
3643	 */
3644	vmbus_chan_close(chan);
3645}
3646
3647static int
3648hn_attach_subchans(struct hn_softc *sc)
3649{
3650	struct vmbus_channel **subchans;
3651	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3652	int i, error = 0;
3653
3654	if (subchan_cnt == 0)
3655		return (0);
3656
3657	/* Attach the sub-channels. */
3658	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3659	for (i = 0; i < subchan_cnt; ++i) {
3660		error = hn_chan_attach(sc, subchans[i]);
3661		if (error)
3662			break;
3663	}
3664	vmbus_subchan_rel(subchans, subchan_cnt);
3665
3666	if (error) {
3667		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
3668	} else {
3669		if (bootverbose) {
3670			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
3671			    subchan_cnt);
3672		}
3673	}
3674	return (error);
3675}
3676
3677static void
3678hn_detach_allchans(struct hn_softc *sc)
3679{
3680	struct vmbus_channel **subchans;
3681	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3682	int i;
3683
3684	if (subchan_cnt == 0)
3685		goto back;
3686
3687	/* Detach the sub-channels. */
3688	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3689	for (i = 0; i < subchan_cnt; ++i)
3690		hn_chan_detach(sc, subchans[i]);
3691	vmbus_subchan_rel(subchans, subchan_cnt);
3692
3693back:
3694	/*
3695	 * Detach the primary channel, _after_ all sub-channels
3696	 * are detached.
3697	 */
3698	hn_chan_detach(sc, sc->hn_prichan);
3699
3700	/* Wait for sub-channels to be destroyed, if any. */
3701	vmbus_subchan_drain(sc->hn_prichan);
3702
3703#ifdef INVARIANTS
3704	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3705		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
3706		    HN_RX_FLAG_ATTACHED) == 0,
3707		    ("%dth RX ring is still attached", i));
3708	}
3709	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3710		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
3711		    HN_TX_FLAG_ATTACHED) == 0,
3712		    ("%dth TX ring is still attached", i));
3713	}
3714#endif
3715}
3716
3717static int
3718hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
3719{
3720	struct vmbus_channel **subchans;
3721	int nchan, rxr_cnt, error;
3722
3723	nchan = *nsubch + 1;
3724	if (nchan == 1) {
3725		/*
3726		 * Multiple RX/TX rings are not requested.
3727		 */
3728		*nsubch = 0;
3729		return (0);
3730	}
3731
3732	/*
3733	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
3734	 * table entries.
3735	 */
3736	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
3737	if (error) {
3738		/* No RSS; this is benign. */
3739		*nsubch = 0;
3740		return (0);
3741	}
3742	if (bootverbose) {
3743		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
3744		    rxr_cnt, nchan);
3745	}
3746
3747	if (nchan > rxr_cnt)
3748		nchan = rxr_cnt;
3749	if (nchan == 1) {
3750		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
3751		*nsubch = 0;
3752		return (0);
3753	}
3754
3755	/*
3756	 * Allocate sub-channels from NVS.
3757	 */
3758	*nsubch = nchan - 1;
3759	error = hn_nvs_alloc_subchans(sc, nsubch);
3760	if (error || *nsubch == 0) {
3761		/* Failed to allocate sub-channels. */
3762		*nsubch = 0;
3763		return (0);
3764	}
3765
3766	/*
3767	 * Wait for all sub-channels to become ready before moving on.
3768	 */
3769	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
3770	vmbus_subchan_rel(subchans, *nsubch);
3771	return (0);
3772}
3773
3774static int
3775hn_synth_attach(struct hn_softc *sc, int mtu)
3776{
3777	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
3778	int error, nsubch, nchan, i;
3779	uint32_t old_caps;
3780
3781	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
3782	    ("synthetic parts were attached"));
3783
3784	/* Save capabilities for later verification. */
3785	old_caps = sc->hn_caps;
3786	sc->hn_caps = 0;
3787
3788	/* Clear RSS stuffs. */
3789	sc->hn_rss_ind_size = 0;
3790	sc->hn_rss_hash = 0;
3791
3792	/*
3793	 * Attach the primary channel _before_ attaching NVS and RNDIS.
3794	 */
3795	error = hn_chan_attach(sc, sc->hn_prichan);
3796	if (error)
3797		return (error);
3798
3799	/*
3800	 * Attach NVS.
3801	 */
3802	error = hn_nvs_attach(sc, mtu);
3803	if (error)
3804		return (error);
3805
3806	/*
3807	 * Attach RNDIS _after_ NVS is attached.
3808	 */
3809	error = hn_rndis_attach(sc, mtu);
3810	if (error)
3811		return (error);
3812
3813	/*
3814	 * Make sure capabilities are not changed.
3815	 */
3816	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
3817		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
3818		    old_caps, sc->hn_caps);
3819		/* Restore old capabilities and abort. */
3820		sc->hn_caps = old_caps;
3821		return ENXIO;
3822	}
3823
3824	/*
3825	 * Allocate sub-channels for multi-TX/RX rings.
3826	 *
3827	 * NOTE:
3828	 * The # of RX rings that can be used is equivalent to the # of
3829	 * channels to be requested.
3830	 */
3831	nsubch = sc->hn_rx_ring_cnt - 1;
3832	error = hn_synth_alloc_subchans(sc, &nsubch);
3833	if (error)
3834		return (error);
3835
3836	nchan = nsubch + 1;
3837	if (nchan == 1) {
3838		/* Only the primary channel can be used; done */
3839		goto back;
3840	}
3841
3842	/*
3843	 * Configure RSS key and indirect table _after_ all sub-channels
3844	 * are allocated.
3845	 */
3846
3847	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
3848		/*
3849		 * RSS key is not set yet; set it to the default RSS key.
3850		 */
3851		if (bootverbose)
3852			if_printf(sc->hn_ifp, "setup default RSS key\n");
3853		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
3854		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3855	}
3856
3857	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
3858		/*
3859		 * RSS indirect table is not set yet; set it up in round-
3860		 * robin fashion.
3861		 */
3862		if (bootverbose) {
3863			if_printf(sc->hn_ifp, "setup default RSS indirect "
3864			    "table\n");
3865		}
3866		for (i = 0; i < NDIS_HASH_INDCNT; ++i)
3867			rss->rss_ind[i] = i % nchan;
3868		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3869	} else {
3870		/*
3871		 * # of usable channels may be changed, so we have to
3872		 * make sure that all entries in RSS indirect table
3873		 * are valid.
3874		 */
3875		hn_rss_ind_fixup(sc, nchan);
3876	}
3877
3878	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
3879	if (error) {
3880		/*
3881		 * Failed to configure RSS key or indirect table; only
3882		 * the primary channel can be used.
3883		 */
3884		nchan = 1;
3885	}
3886back:
3887	/*
3888	 * Set the # of TX/RX rings that could be used according to
3889	 * the # of channels that NVS offered.
3890	 */
3891	hn_set_ring_inuse(sc, nchan);
3892
3893	/*
3894	 * Attach the sub-channels, if any.
3895	 */
3896	error = hn_attach_subchans(sc);
3897	if (error)
3898		return (error);
3899
3900	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
3901	return (0);
3902}
3903
3904/*
3905 * NOTE:
3906 * The interface must have been suspended though hn_suspend(), before
3907 * this function get called.
3908 */
3909static void
3910hn_synth_detach(struct hn_softc *sc)
3911{
3912	HN_LOCK_ASSERT(sc);
3913
3914	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3915	    ("synthetic parts were not attached"));
3916
3917	/* Detach the RNDIS first. */
3918	hn_rndis_detach(sc);
3919
3920	/* Detach NVS. */
3921	hn_nvs_detach(sc);
3922
3923	/* Detach all of the channels. */
3924	hn_detach_allchans(sc);
3925
3926	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
3927}
3928
3929static void
3930hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
3931{
3932	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
3933	    ("invalid ring count %d", ring_cnt));
3934
3935	if (sc->hn_tx_ring_cnt > ring_cnt)
3936		sc->hn_tx_ring_inuse = ring_cnt;
3937	else
3938		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3939	sc->hn_rx_ring_inuse = ring_cnt;
3940
3941	if (bootverbose) {
3942		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
3943		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
3944	}
3945}
3946
3947static void
3948hn_chan_drain(struct vmbus_channel *chan)
3949{
3950
3951	while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
3952		pause("waitch", 1);
3953	vmbus_chan_intr_drain(chan);
3954}
3955
3956static void
3957hn_suspend_data(struct hn_softc *sc)
3958{
3959	struct vmbus_channel **subch = NULL;
3960	int i, nsubch;
3961
3962	HN_LOCK_ASSERT(sc);
3963
3964	/*
3965	 * Suspend TX.
3966	 */
3967	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
3968		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
3969
3970		mtx_lock(&txr->hn_tx_lock);
3971		txr->hn_suspended = 1;
3972		mtx_unlock(&txr->hn_tx_lock);
3973		/* No one is able send more packets now. */
3974
3975		/* Wait for all pending sends to finish. */
3976		while (hn_tx_ring_pending(txr))
3977			pause("hnwtx", 1 /* 1 tick */);
3978
3979		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
3980		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
3981	}
3982
3983	/*
3984	 * Disable RX by clearing RX filter.
3985	 */
3986	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
3987	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
3988
3989	/*
3990	 * Give RNDIS enough time to flush all pending data packets.
3991	 */
3992	pause("waitrx", (200 * hz) / 1000);
3993
3994	/*
3995	 * Drain RX/TX bufrings and interrupts.
3996	 */
3997	nsubch = sc->hn_rx_ring_inuse - 1;
3998	if (nsubch > 0)
3999		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4000
4001	if (subch != NULL) {
4002		for (i = 0; i < nsubch; ++i)
4003			hn_chan_drain(subch[i]);
4004	}
4005	hn_chan_drain(sc->hn_prichan);
4006
4007	if (subch != NULL)
4008		vmbus_subchan_rel(subch, nsubch);
4009}
4010
4011static void
4012hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4013{
4014
4015	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4016}
4017
4018static void
4019hn_suspend_mgmt(struct hn_softc *sc)
4020{
4021	struct task task;
4022
4023	HN_LOCK_ASSERT(sc);
4024
4025	/*
4026	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4027	 * through hn_mgmt_taskq.
4028	 */
4029	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4030	vmbus_chan_run_task(sc->hn_prichan, &task);
4031
4032	/*
4033	 * Make sure that all pending management tasks are completed.
4034	 */
4035	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4036	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4037	taskqueue_drain_all(sc->hn_mgmt_taskq0);
4038}
4039
4040static void
4041hn_suspend(struct hn_softc *sc)
4042{
4043
4044	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4045		hn_suspend_data(sc);
4046	hn_suspend_mgmt(sc);
4047}
4048
4049static void
4050hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4051{
4052	int i;
4053
4054	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4055	    ("invalid TX ring count %d", tx_ring_cnt));
4056
4057	for (i = 0; i < tx_ring_cnt; ++i) {
4058		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4059
4060		mtx_lock(&txr->hn_tx_lock);
4061		txr->hn_suspended = 0;
4062		mtx_unlock(&txr->hn_tx_lock);
4063	}
4064}
4065
4066static void
4067hn_resume_data(struct hn_softc *sc)
4068{
4069	int i;
4070
4071	HN_LOCK_ASSERT(sc);
4072
4073	/*
4074	 * Re-enable RX.
4075	 */
4076	hn_set_rxfilter(sc);
4077
4078	/*
4079	 * Make sure to clear suspend status on "all" TX rings,
4080	 * since hn_tx_ring_inuse can be changed after
4081	 * hn_suspend_data().
4082	 */
4083	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4084
4085	if (!hn_use_if_start) {
4086		/*
4087		 * Flush unused drbrs, since hn_tx_ring_inuse may be
4088		 * reduced.
4089		 */
4090		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4091			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4092	}
4093
4094	/*
4095	 * Kick start TX.
4096	 */
4097	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4098		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4099
4100		/*
4101		 * Use txeof task, so that any pending oactive can be
4102		 * cleared properly.
4103		 */
4104		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4105	}
4106}
4107
4108static void
4109hn_resume_mgmt(struct hn_softc *sc)
4110{
4111
4112	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4113
4114	/*
4115	 * Kick off network change detection, if it was pending.
4116	 * If no network change was pending, start link status
4117	 * checks, which is more lightweight than network change
4118	 * detection.
4119	 */
4120	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4121		hn_change_network(sc);
4122	else
4123		hn_update_link_status(sc);
4124}
4125
4126static void
4127hn_resume(struct hn_softc *sc)
4128{
4129
4130	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4131		hn_resume_data(sc);
4132	hn_resume_mgmt(sc);
4133}
4134
4135static void
4136hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4137{
4138	const struct rndis_status_msg *msg;
4139	int ofs;
4140
4141	if (dlen < sizeof(*msg)) {
4142		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4143		return;
4144	}
4145	msg = data;
4146
4147	switch (msg->rm_status) {
4148	case RNDIS_STATUS_MEDIA_CONNECT:
4149	case RNDIS_STATUS_MEDIA_DISCONNECT:
4150		hn_update_link_status(sc);
4151		break;
4152
4153	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4154		/* Not really useful; ignore. */
4155		break;
4156
4157	case RNDIS_STATUS_NETWORK_CHANGE:
4158		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4159		if (dlen < ofs + msg->rm_stbuflen ||
4160		    msg->rm_stbuflen < sizeof(uint32_t)) {
4161			if_printf(sc->hn_ifp, "network changed\n");
4162		} else {
4163			uint32_t change;
4164
4165			memcpy(&change, ((const uint8_t *)msg) + ofs,
4166			    sizeof(change));
4167			if_printf(sc->hn_ifp, "network changed, change %u\n",
4168			    change);
4169		}
4170		hn_change_network(sc);
4171		break;
4172
4173	default:
4174		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4175		    msg->rm_status);
4176		break;
4177	}
4178}
4179
4180static int
4181hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4182{
4183	const struct rndis_pktinfo *pi = info_data;
4184	uint32_t mask = 0;
4185
4186	while (info_dlen != 0) {
4187		const void *data;
4188		uint32_t dlen;
4189
4190		if (__predict_false(info_dlen < sizeof(*pi)))
4191			return (EINVAL);
4192		if (__predict_false(info_dlen < pi->rm_size))
4193			return (EINVAL);
4194		info_dlen -= pi->rm_size;
4195
4196		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4197			return (EINVAL);
4198		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4199			return (EINVAL);
4200		dlen = pi->rm_size - pi->rm_pktinfooffset;
4201		data = pi->rm_data;
4202
4203		switch (pi->rm_type) {
4204		case NDIS_PKTINFO_TYPE_VLAN:
4205			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4206				return (EINVAL);
4207			info->vlan_info = *((const uint32_t *)data);
4208			mask |= HN_RXINFO_VLAN;
4209			break;
4210
4211		case NDIS_PKTINFO_TYPE_CSUM:
4212			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4213				return (EINVAL);
4214			info->csum_info = *((const uint32_t *)data);
4215			mask |= HN_RXINFO_CSUM;
4216			break;
4217
4218		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4219			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4220				return (EINVAL);
4221			info->hash_value = *((const uint32_t *)data);
4222			mask |= HN_RXINFO_HASHVAL;
4223			break;
4224
4225		case HN_NDIS_PKTINFO_TYPE_HASHINF:
4226			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4227				return (EINVAL);
4228			info->hash_info = *((const uint32_t *)data);
4229			mask |= HN_RXINFO_HASHINF;
4230			break;
4231
4232		default:
4233			goto next;
4234		}
4235
4236		if (mask == HN_RXINFO_ALL) {
4237			/* All found; done */
4238			break;
4239		}
4240next:
4241		pi = (const struct rndis_pktinfo *)
4242		    ((const uint8_t *)pi + pi->rm_size);
4243	}
4244
4245	/*
4246	 * Final fixup.
4247	 * - If there is no hash value, invalidate the hash info.
4248	 */
4249	if ((mask & HN_RXINFO_HASHVAL) == 0)
4250		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4251	return (0);
4252}
4253
4254static __inline bool
4255hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4256{
4257
4258	if (off < check_off) {
4259		if (__predict_true(off + len <= check_off))
4260			return (false);
4261	} else if (off > check_off) {
4262		if (__predict_true(check_off + check_len <= off))
4263			return (false);
4264	}
4265	return (true);
4266}
4267
4268static void
4269hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
4270{
4271	const struct rndis_packet_msg *pkt;
4272	struct hn_rxinfo info;
4273	int data_off, pktinfo_off, data_len, pktinfo_len;
4274
4275	/*
4276	 * Check length.
4277	 */
4278	if (__predict_false(dlen < sizeof(*pkt))) {
4279		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
4280		return;
4281	}
4282	pkt = data;
4283
4284	if (__predict_false(dlen < pkt->rm_len)) {
4285		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
4286		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
4287		return;
4288	}
4289	if (__predict_false(pkt->rm_len <
4290	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
4291		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
4292		    "msglen %u, data %u, oob %u, pktinfo %u\n",
4293		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
4294		    pkt->rm_pktinfolen);
4295		return;
4296	}
4297	if (__predict_false(pkt->rm_datalen == 0)) {
4298		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
4299		return;
4300	}
4301
4302	/*
4303	 * Check offests.
4304	 */
4305#define IS_OFFSET_INVALID(ofs)			\
4306	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
4307	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
4308
4309	/* XXX Hyper-V does not meet data offset alignment requirement */
4310	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
4311		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4312		    "data offset %u\n", pkt->rm_dataoffset);
4313		return;
4314	}
4315	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
4316	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
4317		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4318		    "oob offset %u\n", pkt->rm_oobdataoffset);
4319		return;
4320	}
4321	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
4322	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
4323		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4324		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
4325		return;
4326	}
4327
4328#undef IS_OFFSET_INVALID
4329
4330	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
4331	data_len = pkt->rm_datalen;
4332	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
4333	pktinfo_len = pkt->rm_pktinfolen;
4334
4335	/*
4336	 * Check OOB coverage.
4337	 */
4338	if (__predict_false(pkt->rm_oobdatalen != 0)) {
4339		int oob_off, oob_len;
4340
4341		if_printf(rxr->hn_ifp, "got oobdata\n");
4342		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
4343		oob_len = pkt->rm_oobdatalen;
4344
4345		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
4346			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4347			    "oob overflow, msglen %u, oob abs %d len %d\n",
4348			    pkt->rm_len, oob_off, oob_len);
4349			return;
4350		}
4351
4352		/*
4353		 * Check against data.
4354		 */
4355		if (hn_rndis_check_overlap(oob_off, oob_len,
4356		    data_off, data_len)) {
4357			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4358			    "oob overlaps data, oob abs %d len %d, "
4359			    "data abs %d len %d\n",
4360			    oob_off, oob_len, data_off, data_len);
4361			return;
4362		}
4363
4364		/*
4365		 * Check against pktinfo.
4366		 */
4367		if (pktinfo_len != 0 &&
4368		    hn_rndis_check_overlap(oob_off, oob_len,
4369		    pktinfo_off, pktinfo_len)) {
4370			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4371			    "oob overlaps pktinfo, oob abs %d len %d, "
4372			    "pktinfo abs %d len %d\n",
4373			    oob_off, oob_len, pktinfo_off, pktinfo_len);
4374			return;
4375		}
4376	}
4377
4378	/*
4379	 * Check per-packet-info coverage and find useful per-packet-info.
4380	 */
4381	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
4382	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
4383	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
4384	if (__predict_true(pktinfo_len != 0)) {
4385		bool overlap;
4386		int error;
4387
4388		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
4389			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4390			    "pktinfo overflow, msglen %u, "
4391			    "pktinfo abs %d len %d\n",
4392			    pkt->rm_len, pktinfo_off, pktinfo_len);
4393			return;
4394		}
4395
4396		/*
4397		 * Check packet info coverage.
4398		 */
4399		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
4400		    data_off, data_len);
4401		if (__predict_false(overlap)) {
4402			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4403			    "pktinfo overlap data, pktinfo abs %d len %d, "
4404			    "data abs %d len %d\n",
4405			    pktinfo_off, pktinfo_len, data_off, data_len);
4406			return;
4407		}
4408
4409		/*
4410		 * Find useful per-packet-info.
4411		 */
4412		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
4413		    pktinfo_len, &info);
4414		if (__predict_false(error)) {
4415			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
4416			    "pktinfo\n");
4417			return;
4418		}
4419	}
4420
4421	if (__predict_false(data_off + data_len > pkt->rm_len)) {
4422		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4423		    "data overflow, msglen %u, data abs %d len %d\n",
4424		    pkt->rm_len, data_off, data_len);
4425		return;
4426	}
4427	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
4428}
4429
4430static __inline void
4431hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
4432{
4433	const struct rndis_msghdr *hdr;
4434
4435	if (__predict_false(dlen < sizeof(*hdr))) {
4436		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
4437		return;
4438	}
4439	hdr = data;
4440
4441	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
4442		/* Hot data path. */
4443		hn_rndis_rx_data(rxr, data, dlen);
4444		/* Done! */
4445		return;
4446	}
4447
4448	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
4449		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
4450	else
4451		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
4452}
4453
4454static void
4455hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
4456{
4457	const struct hn_nvs_hdr *hdr;
4458
4459	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
4460		if_printf(sc->hn_ifp, "invalid nvs notify\n");
4461		return;
4462	}
4463	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
4464
4465	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
4466		/* Useless; ignore */
4467		return;
4468	}
4469	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
4470}
4471
4472static void
4473hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
4474    const struct vmbus_chanpkt_hdr *pkt)
4475{
4476	struct hn_nvs_sendctx *sndc;
4477
4478	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
4479	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
4480	    VMBUS_CHANPKT_DATALEN(pkt));
4481	/*
4482	 * NOTE:
4483	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
4484	 * its callback.
4485	 */
4486}
4487
4488static void
4489hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
4490    const struct vmbus_chanpkt_hdr *pkthdr)
4491{
4492	const struct vmbus_chanpkt_rxbuf *pkt;
4493	const struct hn_nvs_hdr *nvs_hdr;
4494	int count, i, hlen;
4495
4496	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
4497		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
4498		return;
4499	}
4500	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
4501
4502	/* Make sure that this is a RNDIS message. */
4503	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
4504		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
4505		    nvs_hdr->nvs_type);
4506		return;
4507	}
4508
4509	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
4510	if (__predict_false(hlen < sizeof(*pkt))) {
4511		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
4512		return;
4513	}
4514	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
4515
4516	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
4517		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
4518		    pkt->cp_rxbuf_id);
4519		return;
4520	}
4521
4522	count = pkt->cp_rxbuf_cnt;
4523	if (__predict_false(hlen <
4524	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
4525		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
4526		return;
4527	}
4528
4529	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
4530	for (i = 0; i < count; ++i) {
4531		int ofs, len;
4532
4533		ofs = pkt->cp_rxbuf[i].rb_ofs;
4534		len = pkt->cp_rxbuf[i].rb_len;
4535		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
4536			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
4537			    "ofs %d, len %d\n", i, ofs, len);
4538			continue;
4539		}
4540		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
4541	}
4542
4543	/*
4544	 * Ack the consumed RXBUF associated w/ this channel packet,
4545	 * so that this RXBUF can be recycled by the hypervisor.
4546	 */
4547	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
4548}
4549
4550static void
4551hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
4552    uint64_t tid)
4553{
4554	struct hn_nvs_rndis_ack ack;
4555	int retries, error;
4556
4557	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
4558	ack.nvs_status = HN_NVS_STATUS_OK;
4559
4560	retries = 0;
4561again:
4562	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
4563	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
4564	if (__predict_false(error == EAGAIN)) {
4565		/*
4566		 * NOTE:
4567		 * This should _not_ happen in real world, since the
4568		 * consumption of the TX bufring from the TX path is
4569		 * controlled.
4570		 */
4571		if (rxr->hn_ack_failed == 0)
4572			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
4573		rxr->hn_ack_failed++;
4574		retries++;
4575		if (retries < 10) {
4576			DELAY(100);
4577			goto again;
4578		}
4579		/* RXBUF leaks! */
4580		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
4581	}
4582}
4583
4584static void
4585hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
4586{
4587	struct hn_rx_ring *rxr = xrxr;
4588	struct hn_softc *sc = rxr->hn_ifp->if_softc;
4589
4590	for (;;) {
4591		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
4592		int error, pktlen;
4593
4594		pktlen = rxr->hn_pktbuf_len;
4595		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
4596		if (__predict_false(error == ENOBUFS)) {
4597			void *nbuf;
4598			int nlen;
4599
4600			/*
4601			 * Expand channel packet buffer.
4602			 *
4603			 * XXX
4604			 * Use M_WAITOK here, since allocation failure
4605			 * is fatal.
4606			 */
4607			nlen = rxr->hn_pktbuf_len * 2;
4608			while (nlen < pktlen)
4609				nlen *= 2;
4610			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
4611
4612			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
4613			    rxr->hn_pktbuf_len, nlen);
4614
4615			free(rxr->hn_pktbuf, M_DEVBUF);
4616			rxr->hn_pktbuf = nbuf;
4617			rxr->hn_pktbuf_len = nlen;
4618			/* Retry! */
4619			continue;
4620		} else if (__predict_false(error == EAGAIN)) {
4621			/* No more channel packets; done! */
4622			break;
4623		}
4624		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
4625
4626		switch (pkt->cph_type) {
4627		case VMBUS_CHANPKT_TYPE_COMP:
4628			hn_nvs_handle_comp(sc, chan, pkt);
4629			break;
4630
4631		case VMBUS_CHANPKT_TYPE_RXBUF:
4632			hn_nvs_handle_rxbuf(rxr, chan, pkt);
4633			break;
4634
4635		case VMBUS_CHANPKT_TYPE_INBAND:
4636			hn_nvs_handle_notify(sc, pkt);
4637			break;
4638
4639		default:
4640			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
4641			    pkt->cph_type);
4642			break;
4643		}
4644	}
4645	hn_chan_rollup(rxr, rxr->hn_txr);
4646}
4647
4648static void
4649hn_tx_taskq_create(void *arg __unused)
4650{
4651
4652	if (vm_guest != VM_GUEST_HV)
4653		return;
4654
4655	if (!hn_share_tx_taskq)
4656		return;
4657
4658	hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
4659	    taskqueue_thread_enqueue, &hn_tx_taskq);
4660	taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
4661	if (hn_bind_tx_taskq >= 0) {
4662		int cpu = hn_bind_tx_taskq;
4663		struct task cpuset_task;
4664		cpuset_t cpu_set;
4665
4666		if (cpu > mp_ncpus - 1)
4667			cpu = mp_ncpus - 1;
4668		CPU_SETOF(cpu, &cpu_set);
4669		TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
4670		taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
4671		taskqueue_drain(hn_tx_taskq, &cpuset_task);
4672	}
4673}
4674SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
4675    hn_tx_taskq_create, NULL);
4676
4677static void
4678hn_tx_taskq_destroy(void *arg __unused)
4679{
4680
4681	if (hn_tx_taskq != NULL)
4682		taskqueue_free(hn_tx_taskq);
4683}
4684SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
4685    hn_tx_taskq_destroy, NULL);
4686