if_hn.c revision 310799
1/*-
2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*-
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54
55#include <sys/cdefs.h>
56__FBSDID("$FreeBSD: stable/10/sys/dev/hyperv/netvsc/if_hn.c 310799 2016-12-30 01:59:19Z sephe $");
57
58#include "opt_inet6.h"
59#include "opt_inet.h"
60#include "opt_hn.h"
61
62#include <sys/param.h>
63#include <sys/bus.h>
64#include <sys/kernel.h>
65#include <sys/limits.h>
66#include <sys/malloc.h>
67#include <sys/mbuf.h>
68#include <sys/module.h>
69#include <sys/proc.h>
70#include <sys/queue.h>
71#include <sys/lock.h>
72#include <sys/smp.h>
73#include <sys/socket.h>
74#include <sys/sockio.h>
75#include <sys/sx.h>
76#include <sys/sysctl.h>
77#include <sys/systm.h>
78#include <sys/taskqueue.h>
79#include <sys/buf_ring.h>
80
81#include <machine/atomic.h>
82#include <machine/in_cksum.h>
83
84#include <net/bpf.h>
85#include <net/ethernet.h>
86#include <net/if.h>
87#include <net/if_arp.h>
88#include <net/if_media.h>
89#include <net/if_types.h>
90#include <net/if_var.h>
91#include <net/if_vlan_var.h>
92#include <net/rndis.h>
93
94#include <netinet/in_systm.h>
95#include <netinet/in.h>
96#include <netinet/ip.h>
97#include <netinet/ip6.h>
98#include <netinet/tcp.h>
99#include <netinet/tcp_lro.h>
100#include <netinet/udp.h>
101
102#include <dev/hyperv/include/hyperv.h>
103#include <dev/hyperv/include/hyperv_busdma.h>
104#include <dev/hyperv/include/vmbus.h>
105#include <dev/hyperv/include/vmbus_xact.h>
106
107#include <dev/hyperv/netvsc/ndis.h>
108#include <dev/hyperv/netvsc/if_hnreg.h>
109#include <dev/hyperv/netvsc/if_hnvar.h>
110#include <dev/hyperv/netvsc/hn_nvs.h>
111#include <dev/hyperv/netvsc/hn_rndis.h>
112
113#include "vmbus_if.h"
114
115#define HN_IFSTART_SUPPORT
116
117#define HN_RING_CNT_DEF_MAX		8
118
119/* YYY should get it from the underlying channel */
120#define HN_TX_DESC_CNT			512
121
122#define HN_RNDIS_PKT_LEN					\
123	(sizeof(struct rndis_packet_msg) +			\
124	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
125	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
126	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
127	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
128#define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
129#define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
130
131#define HN_TX_DATA_BOUNDARY		PAGE_SIZE
132#define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
133#define HN_TX_DATA_SEGSIZE		PAGE_SIZE
134/* -1 for RNDIS packet message */
135#define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
136
137#define HN_DIRECT_TX_SIZE_DEF		128
138
139#define HN_EARLY_TXEOF_THRESH		8
140
141#define HN_PKTBUF_LEN_DEF		(16 * 1024)
142
143#define HN_LROENT_CNT_DEF		128
144
145#define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
146#define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
147/* YYY 2*MTU is a bit rough, but should be good enough. */
148#define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
149
150#define HN_LRO_ACKCNT_DEF		1
151
152#define HN_LOCK_INIT(sc)		\
153	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
154#define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
155#define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
156#define HN_LOCK(sc)					\
157do {							\
158	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
159		DELAY(1000);				\
160} while (0)
161#define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
162
163#define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
164#define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
165#define HN_CSUM_IP_HWASSIST(sc)		\
166	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
167#define HN_CSUM_IP6_HWASSIST(sc)	\
168	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
169
170#define HN_PKTSIZE_MIN(align)		\
171	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
172	    HN_RNDIS_PKT_LEN, (align))
173#define HN_PKTSIZE(m, align)		\
174	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
175
176#define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
177
178struct hn_txdesc {
179#ifndef HN_USE_TXDESC_BUFRING
180	SLIST_ENTRY(hn_txdesc)		link;
181#endif
182	STAILQ_ENTRY(hn_txdesc)		agg_link;
183
184	/* Aggregated txdescs, in sending order. */
185	STAILQ_HEAD(, hn_txdesc)	agg_list;
186
187	/* The oldest packet, if transmission aggregation happens. */
188	struct mbuf			*m;
189	struct hn_tx_ring		*txr;
190	int				refs;
191	uint32_t			flags;	/* HN_TXD_FLAG_ */
192	struct hn_nvs_sendctx		send_ctx;
193	uint32_t			chim_index;
194	int				chim_size;
195
196	bus_dmamap_t			data_dmap;
197
198	bus_addr_t			rndis_pkt_paddr;
199	struct rndis_packet_msg		*rndis_pkt;
200	bus_dmamap_t			rndis_pkt_dmap;
201};
202
203#define HN_TXD_FLAG_ONLIST		0x0001
204#define HN_TXD_FLAG_DMAMAP		0x0002
205#define HN_TXD_FLAG_ONAGG		0x0004
206
207struct hn_rxinfo {
208	uint32_t			vlan_info;
209	uint32_t			csum_info;
210	uint32_t			hash_info;
211	uint32_t			hash_value;
212};
213
214#define HN_RXINFO_VLAN			0x0001
215#define HN_RXINFO_CSUM			0x0002
216#define HN_RXINFO_HASHINF		0x0004
217#define HN_RXINFO_HASHVAL		0x0008
218#define HN_RXINFO_ALL			\
219	(HN_RXINFO_VLAN |		\
220	 HN_RXINFO_CSUM |		\
221	 HN_RXINFO_HASHINF |		\
222	 HN_RXINFO_HASHVAL)
223
224#define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
225#define HN_NDIS_RXCSUM_INFO_INVALID	0
226#define HN_NDIS_HASH_INFO_INVALID	0
227
228static int			hn_probe(device_t);
229static int			hn_attach(device_t);
230static int			hn_detach(device_t);
231static int			hn_shutdown(device_t);
232static void			hn_chan_callback(struct vmbus_channel *,
233				    void *);
234
235static void			hn_init(void *);
236static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
237#ifdef HN_IFSTART_SUPPORT
238static void			hn_start(struct ifnet *);
239#endif
240static int			hn_transmit(struct ifnet *, struct mbuf *);
241static void			hn_xmit_qflush(struct ifnet *);
242static int			hn_ifmedia_upd(struct ifnet *);
243static void			hn_ifmedia_sts(struct ifnet *,
244				    struct ifmediareq *);
245
246static int			hn_rndis_rxinfo(const void *, int,
247				    struct hn_rxinfo *);
248static void			hn_rndis_rx_data(struct hn_rx_ring *,
249				    const void *, int);
250static void			hn_rndis_rx_status(struct hn_softc *,
251				    const void *, int);
252
253static void			hn_nvs_handle_notify(struct hn_softc *,
254				    const struct vmbus_chanpkt_hdr *);
255static void			hn_nvs_handle_comp(struct hn_softc *,
256				    struct vmbus_channel *,
257				    const struct vmbus_chanpkt_hdr *);
258static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
259				    struct vmbus_channel *,
260				    const struct vmbus_chanpkt_hdr *);
261static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
262				    struct vmbus_channel *, uint64_t);
263
264#if __FreeBSD_version >= 1100099
265static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
266static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
267#endif
268static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
269static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
270#if __FreeBSD_version < 1100095
271static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
272#else
273static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
274#endif
275static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
276static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
277static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
278static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
279static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
280static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
281static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
282static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
283static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
284static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
285static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
286static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
287static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
288static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
289
290static void			hn_stop(struct hn_softc *);
291static void			hn_init_locked(struct hn_softc *);
292static int			hn_chan_attach(struct hn_softc *,
293				    struct vmbus_channel *);
294static void			hn_chan_detach(struct hn_softc *,
295				    struct vmbus_channel *);
296static int			hn_attach_subchans(struct hn_softc *);
297static void			hn_detach_allchans(struct hn_softc *);
298static void			hn_chan_rollup(struct hn_rx_ring *,
299				    struct hn_tx_ring *);
300static void			hn_set_ring_inuse(struct hn_softc *, int);
301static int			hn_synth_attach(struct hn_softc *, int);
302static void			hn_synth_detach(struct hn_softc *);
303static int			hn_synth_alloc_subchans(struct hn_softc *,
304				    int *);
305static bool			hn_synth_attachable(const struct hn_softc *);
306static void			hn_suspend(struct hn_softc *);
307static void			hn_suspend_data(struct hn_softc *);
308static void			hn_suspend_mgmt(struct hn_softc *);
309static void			hn_resume(struct hn_softc *);
310static void			hn_resume_data(struct hn_softc *);
311static void			hn_resume_mgmt(struct hn_softc *);
312static void			hn_suspend_mgmt_taskfunc(void *, int);
313static void			hn_chan_drain(struct hn_softc *,
314				    struct vmbus_channel *);
315
316static void			hn_update_link_status(struct hn_softc *);
317static void			hn_change_network(struct hn_softc *);
318static void			hn_link_taskfunc(void *, int);
319static void			hn_netchg_init_taskfunc(void *, int);
320static void			hn_netchg_status_taskfunc(void *, int);
321static void			hn_link_status(struct hn_softc *);
322
323static int			hn_create_rx_data(struct hn_softc *, int);
324static void			hn_destroy_rx_data(struct hn_softc *);
325static int			hn_check_iplen(const struct mbuf *, int);
326static int			hn_set_rxfilter(struct hn_softc *);
327static int			hn_rss_reconfig(struct hn_softc *);
328static void			hn_rss_ind_fixup(struct hn_softc *);
329static int			hn_rxpkt(struct hn_rx_ring *, const void *,
330				    int, const struct hn_rxinfo *);
331
332static int			hn_tx_ring_create(struct hn_softc *, int);
333static void			hn_tx_ring_destroy(struct hn_tx_ring *);
334static int			hn_create_tx_data(struct hn_softc *, int);
335static void			hn_fixup_tx_data(struct hn_softc *);
336static void			hn_destroy_tx_data(struct hn_softc *);
337static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
338static void			hn_txdesc_gc(struct hn_tx_ring *,
339				    struct hn_txdesc *);
340static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
341				    struct hn_txdesc *, struct mbuf **);
342static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
343				    struct hn_txdesc *);
344static void			hn_set_chim_size(struct hn_softc *, int);
345static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
346static bool			hn_tx_ring_pending(struct hn_tx_ring *);
347static void			hn_tx_ring_qflush(struct hn_tx_ring *);
348static void			hn_resume_tx(struct hn_softc *, int);
349static void			hn_set_txagg(struct hn_softc *);
350static void			*hn_try_txagg(struct ifnet *,
351				    struct hn_tx_ring *, struct hn_txdesc *,
352				    int);
353static int			hn_get_txswq_depth(const struct hn_tx_ring *);
354static void			hn_txpkt_done(struct hn_nvs_sendctx *,
355				    struct hn_softc *, struct vmbus_channel *,
356				    const void *, int);
357static int			hn_txpkt_sglist(struct hn_tx_ring *,
358				    struct hn_txdesc *);
359static int			hn_txpkt_chim(struct hn_tx_ring *,
360				    struct hn_txdesc *);
361static int			hn_xmit(struct hn_tx_ring *, int);
362static void			hn_xmit_taskfunc(void *, int);
363static void			hn_xmit_txeof(struct hn_tx_ring *);
364static void			hn_xmit_txeof_taskfunc(void *, int);
365#ifdef HN_IFSTART_SUPPORT
366static int			hn_start_locked(struct hn_tx_ring *, int);
367static void			hn_start_taskfunc(void *, int);
368static void			hn_start_txeof(struct hn_tx_ring *);
369static void			hn_start_txeof_taskfunc(void *, int);
370#endif
371
372SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
373    "Hyper-V network interface");
374
375/* Trust tcp segements verification on host side. */
376static int			hn_trust_hosttcp = 1;
377SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
378    &hn_trust_hosttcp, 0,
379    "Trust tcp segement verification on host side, "
380    "when csum info is missing (global setting)");
381
382/* Trust udp datagrams verification on host side. */
383static int			hn_trust_hostudp = 1;
384SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
385    &hn_trust_hostudp, 0,
386    "Trust udp datagram verification on host side, "
387    "when csum info is missing (global setting)");
388
389/* Trust ip packets verification on host side. */
390static int			hn_trust_hostip = 1;
391SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
392    &hn_trust_hostip, 0,
393    "Trust ip packet verification on host side, "
394    "when csum info is missing (global setting)");
395
396/* Limit TSO burst size */
397static int			hn_tso_maxlen = IP_MAXPACKET;
398SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
399    &hn_tso_maxlen, 0, "TSO burst limit");
400
401/* Limit chimney send size */
402static int			hn_tx_chimney_size = 0;
403SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
404    &hn_tx_chimney_size, 0, "Chimney send packet size limit");
405
406/* Limit the size of packet for direct transmission */
407static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
408SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
409    &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
410
411/* # of LRO entries per RX ring */
412#if defined(INET) || defined(INET6)
413#if __FreeBSD_version >= 1100095
414static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
415SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
416    &hn_lro_entry_count, 0, "LRO entry count");
417#endif
418#endif
419
420static int			hn_tx_taskq_cnt = 1;
421SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
422    &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
423
424#define HN_TX_TASKQ_M_INDEP	0
425#define HN_TX_TASKQ_M_GLOBAL	1
426#define HN_TX_TASKQ_M_EVTTQ	2
427
428static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
429SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
430    &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
431    "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
432
433#ifndef HN_USE_TXDESC_BUFRING
434static int			hn_use_txdesc_bufring = 0;
435#else
436static int			hn_use_txdesc_bufring = 1;
437#endif
438SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
439    &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
440
441#ifdef HN_IFSTART_SUPPORT
442/* Use ifnet.if_start instead of ifnet.if_transmit */
443static int			hn_use_if_start = 0;
444SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
445    &hn_use_if_start, 0, "Use if_start TX method");
446#endif
447
448/* # of channels to use */
449static int			hn_chan_cnt = 0;
450SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
451    &hn_chan_cnt, 0,
452    "# of channels to use; each channel has one RX ring and one TX ring");
453
454/* # of transmit rings to use */
455static int			hn_tx_ring_cnt = 0;
456SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
457    &hn_tx_ring_cnt, 0, "# of TX rings to use");
458
459/* Software TX ring deptch */
460static int			hn_tx_swq_depth = 0;
461SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
462    &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
463
464/* Enable sorted LRO, and the depth of the per-channel mbuf queue */
465#if __FreeBSD_version >= 1100095
466static u_int			hn_lro_mbufq_depth = 0;
467SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
468    &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
469#endif
470
471/* Packet transmission aggregation size limit */
472static int			hn_tx_agg_size = -1;
473SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
474    &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
475
476/* Packet transmission aggregation count limit */
477static int			hn_tx_agg_pkts = -1;
478SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
479    &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
480
481static u_int			hn_cpu_index;	/* next CPU for channel */
482static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
483
484static const uint8_t
485hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
486	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
487	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
488	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
489	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
490	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
491};
492
493static device_method_t hn_methods[] = {
494	/* Device interface */
495	DEVMETHOD(device_probe,		hn_probe),
496	DEVMETHOD(device_attach,	hn_attach),
497	DEVMETHOD(device_detach,	hn_detach),
498	DEVMETHOD(device_shutdown,	hn_shutdown),
499	DEVMETHOD_END
500};
501
502static driver_t hn_driver = {
503	"hn",
504	hn_methods,
505	sizeof(struct hn_softc)
506};
507
508static devclass_t hn_devclass;
509
510DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
511MODULE_VERSION(hn, 1);
512MODULE_DEPEND(hn, vmbus, 1, 1, 1);
513
514#if __FreeBSD_version >= 1100099
515static void
516hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
517{
518	int i;
519
520	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
521		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
522}
523#endif
524
525static int
526hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
527{
528
529	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
530	    txd->chim_size == 0, ("invalid rndis sglist txd"));
531	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
532	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
533}
534
535static int
536hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
537{
538	struct hn_nvs_rndis rndis;
539
540	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
541	    txd->chim_size > 0, ("invalid rndis chim txd"));
542
543	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
544	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
545	rndis.nvs_chim_idx = txd->chim_index;
546	rndis.nvs_chim_sz = txd->chim_size;
547
548	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
549	    &rndis, sizeof(rndis), &txd->send_ctx));
550}
551
552static __inline uint32_t
553hn_chim_alloc(struct hn_softc *sc)
554{
555	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
556	u_long *bmap = sc->hn_chim_bmap;
557	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
558
559	for (i = 0; i < bmap_cnt; ++i) {
560		int idx;
561
562		idx = ffsl(~bmap[i]);
563		if (idx == 0)
564			continue;
565
566		--idx; /* ffsl is 1-based */
567		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
568		    ("invalid i %d and idx %d", i, idx));
569
570		if (atomic_testandset_long(&bmap[i], idx))
571			continue;
572
573		ret = i * LONG_BIT + idx;
574		break;
575	}
576	return (ret);
577}
578
579static __inline void
580hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
581{
582	u_long mask;
583	uint32_t idx;
584
585	idx = chim_idx / LONG_BIT;
586	KASSERT(idx < sc->hn_chim_bmap_cnt,
587	    ("invalid chimney index 0x%x", chim_idx));
588
589	mask = 1UL << (chim_idx % LONG_BIT);
590	KASSERT(sc->hn_chim_bmap[idx] & mask,
591	    ("index bitmap 0x%lx, chimney index %u, "
592	     "bitmap idx %d, bitmask 0x%lx",
593	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
594
595	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
596}
597
598#if defined(INET6) || defined(INET)
599/*
600 * NOTE: If this function failed, the m_head would be freed.
601 */
602static __inline struct mbuf *
603hn_tso_fixup(struct mbuf *m_head)
604{
605	struct ether_vlan_header *evl;
606	struct tcphdr *th;
607	int ehlen;
608
609	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
610
611#define PULLUP_HDR(m, len)				\
612do {							\
613	if (__predict_false((m)->m_len < (len))) {	\
614		(m) = m_pullup((m), (len));		\
615		if ((m) == NULL)			\
616			return (NULL);			\
617	}						\
618} while (0)
619
620	PULLUP_HDR(m_head, sizeof(*evl));
621	evl = mtod(m_head, struct ether_vlan_header *);
622	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
623		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
624	else
625		ehlen = ETHER_HDR_LEN;
626
627#ifdef INET
628	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
629		struct ip *ip;
630		int iphlen;
631
632		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
633		ip = mtodo(m_head, ehlen);
634		iphlen = ip->ip_hl << 2;
635
636		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
637		th = mtodo(m_head, ehlen + iphlen);
638
639		ip->ip_len = 0;
640		ip->ip_sum = 0;
641		th->th_sum = in_pseudo(ip->ip_src.s_addr,
642		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
643	}
644#endif
645#if defined(INET6) && defined(INET)
646	else
647#endif
648#ifdef INET6
649	{
650		struct ip6_hdr *ip6;
651
652		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
653		ip6 = mtodo(m_head, ehlen);
654		if (ip6->ip6_nxt != IPPROTO_TCP) {
655			m_freem(m_head);
656			return (NULL);
657		}
658
659		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
660		th = mtodo(m_head, ehlen + sizeof(*ip6));
661
662		ip6->ip6_plen = 0;
663		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
664	}
665#endif
666	return (m_head);
667
668#undef PULLUP_HDR
669}
670#endif	/* INET6 || INET */
671
672static int
673hn_set_rxfilter(struct hn_softc *sc)
674{
675	struct ifnet *ifp = sc->hn_ifp;
676	uint32_t filter;
677	int error = 0;
678
679	HN_LOCK_ASSERT(sc);
680
681	if (ifp->if_flags & IFF_PROMISC) {
682		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
683	} else {
684		filter = NDIS_PACKET_TYPE_DIRECTED;
685		if (ifp->if_flags & IFF_BROADCAST)
686			filter |= NDIS_PACKET_TYPE_BROADCAST;
687		/* TODO: support multicast list */
688		if ((ifp->if_flags & IFF_ALLMULTI) ||
689		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
690			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
691	}
692
693	if (sc->hn_rx_filter != filter) {
694		error = hn_rndis_set_rxfilter(sc, filter);
695		if (!error)
696			sc->hn_rx_filter = filter;
697	}
698	return (error);
699}
700
701static void
702hn_set_txagg(struct hn_softc *sc)
703{
704	uint32_t size, pkts;
705	int i;
706
707	/*
708	 * Setup aggregation size.
709	 */
710	if (sc->hn_agg_size < 0)
711		size = UINT32_MAX;
712	else
713		size = sc->hn_agg_size;
714
715	if (sc->hn_rndis_agg_size < size)
716		size = sc->hn_rndis_agg_size;
717
718	/* NOTE: We only aggregate packets using chimney sending buffers. */
719	if (size > (uint32_t)sc->hn_chim_szmax)
720		size = sc->hn_chim_szmax;
721
722	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
723		/* Disable */
724		size = 0;
725		pkts = 0;
726		goto done;
727	}
728
729	/* NOTE: Type of the per TX ring setting is 'int'. */
730	if (size > INT_MAX)
731		size = INT_MAX;
732
733	/*
734	 * Setup aggregation packet count.
735	 */
736	if (sc->hn_agg_pkts < 0)
737		pkts = UINT32_MAX;
738	else
739		pkts = sc->hn_agg_pkts;
740
741	if (sc->hn_rndis_agg_pkts < pkts)
742		pkts = sc->hn_rndis_agg_pkts;
743
744	if (pkts <= 1) {
745		/* Disable */
746		size = 0;
747		pkts = 0;
748		goto done;
749	}
750
751	/* NOTE: Type of the per TX ring setting is 'short'. */
752	if (pkts > SHRT_MAX)
753		pkts = SHRT_MAX;
754
755done:
756	/* NOTE: Type of the per TX ring setting is 'short'. */
757	if (sc->hn_rndis_agg_align > SHRT_MAX) {
758		/* Disable */
759		size = 0;
760		pkts = 0;
761	}
762
763	if (bootverbose) {
764		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
765		    size, pkts, sc->hn_rndis_agg_align);
766	}
767
768	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
769		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
770
771		mtx_lock(&txr->hn_tx_lock);
772		txr->hn_agg_szmax = size;
773		txr->hn_agg_pktmax = pkts;
774		txr->hn_agg_align = sc->hn_rndis_agg_align;
775		mtx_unlock(&txr->hn_tx_lock);
776	}
777}
778
779static int
780hn_get_txswq_depth(const struct hn_tx_ring *txr)
781{
782
783	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
784	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
785		return txr->hn_txdesc_cnt;
786	return hn_tx_swq_depth;
787}
788
789static int
790hn_rss_reconfig(struct hn_softc *sc)
791{
792	int error;
793
794	HN_LOCK_ASSERT(sc);
795
796	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
797		return (ENXIO);
798
799	/*
800	 * Disable RSS first.
801	 *
802	 * NOTE:
803	 * Direct reconfiguration by setting the UNCHG flags does
804	 * _not_ work properly.
805	 */
806	if (bootverbose)
807		if_printf(sc->hn_ifp, "disable RSS\n");
808	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
809	if (error) {
810		if_printf(sc->hn_ifp, "RSS disable failed\n");
811		return (error);
812	}
813
814	/*
815	 * Reenable the RSS w/ the updated RSS key or indirect
816	 * table.
817	 */
818	if (bootverbose)
819		if_printf(sc->hn_ifp, "reconfig RSS\n");
820	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
821	if (error) {
822		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
823		return (error);
824	}
825	return (0);
826}
827
828static void
829hn_rss_ind_fixup(struct hn_softc *sc)
830{
831	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
832	int i, nchan;
833
834	nchan = sc->hn_rx_ring_inuse;
835	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
836
837	/*
838	 * Check indirect table to make sure that all channels in it
839	 * can be used.
840	 */
841	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
842		if (rss->rss_ind[i] >= nchan) {
843			if_printf(sc->hn_ifp,
844			    "RSS indirect table %d fixup: %u -> %d\n",
845			    i, rss->rss_ind[i], nchan - 1);
846			rss->rss_ind[i] = nchan - 1;
847		}
848	}
849}
850
851static int
852hn_ifmedia_upd(struct ifnet *ifp __unused)
853{
854
855	return EOPNOTSUPP;
856}
857
858static void
859hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
860{
861	struct hn_softc *sc = ifp->if_softc;
862
863	ifmr->ifm_status = IFM_AVALID;
864	ifmr->ifm_active = IFM_ETHER;
865
866	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
867		ifmr->ifm_active |= IFM_NONE;
868		return;
869	}
870	ifmr->ifm_status |= IFM_ACTIVE;
871	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
872}
873
874/* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
875static const struct hyperv_guid g_net_vsc_device_type = {
876	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
877		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
878};
879
880static int
881hn_probe(device_t dev)
882{
883
884	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
885	    &g_net_vsc_device_type) == 0) {
886		device_set_desc(dev, "Hyper-V Network Interface");
887		return BUS_PROBE_DEFAULT;
888	}
889	return ENXIO;
890}
891
892static int
893hn_attach(device_t dev)
894{
895	struct hn_softc *sc = device_get_softc(dev);
896	struct sysctl_oid_list *child;
897	struct sysctl_ctx_list *ctx;
898	uint8_t eaddr[ETHER_ADDR_LEN];
899	struct ifnet *ifp = NULL;
900	int error, ring_cnt, tx_ring_cnt;
901
902	sc->hn_dev = dev;
903	sc->hn_prichan = vmbus_get_channel(dev);
904	HN_LOCK_INIT(sc);
905
906	/*
907	 * Initialize these tunables once.
908	 */
909	sc->hn_agg_size = hn_tx_agg_size;
910	sc->hn_agg_pkts = hn_tx_agg_pkts;
911
912	/*
913	 * Setup taskqueue for transmission.
914	 */
915	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
916		int i;
917
918		sc->hn_tx_taskqs =
919		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
920		    M_DEVBUF, M_WAITOK);
921		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
922			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
923			    M_WAITOK, taskqueue_thread_enqueue,
924			    &sc->hn_tx_taskqs[i]);
925			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
926			    "%s tx%d", device_get_nameunit(dev), i);
927		}
928	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
929		sc->hn_tx_taskqs = hn_tx_taskque;
930	}
931
932	/*
933	 * Setup taskqueue for mangement tasks, e.g. link status.
934	 */
935	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
936	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
937	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
938	    device_get_nameunit(dev));
939	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
940	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
941	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
942	    hn_netchg_status_taskfunc, sc);
943
944	/*
945	 * Allocate ifnet and setup its name earlier, so that if_printf
946	 * can be used by functions, which will be called after
947	 * ether_ifattach().
948	 */
949	ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
950	ifp->if_softc = sc;
951	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
952
953	/*
954	 * Initialize ifmedia earlier so that it can be unconditionally
955	 * destroyed, if error happened later on.
956	 */
957	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
958
959	/*
960	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
961	 * to use (tx_ring_cnt).
962	 *
963	 * NOTE:
964	 * The # of RX rings to use is same as the # of channels to use.
965	 */
966	ring_cnt = hn_chan_cnt;
967	if (ring_cnt <= 0) {
968		/* Default */
969		ring_cnt = mp_ncpus;
970		if (ring_cnt > HN_RING_CNT_DEF_MAX)
971			ring_cnt = HN_RING_CNT_DEF_MAX;
972	} else if (ring_cnt > mp_ncpus) {
973		ring_cnt = mp_ncpus;
974	}
975
976	tx_ring_cnt = hn_tx_ring_cnt;
977	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
978		tx_ring_cnt = ring_cnt;
979#ifdef HN_IFSTART_SUPPORT
980	if (hn_use_if_start) {
981		/* ifnet.if_start only needs one TX ring. */
982		tx_ring_cnt = 1;
983	}
984#endif
985
986	/*
987	 * Set the leader CPU for channels.
988	 */
989	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
990
991	/*
992	 * Create enough TX/RX rings, even if only limited number of
993	 * channels can be allocated.
994	 */
995	error = hn_create_tx_data(sc, tx_ring_cnt);
996	if (error)
997		goto failed;
998	error = hn_create_rx_data(sc, ring_cnt);
999	if (error)
1000		goto failed;
1001
1002	/*
1003	 * Create transaction context for NVS and RNDIS transactions.
1004	 */
1005	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1006	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1007	if (sc->hn_xact == NULL) {
1008		error = ENXIO;
1009		goto failed;
1010	}
1011
1012	/*
1013	 * Install orphan handler for the revocation of this device's
1014	 * primary channel.
1015	 *
1016	 * NOTE:
1017	 * The processing order is critical here:
1018	 * Install the orphan handler, _before_ testing whether this
1019	 * device's primary channel has been revoked or not.
1020	 */
1021	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1022	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1023		error = ENXIO;
1024		goto failed;
1025	}
1026
1027	/*
1028	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1029	 */
1030	error = hn_synth_attach(sc, ETHERMTU);
1031	if (error)
1032		goto failed;
1033
1034	error = hn_rndis_get_eaddr(sc, eaddr);
1035	if (error)
1036		goto failed;
1037
1038#if __FreeBSD_version >= 1100099
1039	if (sc->hn_rx_ring_inuse > 1) {
1040		/*
1041		 * Reduce TCP segment aggregation limit for multiple
1042		 * RX rings to increase ACK timeliness.
1043		 */
1044		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1045	}
1046#endif
1047
1048	/*
1049	 * Fixup TX stuffs after synthetic parts are attached.
1050	 */
1051	hn_fixup_tx_data(sc);
1052
1053	ctx = device_get_sysctl_ctx(dev);
1054	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1055	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1056	    &sc->hn_nvs_ver, 0, "NVS version");
1057	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1058	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1059	    hn_ndis_version_sysctl, "A", "NDIS version");
1060	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1061	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1062	    hn_caps_sysctl, "A", "capabilities");
1063	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1064	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1065	    hn_hwassist_sysctl, "A", "hwassist");
1066	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1067	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1068	    hn_rxfilter_sysctl, "A", "rxfilter");
1069	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1070	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1071	    hn_rss_hash_sysctl, "A", "RSS hash");
1072	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1073	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1074	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1075	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1076	    hn_rss_key_sysctl, "IU", "RSS key");
1077	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1078	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1079	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1080	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1081	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1082	    "RNDIS offered packet transmission aggregation size limit");
1083	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1084	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1085	    "RNDIS offered packet transmission aggregation count limit");
1086	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1087	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1088	    "RNDIS packet transmission aggregation alignment");
1089	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1090	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1091	    hn_txagg_size_sysctl, "I",
1092	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1093	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1094	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1095	    hn_txagg_pkts_sysctl, "I",
1096	    "Packet transmission aggregation packets, "
1097	    "0 -- disable, -1 -- auto");
1098
1099	/*
1100	 * Setup the ifmedia, which has been initialized earlier.
1101	 */
1102	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1103	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1104	/* XXX ifmedia_set really should do this for us */
1105	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1106
1107	/*
1108	 * Setup the ifnet for this interface.
1109	 */
1110
1111#ifdef __LP64__
1112	ifp->if_baudrate = IF_Gbps(10);
1113#else
1114	/* if_baudrate is 32bits on 32bit system. */
1115	ifp->if_baudrate = IF_Gbps(1);
1116#endif
1117	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1118	ifp->if_ioctl = hn_ioctl;
1119	ifp->if_init = hn_init;
1120#ifdef HN_IFSTART_SUPPORT
1121	if (hn_use_if_start) {
1122		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1123
1124		ifp->if_start = hn_start;
1125		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1126		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1127		IFQ_SET_READY(&ifp->if_snd);
1128	} else
1129#endif
1130	{
1131		ifp->if_transmit = hn_transmit;
1132		ifp->if_qflush = hn_xmit_qflush;
1133	}
1134
1135	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1136#ifdef foo
1137	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1138	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1139#endif
1140	if (sc->hn_caps & HN_CAP_VLAN) {
1141		/* XXX not sure about VLAN_MTU. */
1142		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1143	}
1144
1145	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1146	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1147		ifp->if_capabilities |= IFCAP_TXCSUM;
1148	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1149		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1150	if (sc->hn_caps & HN_CAP_TSO4) {
1151		ifp->if_capabilities |= IFCAP_TSO4;
1152		ifp->if_hwassist |= CSUM_IP_TSO;
1153	}
1154	if (sc->hn_caps & HN_CAP_TSO6) {
1155		ifp->if_capabilities |= IFCAP_TSO6;
1156		ifp->if_hwassist |= CSUM_IP6_TSO;
1157	}
1158
1159	/* Enable all available capabilities by default. */
1160	ifp->if_capenable = ifp->if_capabilities;
1161
1162	/*
1163	 * Disable IPv6 TSO and TXCSUM by default, they still can
1164	 * be enabled through SIOCSIFCAP.
1165	 */
1166	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1167	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1168
1169	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1170		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1171		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1172		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1173	}
1174
1175	ether_ifattach(ifp, eaddr);
1176
1177	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1178		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1179		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1180	}
1181
1182	/* Inform the upper layer about the long frame support. */
1183	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1184
1185	/*
1186	 * Kick off link status check.
1187	 */
1188	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1189	hn_update_link_status(sc);
1190
1191	return (0);
1192failed:
1193	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1194		hn_synth_detach(sc);
1195	hn_detach(dev);
1196	return (error);
1197}
1198
1199static int
1200hn_detach(device_t dev)
1201{
1202	struct hn_softc *sc = device_get_softc(dev);
1203	struct ifnet *ifp = sc->hn_ifp;
1204
1205	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1206		/*
1207		 * In case that the vmbus missed the orphan handler
1208		 * installation.
1209		 */
1210		vmbus_xact_ctx_orphan(sc->hn_xact);
1211	}
1212
1213	if (device_is_attached(dev)) {
1214		HN_LOCK(sc);
1215		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1216			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1217				hn_stop(sc);
1218			/*
1219			 * NOTE:
1220			 * hn_stop() only suspends data, so managment
1221			 * stuffs have to be suspended manually here.
1222			 */
1223			hn_suspend_mgmt(sc);
1224			hn_synth_detach(sc);
1225		}
1226		HN_UNLOCK(sc);
1227		ether_ifdetach(ifp);
1228	}
1229
1230	ifmedia_removeall(&sc->hn_media);
1231	hn_destroy_rx_data(sc);
1232	hn_destroy_tx_data(sc);
1233
1234	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1235		int i;
1236
1237		for (i = 0; i < hn_tx_taskq_cnt; ++i)
1238			taskqueue_free(sc->hn_tx_taskqs[i]);
1239		free(sc->hn_tx_taskqs, M_DEVBUF);
1240	}
1241	taskqueue_free(sc->hn_mgmt_taskq0);
1242
1243	if (sc->hn_xact != NULL) {
1244		/*
1245		 * Uninstall the orphan handler _before_ the xact is
1246		 * destructed.
1247		 */
1248		vmbus_chan_unset_orphan(sc->hn_prichan);
1249		vmbus_xact_ctx_destroy(sc->hn_xact);
1250	}
1251
1252	if_free(ifp);
1253
1254	HN_LOCK_DESTROY(sc);
1255	return (0);
1256}
1257
1258static int
1259hn_shutdown(device_t dev)
1260{
1261
1262	return (0);
1263}
1264
1265static void
1266hn_link_status(struct hn_softc *sc)
1267{
1268	uint32_t link_status;
1269	int error;
1270
1271	error = hn_rndis_get_linkstatus(sc, &link_status);
1272	if (error) {
1273		/* XXX what to do? */
1274		return;
1275	}
1276
1277	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1278		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1279	else
1280		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1281	if_link_state_change(sc->hn_ifp,
1282	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1283	    LINK_STATE_UP : LINK_STATE_DOWN);
1284}
1285
1286static void
1287hn_link_taskfunc(void *xsc, int pending __unused)
1288{
1289	struct hn_softc *sc = xsc;
1290
1291	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1292		return;
1293	hn_link_status(sc);
1294}
1295
1296static void
1297hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1298{
1299	struct hn_softc *sc = xsc;
1300
1301	/* Prevent any link status checks from running. */
1302	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1303
1304	/*
1305	 * Fake up a [link down --> link up] state change; 5 seconds
1306	 * delay is used, which closely simulates miibus reaction
1307	 * upon link down event.
1308	 */
1309	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1310	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1311	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1312	    &sc->hn_netchg_status, 5 * hz);
1313}
1314
1315static void
1316hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1317{
1318	struct hn_softc *sc = xsc;
1319
1320	/* Re-allow link status checks. */
1321	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1322	hn_link_status(sc);
1323}
1324
1325static void
1326hn_update_link_status(struct hn_softc *sc)
1327{
1328
1329	if (sc->hn_mgmt_taskq != NULL)
1330		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1331}
1332
1333static void
1334hn_change_network(struct hn_softc *sc)
1335{
1336
1337	if (sc->hn_mgmt_taskq != NULL)
1338		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1339}
1340
1341static __inline int
1342hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1343    struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1344{
1345	struct mbuf *m = *m_head;
1346	int error;
1347
1348	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1349
1350	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1351	    m, segs, nsegs, BUS_DMA_NOWAIT);
1352	if (error == EFBIG) {
1353		struct mbuf *m_new;
1354
1355		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1356		if (m_new == NULL)
1357			return ENOBUFS;
1358		else
1359			*m_head = m = m_new;
1360		txr->hn_tx_collapsed++;
1361
1362		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1363		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1364	}
1365	if (!error) {
1366		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1367		    BUS_DMASYNC_PREWRITE);
1368		txd->flags |= HN_TXD_FLAG_DMAMAP;
1369	}
1370	return error;
1371}
1372
1373static __inline int
1374hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1375{
1376
1377	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1378	    ("put an onlist txd %#x", txd->flags));
1379	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1380	    ("put an onagg txd %#x", txd->flags));
1381
1382	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1383	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1384		return 0;
1385
1386	if (!STAILQ_EMPTY(&txd->agg_list)) {
1387		struct hn_txdesc *tmp_txd;
1388
1389		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1390			int freed;
1391
1392			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1393			    ("resursive aggregation on aggregated txdesc"));
1394			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1395			    ("not aggregated txdesc"));
1396			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1397			    ("aggregated txdesc uses dmamap"));
1398			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1399			    ("aggregated txdesc consumes "
1400			     "chimney sending buffer"));
1401			KASSERT(tmp_txd->chim_size == 0,
1402			    ("aggregated txdesc has non-zero "
1403			     "chimney sending size"));
1404
1405			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1406			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1407			freed = hn_txdesc_put(txr, tmp_txd);
1408			KASSERT(freed, ("failed to free aggregated txdesc"));
1409		}
1410	}
1411
1412	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1413		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1414		    ("chim txd uses dmamap"));
1415		hn_chim_free(txr->hn_sc, txd->chim_index);
1416		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1417		txd->chim_size = 0;
1418	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1419		bus_dmamap_sync(txr->hn_tx_data_dtag,
1420		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1421		bus_dmamap_unload(txr->hn_tx_data_dtag,
1422		    txd->data_dmap);
1423		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1424	}
1425
1426	if (txd->m != NULL) {
1427		m_freem(txd->m);
1428		txd->m = NULL;
1429	}
1430
1431	txd->flags |= HN_TXD_FLAG_ONLIST;
1432#ifndef HN_USE_TXDESC_BUFRING
1433	mtx_lock_spin(&txr->hn_txlist_spin);
1434	KASSERT(txr->hn_txdesc_avail >= 0 &&
1435	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1436	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1437	txr->hn_txdesc_avail++;
1438	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1439	mtx_unlock_spin(&txr->hn_txlist_spin);
1440#else	/* HN_USE_TXDESC_BUFRING */
1441#ifdef HN_DEBUG
1442	atomic_add_int(&txr->hn_txdesc_avail, 1);
1443#endif
1444	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1445#endif	/* !HN_USE_TXDESC_BUFRING */
1446
1447	return 1;
1448}
1449
1450static __inline struct hn_txdesc *
1451hn_txdesc_get(struct hn_tx_ring *txr)
1452{
1453	struct hn_txdesc *txd;
1454
1455#ifndef HN_USE_TXDESC_BUFRING
1456	mtx_lock_spin(&txr->hn_txlist_spin);
1457	txd = SLIST_FIRST(&txr->hn_txlist);
1458	if (txd != NULL) {
1459		KASSERT(txr->hn_txdesc_avail > 0,
1460		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1461		txr->hn_txdesc_avail--;
1462		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1463	}
1464	mtx_unlock_spin(&txr->hn_txlist_spin);
1465#else
1466	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1467#endif
1468
1469	if (txd != NULL) {
1470#ifdef HN_USE_TXDESC_BUFRING
1471#ifdef HN_DEBUG
1472		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1473#endif
1474#endif	/* HN_USE_TXDESC_BUFRING */
1475		KASSERT(txd->m == NULL && txd->refs == 0 &&
1476		    STAILQ_EMPTY(&txd->agg_list) &&
1477		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1478		    txd->chim_size == 0 &&
1479		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1480		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1481		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1482		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1483		txd->refs = 1;
1484	}
1485	return txd;
1486}
1487
1488static __inline void
1489hn_txdesc_hold(struct hn_txdesc *txd)
1490{
1491
1492	/* 0->1 transition will never work */
1493	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1494	atomic_add_int(&txd->refs, 1);
1495}
1496
1497static __inline void
1498hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1499{
1500
1501	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1502	    ("recursive aggregation on aggregating txdesc"));
1503
1504	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1505	    ("already aggregated"));
1506	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1507	    ("recursive aggregation on to-be-aggregated txdesc"));
1508
1509	txd->flags |= HN_TXD_FLAG_ONAGG;
1510	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1511}
1512
1513static bool
1514hn_tx_ring_pending(struct hn_tx_ring *txr)
1515{
1516	bool pending = false;
1517
1518#ifndef HN_USE_TXDESC_BUFRING
1519	mtx_lock_spin(&txr->hn_txlist_spin);
1520	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1521		pending = true;
1522	mtx_unlock_spin(&txr->hn_txlist_spin);
1523#else
1524	if (!buf_ring_full(txr->hn_txdesc_br))
1525		pending = true;
1526#endif
1527	return (pending);
1528}
1529
1530static __inline void
1531hn_txeof(struct hn_tx_ring *txr)
1532{
1533	txr->hn_has_txeof = 0;
1534	txr->hn_txeof(txr);
1535}
1536
1537static void
1538hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1539    struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1540{
1541	struct hn_txdesc *txd = sndc->hn_cbarg;
1542	struct hn_tx_ring *txr;
1543
1544	txr = txd->txr;
1545	KASSERT(txr->hn_chan == chan,
1546	    ("channel mismatch, on chan%u, should be chan%u",
1547	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1548
1549	txr->hn_has_txeof = 1;
1550	hn_txdesc_put(txr, txd);
1551
1552	++txr->hn_txdone_cnt;
1553	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1554		txr->hn_txdone_cnt = 0;
1555		if (txr->hn_oactive)
1556			hn_txeof(txr);
1557	}
1558}
1559
1560static void
1561hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1562{
1563#if defined(INET) || defined(INET6)
1564	struct lro_ctrl *lro = &rxr->hn_lro;
1565	struct lro_entry *queued;
1566
1567	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1568		SLIST_REMOVE_HEAD(&lro->lro_active, next);
1569		tcp_lro_flush(lro, queued);
1570	}
1571#endif
1572
1573	/*
1574	 * NOTE:
1575	 * 'txr' could be NULL, if multiple channels and
1576	 * ifnet.if_start method are enabled.
1577	 */
1578	if (txr == NULL || !txr->hn_has_txeof)
1579		return;
1580
1581	txr->hn_txdone_cnt = 0;
1582	hn_txeof(txr);
1583}
1584
1585static __inline uint32_t
1586hn_rndis_pktmsg_offset(uint32_t ofs)
1587{
1588
1589	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1590	    ("invalid RNDIS packet msg offset %u", ofs));
1591	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1592}
1593
1594static __inline void *
1595hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1596    size_t pi_dlen, uint32_t pi_type)
1597{
1598	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1599	struct rndis_pktinfo *pi;
1600
1601	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1602	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1603
1604	/*
1605	 * Per-packet-info does not move; it only grows.
1606	 *
1607	 * NOTE:
1608	 * rm_pktinfooffset in this phase counts from the beginning
1609	 * of rndis_packet_msg.
1610	 */
1611	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1612	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1613	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1614	    pkt->rm_pktinfolen);
1615	pkt->rm_pktinfolen += pi_size;
1616
1617	pi->rm_size = pi_size;
1618	pi->rm_type = pi_type;
1619	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1620
1621	/* Data immediately follow per-packet-info. */
1622	pkt->rm_dataoffset += pi_size;
1623
1624	/* Update RNDIS packet msg length */
1625	pkt->rm_len += pi_size;
1626
1627	return (pi->rm_data);
1628}
1629
1630static __inline int
1631hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1632{
1633	struct hn_txdesc *txd;
1634	struct mbuf *m;
1635	int error, pkts;
1636
1637	txd = txr->hn_agg_txd;
1638	KASSERT(txd != NULL, ("no aggregate txdesc"));
1639
1640	/*
1641	 * Since hn_txpkt() will reset this temporary stat, save
1642	 * it now, so that oerrors can be updated properly, if
1643	 * hn_txpkt() ever fails.
1644	 */
1645	pkts = txr->hn_stat_pkts;
1646
1647	/*
1648	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1649	 * failure, save it for later freeing, if hn_txpkt() ever
1650	 * fails.
1651	 */
1652	m = txd->m;
1653	error = hn_txpkt(ifp, txr, txd);
1654	if (__predict_false(error)) {
1655		/* txd is freed, but m is not. */
1656		m_freem(m);
1657
1658		txr->hn_flush_failed++;
1659		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1660	}
1661
1662	/* Reset all aggregation states. */
1663	txr->hn_agg_txd = NULL;
1664	txr->hn_agg_szleft = 0;
1665	txr->hn_agg_pktleft = 0;
1666	txr->hn_agg_prevpkt = NULL;
1667
1668	return (error);
1669}
1670
1671static void *
1672hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1673    int pktsize)
1674{
1675	void *chim;
1676
1677	if (txr->hn_agg_txd != NULL) {
1678		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1679			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1680			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1681			int olen;
1682
1683			/*
1684			 * Update the previous RNDIS packet's total length,
1685			 * it can be increased due to the mandatory alignment
1686			 * padding for this RNDIS packet.  And update the
1687			 * aggregating txdesc's chimney sending buffer size
1688			 * accordingly.
1689			 *
1690			 * XXX
1691			 * Zero-out the padding, as required by the RNDIS spec.
1692			 */
1693			olen = pkt->rm_len;
1694			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1695			agg_txd->chim_size += pkt->rm_len - olen;
1696
1697			/* Link this txdesc to the parent. */
1698			hn_txdesc_agg(agg_txd, txd);
1699
1700			chim = (uint8_t *)pkt + pkt->rm_len;
1701			/* Save the current packet for later fixup. */
1702			txr->hn_agg_prevpkt = chim;
1703
1704			txr->hn_agg_pktleft--;
1705			txr->hn_agg_szleft -= pktsize;
1706			if (txr->hn_agg_szleft <=
1707			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1708				/*
1709				 * Probably can't aggregate more packets,
1710				 * flush this aggregating txdesc proactively.
1711				 */
1712				txr->hn_agg_pktleft = 0;
1713			}
1714			/* Done! */
1715			return (chim);
1716		}
1717		hn_flush_txagg(ifp, txr);
1718	}
1719	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1720
1721	txr->hn_tx_chimney_tried++;
1722	txd->chim_index = hn_chim_alloc(txr->hn_sc);
1723	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1724		return (NULL);
1725	txr->hn_tx_chimney++;
1726
1727	chim = txr->hn_sc->hn_chim +
1728	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1729
1730	if (txr->hn_agg_pktmax > 1 &&
1731	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1732		txr->hn_agg_txd = txd;
1733		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1734		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1735		txr->hn_agg_prevpkt = chim;
1736	}
1737	return (chim);
1738}
1739
1740/*
1741 * NOTE:
1742 * If this function fails, then both txd and m_head0 will be freed.
1743 */
1744static int
1745hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1746    struct mbuf **m_head0)
1747{
1748	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1749	int error, nsegs, i;
1750	struct mbuf *m_head = *m_head0;
1751	struct rndis_packet_msg *pkt;
1752	uint32_t *pi_data;
1753	void *chim = NULL;
1754	int pkt_hlen, pkt_size;
1755
1756	pkt = txd->rndis_pkt;
1757	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1758	if (pkt_size < txr->hn_chim_size) {
1759		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1760		if (chim != NULL)
1761			pkt = chim;
1762	} else {
1763		if (txr->hn_agg_txd != NULL)
1764			hn_flush_txagg(ifp, txr);
1765	}
1766
1767	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1768	pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1769	pkt->rm_dataoffset = sizeof(*pkt);
1770	pkt->rm_datalen = m_head->m_pkthdr.len;
1771	pkt->rm_oobdataoffset = 0;
1772	pkt->rm_oobdatalen = 0;
1773	pkt->rm_oobdataelements = 0;
1774	pkt->rm_pktinfooffset = sizeof(*pkt);
1775	pkt->rm_pktinfolen = 0;
1776	pkt->rm_vchandle = 0;
1777	pkt->rm_reserved = 0;
1778
1779	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1780		/*
1781		 * Set the hash value for this packet, so that the host could
1782		 * dispatch the TX done event for this packet back to this TX
1783		 * ring's channel.
1784		 */
1785		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1786		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1787		*pi_data = txr->hn_tx_idx;
1788	}
1789
1790	if (m_head->m_flags & M_VLANTAG) {
1791		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1792		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1793		*pi_data = NDIS_VLAN_INFO_MAKE(
1794		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1795		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1796		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1797	}
1798
1799	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1800#if defined(INET6) || defined(INET)
1801		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1802		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1803#ifdef INET
1804		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1805			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1806			    m_head->m_pkthdr.tso_segsz);
1807		}
1808#endif
1809#if defined(INET6) && defined(INET)
1810		else
1811#endif
1812#ifdef INET6
1813		{
1814			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1815			    m_head->m_pkthdr.tso_segsz);
1816		}
1817#endif
1818#endif	/* INET6 || INET */
1819	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1820		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1821		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1822		if (m_head->m_pkthdr.csum_flags &
1823		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1824			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1825		} else {
1826			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1827			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1828				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1829		}
1830
1831		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1832			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1833		else if (m_head->m_pkthdr.csum_flags &
1834		    (CSUM_IP_UDP | CSUM_IP6_UDP))
1835			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1836	}
1837
1838	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1839	/* Convert RNDIS packet message offsets */
1840	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1841	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1842
1843	/*
1844	 * Fast path: Chimney sending.
1845	 */
1846	if (chim != NULL) {
1847		struct hn_txdesc *tgt_txd = txd;
1848
1849		if (txr->hn_agg_txd != NULL) {
1850			tgt_txd = txr->hn_agg_txd;
1851#ifdef INVARIANTS
1852			*m_head0 = NULL;
1853#endif
1854		}
1855
1856		KASSERT(pkt == chim,
1857		    ("RNDIS pkt not in chimney sending buffer"));
1858		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1859		    ("chimney sending buffer is not used"));
1860		tgt_txd->chim_size += pkt->rm_len;
1861
1862		m_copydata(m_head, 0, m_head->m_pkthdr.len,
1863		    ((uint8_t *)chim) + pkt_hlen);
1864
1865		txr->hn_gpa_cnt = 0;
1866		txr->hn_sendpkt = hn_txpkt_chim;
1867		goto done;
1868	}
1869
1870	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1871	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1872	    ("chimney buffer is used"));
1873	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1874
1875	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1876	if (__predict_false(error)) {
1877		int freed;
1878
1879		/*
1880		 * This mbuf is not linked w/ the txd yet, so free it now.
1881		 */
1882		m_freem(m_head);
1883		*m_head0 = NULL;
1884
1885		freed = hn_txdesc_put(txr, txd);
1886		KASSERT(freed != 0,
1887		    ("fail to free txd upon txdma error"));
1888
1889		txr->hn_txdma_failed++;
1890		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1891		return error;
1892	}
1893	*m_head0 = m_head;
1894
1895	/* +1 RNDIS packet message */
1896	txr->hn_gpa_cnt = nsegs + 1;
1897
1898	/* send packet with page buffer */
1899	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1900	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1901	txr->hn_gpa[0].gpa_len = pkt_hlen;
1902
1903	/*
1904	 * Fill the page buffers with mbuf info after the page
1905	 * buffer for RNDIS packet message.
1906	 */
1907	for (i = 0; i < nsegs; ++i) {
1908		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1909
1910		gpa->gpa_page = atop(segs[i].ds_addr);
1911		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1912		gpa->gpa_len = segs[i].ds_len;
1913	}
1914
1915	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1916	txd->chim_size = 0;
1917	txr->hn_sendpkt = hn_txpkt_sglist;
1918done:
1919	txd->m = m_head;
1920
1921	/* Set the completion routine */
1922	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1923
1924	/* Update temporary stats for later use. */
1925	txr->hn_stat_pkts++;
1926	txr->hn_stat_size += m_head->m_pkthdr.len;
1927	if (m_head->m_flags & M_MCAST)
1928		txr->hn_stat_mcasts++;
1929
1930	return 0;
1931}
1932
1933/*
1934 * NOTE:
1935 * If this function fails, then txd will be freed, but the mbuf
1936 * associated w/ the txd will _not_ be freed.
1937 */
1938static int
1939hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1940{
1941	int error, send_failed = 0, has_bpf;
1942
1943again:
1944	has_bpf = bpf_peers_present(ifp->if_bpf);
1945	if (has_bpf) {
1946		/*
1947		 * Make sure that this txd and any aggregated txds are not
1948		 * freed before ETHER_BPF_MTAP.
1949		 */
1950		hn_txdesc_hold(txd);
1951	}
1952	error = txr->hn_sendpkt(txr, txd);
1953	if (!error) {
1954		if (has_bpf) {
1955			const struct hn_txdesc *tmp_txd;
1956
1957			ETHER_BPF_MTAP(ifp, txd->m);
1958			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1959				ETHER_BPF_MTAP(ifp, tmp_txd->m);
1960		}
1961
1962		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1963#ifdef HN_IFSTART_SUPPORT
1964		if (!hn_use_if_start)
1965#endif
1966		{
1967			if_inc_counter(ifp, IFCOUNTER_OBYTES,
1968			    txr->hn_stat_size);
1969			if (txr->hn_stat_mcasts != 0) {
1970				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1971				    txr->hn_stat_mcasts);
1972			}
1973		}
1974		txr->hn_pkts += txr->hn_stat_pkts;
1975		txr->hn_sends++;
1976	}
1977	if (has_bpf)
1978		hn_txdesc_put(txr, txd);
1979
1980	if (__predict_false(error)) {
1981		int freed;
1982
1983		/*
1984		 * This should "really rarely" happen.
1985		 *
1986		 * XXX Too many RX to be acked or too many sideband
1987		 * commands to run?  Ask netvsc_channel_rollup()
1988		 * to kick start later.
1989		 */
1990		txr->hn_has_txeof = 1;
1991		if (!send_failed) {
1992			txr->hn_send_failed++;
1993			send_failed = 1;
1994			/*
1995			 * Try sending again after set hn_has_txeof;
1996			 * in case that we missed the last
1997			 * netvsc_channel_rollup().
1998			 */
1999			goto again;
2000		}
2001		if_printf(ifp, "send failed\n");
2002
2003		/*
2004		 * Caller will perform further processing on the
2005		 * associated mbuf, so don't free it in hn_txdesc_put();
2006		 * only unload it from the DMA map in hn_txdesc_put(),
2007		 * if it was loaded.
2008		 */
2009		txd->m = NULL;
2010		freed = hn_txdesc_put(txr, txd);
2011		KASSERT(freed != 0,
2012		    ("fail to free txd upon send error"));
2013
2014		txr->hn_send_failed++;
2015	}
2016
2017	/* Reset temporary stats, after this sending is done. */
2018	txr->hn_stat_size = 0;
2019	txr->hn_stat_pkts = 0;
2020	txr->hn_stat_mcasts = 0;
2021
2022	return (error);
2023}
2024
2025/*
2026 * Append the specified data to the indicated mbuf chain,
2027 * Extend the mbuf chain if the new data does not fit in
2028 * existing space.
2029 *
2030 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2031 * There should be an equivalent in the kernel mbuf code,
2032 * but there does not appear to be one yet.
2033 *
2034 * Differs from m_append() in that additional mbufs are
2035 * allocated with cluster size MJUMPAGESIZE, and filled
2036 * accordingly.
2037 *
2038 * Return 1 if able to complete the job; otherwise 0.
2039 */
2040static int
2041hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2042{
2043	struct mbuf *m, *n;
2044	int remainder, space;
2045
2046	for (m = m0; m->m_next != NULL; m = m->m_next)
2047		;
2048	remainder = len;
2049	space = M_TRAILINGSPACE(m);
2050	if (space > 0) {
2051		/*
2052		 * Copy into available space.
2053		 */
2054		if (space > remainder)
2055			space = remainder;
2056		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2057		m->m_len += space;
2058		cp += space;
2059		remainder -= space;
2060	}
2061	while (remainder > 0) {
2062		/*
2063		 * Allocate a new mbuf; could check space
2064		 * and allocate a cluster instead.
2065		 */
2066		n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2067		if (n == NULL)
2068			break;
2069		n->m_len = min(MJUMPAGESIZE, remainder);
2070		bcopy(cp, mtod(n, caddr_t), n->m_len);
2071		cp += n->m_len;
2072		remainder -= n->m_len;
2073		m->m_next = n;
2074		m = n;
2075	}
2076	if (m0->m_flags & M_PKTHDR)
2077		m0->m_pkthdr.len += len - remainder;
2078
2079	return (remainder == 0);
2080}
2081
2082#if defined(INET) || defined(INET6)
2083static __inline int
2084hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2085{
2086#if __FreeBSD_version >= 1100095
2087	if (hn_lro_mbufq_depth) {
2088		tcp_lro_queue_mbuf(lc, m);
2089		return 0;
2090	}
2091#endif
2092	return tcp_lro_rx(lc, m, 0);
2093}
2094#endif
2095
2096static int
2097hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2098    const struct hn_rxinfo *info)
2099{
2100	struct ifnet *ifp = rxr->hn_ifp;
2101	struct mbuf *m_new;
2102	int size, do_lro = 0, do_csum = 1;
2103	int hash_type = M_HASHTYPE_OPAQUE;
2104
2105	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2106		return (0);
2107
2108	/*
2109	 * Bail out if packet contains more data than configured MTU.
2110	 */
2111	if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2112		return (0);
2113	} else if (dlen <= MHLEN) {
2114		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2115		if (m_new == NULL) {
2116			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2117			return (0);
2118		}
2119		memcpy(mtod(m_new, void *), data, dlen);
2120		m_new->m_pkthdr.len = m_new->m_len = dlen;
2121		rxr->hn_small_pkts++;
2122	} else {
2123		/*
2124		 * Get an mbuf with a cluster.  For packets 2K or less,
2125		 * get a standard 2K cluster.  For anything larger, get a
2126		 * 4K cluster.  Any buffers larger than 4K can cause problems
2127		 * if looped around to the Hyper-V TX channel, so avoid them.
2128		 */
2129		size = MCLBYTES;
2130		if (dlen > MCLBYTES) {
2131			/* 4096 */
2132			size = MJUMPAGESIZE;
2133		}
2134
2135		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2136		if (m_new == NULL) {
2137			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2138			return (0);
2139		}
2140
2141		hv_m_append(m_new, dlen, data);
2142	}
2143	m_new->m_pkthdr.rcvif = ifp;
2144
2145	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2146		do_csum = 0;
2147
2148	/* receive side checksum offload */
2149	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2150		/* IP csum offload */
2151		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2152			m_new->m_pkthdr.csum_flags |=
2153			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2154			rxr->hn_csum_ip++;
2155		}
2156
2157		/* TCP/UDP csum offload */
2158		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2159		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2160			m_new->m_pkthdr.csum_flags |=
2161			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2162			m_new->m_pkthdr.csum_data = 0xffff;
2163			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2164				rxr->hn_csum_tcp++;
2165			else
2166				rxr->hn_csum_udp++;
2167		}
2168
2169		/*
2170		 * XXX
2171		 * As of this write (Oct 28th, 2016), host side will turn
2172		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2173		 * the do_lro setting here is actually _not_ accurate.  We
2174		 * depend on the RSS hash type check to reset do_lro.
2175		 */
2176		if ((info->csum_info &
2177		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2178		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2179			do_lro = 1;
2180	} else {
2181		const struct ether_header *eh;
2182		uint16_t etype;
2183		int hoff;
2184
2185		hoff = sizeof(*eh);
2186		if (m_new->m_len < hoff)
2187			goto skip;
2188		eh = mtod(m_new, struct ether_header *);
2189		etype = ntohs(eh->ether_type);
2190		if (etype == ETHERTYPE_VLAN) {
2191			const struct ether_vlan_header *evl;
2192
2193			hoff = sizeof(*evl);
2194			if (m_new->m_len < hoff)
2195				goto skip;
2196			evl = mtod(m_new, struct ether_vlan_header *);
2197			etype = ntohs(evl->evl_proto);
2198		}
2199
2200		if (etype == ETHERTYPE_IP) {
2201			int pr;
2202
2203			pr = hn_check_iplen(m_new, hoff);
2204			if (pr == IPPROTO_TCP) {
2205				if (do_csum &&
2206				    (rxr->hn_trust_hcsum &
2207				     HN_TRUST_HCSUM_TCP)) {
2208					rxr->hn_csum_trusted++;
2209					m_new->m_pkthdr.csum_flags |=
2210					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2211					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2212					m_new->m_pkthdr.csum_data = 0xffff;
2213				}
2214				do_lro = 1;
2215			} else if (pr == IPPROTO_UDP) {
2216				if (do_csum &&
2217				    (rxr->hn_trust_hcsum &
2218				     HN_TRUST_HCSUM_UDP)) {
2219					rxr->hn_csum_trusted++;
2220					m_new->m_pkthdr.csum_flags |=
2221					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2222					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2223					m_new->m_pkthdr.csum_data = 0xffff;
2224				}
2225			} else if (pr != IPPROTO_DONE && do_csum &&
2226			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2227				rxr->hn_csum_trusted++;
2228				m_new->m_pkthdr.csum_flags |=
2229				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2230			}
2231		}
2232	}
2233skip:
2234	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2235		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2236		    NDIS_VLAN_INFO_ID(info->vlan_info),
2237		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2238		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2239		m_new->m_flags |= M_VLANTAG;
2240	}
2241
2242	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2243		rxr->hn_rss_pkts++;
2244		m_new->m_pkthdr.flowid = info->hash_value;
2245		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2246		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2247			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2248
2249			/*
2250			 * NOTE:
2251			 * do_lro is resetted, if the hash types are not TCP
2252			 * related.  See the comment in the above csum_flags
2253			 * setup section.
2254			 */
2255			switch (type) {
2256			case NDIS_HASH_IPV4:
2257				hash_type = M_HASHTYPE_RSS_IPV4;
2258				do_lro = 0;
2259				break;
2260
2261			case NDIS_HASH_TCP_IPV4:
2262				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2263				break;
2264
2265			case NDIS_HASH_IPV6:
2266				hash_type = M_HASHTYPE_RSS_IPV6;
2267				do_lro = 0;
2268				break;
2269
2270			case NDIS_HASH_IPV6_EX:
2271				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2272				do_lro = 0;
2273				break;
2274
2275			case NDIS_HASH_TCP_IPV6:
2276				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2277				break;
2278
2279			case NDIS_HASH_TCP_IPV6_EX:
2280				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2281				break;
2282			}
2283		}
2284	} else {
2285		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2286	}
2287	M_HASHTYPE_SET(m_new, hash_type);
2288
2289	/*
2290	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2291	 * messages (not just data messages) will trigger a response.
2292	 */
2293
2294	ifp->if_ipackets++;
2295	rxr->hn_pkts++;
2296
2297	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2298#if defined(INET) || defined(INET6)
2299		struct lro_ctrl *lro = &rxr->hn_lro;
2300
2301		if (lro->lro_cnt) {
2302			rxr->hn_lro_tried++;
2303			if (hn_lro_rx(lro, m_new) == 0) {
2304				/* DONE! */
2305				return 0;
2306			}
2307		}
2308#endif
2309	}
2310
2311	/* We're not holding the lock here, so don't release it */
2312	(*ifp->if_input)(ifp, m_new);
2313
2314	return (0);
2315}
2316
2317static int
2318hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2319{
2320	struct hn_softc *sc = ifp->if_softc;
2321	struct ifreq *ifr = (struct ifreq *)data;
2322	int mask, error = 0;
2323
2324	switch (cmd) {
2325	case SIOCSIFMTU:
2326		if (ifr->ifr_mtu > HN_MTU_MAX) {
2327			error = EINVAL;
2328			break;
2329		}
2330
2331		HN_LOCK(sc);
2332
2333		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2334			HN_UNLOCK(sc);
2335			break;
2336		}
2337
2338		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2339			/* Can't change MTU */
2340			HN_UNLOCK(sc);
2341			error = EOPNOTSUPP;
2342			break;
2343		}
2344
2345		if (ifp->if_mtu == ifr->ifr_mtu) {
2346			HN_UNLOCK(sc);
2347			break;
2348		}
2349
2350		/*
2351		 * Suspend this interface before the synthetic parts
2352		 * are ripped.
2353		 */
2354		hn_suspend(sc);
2355
2356		/*
2357		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2358		 */
2359		hn_synth_detach(sc);
2360
2361		/*
2362		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2363		 * with the new MTU setting.
2364		 */
2365		error = hn_synth_attach(sc, ifr->ifr_mtu);
2366		if (error) {
2367			HN_UNLOCK(sc);
2368			break;
2369		}
2370
2371		/*
2372		 * Commit the requested MTU, after the synthetic parts
2373		 * have been successfully attached.
2374		 */
2375		ifp->if_mtu = ifr->ifr_mtu;
2376
2377		/*
2378		 * Make sure that various parameters based on MTU are
2379		 * still valid, after the MTU change.
2380		 */
2381		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2382			hn_set_chim_size(sc, sc->hn_chim_szmax);
2383		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2384#if __FreeBSD_version >= 1100099
2385		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2386		    HN_LRO_LENLIM_MIN(ifp))
2387			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2388#endif
2389
2390		/*
2391		 * All done!  Resume the interface now.
2392		 */
2393		hn_resume(sc);
2394
2395		HN_UNLOCK(sc);
2396		break;
2397
2398	case SIOCSIFFLAGS:
2399		HN_LOCK(sc);
2400
2401		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2402			HN_UNLOCK(sc);
2403			break;
2404		}
2405
2406		if (ifp->if_flags & IFF_UP) {
2407			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2408				/*
2409				 * Caller meight hold mutex, e.g.
2410				 * bpf; use busy-wait for the RNDIS
2411				 * reply.
2412				 */
2413				HN_NO_SLEEPING(sc);
2414				hn_set_rxfilter(sc);
2415				HN_SLEEPING_OK(sc);
2416			} else {
2417				hn_init_locked(sc);
2418			}
2419		} else {
2420			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2421				hn_stop(sc);
2422		}
2423		sc->hn_if_flags = ifp->if_flags;
2424
2425		HN_UNLOCK(sc);
2426		break;
2427
2428	case SIOCSIFCAP:
2429		HN_LOCK(sc);
2430		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2431
2432		if (mask & IFCAP_TXCSUM) {
2433			ifp->if_capenable ^= IFCAP_TXCSUM;
2434			if (ifp->if_capenable & IFCAP_TXCSUM)
2435				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2436			else
2437				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2438		}
2439		if (mask & IFCAP_TXCSUM_IPV6) {
2440			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2441			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2442				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2443			else
2444				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2445		}
2446
2447		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2448		if (mask & IFCAP_RXCSUM)
2449			ifp->if_capenable ^= IFCAP_RXCSUM;
2450#ifdef foo
2451		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2452		if (mask & IFCAP_RXCSUM_IPV6)
2453			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2454#endif
2455
2456		if (mask & IFCAP_LRO)
2457			ifp->if_capenable ^= IFCAP_LRO;
2458
2459		if (mask & IFCAP_TSO4) {
2460			ifp->if_capenable ^= IFCAP_TSO4;
2461			if (ifp->if_capenable & IFCAP_TSO4)
2462				ifp->if_hwassist |= CSUM_IP_TSO;
2463			else
2464				ifp->if_hwassist &= ~CSUM_IP_TSO;
2465		}
2466		if (mask & IFCAP_TSO6) {
2467			ifp->if_capenable ^= IFCAP_TSO6;
2468			if (ifp->if_capenable & IFCAP_TSO6)
2469				ifp->if_hwassist |= CSUM_IP6_TSO;
2470			else
2471				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2472		}
2473
2474		HN_UNLOCK(sc);
2475		break;
2476
2477	case SIOCADDMULTI:
2478	case SIOCDELMULTI:
2479		HN_LOCK(sc);
2480
2481		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2482			HN_UNLOCK(sc);
2483			break;
2484		}
2485		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2486			/*
2487			 * Multicast uses mutex; use busy-wait for
2488			 * the RNDIS reply.
2489			 */
2490			HN_NO_SLEEPING(sc);
2491			hn_set_rxfilter(sc);
2492			HN_SLEEPING_OK(sc);
2493		}
2494
2495		HN_UNLOCK(sc);
2496		break;
2497
2498	case SIOCSIFMEDIA:
2499	case SIOCGIFMEDIA:
2500		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2501		break;
2502
2503	default:
2504		error = ether_ioctl(ifp, cmd, data);
2505		break;
2506	}
2507	return (error);
2508}
2509
2510static void
2511hn_stop(struct hn_softc *sc)
2512{
2513	struct ifnet *ifp = sc->hn_ifp;
2514	int i;
2515
2516	HN_LOCK_ASSERT(sc);
2517
2518	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2519	    ("synthetic parts were not attached"));
2520
2521	/* Clear RUNNING bit _before_ hn_suspend_data() */
2522	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2523	hn_suspend_data(sc);
2524
2525	/* Clear OACTIVE bit. */
2526	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2527	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2528		sc->hn_tx_ring[i].hn_oactive = 0;
2529}
2530
2531static void
2532hn_init_locked(struct hn_softc *sc)
2533{
2534	struct ifnet *ifp = sc->hn_ifp;
2535	int i;
2536
2537	HN_LOCK_ASSERT(sc);
2538
2539	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2540		return;
2541
2542	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2543		return;
2544
2545	/* Configure RX filter */
2546	hn_set_rxfilter(sc);
2547
2548	/* Clear OACTIVE bit. */
2549	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2550	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2551		sc->hn_tx_ring[i].hn_oactive = 0;
2552
2553	/* Clear TX 'suspended' bit. */
2554	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2555
2556	/* Everything is ready; unleash! */
2557	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2558}
2559
2560static void
2561hn_init(void *xsc)
2562{
2563	struct hn_softc *sc = xsc;
2564
2565	HN_LOCK(sc);
2566	hn_init_locked(sc);
2567	HN_UNLOCK(sc);
2568}
2569
2570#if __FreeBSD_version >= 1100099
2571
2572static int
2573hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2574{
2575	struct hn_softc *sc = arg1;
2576	unsigned int lenlim;
2577	int error;
2578
2579	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2580	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2581	if (error || req->newptr == NULL)
2582		return error;
2583
2584	HN_LOCK(sc);
2585	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2586	    lenlim > TCP_LRO_LENGTH_MAX) {
2587		HN_UNLOCK(sc);
2588		return EINVAL;
2589	}
2590	hn_set_lro_lenlim(sc, lenlim);
2591	HN_UNLOCK(sc);
2592
2593	return 0;
2594}
2595
2596static int
2597hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2598{
2599	struct hn_softc *sc = arg1;
2600	int ackcnt, error, i;
2601
2602	/*
2603	 * lro_ackcnt_lim is append count limit,
2604	 * +1 to turn it into aggregation limit.
2605	 */
2606	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2607	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2608	if (error || req->newptr == NULL)
2609		return error;
2610
2611	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2612		return EINVAL;
2613
2614	/*
2615	 * Convert aggregation limit back to append
2616	 * count limit.
2617	 */
2618	--ackcnt;
2619	HN_LOCK(sc);
2620	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2621		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2622	HN_UNLOCK(sc);
2623	return 0;
2624}
2625
2626#endif
2627
2628static int
2629hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2630{
2631	struct hn_softc *sc = arg1;
2632	int hcsum = arg2;
2633	int on, error, i;
2634
2635	on = 0;
2636	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2637		on = 1;
2638
2639	error = sysctl_handle_int(oidp, &on, 0, req);
2640	if (error || req->newptr == NULL)
2641		return error;
2642
2643	HN_LOCK(sc);
2644	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2645		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2646
2647		if (on)
2648			rxr->hn_trust_hcsum |= hcsum;
2649		else
2650			rxr->hn_trust_hcsum &= ~hcsum;
2651	}
2652	HN_UNLOCK(sc);
2653	return 0;
2654}
2655
2656static int
2657hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2658{
2659	struct hn_softc *sc = arg1;
2660	int chim_size, error;
2661
2662	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2663	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2664	if (error || req->newptr == NULL)
2665		return error;
2666
2667	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2668		return EINVAL;
2669
2670	HN_LOCK(sc);
2671	hn_set_chim_size(sc, chim_size);
2672	HN_UNLOCK(sc);
2673	return 0;
2674}
2675
2676#if __FreeBSD_version < 1100095
2677static int
2678hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2679{
2680	struct hn_softc *sc = arg1;
2681	int ofs = arg2, i, error;
2682	struct hn_rx_ring *rxr;
2683	uint64_t stat;
2684
2685	stat = 0;
2686	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2687		rxr = &sc->hn_rx_ring[i];
2688		stat += *((int *)((uint8_t *)rxr + ofs));
2689	}
2690
2691	error = sysctl_handle_64(oidp, &stat, 0, req);
2692	if (error || req->newptr == NULL)
2693		return error;
2694
2695	/* Zero out this stat. */
2696	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2697		rxr = &sc->hn_rx_ring[i];
2698		*((int *)((uint8_t *)rxr + ofs)) = 0;
2699	}
2700	return 0;
2701}
2702#else
2703static int
2704hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2705{
2706	struct hn_softc *sc = arg1;
2707	int ofs = arg2, i, error;
2708	struct hn_rx_ring *rxr;
2709	uint64_t stat;
2710
2711	stat = 0;
2712	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2713		rxr = &sc->hn_rx_ring[i];
2714		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2715	}
2716
2717	error = sysctl_handle_64(oidp, &stat, 0, req);
2718	if (error || req->newptr == NULL)
2719		return error;
2720
2721	/* Zero out this stat. */
2722	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2723		rxr = &sc->hn_rx_ring[i];
2724		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2725	}
2726	return 0;
2727}
2728
2729#endif
2730
2731static int
2732hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2733{
2734	struct hn_softc *sc = arg1;
2735	int ofs = arg2, i, error;
2736	struct hn_rx_ring *rxr;
2737	u_long stat;
2738
2739	stat = 0;
2740	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2741		rxr = &sc->hn_rx_ring[i];
2742		stat += *((u_long *)((uint8_t *)rxr + ofs));
2743	}
2744
2745	error = sysctl_handle_long(oidp, &stat, 0, req);
2746	if (error || req->newptr == NULL)
2747		return error;
2748
2749	/* Zero out this stat. */
2750	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2751		rxr = &sc->hn_rx_ring[i];
2752		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2753	}
2754	return 0;
2755}
2756
2757static int
2758hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2759{
2760	struct hn_softc *sc = arg1;
2761	int ofs = arg2, i, error;
2762	struct hn_tx_ring *txr;
2763	u_long stat;
2764
2765	stat = 0;
2766	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2767		txr = &sc->hn_tx_ring[i];
2768		stat += *((u_long *)((uint8_t *)txr + ofs));
2769	}
2770
2771	error = sysctl_handle_long(oidp, &stat, 0, req);
2772	if (error || req->newptr == NULL)
2773		return error;
2774
2775	/* Zero out this stat. */
2776	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2777		txr = &sc->hn_tx_ring[i];
2778		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2779	}
2780	return 0;
2781}
2782
2783static int
2784hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2785{
2786	struct hn_softc *sc = arg1;
2787	int ofs = arg2, i, error, conf;
2788	struct hn_tx_ring *txr;
2789
2790	txr = &sc->hn_tx_ring[0];
2791	conf = *((int *)((uint8_t *)txr + ofs));
2792
2793	error = sysctl_handle_int(oidp, &conf, 0, req);
2794	if (error || req->newptr == NULL)
2795		return error;
2796
2797	HN_LOCK(sc);
2798	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2799		txr = &sc->hn_tx_ring[i];
2800		*((int *)((uint8_t *)txr + ofs)) = conf;
2801	}
2802	HN_UNLOCK(sc);
2803
2804	return 0;
2805}
2806
2807static int
2808hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2809{
2810	struct hn_softc *sc = arg1;
2811	int error, size;
2812
2813	size = sc->hn_agg_size;
2814	error = sysctl_handle_int(oidp, &size, 0, req);
2815	if (error || req->newptr == NULL)
2816		return (error);
2817
2818	HN_LOCK(sc);
2819	sc->hn_agg_size = size;
2820	hn_set_txagg(sc);
2821	HN_UNLOCK(sc);
2822
2823	return (0);
2824}
2825
2826static int
2827hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2828{
2829	struct hn_softc *sc = arg1;
2830	int error, pkts;
2831
2832	pkts = sc->hn_agg_pkts;
2833	error = sysctl_handle_int(oidp, &pkts, 0, req);
2834	if (error || req->newptr == NULL)
2835		return (error);
2836
2837	HN_LOCK(sc);
2838	sc->hn_agg_pkts = pkts;
2839	hn_set_txagg(sc);
2840	HN_UNLOCK(sc);
2841
2842	return (0);
2843}
2844
2845static int
2846hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2847{
2848	struct hn_softc *sc = arg1;
2849	int pkts;
2850
2851	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2852	return (sysctl_handle_int(oidp, &pkts, 0, req));
2853}
2854
2855static int
2856hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2857{
2858	struct hn_softc *sc = arg1;
2859	int align;
2860
2861	align = sc->hn_tx_ring[0].hn_agg_align;
2862	return (sysctl_handle_int(oidp, &align, 0, req));
2863}
2864
2865static int
2866hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2867{
2868	struct hn_softc *sc = arg1;
2869	char verstr[16];
2870
2871	snprintf(verstr, sizeof(verstr), "%u.%u",
2872	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2873	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2874	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2875}
2876
2877static int
2878hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2879{
2880	struct hn_softc *sc = arg1;
2881	char caps_str[128];
2882	uint32_t caps;
2883
2884	HN_LOCK(sc);
2885	caps = sc->hn_caps;
2886	HN_UNLOCK(sc);
2887	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2888	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2889}
2890
2891static int
2892hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2893{
2894	struct hn_softc *sc = arg1;
2895	char assist_str[128];
2896	uint32_t hwassist;
2897
2898	HN_LOCK(sc);
2899	hwassist = sc->hn_ifp->if_hwassist;
2900	HN_UNLOCK(sc);
2901	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2902	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2903}
2904
2905static int
2906hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2907{
2908	struct hn_softc *sc = arg1;
2909	char filter_str[128];
2910	uint32_t filter;
2911
2912	HN_LOCK(sc);
2913	filter = sc->hn_rx_filter;
2914	HN_UNLOCK(sc);
2915	snprintf(filter_str, sizeof(filter_str), "%b", filter,
2916	    NDIS_PACKET_TYPES);
2917	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2918}
2919
2920static int
2921hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2922{
2923	struct hn_softc *sc = arg1;
2924	int error;
2925
2926	HN_LOCK(sc);
2927
2928	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2929	if (error || req->newptr == NULL)
2930		goto back;
2931
2932	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2933	if (error)
2934		goto back;
2935	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2936
2937	if (sc->hn_rx_ring_inuse > 1) {
2938		error = hn_rss_reconfig(sc);
2939	} else {
2940		/* Not RSS capable, at least for now; just save the RSS key. */
2941		error = 0;
2942	}
2943back:
2944	HN_UNLOCK(sc);
2945	return (error);
2946}
2947
2948static int
2949hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2950{
2951	struct hn_softc *sc = arg1;
2952	int error;
2953
2954	HN_LOCK(sc);
2955
2956	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2957	if (error || req->newptr == NULL)
2958		goto back;
2959
2960	/*
2961	 * Don't allow RSS indirect table change, if this interface is not
2962	 * RSS capable currently.
2963	 */
2964	if (sc->hn_rx_ring_inuse == 1) {
2965		error = EOPNOTSUPP;
2966		goto back;
2967	}
2968
2969	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2970	if (error)
2971		goto back;
2972	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2973
2974	hn_rss_ind_fixup(sc);
2975	error = hn_rss_reconfig(sc);
2976back:
2977	HN_UNLOCK(sc);
2978	return (error);
2979}
2980
2981static int
2982hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2983{
2984	struct hn_softc *sc = arg1;
2985	char hash_str[128];
2986	uint32_t hash;
2987
2988	HN_LOCK(sc);
2989	hash = sc->hn_rss_hash;
2990	HN_UNLOCK(sc);
2991	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2992	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2993}
2994
2995static int
2996hn_check_iplen(const struct mbuf *m, int hoff)
2997{
2998	const struct ip *ip;
2999	int len, iphlen, iplen;
3000	const struct tcphdr *th;
3001	int thoff;				/* TCP data offset */
3002
3003	len = hoff + sizeof(struct ip);
3004
3005	/* The packet must be at least the size of an IP header. */
3006	if (m->m_pkthdr.len < len)
3007		return IPPROTO_DONE;
3008
3009	/* The fixed IP header must reside completely in the first mbuf. */
3010	if (m->m_len < len)
3011		return IPPROTO_DONE;
3012
3013	ip = mtodo(m, hoff);
3014
3015	/* Bound check the packet's stated IP header length. */
3016	iphlen = ip->ip_hl << 2;
3017	if (iphlen < sizeof(struct ip))		/* minimum header length */
3018		return IPPROTO_DONE;
3019
3020	/* The full IP header must reside completely in the one mbuf. */
3021	if (m->m_len < hoff + iphlen)
3022		return IPPROTO_DONE;
3023
3024	iplen = ntohs(ip->ip_len);
3025
3026	/*
3027	 * Check that the amount of data in the buffers is as
3028	 * at least much as the IP header would have us expect.
3029	 */
3030	if (m->m_pkthdr.len < hoff + iplen)
3031		return IPPROTO_DONE;
3032
3033	/*
3034	 * Ignore IP fragments.
3035	 */
3036	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3037		return IPPROTO_DONE;
3038
3039	/*
3040	 * The TCP/IP or UDP/IP header must be entirely contained within
3041	 * the first fragment of a packet.
3042	 */
3043	switch (ip->ip_p) {
3044	case IPPROTO_TCP:
3045		if (iplen < iphlen + sizeof(struct tcphdr))
3046			return IPPROTO_DONE;
3047		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3048			return IPPROTO_DONE;
3049		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3050		thoff = th->th_off << 2;
3051		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3052			return IPPROTO_DONE;
3053		if (m->m_len < hoff + iphlen + thoff)
3054			return IPPROTO_DONE;
3055		break;
3056	case IPPROTO_UDP:
3057		if (iplen < iphlen + sizeof(struct udphdr))
3058			return IPPROTO_DONE;
3059		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3060			return IPPROTO_DONE;
3061		break;
3062	default:
3063		if (iplen < iphlen)
3064			return IPPROTO_DONE;
3065		break;
3066	}
3067	return ip->ip_p;
3068}
3069
3070static int
3071hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3072{
3073	struct sysctl_oid_list *child;
3074	struct sysctl_ctx_list *ctx;
3075	device_t dev = sc->hn_dev;
3076#if defined(INET) || defined(INET6)
3077#if __FreeBSD_version >= 1100095
3078	int lroent_cnt;
3079#endif
3080#endif
3081	int i;
3082
3083	/*
3084	 * Create RXBUF for reception.
3085	 *
3086	 * NOTE:
3087	 * - It is shared by all channels.
3088	 * - A large enough buffer is allocated, certain version of NVSes
3089	 *   may further limit the usable space.
3090	 */
3091	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3092	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3093	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3094	if (sc->hn_rxbuf == NULL) {
3095		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3096		return (ENOMEM);
3097	}
3098
3099	sc->hn_rx_ring_cnt = ring_cnt;
3100	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3101
3102	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3103	    M_DEVBUF, M_WAITOK | M_ZERO);
3104
3105#if defined(INET) || defined(INET6)
3106#if __FreeBSD_version >= 1100095
3107	lroent_cnt = hn_lro_entry_count;
3108	if (lroent_cnt < TCP_LRO_ENTRIES)
3109		lroent_cnt = TCP_LRO_ENTRIES;
3110	if (bootverbose)
3111		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3112#endif
3113#endif	/* INET || INET6 */
3114
3115	ctx = device_get_sysctl_ctx(dev);
3116	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3117
3118	/* Create dev.hn.UNIT.rx sysctl tree */
3119	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3120	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3121
3122	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3123		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3124
3125		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3126		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3127		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3128		if (rxr->hn_br == NULL) {
3129			device_printf(dev, "allocate bufring failed\n");
3130			return (ENOMEM);
3131		}
3132
3133		if (hn_trust_hosttcp)
3134			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3135		if (hn_trust_hostudp)
3136			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3137		if (hn_trust_hostip)
3138			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3139		rxr->hn_ifp = sc->hn_ifp;
3140		if (i < sc->hn_tx_ring_cnt)
3141			rxr->hn_txr = &sc->hn_tx_ring[i];
3142		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3143		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3144		rxr->hn_rx_idx = i;
3145		rxr->hn_rxbuf = sc->hn_rxbuf;
3146
3147		/*
3148		 * Initialize LRO.
3149		 */
3150#if defined(INET) || defined(INET6)
3151#if __FreeBSD_version >= 1100095
3152		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3153		    hn_lro_mbufq_depth);
3154#else
3155		tcp_lro_init(&rxr->hn_lro);
3156		rxr->hn_lro.ifp = sc->hn_ifp;
3157#endif
3158#if __FreeBSD_version >= 1100099
3159		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3160		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3161#endif
3162#endif	/* INET || INET6 */
3163
3164		if (sc->hn_rx_sysctl_tree != NULL) {
3165			char name[16];
3166
3167			/*
3168			 * Create per RX ring sysctl tree:
3169			 * dev.hn.UNIT.rx.RINGID
3170			 */
3171			snprintf(name, sizeof(name), "%d", i);
3172			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3173			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3174			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3175
3176			if (rxr->hn_rx_sysctl_tree != NULL) {
3177				SYSCTL_ADD_ULONG(ctx,
3178				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3179				    OID_AUTO, "packets", CTLFLAG_RW,
3180				    &rxr->hn_pkts, "# of packets received");
3181				SYSCTL_ADD_ULONG(ctx,
3182				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3183				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3184				    &rxr->hn_rss_pkts,
3185				    "# of packets w/ RSS info received");
3186				SYSCTL_ADD_INT(ctx,
3187				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3188				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3189				    &rxr->hn_pktbuf_len, 0,
3190				    "Temporary channel packet buffer length");
3191			}
3192		}
3193	}
3194
3195	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3196	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3197	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3198#if __FreeBSD_version < 1100095
3199	    hn_rx_stat_int_sysctl,
3200#else
3201	    hn_rx_stat_u64_sysctl,
3202#endif
3203	    "LU", "LRO queued");
3204	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3205	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3206	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3207#if __FreeBSD_version < 1100095
3208	    hn_rx_stat_int_sysctl,
3209#else
3210	    hn_rx_stat_u64_sysctl,
3211#endif
3212	    "LU", "LRO flushed");
3213	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3214	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3215	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3216	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3217#if __FreeBSD_version >= 1100099
3218	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3219	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3220	    hn_lro_lenlim_sysctl, "IU",
3221	    "Max # of data bytes to be aggregated by LRO");
3222	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3223	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3224	    hn_lro_ackcnt_sysctl, "I",
3225	    "Max # of ACKs to be aggregated by LRO");
3226#endif
3227	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3228	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3229	    hn_trust_hcsum_sysctl, "I",
3230	    "Trust tcp segement verification on host side, "
3231	    "when csum info is missing");
3232	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3233	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3234	    hn_trust_hcsum_sysctl, "I",
3235	    "Trust udp datagram verification on host side, "
3236	    "when csum info is missing");
3237	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3238	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3239	    hn_trust_hcsum_sysctl, "I",
3240	    "Trust ip packet verification on host side, "
3241	    "when csum info is missing");
3242	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3243	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3244	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3245	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3246	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3247	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3248	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3249	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3250	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3251	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3252	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3253	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3254	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3255	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3256	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3257	    hn_rx_stat_ulong_sysctl, "LU",
3258	    "# of packets that we trust host's csum verification");
3259	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3260	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3261	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3262	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3263	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3264	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3265	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3266	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3267	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3268	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3269	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3270	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3271
3272	return (0);
3273}
3274
3275static void
3276hn_destroy_rx_data(struct hn_softc *sc)
3277{
3278	int i;
3279
3280	if (sc->hn_rxbuf != NULL) {
3281		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3282			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3283		else
3284			device_printf(sc->hn_dev, "RXBUF is referenced\n");
3285		sc->hn_rxbuf = NULL;
3286	}
3287
3288	if (sc->hn_rx_ring_cnt == 0)
3289		return;
3290
3291	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3292		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3293
3294		if (rxr->hn_br == NULL)
3295			continue;
3296		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3297			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3298		} else {
3299			device_printf(sc->hn_dev,
3300			    "%dth channel bufring is referenced", i);
3301		}
3302		rxr->hn_br = NULL;
3303
3304#if defined(INET) || defined(INET6)
3305		tcp_lro_free(&rxr->hn_lro);
3306#endif
3307		free(rxr->hn_pktbuf, M_DEVBUF);
3308	}
3309	free(sc->hn_rx_ring, M_DEVBUF);
3310	sc->hn_rx_ring = NULL;
3311
3312	sc->hn_rx_ring_cnt = 0;
3313	sc->hn_rx_ring_inuse = 0;
3314}
3315
3316static int
3317hn_tx_ring_create(struct hn_softc *sc, int id)
3318{
3319	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3320	device_t dev = sc->hn_dev;
3321	bus_dma_tag_t parent_dtag;
3322	int error, i;
3323
3324	txr->hn_sc = sc;
3325	txr->hn_tx_idx = id;
3326
3327#ifndef HN_USE_TXDESC_BUFRING
3328	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3329#endif
3330	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3331
3332	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3333	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3334	    M_DEVBUF, M_WAITOK | M_ZERO);
3335#ifndef HN_USE_TXDESC_BUFRING
3336	SLIST_INIT(&txr->hn_txlist);
3337#else
3338	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3339	    M_WAITOK, &txr->hn_tx_lock);
3340#endif
3341
3342	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3343		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3344		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3345	} else {
3346		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3347	}
3348
3349#ifdef HN_IFSTART_SUPPORT
3350	if (hn_use_if_start) {
3351		txr->hn_txeof = hn_start_txeof;
3352		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3353		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3354	} else
3355#endif
3356	{
3357		int br_depth;
3358
3359		txr->hn_txeof = hn_xmit_txeof;
3360		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3361		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3362
3363		br_depth = hn_get_txswq_depth(txr);
3364		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3365		    M_WAITOK, &txr->hn_tx_lock);
3366	}
3367
3368	txr->hn_direct_tx_size = hn_direct_tx_size;
3369
3370	/*
3371	 * Always schedule transmission instead of trying to do direct
3372	 * transmission.  This one gives the best performance so far.
3373	 */
3374	txr->hn_sched_tx = 1;
3375
3376	parent_dtag = bus_get_dma_tag(dev);
3377
3378	/* DMA tag for RNDIS packet messages. */
3379	error = bus_dma_tag_create(parent_dtag, /* parent */
3380	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3381	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3382	    BUS_SPACE_MAXADDR,		/* lowaddr */
3383	    BUS_SPACE_MAXADDR,		/* highaddr */
3384	    NULL, NULL,			/* filter, filterarg */
3385	    HN_RNDIS_PKT_LEN,		/* maxsize */
3386	    1,				/* nsegments */
3387	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3388	    0,				/* flags */
3389	    NULL,			/* lockfunc */
3390	    NULL,			/* lockfuncarg */
3391	    &txr->hn_tx_rndis_dtag);
3392	if (error) {
3393		device_printf(dev, "failed to create rndis dmatag\n");
3394		return error;
3395	}
3396
3397	/* DMA tag for data. */
3398	error = bus_dma_tag_create(parent_dtag, /* parent */
3399	    1,				/* alignment */
3400	    HN_TX_DATA_BOUNDARY,	/* boundary */
3401	    BUS_SPACE_MAXADDR,		/* lowaddr */
3402	    BUS_SPACE_MAXADDR,		/* highaddr */
3403	    NULL, NULL,			/* filter, filterarg */
3404	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3405	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3406	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3407	    0,				/* flags */
3408	    NULL,			/* lockfunc */
3409	    NULL,			/* lockfuncarg */
3410	    &txr->hn_tx_data_dtag);
3411	if (error) {
3412		device_printf(dev, "failed to create data dmatag\n");
3413		return error;
3414	}
3415
3416	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3417		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3418
3419		txd->txr = txr;
3420		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3421		STAILQ_INIT(&txd->agg_list);
3422
3423		/*
3424		 * Allocate and load RNDIS packet message.
3425		 */
3426        	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3427		    (void **)&txd->rndis_pkt,
3428		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3429		    &txd->rndis_pkt_dmap);
3430		if (error) {
3431			device_printf(dev,
3432			    "failed to allocate rndis_packet_msg, %d\n", i);
3433			return error;
3434		}
3435
3436		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3437		    txd->rndis_pkt_dmap,
3438		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3439		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3440		    BUS_DMA_NOWAIT);
3441		if (error) {
3442			device_printf(dev,
3443			    "failed to load rndis_packet_msg, %d\n", i);
3444			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3445			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3446			return error;
3447		}
3448
3449		/* DMA map for TX data. */
3450		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3451		    &txd->data_dmap);
3452		if (error) {
3453			device_printf(dev,
3454			    "failed to allocate tx data dmamap\n");
3455			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3456			    txd->rndis_pkt_dmap);
3457			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3458			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3459			return error;
3460		}
3461
3462		/* All set, put it to list */
3463		txd->flags |= HN_TXD_FLAG_ONLIST;
3464#ifndef HN_USE_TXDESC_BUFRING
3465		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3466#else
3467		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3468#endif
3469	}
3470	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3471
3472	if (sc->hn_tx_sysctl_tree != NULL) {
3473		struct sysctl_oid_list *child;
3474		struct sysctl_ctx_list *ctx;
3475		char name[16];
3476
3477		/*
3478		 * Create per TX ring sysctl tree:
3479		 * dev.hn.UNIT.tx.RINGID
3480		 */
3481		ctx = device_get_sysctl_ctx(dev);
3482		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3483
3484		snprintf(name, sizeof(name), "%d", id);
3485		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3486		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3487
3488		if (txr->hn_tx_sysctl_tree != NULL) {
3489			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3490
3491#ifdef HN_DEBUG
3492			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3493			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3494			    "# of available TX descs");
3495#endif
3496#ifdef HN_IFSTART_SUPPORT
3497			if (!hn_use_if_start)
3498#endif
3499			{
3500				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3501				    CTLFLAG_RD, &txr->hn_oactive, 0,
3502				    "over active");
3503			}
3504			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3505			    CTLFLAG_RW, &txr->hn_pkts,
3506			    "# of packets transmitted");
3507			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3508			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
3509		}
3510	}
3511
3512	return 0;
3513}
3514
3515static void
3516hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3517{
3518	struct hn_tx_ring *txr = txd->txr;
3519
3520	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3521	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3522
3523	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3524	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3525	    txd->rndis_pkt_dmap);
3526	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3527}
3528
3529static void
3530hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3531{
3532
3533	KASSERT(txd->refs == 0 || txd->refs == 1,
3534	    ("invalid txd refs %d", txd->refs));
3535
3536	/* Aggregated txds will be freed by their aggregating txd. */
3537	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3538		int freed;
3539
3540		freed = hn_txdesc_put(txr, txd);
3541		KASSERT(freed, ("can't free txdesc"));
3542	}
3543}
3544
3545static void
3546hn_tx_ring_destroy(struct hn_tx_ring *txr)
3547{
3548	int i;
3549
3550	if (txr->hn_txdesc == NULL)
3551		return;
3552
3553	/*
3554	 * NOTE:
3555	 * Because the freeing of aggregated txds will be deferred
3556	 * to the aggregating txd, two passes are used here:
3557	 * - The first pass GCes any pending txds.  This GC is necessary,
3558	 *   since if the channels are revoked, hypervisor will not
3559	 *   deliver send-done for all pending txds.
3560	 * - The second pass frees the busdma stuffs, i.e. after all txds
3561	 *   were freed.
3562	 */
3563	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3564		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3565	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3566		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3567
3568	if (txr->hn_tx_data_dtag != NULL)
3569		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3570	if (txr->hn_tx_rndis_dtag != NULL)
3571		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3572
3573#ifdef HN_USE_TXDESC_BUFRING
3574	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3575#endif
3576
3577	free(txr->hn_txdesc, M_DEVBUF);
3578	txr->hn_txdesc = NULL;
3579
3580	if (txr->hn_mbuf_br != NULL)
3581		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3582
3583#ifndef HN_USE_TXDESC_BUFRING
3584	mtx_destroy(&txr->hn_txlist_spin);
3585#endif
3586	mtx_destroy(&txr->hn_tx_lock);
3587}
3588
3589static int
3590hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3591{
3592	struct sysctl_oid_list *child;
3593	struct sysctl_ctx_list *ctx;
3594	int i;
3595
3596	/*
3597	 * Create TXBUF for chimney sending.
3598	 *
3599	 * NOTE: It is shared by all channels.
3600	 */
3601	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3602	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3603	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3604	if (sc->hn_chim == NULL) {
3605		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3606		return (ENOMEM);
3607	}
3608
3609	sc->hn_tx_ring_cnt = ring_cnt;
3610	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3611
3612	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3613	    M_DEVBUF, M_WAITOK | M_ZERO);
3614
3615	ctx = device_get_sysctl_ctx(sc->hn_dev);
3616	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3617
3618	/* Create dev.hn.UNIT.tx sysctl tree */
3619	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3620	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3621
3622	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3623		int error;
3624
3625		error = hn_tx_ring_create(sc, i);
3626		if (error)
3627			return error;
3628	}
3629
3630	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3631	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3632	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3633	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3634	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3635	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3636	    __offsetof(struct hn_tx_ring, hn_send_failed),
3637	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3638	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3639	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3640	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3641	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3642	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3643	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3644	    __offsetof(struct hn_tx_ring, hn_flush_failed),
3645	    hn_tx_stat_ulong_sysctl, "LU",
3646	    "# of packet transmission aggregation flush failure");
3647	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3648	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3649	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3650	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3651	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3652	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3653	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3654	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3655	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3656	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3657	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3658	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3659	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3660	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3661	    "# of total TX descs");
3662	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3663	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3664	    "Chimney send packet size upper boundary");
3665	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3666	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3667	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3668	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3669	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3670	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3671	    hn_tx_conf_int_sysctl, "I",
3672	    "Size of the packet for direct transmission");
3673	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3674	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3675	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3676	    hn_tx_conf_int_sysctl, "I",
3677	    "Always schedule transmission "
3678	    "instead of doing direct transmission");
3679	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3680	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3681	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3682	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3683	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3684	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3685	    "Applied packet transmission aggregation size");
3686	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3687	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3688	    hn_txagg_pktmax_sysctl, "I",
3689	    "Applied packet transmission aggregation packets");
3690	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3691	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3692	    hn_txagg_align_sysctl, "I",
3693	    "Applied packet transmission aggregation alignment");
3694
3695	return 0;
3696}
3697
3698static void
3699hn_set_chim_size(struct hn_softc *sc, int chim_size)
3700{
3701	int i;
3702
3703	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3704		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3705}
3706
3707static void
3708hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3709{
3710	struct ifnet *ifp = sc->hn_ifp;
3711	int tso_minlen;
3712
3713	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3714		return;
3715
3716	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3717	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3718	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3719
3720	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3721	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3722	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3723
3724	if (tso_maxlen < tso_minlen)
3725		tso_maxlen = tso_minlen;
3726	else if (tso_maxlen > IP_MAXPACKET)
3727		tso_maxlen = IP_MAXPACKET;
3728	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3729		tso_maxlen = sc->hn_ndis_tso_szmax;
3730	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3731	if (bootverbose)
3732		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3733}
3734
3735static void
3736hn_fixup_tx_data(struct hn_softc *sc)
3737{
3738	uint64_t csum_assist;
3739	int i;
3740
3741	hn_set_chim_size(sc, sc->hn_chim_szmax);
3742	if (hn_tx_chimney_size > 0 &&
3743	    hn_tx_chimney_size < sc->hn_chim_szmax)
3744		hn_set_chim_size(sc, hn_tx_chimney_size);
3745
3746	csum_assist = 0;
3747	if (sc->hn_caps & HN_CAP_IPCS)
3748		csum_assist |= CSUM_IP;
3749	if (sc->hn_caps & HN_CAP_TCP4CS)
3750		csum_assist |= CSUM_IP_TCP;
3751	if (sc->hn_caps & HN_CAP_UDP4CS)
3752		csum_assist |= CSUM_IP_UDP;
3753	if (sc->hn_caps & HN_CAP_TCP6CS)
3754		csum_assist |= CSUM_IP6_TCP;
3755	if (sc->hn_caps & HN_CAP_UDP6CS)
3756		csum_assist |= CSUM_IP6_UDP;
3757	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3758		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3759
3760	if (sc->hn_caps & HN_CAP_HASHVAL) {
3761		/*
3762		 * Support HASHVAL pktinfo on TX path.
3763		 */
3764		if (bootverbose)
3765			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3766		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3767			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3768	}
3769}
3770
3771static void
3772hn_destroy_tx_data(struct hn_softc *sc)
3773{
3774	int i;
3775
3776	if (sc->hn_chim != NULL) {
3777		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3778			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3779		} else {
3780			device_printf(sc->hn_dev,
3781			    "chimney sending buffer is referenced");
3782		}
3783		sc->hn_chim = NULL;
3784	}
3785
3786	if (sc->hn_tx_ring_cnt == 0)
3787		return;
3788
3789	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3790		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3791
3792	free(sc->hn_tx_ring, M_DEVBUF);
3793	sc->hn_tx_ring = NULL;
3794
3795	sc->hn_tx_ring_cnt = 0;
3796	sc->hn_tx_ring_inuse = 0;
3797}
3798
3799#ifdef HN_IFSTART_SUPPORT
3800
3801static void
3802hn_start_taskfunc(void *xtxr, int pending __unused)
3803{
3804	struct hn_tx_ring *txr = xtxr;
3805
3806	mtx_lock(&txr->hn_tx_lock);
3807	hn_start_locked(txr, 0);
3808	mtx_unlock(&txr->hn_tx_lock);
3809}
3810
3811static int
3812hn_start_locked(struct hn_tx_ring *txr, int len)
3813{
3814	struct hn_softc *sc = txr->hn_sc;
3815	struct ifnet *ifp = sc->hn_ifp;
3816	int sched = 0;
3817
3818	KASSERT(hn_use_if_start,
3819	    ("hn_start_locked is called, when if_start is disabled"));
3820	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3821	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3822	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3823
3824	if (__predict_false(txr->hn_suspended))
3825		return (0);
3826
3827	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3828	    IFF_DRV_RUNNING)
3829		return (0);
3830
3831	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3832		struct hn_txdesc *txd;
3833		struct mbuf *m_head;
3834		int error;
3835
3836		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3837		if (m_head == NULL)
3838			break;
3839
3840		if (len > 0 && m_head->m_pkthdr.len > len) {
3841			/*
3842			 * This sending could be time consuming; let callers
3843			 * dispatch this packet sending (and sending of any
3844			 * following up packets) to tx taskqueue.
3845			 */
3846			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3847			sched = 1;
3848			break;
3849		}
3850
3851#if defined(INET6) || defined(INET)
3852		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3853			m_head = hn_tso_fixup(m_head);
3854			if (__predict_false(m_head == NULL)) {
3855				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3856				continue;
3857			}
3858		}
3859#endif
3860
3861		txd = hn_txdesc_get(txr);
3862		if (txd == NULL) {
3863			txr->hn_no_txdescs++;
3864			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3865			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3866			break;
3867		}
3868
3869		error = hn_encap(ifp, txr, txd, &m_head);
3870		if (error) {
3871			/* Both txd and m_head are freed */
3872			KASSERT(txr->hn_agg_txd == NULL,
3873			    ("encap failed w/ pending aggregating txdesc"));
3874			continue;
3875		}
3876
3877		if (txr->hn_agg_pktleft == 0) {
3878			if (txr->hn_agg_txd != NULL) {
3879				KASSERT(m_head == NULL,
3880				    ("pending mbuf for aggregating txdesc"));
3881				error = hn_flush_txagg(ifp, txr);
3882				if (__predict_false(error)) {
3883					atomic_set_int(&ifp->if_drv_flags,
3884					    IFF_DRV_OACTIVE);
3885					break;
3886				}
3887			} else {
3888				KASSERT(m_head != NULL, ("mbuf was freed"));
3889				error = hn_txpkt(ifp, txr, txd);
3890				if (__predict_false(error)) {
3891					/* txd is freed, but m_head is not */
3892					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3893					atomic_set_int(&ifp->if_drv_flags,
3894					    IFF_DRV_OACTIVE);
3895					break;
3896				}
3897			}
3898		}
3899#ifdef INVARIANTS
3900		else {
3901			KASSERT(txr->hn_agg_txd != NULL,
3902			    ("no aggregating txdesc"));
3903			KASSERT(m_head == NULL,
3904			    ("pending mbuf for aggregating txdesc"));
3905		}
3906#endif
3907	}
3908
3909	/* Flush pending aggerated transmission. */
3910	if (txr->hn_agg_txd != NULL)
3911		hn_flush_txagg(ifp, txr);
3912	return (sched);
3913}
3914
3915static void
3916hn_start(struct ifnet *ifp)
3917{
3918	struct hn_softc *sc = ifp->if_softc;
3919	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3920
3921	if (txr->hn_sched_tx)
3922		goto do_sched;
3923
3924	if (mtx_trylock(&txr->hn_tx_lock)) {
3925		int sched;
3926
3927		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3928		mtx_unlock(&txr->hn_tx_lock);
3929		if (!sched)
3930			return;
3931	}
3932do_sched:
3933	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3934}
3935
3936static void
3937hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3938{
3939	struct hn_tx_ring *txr = xtxr;
3940
3941	mtx_lock(&txr->hn_tx_lock);
3942	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3943	hn_start_locked(txr, 0);
3944	mtx_unlock(&txr->hn_tx_lock);
3945}
3946
3947static void
3948hn_start_txeof(struct hn_tx_ring *txr)
3949{
3950	struct hn_softc *sc = txr->hn_sc;
3951	struct ifnet *ifp = sc->hn_ifp;
3952
3953	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3954
3955	if (txr->hn_sched_tx)
3956		goto do_sched;
3957
3958	if (mtx_trylock(&txr->hn_tx_lock)) {
3959		int sched;
3960
3961		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3962		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3963		mtx_unlock(&txr->hn_tx_lock);
3964		if (sched) {
3965			taskqueue_enqueue(txr->hn_tx_taskq,
3966			    &txr->hn_tx_task);
3967		}
3968	} else {
3969do_sched:
3970		/*
3971		 * Release the OACTIVE earlier, with the hope, that
3972		 * others could catch up.  The task will clear the
3973		 * flag again with the hn_tx_lock to avoid possible
3974		 * races.
3975		 */
3976		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3977		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3978	}
3979}
3980
3981#endif	/* HN_IFSTART_SUPPORT */
3982
3983static int
3984hn_xmit(struct hn_tx_ring *txr, int len)
3985{
3986	struct hn_softc *sc = txr->hn_sc;
3987	struct ifnet *ifp = sc->hn_ifp;
3988	struct mbuf *m_head;
3989	int sched = 0;
3990
3991	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3992#ifdef HN_IFSTART_SUPPORT
3993	KASSERT(hn_use_if_start == 0,
3994	    ("hn_xmit is called, when if_start is enabled"));
3995#endif
3996	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3997
3998	if (__predict_false(txr->hn_suspended))
3999		return (0);
4000
4001	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4002		return (0);
4003
4004	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4005		struct hn_txdesc *txd;
4006		int error;
4007
4008		if (len > 0 && m_head->m_pkthdr.len > len) {
4009			/*
4010			 * This sending could be time consuming; let callers
4011			 * dispatch this packet sending (and sending of any
4012			 * following up packets) to tx taskqueue.
4013			 */
4014			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4015			sched = 1;
4016			break;
4017		}
4018
4019		txd = hn_txdesc_get(txr);
4020		if (txd == NULL) {
4021			txr->hn_no_txdescs++;
4022			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4023			txr->hn_oactive = 1;
4024			break;
4025		}
4026
4027		error = hn_encap(ifp, txr, txd, &m_head);
4028		if (error) {
4029			/* Both txd and m_head are freed; discard */
4030			KASSERT(txr->hn_agg_txd == NULL,
4031			    ("encap failed w/ pending aggregating txdesc"));
4032			drbr_advance(ifp, txr->hn_mbuf_br);
4033			continue;
4034		}
4035
4036		if (txr->hn_agg_pktleft == 0) {
4037			if (txr->hn_agg_txd != NULL) {
4038				KASSERT(m_head == NULL,
4039				    ("pending mbuf for aggregating txdesc"));
4040				error = hn_flush_txagg(ifp, txr);
4041				if (__predict_false(error)) {
4042					txr->hn_oactive = 1;
4043					break;
4044				}
4045			} else {
4046				KASSERT(m_head != NULL, ("mbuf was freed"));
4047				error = hn_txpkt(ifp, txr, txd);
4048				if (__predict_false(error)) {
4049					/* txd is freed, but m_head is not */
4050					drbr_putback(ifp, txr->hn_mbuf_br,
4051					    m_head);
4052					txr->hn_oactive = 1;
4053					break;
4054				}
4055			}
4056		}
4057#ifdef INVARIANTS
4058		else {
4059			KASSERT(txr->hn_agg_txd != NULL,
4060			    ("no aggregating txdesc"));
4061			KASSERT(m_head == NULL,
4062			    ("pending mbuf for aggregating txdesc"));
4063		}
4064#endif
4065
4066		/* Sent */
4067		drbr_advance(ifp, txr->hn_mbuf_br);
4068	}
4069
4070	/* Flush pending aggerated transmission. */
4071	if (txr->hn_agg_txd != NULL)
4072		hn_flush_txagg(ifp, txr);
4073	return (sched);
4074}
4075
4076static int
4077hn_transmit(struct ifnet *ifp, struct mbuf *m)
4078{
4079	struct hn_softc *sc = ifp->if_softc;
4080	struct hn_tx_ring *txr;
4081	int error, idx = 0;
4082
4083#if defined(INET6) || defined(INET)
4084	/*
4085	 * Perform TSO packet header fixup now, since the TSO
4086	 * packet header should be cache-hot.
4087	 */
4088	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4089		m = hn_tso_fixup(m);
4090		if (__predict_false(m == NULL)) {
4091			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4092			return EIO;
4093		}
4094	}
4095#endif
4096
4097	/*
4098	 * Select the TX ring based on flowid
4099	 */
4100	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4101		idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4102	txr = &sc->hn_tx_ring[idx];
4103
4104	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4105	if (error) {
4106		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4107		return error;
4108	}
4109
4110	if (txr->hn_oactive)
4111		return 0;
4112
4113	if (txr->hn_sched_tx)
4114		goto do_sched;
4115
4116	if (mtx_trylock(&txr->hn_tx_lock)) {
4117		int sched;
4118
4119		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4120		mtx_unlock(&txr->hn_tx_lock);
4121		if (!sched)
4122			return 0;
4123	}
4124do_sched:
4125	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4126	return 0;
4127}
4128
4129static void
4130hn_tx_ring_qflush(struct hn_tx_ring *txr)
4131{
4132	struct mbuf *m;
4133
4134	mtx_lock(&txr->hn_tx_lock);
4135	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4136		m_freem(m);
4137	mtx_unlock(&txr->hn_tx_lock);
4138}
4139
4140static void
4141hn_xmit_qflush(struct ifnet *ifp)
4142{
4143	struct hn_softc *sc = ifp->if_softc;
4144	int i;
4145
4146	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4147		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4148	if_qflush(ifp);
4149}
4150
4151static void
4152hn_xmit_txeof(struct hn_tx_ring *txr)
4153{
4154
4155	if (txr->hn_sched_tx)
4156		goto do_sched;
4157
4158	if (mtx_trylock(&txr->hn_tx_lock)) {
4159		int sched;
4160
4161		txr->hn_oactive = 0;
4162		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4163		mtx_unlock(&txr->hn_tx_lock);
4164		if (sched) {
4165			taskqueue_enqueue(txr->hn_tx_taskq,
4166			    &txr->hn_tx_task);
4167		}
4168	} else {
4169do_sched:
4170		/*
4171		 * Release the oactive earlier, with the hope, that
4172		 * others could catch up.  The task will clear the
4173		 * oactive again with the hn_tx_lock to avoid possible
4174		 * races.
4175		 */
4176		txr->hn_oactive = 0;
4177		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4178	}
4179}
4180
4181static void
4182hn_xmit_taskfunc(void *xtxr, int pending __unused)
4183{
4184	struct hn_tx_ring *txr = xtxr;
4185
4186	mtx_lock(&txr->hn_tx_lock);
4187	hn_xmit(txr, 0);
4188	mtx_unlock(&txr->hn_tx_lock);
4189}
4190
4191static void
4192hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4193{
4194	struct hn_tx_ring *txr = xtxr;
4195
4196	mtx_lock(&txr->hn_tx_lock);
4197	txr->hn_oactive = 0;
4198	hn_xmit(txr, 0);
4199	mtx_unlock(&txr->hn_tx_lock);
4200}
4201
4202static int
4203hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4204{
4205	struct vmbus_chan_br cbr;
4206	struct hn_rx_ring *rxr;
4207	struct hn_tx_ring *txr = NULL;
4208	int idx, error;
4209
4210	idx = vmbus_chan_subidx(chan);
4211
4212	/*
4213	 * Link this channel to RX/TX ring.
4214	 */
4215	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4216	    ("invalid channel index %d, should > 0 && < %d",
4217	     idx, sc->hn_rx_ring_inuse));
4218	rxr = &sc->hn_rx_ring[idx];
4219	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4220	    ("RX ring %d already attached", idx));
4221	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4222
4223	if (bootverbose) {
4224		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4225		    idx, vmbus_chan_id(chan));
4226	}
4227
4228	if (idx < sc->hn_tx_ring_inuse) {
4229		txr = &sc->hn_tx_ring[idx];
4230		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4231		    ("TX ring %d already attached", idx));
4232		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4233
4234		txr->hn_chan = chan;
4235		if (bootverbose) {
4236			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4237			    idx, vmbus_chan_id(chan));
4238		}
4239	}
4240
4241	/* Bind this channel to a proper CPU. */
4242	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4243
4244	/*
4245	 * Open this channel
4246	 */
4247	cbr.cbr = rxr->hn_br;
4248	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4249	cbr.cbr_txsz = HN_TXBR_SIZE;
4250	cbr.cbr_rxsz = HN_RXBR_SIZE;
4251	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4252	if (error) {
4253		if (error == EISCONN) {
4254			if_printf(sc->hn_ifp, "bufring is connected after "
4255			    "chan%u open failure\n", vmbus_chan_id(chan));
4256			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4257		} else {
4258			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4259			    vmbus_chan_id(chan), error);
4260		}
4261	}
4262	return (error);
4263}
4264
4265static void
4266hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4267{
4268	struct hn_rx_ring *rxr;
4269	int idx, error;
4270
4271	idx = vmbus_chan_subidx(chan);
4272
4273	/*
4274	 * Link this channel to RX/TX ring.
4275	 */
4276	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4277	    ("invalid channel index %d, should > 0 && < %d",
4278	     idx, sc->hn_rx_ring_inuse));
4279	rxr = &sc->hn_rx_ring[idx];
4280	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4281	    ("RX ring %d is not attached", idx));
4282	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4283
4284	if (idx < sc->hn_tx_ring_inuse) {
4285		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4286
4287		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4288		    ("TX ring %d is not attached attached", idx));
4289		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4290	}
4291
4292	/*
4293	 * Close this channel.
4294	 *
4295	 * NOTE:
4296	 * Channel closing does _not_ destroy the target channel.
4297	 */
4298	error = vmbus_chan_close_direct(chan);
4299	if (error == EISCONN) {
4300		if_printf(sc->hn_ifp, "chan%u bufring is connected "
4301		    "after being closed\n", vmbus_chan_id(chan));
4302		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4303	} else if (error) {
4304		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4305		    vmbus_chan_id(chan), error);
4306	}
4307}
4308
4309static int
4310hn_attach_subchans(struct hn_softc *sc)
4311{
4312	struct vmbus_channel **subchans;
4313	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4314	int i, error = 0;
4315
4316	KASSERT(subchan_cnt > 0, ("no sub-channels"));
4317
4318	/* Attach the sub-channels. */
4319	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4320	for (i = 0; i < subchan_cnt; ++i) {
4321		int error1;
4322
4323		error1 = hn_chan_attach(sc, subchans[i]);
4324		if (error1) {
4325			error = error1;
4326			/* Move on; all channels will be detached later. */
4327		}
4328	}
4329	vmbus_subchan_rel(subchans, subchan_cnt);
4330
4331	if (error) {
4332		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4333	} else {
4334		if (bootverbose) {
4335			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4336			    subchan_cnt);
4337		}
4338	}
4339	return (error);
4340}
4341
4342static void
4343hn_detach_allchans(struct hn_softc *sc)
4344{
4345	struct vmbus_channel **subchans;
4346	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4347	int i;
4348
4349	if (subchan_cnt == 0)
4350		goto back;
4351
4352	/* Detach the sub-channels. */
4353	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4354	for (i = 0; i < subchan_cnt; ++i)
4355		hn_chan_detach(sc, subchans[i]);
4356	vmbus_subchan_rel(subchans, subchan_cnt);
4357
4358back:
4359	/*
4360	 * Detach the primary channel, _after_ all sub-channels
4361	 * are detached.
4362	 */
4363	hn_chan_detach(sc, sc->hn_prichan);
4364
4365	/* Wait for sub-channels to be destroyed, if any. */
4366	vmbus_subchan_drain(sc->hn_prichan);
4367
4368#ifdef INVARIANTS
4369	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4370		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4371		    HN_RX_FLAG_ATTACHED) == 0,
4372		    ("%dth RX ring is still attached", i));
4373	}
4374	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4375		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4376		    HN_TX_FLAG_ATTACHED) == 0,
4377		    ("%dth TX ring is still attached", i));
4378	}
4379#endif
4380}
4381
4382static int
4383hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4384{
4385	struct vmbus_channel **subchans;
4386	int nchan, rxr_cnt, error;
4387
4388	nchan = *nsubch + 1;
4389	if (nchan == 1) {
4390		/*
4391		 * Multiple RX/TX rings are not requested.
4392		 */
4393		*nsubch = 0;
4394		return (0);
4395	}
4396
4397	/*
4398	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4399	 * table entries.
4400	 */
4401	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4402	if (error) {
4403		/* No RSS; this is benign. */
4404		*nsubch = 0;
4405		return (0);
4406	}
4407	if (bootverbose) {
4408		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4409		    rxr_cnt, nchan);
4410	}
4411
4412	if (nchan > rxr_cnt)
4413		nchan = rxr_cnt;
4414	if (nchan == 1) {
4415		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4416		*nsubch = 0;
4417		return (0);
4418	}
4419
4420	/*
4421	 * Allocate sub-channels from NVS.
4422	 */
4423	*nsubch = nchan - 1;
4424	error = hn_nvs_alloc_subchans(sc, nsubch);
4425	if (error || *nsubch == 0) {
4426		/* Failed to allocate sub-channels. */
4427		*nsubch = 0;
4428		return (0);
4429	}
4430
4431	/*
4432	 * Wait for all sub-channels to become ready before moving on.
4433	 */
4434	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4435	vmbus_subchan_rel(subchans, *nsubch);
4436	return (0);
4437}
4438
4439static bool
4440hn_synth_attachable(const struct hn_softc *sc)
4441{
4442	int i;
4443
4444	if (sc->hn_flags & HN_FLAG_ERRORS)
4445		return (false);
4446
4447	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4448		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4449
4450		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4451			return (false);
4452	}
4453	return (true);
4454}
4455
4456static int
4457hn_synth_attach(struct hn_softc *sc, int mtu)
4458{
4459#define ATTACHED_NVS		0x0002
4460#define ATTACHED_RNDIS		0x0004
4461
4462	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4463	int error, nsubch, nchan, i;
4464	uint32_t old_caps, attached = 0;
4465
4466	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4467	    ("synthetic parts were attached"));
4468
4469	if (!hn_synth_attachable(sc))
4470		return (ENXIO);
4471
4472	/* Save capabilities for later verification. */
4473	old_caps = sc->hn_caps;
4474	sc->hn_caps = 0;
4475
4476	/* Clear RSS stuffs. */
4477	sc->hn_rss_ind_size = 0;
4478	sc->hn_rss_hash = 0;
4479
4480	/*
4481	 * Attach the primary channel _before_ attaching NVS and RNDIS.
4482	 */
4483	error = hn_chan_attach(sc, sc->hn_prichan);
4484	if (error)
4485		goto failed;
4486
4487	/*
4488	 * Attach NVS.
4489	 */
4490	error = hn_nvs_attach(sc, mtu);
4491	if (error)
4492		goto failed;
4493	attached |= ATTACHED_NVS;
4494
4495	/*
4496	 * Attach RNDIS _after_ NVS is attached.
4497	 */
4498	error = hn_rndis_attach(sc, mtu);
4499	if (error)
4500		goto failed;
4501	attached |= ATTACHED_RNDIS;
4502
4503	/*
4504	 * Make sure capabilities are not changed.
4505	 */
4506	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4507		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4508		    old_caps, sc->hn_caps);
4509		error = ENXIO;
4510		goto failed;
4511	}
4512
4513	/*
4514	 * Allocate sub-channels for multi-TX/RX rings.
4515	 *
4516	 * NOTE:
4517	 * The # of RX rings that can be used is equivalent to the # of
4518	 * channels to be requested.
4519	 */
4520	nsubch = sc->hn_rx_ring_cnt - 1;
4521	error = hn_synth_alloc_subchans(sc, &nsubch);
4522	if (error)
4523		goto failed;
4524	/* NOTE: _Full_ synthetic parts detach is required now. */
4525	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4526
4527	/*
4528	 * Set the # of TX/RX rings that could be used according to
4529	 * the # of channels that NVS offered.
4530	 */
4531	nchan = nsubch + 1;
4532	hn_set_ring_inuse(sc, nchan);
4533	if (nchan == 1) {
4534		/* Only the primary channel can be used; done */
4535		goto back;
4536	}
4537
4538	/*
4539	 * Attach the sub-channels.
4540	 *
4541	 * NOTE: hn_set_ring_inuse() _must_ have been called.
4542	 */
4543	error = hn_attach_subchans(sc);
4544	if (error)
4545		goto failed;
4546
4547	/*
4548	 * Configure RSS key and indirect table _after_ all sub-channels
4549	 * are attached.
4550	 */
4551	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4552		/*
4553		 * RSS key is not set yet; set it to the default RSS key.
4554		 */
4555		if (bootverbose)
4556			if_printf(sc->hn_ifp, "setup default RSS key\n");
4557		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4558		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4559	}
4560
4561	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4562		/*
4563		 * RSS indirect table is not set yet; set it up in round-
4564		 * robin fashion.
4565		 */
4566		if (bootverbose) {
4567			if_printf(sc->hn_ifp, "setup default RSS indirect "
4568			    "table\n");
4569		}
4570		for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4571			rss->rss_ind[i] = i % nchan;
4572		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4573	} else {
4574		/*
4575		 * # of usable channels may be changed, so we have to
4576		 * make sure that all entries in RSS indirect table
4577		 * are valid.
4578		 *
4579		 * NOTE: hn_set_ring_inuse() _must_ have been called.
4580		 */
4581		hn_rss_ind_fixup(sc);
4582	}
4583
4584	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4585	if (error)
4586		goto failed;
4587back:
4588	/*
4589	 * Fixup transmission aggregation setup.
4590	 */
4591	hn_set_txagg(sc);
4592	return (0);
4593
4594failed:
4595	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4596		hn_synth_detach(sc);
4597	} else {
4598		if (attached & ATTACHED_RNDIS)
4599			hn_rndis_detach(sc);
4600		if (attached & ATTACHED_NVS)
4601			hn_nvs_detach(sc);
4602		hn_chan_detach(sc, sc->hn_prichan);
4603		/* Restore old capabilities. */
4604		sc->hn_caps = old_caps;
4605	}
4606	return (error);
4607
4608#undef ATTACHED_RNDIS
4609#undef ATTACHED_NVS
4610}
4611
4612/*
4613 * NOTE:
4614 * The interface must have been suspended though hn_suspend(), before
4615 * this function get called.
4616 */
4617static void
4618hn_synth_detach(struct hn_softc *sc)
4619{
4620
4621	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4622	    ("synthetic parts were not attached"));
4623
4624	/* Detach the RNDIS first. */
4625	hn_rndis_detach(sc);
4626
4627	/* Detach NVS. */
4628	hn_nvs_detach(sc);
4629
4630	/* Detach all of the channels. */
4631	hn_detach_allchans(sc);
4632
4633	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4634}
4635
4636static void
4637hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4638{
4639	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4640	    ("invalid ring count %d", ring_cnt));
4641
4642	if (sc->hn_tx_ring_cnt > ring_cnt)
4643		sc->hn_tx_ring_inuse = ring_cnt;
4644	else
4645		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4646	sc->hn_rx_ring_inuse = ring_cnt;
4647
4648	if (bootverbose) {
4649		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4650		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4651	}
4652}
4653
4654static void
4655hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4656{
4657
4658	/*
4659	 * NOTE:
4660	 * The TX bufring will not be drained by the hypervisor,
4661	 * if the primary channel is revoked.
4662	 */
4663	while (!vmbus_chan_rx_empty(chan) ||
4664	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4665	     !vmbus_chan_tx_empty(chan)))
4666		pause("waitch", 1);
4667	vmbus_chan_intr_drain(chan);
4668}
4669
4670static void
4671hn_suspend_data(struct hn_softc *sc)
4672{
4673	struct vmbus_channel **subch = NULL;
4674	struct hn_tx_ring *txr;
4675	int i, nsubch;
4676
4677	HN_LOCK_ASSERT(sc);
4678
4679	/*
4680	 * Suspend TX.
4681	 */
4682	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4683		txr = &sc->hn_tx_ring[i];
4684
4685		mtx_lock(&txr->hn_tx_lock);
4686		txr->hn_suspended = 1;
4687		mtx_unlock(&txr->hn_tx_lock);
4688		/* No one is able send more packets now. */
4689
4690		/*
4691		 * Wait for all pending sends to finish.
4692		 *
4693		 * NOTE:
4694		 * We will _not_ receive all pending send-done, if the
4695		 * primary channel is revoked.
4696		 */
4697		while (hn_tx_ring_pending(txr) &&
4698		    !vmbus_chan_is_revoked(sc->hn_prichan))
4699			pause("hnwtx", 1 /* 1 tick */);
4700	}
4701
4702	/*
4703	 * Disable RX by clearing RX filter.
4704	 */
4705	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4706	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4707
4708	/*
4709	 * Give RNDIS enough time to flush all pending data packets.
4710	 */
4711	pause("waitrx", (200 * hz) / 1000);
4712
4713	/*
4714	 * Drain RX/TX bufrings and interrupts.
4715	 */
4716	nsubch = sc->hn_rx_ring_inuse - 1;
4717	if (nsubch > 0)
4718		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4719
4720	if (subch != NULL) {
4721		for (i = 0; i < nsubch; ++i)
4722			hn_chan_drain(sc, subch[i]);
4723	}
4724	hn_chan_drain(sc, sc->hn_prichan);
4725
4726	if (subch != NULL)
4727		vmbus_subchan_rel(subch, nsubch);
4728
4729	/*
4730	 * Drain any pending TX tasks.
4731	 *
4732	 * NOTE:
4733	 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4734	 * tasks will have to be drained _after_ the above hn_chan_drain()
4735	 * calls.
4736	 */
4737	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4738		txr = &sc->hn_tx_ring[i];
4739
4740		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4741		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4742	}
4743}
4744
4745static void
4746hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4747{
4748
4749	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4750}
4751
4752static void
4753hn_suspend_mgmt(struct hn_softc *sc)
4754{
4755	struct task task;
4756
4757	HN_LOCK_ASSERT(sc);
4758
4759	/*
4760	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4761	 * through hn_mgmt_taskq.
4762	 */
4763	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4764	vmbus_chan_run_task(sc->hn_prichan, &task);
4765
4766	/*
4767	 * Make sure that all pending management tasks are completed.
4768	 */
4769	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4770	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4771	taskqueue_drain_all(sc->hn_mgmt_taskq0);
4772}
4773
4774static void
4775hn_suspend(struct hn_softc *sc)
4776{
4777
4778	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4779		hn_suspend_data(sc);
4780	hn_suspend_mgmt(sc);
4781}
4782
4783static void
4784hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4785{
4786	int i;
4787
4788	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4789	    ("invalid TX ring count %d", tx_ring_cnt));
4790
4791	for (i = 0; i < tx_ring_cnt; ++i) {
4792		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4793
4794		mtx_lock(&txr->hn_tx_lock);
4795		txr->hn_suspended = 0;
4796		mtx_unlock(&txr->hn_tx_lock);
4797	}
4798}
4799
4800static void
4801hn_resume_data(struct hn_softc *sc)
4802{
4803	int i;
4804
4805	HN_LOCK_ASSERT(sc);
4806
4807	/*
4808	 * Re-enable RX.
4809	 */
4810	hn_set_rxfilter(sc);
4811
4812	/*
4813	 * Make sure to clear suspend status on "all" TX rings,
4814	 * since hn_tx_ring_inuse can be changed after
4815	 * hn_suspend_data().
4816	 */
4817	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4818
4819#ifdef HN_IFSTART_SUPPORT
4820	if (!hn_use_if_start)
4821#endif
4822	{
4823		/*
4824		 * Flush unused drbrs, since hn_tx_ring_inuse may be
4825		 * reduced.
4826		 */
4827		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4828			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4829	}
4830
4831	/*
4832	 * Kick start TX.
4833	 */
4834	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4835		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4836
4837		/*
4838		 * Use txeof task, so that any pending oactive can be
4839		 * cleared properly.
4840		 */
4841		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4842	}
4843}
4844
4845static void
4846hn_resume_mgmt(struct hn_softc *sc)
4847{
4848
4849	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4850
4851	/*
4852	 * Kick off network change detection, if it was pending.
4853	 * If no network change was pending, start link status
4854	 * checks, which is more lightweight than network change
4855	 * detection.
4856	 */
4857	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4858		hn_change_network(sc);
4859	else
4860		hn_update_link_status(sc);
4861}
4862
4863static void
4864hn_resume(struct hn_softc *sc)
4865{
4866
4867	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4868		hn_resume_data(sc);
4869	hn_resume_mgmt(sc);
4870}
4871
4872static void
4873hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4874{
4875	const struct rndis_status_msg *msg;
4876	int ofs;
4877
4878	if (dlen < sizeof(*msg)) {
4879		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4880		return;
4881	}
4882	msg = data;
4883
4884	switch (msg->rm_status) {
4885	case RNDIS_STATUS_MEDIA_CONNECT:
4886	case RNDIS_STATUS_MEDIA_DISCONNECT:
4887		hn_update_link_status(sc);
4888		break;
4889
4890	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4891		/* Not really useful; ignore. */
4892		break;
4893
4894	case RNDIS_STATUS_NETWORK_CHANGE:
4895		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4896		if (dlen < ofs + msg->rm_stbuflen ||
4897		    msg->rm_stbuflen < sizeof(uint32_t)) {
4898			if_printf(sc->hn_ifp, "network changed\n");
4899		} else {
4900			uint32_t change;
4901
4902			memcpy(&change, ((const uint8_t *)msg) + ofs,
4903			    sizeof(change));
4904			if_printf(sc->hn_ifp, "network changed, change %u\n",
4905			    change);
4906		}
4907		hn_change_network(sc);
4908		break;
4909
4910	default:
4911		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4912		    msg->rm_status);
4913		break;
4914	}
4915}
4916
4917static int
4918hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4919{
4920	const struct rndis_pktinfo *pi = info_data;
4921	uint32_t mask = 0;
4922
4923	while (info_dlen != 0) {
4924		const void *data;
4925		uint32_t dlen;
4926
4927		if (__predict_false(info_dlen < sizeof(*pi)))
4928			return (EINVAL);
4929		if (__predict_false(info_dlen < pi->rm_size))
4930			return (EINVAL);
4931		info_dlen -= pi->rm_size;
4932
4933		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4934			return (EINVAL);
4935		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4936			return (EINVAL);
4937		dlen = pi->rm_size - pi->rm_pktinfooffset;
4938		data = pi->rm_data;
4939
4940		switch (pi->rm_type) {
4941		case NDIS_PKTINFO_TYPE_VLAN:
4942			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4943				return (EINVAL);
4944			info->vlan_info = *((const uint32_t *)data);
4945			mask |= HN_RXINFO_VLAN;
4946			break;
4947
4948		case NDIS_PKTINFO_TYPE_CSUM:
4949			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4950				return (EINVAL);
4951			info->csum_info = *((const uint32_t *)data);
4952			mask |= HN_RXINFO_CSUM;
4953			break;
4954
4955		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4956			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4957				return (EINVAL);
4958			info->hash_value = *((const uint32_t *)data);
4959			mask |= HN_RXINFO_HASHVAL;
4960			break;
4961
4962		case HN_NDIS_PKTINFO_TYPE_HASHINF:
4963			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4964				return (EINVAL);
4965			info->hash_info = *((const uint32_t *)data);
4966			mask |= HN_RXINFO_HASHINF;
4967			break;
4968
4969		default:
4970			goto next;
4971		}
4972
4973		if (mask == HN_RXINFO_ALL) {
4974			/* All found; done */
4975			break;
4976		}
4977next:
4978		pi = (const struct rndis_pktinfo *)
4979		    ((const uint8_t *)pi + pi->rm_size);
4980	}
4981
4982	/*
4983	 * Final fixup.
4984	 * - If there is no hash value, invalidate the hash info.
4985	 */
4986	if ((mask & HN_RXINFO_HASHVAL) == 0)
4987		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4988	return (0);
4989}
4990
4991static __inline bool
4992hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4993{
4994
4995	if (off < check_off) {
4996		if (__predict_true(off + len <= check_off))
4997			return (false);
4998	} else if (off > check_off) {
4999		if (__predict_true(check_off + check_len <= off))
5000			return (false);
5001	}
5002	return (true);
5003}
5004
5005static void
5006hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5007{
5008	const struct rndis_packet_msg *pkt;
5009	struct hn_rxinfo info;
5010	int data_off, pktinfo_off, data_len, pktinfo_len;
5011
5012	/*
5013	 * Check length.
5014	 */
5015	if (__predict_false(dlen < sizeof(*pkt))) {
5016		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5017		return;
5018	}
5019	pkt = data;
5020
5021	if (__predict_false(dlen < pkt->rm_len)) {
5022		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5023		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5024		return;
5025	}
5026	if (__predict_false(pkt->rm_len <
5027	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5028		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5029		    "msglen %u, data %u, oob %u, pktinfo %u\n",
5030		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5031		    pkt->rm_pktinfolen);
5032		return;
5033	}
5034	if (__predict_false(pkt->rm_datalen == 0)) {
5035		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5036		return;
5037	}
5038
5039	/*
5040	 * Check offests.
5041	 */
5042#define IS_OFFSET_INVALID(ofs)			\
5043	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
5044	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5045
5046	/* XXX Hyper-V does not meet data offset alignment requirement */
5047	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5048		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5049		    "data offset %u\n", pkt->rm_dataoffset);
5050		return;
5051	}
5052	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5053	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5054		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5055		    "oob offset %u\n", pkt->rm_oobdataoffset);
5056		return;
5057	}
5058	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5059	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5060		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5061		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5062		return;
5063	}
5064
5065#undef IS_OFFSET_INVALID
5066
5067	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5068	data_len = pkt->rm_datalen;
5069	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5070	pktinfo_len = pkt->rm_pktinfolen;
5071
5072	/*
5073	 * Check OOB coverage.
5074	 */
5075	if (__predict_false(pkt->rm_oobdatalen != 0)) {
5076		int oob_off, oob_len;
5077
5078		if_printf(rxr->hn_ifp, "got oobdata\n");
5079		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5080		oob_len = pkt->rm_oobdatalen;
5081
5082		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5083			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5084			    "oob overflow, msglen %u, oob abs %d len %d\n",
5085			    pkt->rm_len, oob_off, oob_len);
5086			return;
5087		}
5088
5089		/*
5090		 * Check against data.
5091		 */
5092		if (hn_rndis_check_overlap(oob_off, oob_len,
5093		    data_off, data_len)) {
5094			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5095			    "oob overlaps data, oob abs %d len %d, "
5096			    "data abs %d len %d\n",
5097			    oob_off, oob_len, data_off, data_len);
5098			return;
5099		}
5100
5101		/*
5102		 * Check against pktinfo.
5103		 */
5104		if (pktinfo_len != 0 &&
5105		    hn_rndis_check_overlap(oob_off, oob_len,
5106		    pktinfo_off, pktinfo_len)) {
5107			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5108			    "oob overlaps pktinfo, oob abs %d len %d, "
5109			    "pktinfo abs %d len %d\n",
5110			    oob_off, oob_len, pktinfo_off, pktinfo_len);
5111			return;
5112		}
5113	}
5114
5115	/*
5116	 * Check per-packet-info coverage and find useful per-packet-info.
5117	 */
5118	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5119	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5120	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5121	if (__predict_true(pktinfo_len != 0)) {
5122		bool overlap;
5123		int error;
5124
5125		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5126			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5127			    "pktinfo overflow, msglen %u, "
5128			    "pktinfo abs %d len %d\n",
5129			    pkt->rm_len, pktinfo_off, pktinfo_len);
5130			return;
5131		}
5132
5133		/*
5134		 * Check packet info coverage.
5135		 */
5136		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5137		    data_off, data_len);
5138		if (__predict_false(overlap)) {
5139			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5140			    "pktinfo overlap data, pktinfo abs %d len %d, "
5141			    "data abs %d len %d\n",
5142			    pktinfo_off, pktinfo_len, data_off, data_len);
5143			return;
5144		}
5145
5146		/*
5147		 * Find useful per-packet-info.
5148		 */
5149		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5150		    pktinfo_len, &info);
5151		if (__predict_false(error)) {
5152			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5153			    "pktinfo\n");
5154			return;
5155		}
5156	}
5157
5158	if (__predict_false(data_off + data_len > pkt->rm_len)) {
5159		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5160		    "data overflow, msglen %u, data abs %d len %d\n",
5161		    pkt->rm_len, data_off, data_len);
5162		return;
5163	}
5164	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5165}
5166
5167static __inline void
5168hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5169{
5170	const struct rndis_msghdr *hdr;
5171
5172	if (__predict_false(dlen < sizeof(*hdr))) {
5173		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5174		return;
5175	}
5176	hdr = data;
5177
5178	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5179		/* Hot data path. */
5180		hn_rndis_rx_data(rxr, data, dlen);
5181		/* Done! */
5182		return;
5183	}
5184
5185	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5186		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5187	else
5188		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5189}
5190
5191static void
5192hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5193{
5194	const struct hn_nvs_hdr *hdr;
5195
5196	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5197		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5198		return;
5199	}
5200	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5201
5202	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5203		/* Useless; ignore */
5204		return;
5205	}
5206	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5207}
5208
5209static void
5210hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5211    const struct vmbus_chanpkt_hdr *pkt)
5212{
5213	struct hn_nvs_sendctx *sndc;
5214
5215	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5216	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5217	    VMBUS_CHANPKT_DATALEN(pkt));
5218	/*
5219	 * NOTE:
5220	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5221	 * its callback.
5222	 */
5223}
5224
5225static void
5226hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5227    const struct vmbus_chanpkt_hdr *pkthdr)
5228{
5229	const struct vmbus_chanpkt_rxbuf *pkt;
5230	const struct hn_nvs_hdr *nvs_hdr;
5231	int count, i, hlen;
5232
5233	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5234		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5235		return;
5236	}
5237	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5238
5239	/* Make sure that this is a RNDIS message. */
5240	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5241		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5242		    nvs_hdr->nvs_type);
5243		return;
5244	}
5245
5246	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5247	if (__predict_false(hlen < sizeof(*pkt))) {
5248		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5249		return;
5250	}
5251	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5252
5253	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5254		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5255		    pkt->cp_rxbuf_id);
5256		return;
5257	}
5258
5259	count = pkt->cp_rxbuf_cnt;
5260	if (__predict_false(hlen <
5261	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5262		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5263		return;
5264	}
5265
5266	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5267	for (i = 0; i < count; ++i) {
5268		int ofs, len;
5269
5270		ofs = pkt->cp_rxbuf[i].rb_ofs;
5271		len = pkt->cp_rxbuf[i].rb_len;
5272		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5273			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5274			    "ofs %d, len %d\n", i, ofs, len);
5275			continue;
5276		}
5277		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5278	}
5279
5280	/*
5281	 * Ack the consumed RXBUF associated w/ this channel packet,
5282	 * so that this RXBUF can be recycled by the hypervisor.
5283	 */
5284	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5285}
5286
5287static void
5288hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5289    uint64_t tid)
5290{
5291	struct hn_nvs_rndis_ack ack;
5292	int retries, error;
5293
5294	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5295	ack.nvs_status = HN_NVS_STATUS_OK;
5296
5297	retries = 0;
5298again:
5299	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5300	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5301	if (__predict_false(error == EAGAIN)) {
5302		/*
5303		 * NOTE:
5304		 * This should _not_ happen in real world, since the
5305		 * consumption of the TX bufring from the TX path is
5306		 * controlled.
5307		 */
5308		if (rxr->hn_ack_failed == 0)
5309			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5310		rxr->hn_ack_failed++;
5311		retries++;
5312		if (retries < 10) {
5313			DELAY(100);
5314			goto again;
5315		}
5316		/* RXBUF leaks! */
5317		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5318	}
5319}
5320
5321static void
5322hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5323{
5324	struct hn_rx_ring *rxr = xrxr;
5325	struct hn_softc *sc = rxr->hn_ifp->if_softc;
5326
5327	for (;;) {
5328		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5329		int error, pktlen;
5330
5331		pktlen = rxr->hn_pktbuf_len;
5332		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5333		if (__predict_false(error == ENOBUFS)) {
5334			void *nbuf;
5335			int nlen;
5336
5337			/*
5338			 * Expand channel packet buffer.
5339			 *
5340			 * XXX
5341			 * Use M_WAITOK here, since allocation failure
5342			 * is fatal.
5343			 */
5344			nlen = rxr->hn_pktbuf_len * 2;
5345			while (nlen < pktlen)
5346				nlen *= 2;
5347			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5348
5349			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5350			    rxr->hn_pktbuf_len, nlen);
5351
5352			free(rxr->hn_pktbuf, M_DEVBUF);
5353			rxr->hn_pktbuf = nbuf;
5354			rxr->hn_pktbuf_len = nlen;
5355			/* Retry! */
5356			continue;
5357		} else if (__predict_false(error == EAGAIN)) {
5358			/* No more channel packets; done! */
5359			break;
5360		}
5361		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5362
5363		switch (pkt->cph_type) {
5364		case VMBUS_CHANPKT_TYPE_COMP:
5365			hn_nvs_handle_comp(sc, chan, pkt);
5366			break;
5367
5368		case VMBUS_CHANPKT_TYPE_RXBUF:
5369			hn_nvs_handle_rxbuf(rxr, chan, pkt);
5370			break;
5371
5372		case VMBUS_CHANPKT_TYPE_INBAND:
5373			hn_nvs_handle_notify(sc, pkt);
5374			break;
5375
5376		default:
5377			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5378			    pkt->cph_type);
5379			break;
5380		}
5381	}
5382	hn_chan_rollup(rxr, rxr->hn_txr);
5383}
5384
5385static void
5386hn_tx_taskq_create(void *arg __unused)
5387{
5388	int i;
5389
5390	/*
5391	 * Fix the # of TX taskqueues.
5392	 */
5393	if (hn_tx_taskq_cnt <= 0)
5394		hn_tx_taskq_cnt = 1;
5395	else if (hn_tx_taskq_cnt > mp_ncpus)
5396		hn_tx_taskq_cnt = mp_ncpus;
5397
5398	/*
5399	 * Fix the TX taskqueue mode.
5400	 */
5401	switch (hn_tx_taskq_mode) {
5402	case HN_TX_TASKQ_M_INDEP:
5403	case HN_TX_TASKQ_M_GLOBAL:
5404	case HN_TX_TASKQ_M_EVTTQ:
5405		break;
5406	default:
5407		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5408		break;
5409	}
5410
5411	if (vm_guest != VM_GUEST_HV)
5412		return;
5413
5414	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5415		return;
5416
5417	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5418	    M_DEVBUF, M_WAITOK);
5419	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5420		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5421		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5422		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5423		    "hn tx%d", i);
5424	}
5425}
5426SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5427    hn_tx_taskq_create, NULL);
5428
5429static void
5430hn_tx_taskq_destroy(void *arg __unused)
5431{
5432
5433	if (hn_tx_taskque != NULL) {
5434		int i;
5435
5436		for (i = 0; i < hn_tx_taskq_cnt; ++i)
5437			taskqueue_free(hn_tx_taskque[i]);
5438		free(hn_tx_taskque, M_DEVBUF);
5439	}
5440}
5441SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5442    hn_tx_taskq_destroy, NULL);
5443