if_hn.c revision 310799
1/*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29/*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55#include <sys/cdefs.h> 56__FBSDID("$FreeBSD: stable/10/sys/dev/hyperv/netvsc/if_hn.c 310799 2016-12-30 01:59:19Z sephe $"); 57 58#include "opt_inet6.h" 59#include "opt_inet.h" 60#include "opt_hn.h" 61 62#include <sys/param.h> 63#include <sys/bus.h> 64#include <sys/kernel.h> 65#include <sys/limits.h> 66#include <sys/malloc.h> 67#include <sys/mbuf.h> 68#include <sys/module.h> 69#include <sys/proc.h> 70#include <sys/queue.h> 71#include <sys/lock.h> 72#include <sys/smp.h> 73#include <sys/socket.h> 74#include <sys/sockio.h> 75#include <sys/sx.h> 76#include <sys/sysctl.h> 77#include <sys/systm.h> 78#include <sys/taskqueue.h> 79#include <sys/buf_ring.h> 80 81#include <machine/atomic.h> 82#include <machine/in_cksum.h> 83 84#include <net/bpf.h> 85#include <net/ethernet.h> 86#include <net/if.h> 87#include <net/if_arp.h> 88#include <net/if_media.h> 89#include <net/if_types.h> 90#include <net/if_var.h> 91#include <net/if_vlan_var.h> 92#include <net/rndis.h> 93 94#include <netinet/in_systm.h> 95#include <netinet/in.h> 96#include <netinet/ip.h> 97#include <netinet/ip6.h> 98#include <netinet/tcp.h> 99#include <netinet/tcp_lro.h> 100#include <netinet/udp.h> 101 102#include <dev/hyperv/include/hyperv.h> 103#include <dev/hyperv/include/hyperv_busdma.h> 104#include <dev/hyperv/include/vmbus.h> 105#include <dev/hyperv/include/vmbus_xact.h> 106 107#include <dev/hyperv/netvsc/ndis.h> 108#include <dev/hyperv/netvsc/if_hnreg.h> 109#include <dev/hyperv/netvsc/if_hnvar.h> 110#include <dev/hyperv/netvsc/hn_nvs.h> 111#include <dev/hyperv/netvsc/hn_rndis.h> 112 113#include "vmbus_if.h" 114 115#define HN_IFSTART_SUPPORT 116 117#define HN_RING_CNT_DEF_MAX 8 118 119/* YYY should get it from the underlying channel */ 120#define HN_TX_DESC_CNT 512 121 122#define HN_RNDIS_PKT_LEN \ 123 (sizeof(struct rndis_packet_msg) + \ 124 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 125 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 126 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 127 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 128#define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 129#define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 130 131#define HN_TX_DATA_BOUNDARY PAGE_SIZE 132#define HN_TX_DATA_MAXSIZE IP_MAXPACKET 133#define HN_TX_DATA_SEGSIZE PAGE_SIZE 134/* -1 for RNDIS packet message */ 135#define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 136 137#define HN_DIRECT_TX_SIZE_DEF 128 138 139#define HN_EARLY_TXEOF_THRESH 8 140 141#define HN_PKTBUF_LEN_DEF (16 * 1024) 142 143#define HN_LROENT_CNT_DEF 128 144 145#define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 146#define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 147/* YYY 2*MTU is a bit rough, but should be good enough. */ 148#define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 149 150#define HN_LRO_ACKCNT_DEF 1 151 152#define HN_LOCK_INIT(sc) \ 153 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 154#define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 155#define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 156#define HN_LOCK(sc) \ 157do { \ 158 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 159 DELAY(1000); \ 160} while (0) 161#define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 162 163#define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 164#define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 165#define HN_CSUM_IP_HWASSIST(sc) \ 166 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 167#define HN_CSUM_IP6_HWASSIST(sc) \ 168 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 169 170#define HN_PKTSIZE_MIN(align) \ 171 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 172 HN_RNDIS_PKT_LEN, (align)) 173#define HN_PKTSIZE(m, align) \ 174 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 175 176#define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 177 178struct hn_txdesc { 179#ifndef HN_USE_TXDESC_BUFRING 180 SLIST_ENTRY(hn_txdesc) link; 181#endif 182 STAILQ_ENTRY(hn_txdesc) agg_link; 183 184 /* Aggregated txdescs, in sending order. */ 185 STAILQ_HEAD(, hn_txdesc) agg_list; 186 187 /* The oldest packet, if transmission aggregation happens. */ 188 struct mbuf *m; 189 struct hn_tx_ring *txr; 190 int refs; 191 uint32_t flags; /* HN_TXD_FLAG_ */ 192 struct hn_nvs_sendctx send_ctx; 193 uint32_t chim_index; 194 int chim_size; 195 196 bus_dmamap_t data_dmap; 197 198 bus_addr_t rndis_pkt_paddr; 199 struct rndis_packet_msg *rndis_pkt; 200 bus_dmamap_t rndis_pkt_dmap; 201}; 202 203#define HN_TXD_FLAG_ONLIST 0x0001 204#define HN_TXD_FLAG_DMAMAP 0x0002 205#define HN_TXD_FLAG_ONAGG 0x0004 206 207struct hn_rxinfo { 208 uint32_t vlan_info; 209 uint32_t csum_info; 210 uint32_t hash_info; 211 uint32_t hash_value; 212}; 213 214#define HN_RXINFO_VLAN 0x0001 215#define HN_RXINFO_CSUM 0x0002 216#define HN_RXINFO_HASHINF 0x0004 217#define HN_RXINFO_HASHVAL 0x0008 218#define HN_RXINFO_ALL \ 219 (HN_RXINFO_VLAN | \ 220 HN_RXINFO_CSUM | \ 221 HN_RXINFO_HASHINF | \ 222 HN_RXINFO_HASHVAL) 223 224#define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 225#define HN_NDIS_RXCSUM_INFO_INVALID 0 226#define HN_NDIS_HASH_INFO_INVALID 0 227 228static int hn_probe(device_t); 229static int hn_attach(device_t); 230static int hn_detach(device_t); 231static int hn_shutdown(device_t); 232static void hn_chan_callback(struct vmbus_channel *, 233 void *); 234 235static void hn_init(void *); 236static int hn_ioctl(struct ifnet *, u_long, caddr_t); 237#ifdef HN_IFSTART_SUPPORT 238static void hn_start(struct ifnet *); 239#endif 240static int hn_transmit(struct ifnet *, struct mbuf *); 241static void hn_xmit_qflush(struct ifnet *); 242static int hn_ifmedia_upd(struct ifnet *); 243static void hn_ifmedia_sts(struct ifnet *, 244 struct ifmediareq *); 245 246static int hn_rndis_rxinfo(const void *, int, 247 struct hn_rxinfo *); 248static void hn_rndis_rx_data(struct hn_rx_ring *, 249 const void *, int); 250static void hn_rndis_rx_status(struct hn_softc *, 251 const void *, int); 252 253static void hn_nvs_handle_notify(struct hn_softc *, 254 const struct vmbus_chanpkt_hdr *); 255static void hn_nvs_handle_comp(struct hn_softc *, 256 struct vmbus_channel *, 257 const struct vmbus_chanpkt_hdr *); 258static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 259 struct vmbus_channel *, 260 const struct vmbus_chanpkt_hdr *); 261static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 262 struct vmbus_channel *, uint64_t); 263 264#if __FreeBSD_version >= 1100099 265static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 266static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 267#endif 268static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 269static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 270#if __FreeBSD_version < 1100095 271static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 272#else 273static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 274#endif 275static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 276static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 277static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 278static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 279static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 280static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 281static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 282static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 283static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 284static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 285static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 286static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 287static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 288static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 289 290static void hn_stop(struct hn_softc *); 291static void hn_init_locked(struct hn_softc *); 292static int hn_chan_attach(struct hn_softc *, 293 struct vmbus_channel *); 294static void hn_chan_detach(struct hn_softc *, 295 struct vmbus_channel *); 296static int hn_attach_subchans(struct hn_softc *); 297static void hn_detach_allchans(struct hn_softc *); 298static void hn_chan_rollup(struct hn_rx_ring *, 299 struct hn_tx_ring *); 300static void hn_set_ring_inuse(struct hn_softc *, int); 301static int hn_synth_attach(struct hn_softc *, int); 302static void hn_synth_detach(struct hn_softc *); 303static int hn_synth_alloc_subchans(struct hn_softc *, 304 int *); 305static bool hn_synth_attachable(const struct hn_softc *); 306static void hn_suspend(struct hn_softc *); 307static void hn_suspend_data(struct hn_softc *); 308static void hn_suspend_mgmt(struct hn_softc *); 309static void hn_resume(struct hn_softc *); 310static void hn_resume_data(struct hn_softc *); 311static void hn_resume_mgmt(struct hn_softc *); 312static void hn_suspend_mgmt_taskfunc(void *, int); 313static void hn_chan_drain(struct hn_softc *, 314 struct vmbus_channel *); 315 316static void hn_update_link_status(struct hn_softc *); 317static void hn_change_network(struct hn_softc *); 318static void hn_link_taskfunc(void *, int); 319static void hn_netchg_init_taskfunc(void *, int); 320static void hn_netchg_status_taskfunc(void *, int); 321static void hn_link_status(struct hn_softc *); 322 323static int hn_create_rx_data(struct hn_softc *, int); 324static void hn_destroy_rx_data(struct hn_softc *); 325static int hn_check_iplen(const struct mbuf *, int); 326static int hn_set_rxfilter(struct hn_softc *); 327static int hn_rss_reconfig(struct hn_softc *); 328static void hn_rss_ind_fixup(struct hn_softc *); 329static int hn_rxpkt(struct hn_rx_ring *, const void *, 330 int, const struct hn_rxinfo *); 331 332static int hn_tx_ring_create(struct hn_softc *, int); 333static void hn_tx_ring_destroy(struct hn_tx_ring *); 334static int hn_create_tx_data(struct hn_softc *, int); 335static void hn_fixup_tx_data(struct hn_softc *); 336static void hn_destroy_tx_data(struct hn_softc *); 337static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 338static void hn_txdesc_gc(struct hn_tx_ring *, 339 struct hn_txdesc *); 340static int hn_encap(struct ifnet *, struct hn_tx_ring *, 341 struct hn_txdesc *, struct mbuf **); 342static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 343 struct hn_txdesc *); 344static void hn_set_chim_size(struct hn_softc *, int); 345static void hn_set_tso_maxsize(struct hn_softc *, int, int); 346static bool hn_tx_ring_pending(struct hn_tx_ring *); 347static void hn_tx_ring_qflush(struct hn_tx_ring *); 348static void hn_resume_tx(struct hn_softc *, int); 349static void hn_set_txagg(struct hn_softc *); 350static void *hn_try_txagg(struct ifnet *, 351 struct hn_tx_ring *, struct hn_txdesc *, 352 int); 353static int hn_get_txswq_depth(const struct hn_tx_ring *); 354static void hn_txpkt_done(struct hn_nvs_sendctx *, 355 struct hn_softc *, struct vmbus_channel *, 356 const void *, int); 357static int hn_txpkt_sglist(struct hn_tx_ring *, 358 struct hn_txdesc *); 359static int hn_txpkt_chim(struct hn_tx_ring *, 360 struct hn_txdesc *); 361static int hn_xmit(struct hn_tx_ring *, int); 362static void hn_xmit_taskfunc(void *, int); 363static void hn_xmit_txeof(struct hn_tx_ring *); 364static void hn_xmit_txeof_taskfunc(void *, int); 365#ifdef HN_IFSTART_SUPPORT 366static int hn_start_locked(struct hn_tx_ring *, int); 367static void hn_start_taskfunc(void *, int); 368static void hn_start_txeof(struct hn_tx_ring *); 369static void hn_start_txeof_taskfunc(void *, int); 370#endif 371 372SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 373 "Hyper-V network interface"); 374 375/* Trust tcp segements verification on host side. */ 376static int hn_trust_hosttcp = 1; 377SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 378 &hn_trust_hosttcp, 0, 379 "Trust tcp segement verification on host side, " 380 "when csum info is missing (global setting)"); 381 382/* Trust udp datagrams verification on host side. */ 383static int hn_trust_hostudp = 1; 384SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 385 &hn_trust_hostudp, 0, 386 "Trust udp datagram verification on host side, " 387 "when csum info is missing (global setting)"); 388 389/* Trust ip packets verification on host side. */ 390static int hn_trust_hostip = 1; 391SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 392 &hn_trust_hostip, 0, 393 "Trust ip packet verification on host side, " 394 "when csum info is missing (global setting)"); 395 396/* Limit TSO burst size */ 397static int hn_tso_maxlen = IP_MAXPACKET; 398SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 399 &hn_tso_maxlen, 0, "TSO burst limit"); 400 401/* Limit chimney send size */ 402static int hn_tx_chimney_size = 0; 403SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 404 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 405 406/* Limit the size of packet for direct transmission */ 407static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 408SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 409 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 410 411/* # of LRO entries per RX ring */ 412#if defined(INET) || defined(INET6) 413#if __FreeBSD_version >= 1100095 414static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 415SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 416 &hn_lro_entry_count, 0, "LRO entry count"); 417#endif 418#endif 419 420static int hn_tx_taskq_cnt = 1; 421SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 422 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 423 424#define HN_TX_TASKQ_M_INDEP 0 425#define HN_TX_TASKQ_M_GLOBAL 1 426#define HN_TX_TASKQ_M_EVTTQ 2 427 428static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 429SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 430 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 431 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 432 433#ifndef HN_USE_TXDESC_BUFRING 434static int hn_use_txdesc_bufring = 0; 435#else 436static int hn_use_txdesc_bufring = 1; 437#endif 438SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 439 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 440 441#ifdef HN_IFSTART_SUPPORT 442/* Use ifnet.if_start instead of ifnet.if_transmit */ 443static int hn_use_if_start = 0; 444SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 445 &hn_use_if_start, 0, "Use if_start TX method"); 446#endif 447 448/* # of channels to use */ 449static int hn_chan_cnt = 0; 450SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 451 &hn_chan_cnt, 0, 452 "# of channels to use; each channel has one RX ring and one TX ring"); 453 454/* # of transmit rings to use */ 455static int hn_tx_ring_cnt = 0; 456SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 457 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 458 459/* Software TX ring deptch */ 460static int hn_tx_swq_depth = 0; 461SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 462 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 463 464/* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 465#if __FreeBSD_version >= 1100095 466static u_int hn_lro_mbufq_depth = 0; 467SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 468 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 469#endif 470 471/* Packet transmission aggregation size limit */ 472static int hn_tx_agg_size = -1; 473SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 474 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 475 476/* Packet transmission aggregation count limit */ 477static int hn_tx_agg_pkts = -1; 478SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 479 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 480 481static u_int hn_cpu_index; /* next CPU for channel */ 482static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 483 484static const uint8_t 485hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 486 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 487 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 488 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 489 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 490 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 491}; 492 493static device_method_t hn_methods[] = { 494 /* Device interface */ 495 DEVMETHOD(device_probe, hn_probe), 496 DEVMETHOD(device_attach, hn_attach), 497 DEVMETHOD(device_detach, hn_detach), 498 DEVMETHOD(device_shutdown, hn_shutdown), 499 DEVMETHOD_END 500}; 501 502static driver_t hn_driver = { 503 "hn", 504 hn_methods, 505 sizeof(struct hn_softc) 506}; 507 508static devclass_t hn_devclass; 509 510DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 511MODULE_VERSION(hn, 1); 512MODULE_DEPEND(hn, vmbus, 1, 1, 1); 513 514#if __FreeBSD_version >= 1100099 515static void 516hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 517{ 518 int i; 519 520 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 521 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 522} 523#endif 524 525static int 526hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 527{ 528 529 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 530 txd->chim_size == 0, ("invalid rndis sglist txd")); 531 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 532 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 533} 534 535static int 536hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 537{ 538 struct hn_nvs_rndis rndis; 539 540 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 541 txd->chim_size > 0, ("invalid rndis chim txd")); 542 543 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 544 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 545 rndis.nvs_chim_idx = txd->chim_index; 546 rndis.nvs_chim_sz = txd->chim_size; 547 548 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 549 &rndis, sizeof(rndis), &txd->send_ctx)); 550} 551 552static __inline uint32_t 553hn_chim_alloc(struct hn_softc *sc) 554{ 555 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 556 u_long *bmap = sc->hn_chim_bmap; 557 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 558 559 for (i = 0; i < bmap_cnt; ++i) { 560 int idx; 561 562 idx = ffsl(~bmap[i]); 563 if (idx == 0) 564 continue; 565 566 --idx; /* ffsl is 1-based */ 567 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 568 ("invalid i %d and idx %d", i, idx)); 569 570 if (atomic_testandset_long(&bmap[i], idx)) 571 continue; 572 573 ret = i * LONG_BIT + idx; 574 break; 575 } 576 return (ret); 577} 578 579static __inline void 580hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 581{ 582 u_long mask; 583 uint32_t idx; 584 585 idx = chim_idx / LONG_BIT; 586 KASSERT(idx < sc->hn_chim_bmap_cnt, 587 ("invalid chimney index 0x%x", chim_idx)); 588 589 mask = 1UL << (chim_idx % LONG_BIT); 590 KASSERT(sc->hn_chim_bmap[idx] & mask, 591 ("index bitmap 0x%lx, chimney index %u, " 592 "bitmap idx %d, bitmask 0x%lx", 593 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 594 595 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 596} 597 598#if defined(INET6) || defined(INET) 599/* 600 * NOTE: If this function failed, the m_head would be freed. 601 */ 602static __inline struct mbuf * 603hn_tso_fixup(struct mbuf *m_head) 604{ 605 struct ether_vlan_header *evl; 606 struct tcphdr *th; 607 int ehlen; 608 609 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 610 611#define PULLUP_HDR(m, len) \ 612do { \ 613 if (__predict_false((m)->m_len < (len))) { \ 614 (m) = m_pullup((m), (len)); \ 615 if ((m) == NULL) \ 616 return (NULL); \ 617 } \ 618} while (0) 619 620 PULLUP_HDR(m_head, sizeof(*evl)); 621 evl = mtod(m_head, struct ether_vlan_header *); 622 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 623 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 624 else 625 ehlen = ETHER_HDR_LEN; 626 627#ifdef INET 628 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 629 struct ip *ip; 630 int iphlen; 631 632 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 633 ip = mtodo(m_head, ehlen); 634 iphlen = ip->ip_hl << 2; 635 636 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 637 th = mtodo(m_head, ehlen + iphlen); 638 639 ip->ip_len = 0; 640 ip->ip_sum = 0; 641 th->th_sum = in_pseudo(ip->ip_src.s_addr, 642 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 643 } 644#endif 645#if defined(INET6) && defined(INET) 646 else 647#endif 648#ifdef INET6 649 { 650 struct ip6_hdr *ip6; 651 652 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 653 ip6 = mtodo(m_head, ehlen); 654 if (ip6->ip6_nxt != IPPROTO_TCP) { 655 m_freem(m_head); 656 return (NULL); 657 } 658 659 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 660 th = mtodo(m_head, ehlen + sizeof(*ip6)); 661 662 ip6->ip6_plen = 0; 663 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 664 } 665#endif 666 return (m_head); 667 668#undef PULLUP_HDR 669} 670#endif /* INET6 || INET */ 671 672static int 673hn_set_rxfilter(struct hn_softc *sc) 674{ 675 struct ifnet *ifp = sc->hn_ifp; 676 uint32_t filter; 677 int error = 0; 678 679 HN_LOCK_ASSERT(sc); 680 681 if (ifp->if_flags & IFF_PROMISC) { 682 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 683 } else { 684 filter = NDIS_PACKET_TYPE_DIRECTED; 685 if (ifp->if_flags & IFF_BROADCAST) 686 filter |= NDIS_PACKET_TYPE_BROADCAST; 687 /* TODO: support multicast list */ 688 if ((ifp->if_flags & IFF_ALLMULTI) || 689 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 690 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 691 } 692 693 if (sc->hn_rx_filter != filter) { 694 error = hn_rndis_set_rxfilter(sc, filter); 695 if (!error) 696 sc->hn_rx_filter = filter; 697 } 698 return (error); 699} 700 701static void 702hn_set_txagg(struct hn_softc *sc) 703{ 704 uint32_t size, pkts; 705 int i; 706 707 /* 708 * Setup aggregation size. 709 */ 710 if (sc->hn_agg_size < 0) 711 size = UINT32_MAX; 712 else 713 size = sc->hn_agg_size; 714 715 if (sc->hn_rndis_agg_size < size) 716 size = sc->hn_rndis_agg_size; 717 718 /* NOTE: We only aggregate packets using chimney sending buffers. */ 719 if (size > (uint32_t)sc->hn_chim_szmax) 720 size = sc->hn_chim_szmax; 721 722 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 723 /* Disable */ 724 size = 0; 725 pkts = 0; 726 goto done; 727 } 728 729 /* NOTE: Type of the per TX ring setting is 'int'. */ 730 if (size > INT_MAX) 731 size = INT_MAX; 732 733 /* 734 * Setup aggregation packet count. 735 */ 736 if (sc->hn_agg_pkts < 0) 737 pkts = UINT32_MAX; 738 else 739 pkts = sc->hn_agg_pkts; 740 741 if (sc->hn_rndis_agg_pkts < pkts) 742 pkts = sc->hn_rndis_agg_pkts; 743 744 if (pkts <= 1) { 745 /* Disable */ 746 size = 0; 747 pkts = 0; 748 goto done; 749 } 750 751 /* NOTE: Type of the per TX ring setting is 'short'. */ 752 if (pkts > SHRT_MAX) 753 pkts = SHRT_MAX; 754 755done: 756 /* NOTE: Type of the per TX ring setting is 'short'. */ 757 if (sc->hn_rndis_agg_align > SHRT_MAX) { 758 /* Disable */ 759 size = 0; 760 pkts = 0; 761 } 762 763 if (bootverbose) { 764 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 765 size, pkts, sc->hn_rndis_agg_align); 766 } 767 768 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 769 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 770 771 mtx_lock(&txr->hn_tx_lock); 772 txr->hn_agg_szmax = size; 773 txr->hn_agg_pktmax = pkts; 774 txr->hn_agg_align = sc->hn_rndis_agg_align; 775 mtx_unlock(&txr->hn_tx_lock); 776 } 777} 778 779static int 780hn_get_txswq_depth(const struct hn_tx_ring *txr) 781{ 782 783 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 784 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 785 return txr->hn_txdesc_cnt; 786 return hn_tx_swq_depth; 787} 788 789static int 790hn_rss_reconfig(struct hn_softc *sc) 791{ 792 int error; 793 794 HN_LOCK_ASSERT(sc); 795 796 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 797 return (ENXIO); 798 799 /* 800 * Disable RSS first. 801 * 802 * NOTE: 803 * Direct reconfiguration by setting the UNCHG flags does 804 * _not_ work properly. 805 */ 806 if (bootverbose) 807 if_printf(sc->hn_ifp, "disable RSS\n"); 808 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 809 if (error) { 810 if_printf(sc->hn_ifp, "RSS disable failed\n"); 811 return (error); 812 } 813 814 /* 815 * Reenable the RSS w/ the updated RSS key or indirect 816 * table. 817 */ 818 if (bootverbose) 819 if_printf(sc->hn_ifp, "reconfig RSS\n"); 820 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 821 if (error) { 822 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 823 return (error); 824 } 825 return (0); 826} 827 828static void 829hn_rss_ind_fixup(struct hn_softc *sc) 830{ 831 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 832 int i, nchan; 833 834 nchan = sc->hn_rx_ring_inuse; 835 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 836 837 /* 838 * Check indirect table to make sure that all channels in it 839 * can be used. 840 */ 841 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 842 if (rss->rss_ind[i] >= nchan) { 843 if_printf(sc->hn_ifp, 844 "RSS indirect table %d fixup: %u -> %d\n", 845 i, rss->rss_ind[i], nchan - 1); 846 rss->rss_ind[i] = nchan - 1; 847 } 848 } 849} 850 851static int 852hn_ifmedia_upd(struct ifnet *ifp __unused) 853{ 854 855 return EOPNOTSUPP; 856} 857 858static void 859hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 860{ 861 struct hn_softc *sc = ifp->if_softc; 862 863 ifmr->ifm_status = IFM_AVALID; 864 ifmr->ifm_active = IFM_ETHER; 865 866 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 867 ifmr->ifm_active |= IFM_NONE; 868 return; 869 } 870 ifmr->ifm_status |= IFM_ACTIVE; 871 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 872} 873 874/* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ 875static const struct hyperv_guid g_net_vsc_device_type = { 876 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 877 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} 878}; 879 880static int 881hn_probe(device_t dev) 882{ 883 884 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, 885 &g_net_vsc_device_type) == 0) { 886 device_set_desc(dev, "Hyper-V Network Interface"); 887 return BUS_PROBE_DEFAULT; 888 } 889 return ENXIO; 890} 891 892static int 893hn_attach(device_t dev) 894{ 895 struct hn_softc *sc = device_get_softc(dev); 896 struct sysctl_oid_list *child; 897 struct sysctl_ctx_list *ctx; 898 uint8_t eaddr[ETHER_ADDR_LEN]; 899 struct ifnet *ifp = NULL; 900 int error, ring_cnt, tx_ring_cnt; 901 902 sc->hn_dev = dev; 903 sc->hn_prichan = vmbus_get_channel(dev); 904 HN_LOCK_INIT(sc); 905 906 /* 907 * Initialize these tunables once. 908 */ 909 sc->hn_agg_size = hn_tx_agg_size; 910 sc->hn_agg_pkts = hn_tx_agg_pkts; 911 912 /* 913 * Setup taskqueue for transmission. 914 */ 915 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 916 int i; 917 918 sc->hn_tx_taskqs = 919 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 920 M_DEVBUF, M_WAITOK); 921 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 922 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 923 M_WAITOK, taskqueue_thread_enqueue, 924 &sc->hn_tx_taskqs[i]); 925 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 926 "%s tx%d", device_get_nameunit(dev), i); 927 } 928 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 929 sc->hn_tx_taskqs = hn_tx_taskque; 930 } 931 932 /* 933 * Setup taskqueue for mangement tasks, e.g. link status. 934 */ 935 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 936 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 937 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 938 device_get_nameunit(dev)); 939 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 940 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 941 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 942 hn_netchg_status_taskfunc, sc); 943 944 /* 945 * Allocate ifnet and setup its name earlier, so that if_printf 946 * can be used by functions, which will be called after 947 * ether_ifattach(). 948 */ 949 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER); 950 ifp->if_softc = sc; 951 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 952 953 /* 954 * Initialize ifmedia earlier so that it can be unconditionally 955 * destroyed, if error happened later on. 956 */ 957 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 958 959 /* 960 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 961 * to use (tx_ring_cnt). 962 * 963 * NOTE: 964 * The # of RX rings to use is same as the # of channels to use. 965 */ 966 ring_cnt = hn_chan_cnt; 967 if (ring_cnt <= 0) { 968 /* Default */ 969 ring_cnt = mp_ncpus; 970 if (ring_cnt > HN_RING_CNT_DEF_MAX) 971 ring_cnt = HN_RING_CNT_DEF_MAX; 972 } else if (ring_cnt > mp_ncpus) { 973 ring_cnt = mp_ncpus; 974 } 975 976 tx_ring_cnt = hn_tx_ring_cnt; 977 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 978 tx_ring_cnt = ring_cnt; 979#ifdef HN_IFSTART_SUPPORT 980 if (hn_use_if_start) { 981 /* ifnet.if_start only needs one TX ring. */ 982 tx_ring_cnt = 1; 983 } 984#endif 985 986 /* 987 * Set the leader CPU for channels. 988 */ 989 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 990 991 /* 992 * Create enough TX/RX rings, even if only limited number of 993 * channels can be allocated. 994 */ 995 error = hn_create_tx_data(sc, tx_ring_cnt); 996 if (error) 997 goto failed; 998 error = hn_create_rx_data(sc, ring_cnt); 999 if (error) 1000 goto failed; 1001 1002 /* 1003 * Create transaction context for NVS and RNDIS transactions. 1004 */ 1005 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 1006 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 1007 if (sc->hn_xact == NULL) { 1008 error = ENXIO; 1009 goto failed; 1010 } 1011 1012 /* 1013 * Install orphan handler for the revocation of this device's 1014 * primary channel. 1015 * 1016 * NOTE: 1017 * The processing order is critical here: 1018 * Install the orphan handler, _before_ testing whether this 1019 * device's primary channel has been revoked or not. 1020 */ 1021 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 1022 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 1023 error = ENXIO; 1024 goto failed; 1025 } 1026 1027 /* 1028 * Attach the synthetic parts, i.e. NVS and RNDIS. 1029 */ 1030 error = hn_synth_attach(sc, ETHERMTU); 1031 if (error) 1032 goto failed; 1033 1034 error = hn_rndis_get_eaddr(sc, eaddr); 1035 if (error) 1036 goto failed; 1037 1038#if __FreeBSD_version >= 1100099 1039 if (sc->hn_rx_ring_inuse > 1) { 1040 /* 1041 * Reduce TCP segment aggregation limit for multiple 1042 * RX rings to increase ACK timeliness. 1043 */ 1044 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 1045 } 1046#endif 1047 1048 /* 1049 * Fixup TX stuffs after synthetic parts are attached. 1050 */ 1051 hn_fixup_tx_data(sc); 1052 1053 ctx = device_get_sysctl_ctx(dev); 1054 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 1055 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 1056 &sc->hn_nvs_ver, 0, "NVS version"); 1057 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 1058 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1059 hn_ndis_version_sysctl, "A", "NDIS version"); 1060 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 1061 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1062 hn_caps_sysctl, "A", "capabilities"); 1063 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 1064 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1065 hn_hwassist_sysctl, "A", "hwassist"); 1066 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 1067 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1068 hn_rxfilter_sysctl, "A", "rxfilter"); 1069 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 1070 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1071 hn_rss_hash_sysctl, "A", "RSS hash"); 1072 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 1073 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 1074 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 1075 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1076 hn_rss_key_sysctl, "IU", "RSS key"); 1077 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 1078 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1079 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 1080 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 1081 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 1082 "RNDIS offered packet transmission aggregation size limit"); 1083 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 1084 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 1085 "RNDIS offered packet transmission aggregation count limit"); 1086 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 1087 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 1088 "RNDIS packet transmission aggregation alignment"); 1089 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 1090 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1091 hn_txagg_size_sysctl, "I", 1092 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 1093 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 1094 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1095 hn_txagg_pkts_sysctl, "I", 1096 "Packet transmission aggregation packets, " 1097 "0 -- disable, -1 -- auto"); 1098 1099 /* 1100 * Setup the ifmedia, which has been initialized earlier. 1101 */ 1102 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 1103 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 1104 /* XXX ifmedia_set really should do this for us */ 1105 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 1106 1107 /* 1108 * Setup the ifnet for this interface. 1109 */ 1110 1111#ifdef __LP64__ 1112 ifp->if_baudrate = IF_Gbps(10); 1113#else 1114 /* if_baudrate is 32bits on 32bit system. */ 1115 ifp->if_baudrate = IF_Gbps(1); 1116#endif 1117 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 1118 ifp->if_ioctl = hn_ioctl; 1119 ifp->if_init = hn_init; 1120#ifdef HN_IFSTART_SUPPORT 1121 if (hn_use_if_start) { 1122 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 1123 1124 ifp->if_start = hn_start; 1125 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 1126 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 1127 IFQ_SET_READY(&ifp->if_snd); 1128 } else 1129#endif 1130 { 1131 ifp->if_transmit = hn_transmit; 1132 ifp->if_qflush = hn_xmit_qflush; 1133 } 1134 1135 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; 1136#ifdef foo 1137 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 1138 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 1139#endif 1140 if (sc->hn_caps & HN_CAP_VLAN) { 1141 /* XXX not sure about VLAN_MTU. */ 1142 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 1143 } 1144 1145 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 1146 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 1147 ifp->if_capabilities |= IFCAP_TXCSUM; 1148 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 1149 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 1150 if (sc->hn_caps & HN_CAP_TSO4) { 1151 ifp->if_capabilities |= IFCAP_TSO4; 1152 ifp->if_hwassist |= CSUM_IP_TSO; 1153 } 1154 if (sc->hn_caps & HN_CAP_TSO6) { 1155 ifp->if_capabilities |= IFCAP_TSO6; 1156 ifp->if_hwassist |= CSUM_IP6_TSO; 1157 } 1158 1159 /* Enable all available capabilities by default. */ 1160 ifp->if_capenable = ifp->if_capabilities; 1161 1162 /* 1163 * Disable IPv6 TSO and TXCSUM by default, they still can 1164 * be enabled through SIOCSIFCAP. 1165 */ 1166 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 1167 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 1168 1169 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 1170 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 1171 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 1172 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 1173 } 1174 1175 ether_ifattach(ifp, eaddr); 1176 1177 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 1178 if_printf(ifp, "TSO segcnt %u segsz %u\n", 1179 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 1180 } 1181 1182 /* Inform the upper layer about the long frame support. */ 1183 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 1184 1185 /* 1186 * Kick off link status check. 1187 */ 1188 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 1189 hn_update_link_status(sc); 1190 1191 return (0); 1192failed: 1193 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 1194 hn_synth_detach(sc); 1195 hn_detach(dev); 1196 return (error); 1197} 1198 1199static int 1200hn_detach(device_t dev) 1201{ 1202 struct hn_softc *sc = device_get_softc(dev); 1203 struct ifnet *ifp = sc->hn_ifp; 1204 1205 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 1206 /* 1207 * In case that the vmbus missed the orphan handler 1208 * installation. 1209 */ 1210 vmbus_xact_ctx_orphan(sc->hn_xact); 1211 } 1212 1213 if (device_is_attached(dev)) { 1214 HN_LOCK(sc); 1215 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 1216 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 1217 hn_stop(sc); 1218 /* 1219 * NOTE: 1220 * hn_stop() only suspends data, so managment 1221 * stuffs have to be suspended manually here. 1222 */ 1223 hn_suspend_mgmt(sc); 1224 hn_synth_detach(sc); 1225 } 1226 HN_UNLOCK(sc); 1227 ether_ifdetach(ifp); 1228 } 1229 1230 ifmedia_removeall(&sc->hn_media); 1231 hn_destroy_rx_data(sc); 1232 hn_destroy_tx_data(sc); 1233 1234 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 1235 int i; 1236 1237 for (i = 0; i < hn_tx_taskq_cnt; ++i) 1238 taskqueue_free(sc->hn_tx_taskqs[i]); 1239 free(sc->hn_tx_taskqs, M_DEVBUF); 1240 } 1241 taskqueue_free(sc->hn_mgmt_taskq0); 1242 1243 if (sc->hn_xact != NULL) { 1244 /* 1245 * Uninstall the orphan handler _before_ the xact is 1246 * destructed. 1247 */ 1248 vmbus_chan_unset_orphan(sc->hn_prichan); 1249 vmbus_xact_ctx_destroy(sc->hn_xact); 1250 } 1251 1252 if_free(ifp); 1253 1254 HN_LOCK_DESTROY(sc); 1255 return (0); 1256} 1257 1258static int 1259hn_shutdown(device_t dev) 1260{ 1261 1262 return (0); 1263} 1264 1265static void 1266hn_link_status(struct hn_softc *sc) 1267{ 1268 uint32_t link_status; 1269 int error; 1270 1271 error = hn_rndis_get_linkstatus(sc, &link_status); 1272 if (error) { 1273 /* XXX what to do? */ 1274 return; 1275 } 1276 1277 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 1278 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 1279 else 1280 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1281 if_link_state_change(sc->hn_ifp, 1282 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 1283 LINK_STATE_UP : LINK_STATE_DOWN); 1284} 1285 1286static void 1287hn_link_taskfunc(void *xsc, int pending __unused) 1288{ 1289 struct hn_softc *sc = xsc; 1290 1291 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 1292 return; 1293 hn_link_status(sc); 1294} 1295 1296static void 1297hn_netchg_init_taskfunc(void *xsc, int pending __unused) 1298{ 1299 struct hn_softc *sc = xsc; 1300 1301 /* Prevent any link status checks from running. */ 1302 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 1303 1304 /* 1305 * Fake up a [link down --> link up] state change; 5 seconds 1306 * delay is used, which closely simulates miibus reaction 1307 * upon link down event. 1308 */ 1309 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1310 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 1311 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 1312 &sc->hn_netchg_status, 5 * hz); 1313} 1314 1315static void 1316hn_netchg_status_taskfunc(void *xsc, int pending __unused) 1317{ 1318 struct hn_softc *sc = xsc; 1319 1320 /* Re-allow link status checks. */ 1321 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 1322 hn_link_status(sc); 1323} 1324 1325static void 1326hn_update_link_status(struct hn_softc *sc) 1327{ 1328 1329 if (sc->hn_mgmt_taskq != NULL) 1330 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 1331} 1332 1333static void 1334hn_change_network(struct hn_softc *sc) 1335{ 1336 1337 if (sc->hn_mgmt_taskq != NULL) 1338 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 1339} 1340 1341static __inline int 1342hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 1343 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 1344{ 1345 struct mbuf *m = *m_head; 1346 int error; 1347 1348 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 1349 1350 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 1351 m, segs, nsegs, BUS_DMA_NOWAIT); 1352 if (error == EFBIG) { 1353 struct mbuf *m_new; 1354 1355 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 1356 if (m_new == NULL) 1357 return ENOBUFS; 1358 else 1359 *m_head = m = m_new; 1360 txr->hn_tx_collapsed++; 1361 1362 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 1363 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 1364 } 1365 if (!error) { 1366 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 1367 BUS_DMASYNC_PREWRITE); 1368 txd->flags |= HN_TXD_FLAG_DMAMAP; 1369 } 1370 return error; 1371} 1372 1373static __inline int 1374hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 1375{ 1376 1377 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 1378 ("put an onlist txd %#x", txd->flags)); 1379 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1380 ("put an onagg txd %#x", txd->flags)); 1381 1382 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1383 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 1384 return 0; 1385 1386 if (!STAILQ_EMPTY(&txd->agg_list)) { 1387 struct hn_txdesc *tmp_txd; 1388 1389 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 1390 int freed; 1391 1392 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 1393 ("resursive aggregation on aggregated txdesc")); 1394 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 1395 ("not aggregated txdesc")); 1396 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1397 ("aggregated txdesc uses dmamap")); 1398 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1399 ("aggregated txdesc consumes " 1400 "chimney sending buffer")); 1401 KASSERT(tmp_txd->chim_size == 0, 1402 ("aggregated txdesc has non-zero " 1403 "chimney sending size")); 1404 1405 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 1406 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 1407 freed = hn_txdesc_put(txr, tmp_txd); 1408 KASSERT(freed, ("failed to free aggregated txdesc")); 1409 } 1410 } 1411 1412 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 1413 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1414 ("chim txd uses dmamap")); 1415 hn_chim_free(txr->hn_sc, txd->chim_index); 1416 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1417 txd->chim_size = 0; 1418 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 1419 bus_dmamap_sync(txr->hn_tx_data_dtag, 1420 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 1421 bus_dmamap_unload(txr->hn_tx_data_dtag, 1422 txd->data_dmap); 1423 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 1424 } 1425 1426 if (txd->m != NULL) { 1427 m_freem(txd->m); 1428 txd->m = NULL; 1429 } 1430 1431 txd->flags |= HN_TXD_FLAG_ONLIST; 1432#ifndef HN_USE_TXDESC_BUFRING 1433 mtx_lock_spin(&txr->hn_txlist_spin); 1434 KASSERT(txr->hn_txdesc_avail >= 0 && 1435 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 1436 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 1437 txr->hn_txdesc_avail++; 1438 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 1439 mtx_unlock_spin(&txr->hn_txlist_spin); 1440#else /* HN_USE_TXDESC_BUFRING */ 1441#ifdef HN_DEBUG 1442 atomic_add_int(&txr->hn_txdesc_avail, 1); 1443#endif 1444 buf_ring_enqueue(txr->hn_txdesc_br, txd); 1445#endif /* !HN_USE_TXDESC_BUFRING */ 1446 1447 return 1; 1448} 1449 1450static __inline struct hn_txdesc * 1451hn_txdesc_get(struct hn_tx_ring *txr) 1452{ 1453 struct hn_txdesc *txd; 1454 1455#ifndef HN_USE_TXDESC_BUFRING 1456 mtx_lock_spin(&txr->hn_txlist_spin); 1457 txd = SLIST_FIRST(&txr->hn_txlist); 1458 if (txd != NULL) { 1459 KASSERT(txr->hn_txdesc_avail > 0, 1460 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 1461 txr->hn_txdesc_avail--; 1462 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 1463 } 1464 mtx_unlock_spin(&txr->hn_txlist_spin); 1465#else 1466 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 1467#endif 1468 1469 if (txd != NULL) { 1470#ifdef HN_USE_TXDESC_BUFRING 1471#ifdef HN_DEBUG 1472 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 1473#endif 1474#endif /* HN_USE_TXDESC_BUFRING */ 1475 KASSERT(txd->m == NULL && txd->refs == 0 && 1476 STAILQ_EMPTY(&txd->agg_list) && 1477 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 1478 txd->chim_size == 0 && 1479 (txd->flags & HN_TXD_FLAG_ONLIST) && 1480 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 1481 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 1482 txd->flags &= ~HN_TXD_FLAG_ONLIST; 1483 txd->refs = 1; 1484 } 1485 return txd; 1486} 1487 1488static __inline void 1489hn_txdesc_hold(struct hn_txdesc *txd) 1490{ 1491 1492 /* 0->1 transition will never work */ 1493 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1494 atomic_add_int(&txd->refs, 1); 1495} 1496 1497static __inline void 1498hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 1499{ 1500 1501 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1502 ("recursive aggregation on aggregating txdesc")); 1503 1504 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1505 ("already aggregated")); 1506 KASSERT(STAILQ_EMPTY(&txd->agg_list), 1507 ("recursive aggregation on to-be-aggregated txdesc")); 1508 1509 txd->flags |= HN_TXD_FLAG_ONAGG; 1510 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 1511} 1512 1513static bool 1514hn_tx_ring_pending(struct hn_tx_ring *txr) 1515{ 1516 bool pending = false; 1517 1518#ifndef HN_USE_TXDESC_BUFRING 1519 mtx_lock_spin(&txr->hn_txlist_spin); 1520 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 1521 pending = true; 1522 mtx_unlock_spin(&txr->hn_txlist_spin); 1523#else 1524 if (!buf_ring_full(txr->hn_txdesc_br)) 1525 pending = true; 1526#endif 1527 return (pending); 1528} 1529 1530static __inline void 1531hn_txeof(struct hn_tx_ring *txr) 1532{ 1533 txr->hn_has_txeof = 0; 1534 txr->hn_txeof(txr); 1535} 1536 1537static void 1538hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 1539 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 1540{ 1541 struct hn_txdesc *txd = sndc->hn_cbarg; 1542 struct hn_tx_ring *txr; 1543 1544 txr = txd->txr; 1545 KASSERT(txr->hn_chan == chan, 1546 ("channel mismatch, on chan%u, should be chan%u", 1547 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 1548 1549 txr->hn_has_txeof = 1; 1550 hn_txdesc_put(txr, txd); 1551 1552 ++txr->hn_txdone_cnt; 1553 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 1554 txr->hn_txdone_cnt = 0; 1555 if (txr->hn_oactive) 1556 hn_txeof(txr); 1557 } 1558} 1559 1560static void 1561hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 1562{ 1563#if defined(INET) || defined(INET6) 1564 struct lro_ctrl *lro = &rxr->hn_lro; 1565 struct lro_entry *queued; 1566 1567 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { 1568 SLIST_REMOVE_HEAD(&lro->lro_active, next); 1569 tcp_lro_flush(lro, queued); 1570 } 1571#endif 1572 1573 /* 1574 * NOTE: 1575 * 'txr' could be NULL, if multiple channels and 1576 * ifnet.if_start method are enabled. 1577 */ 1578 if (txr == NULL || !txr->hn_has_txeof) 1579 return; 1580 1581 txr->hn_txdone_cnt = 0; 1582 hn_txeof(txr); 1583} 1584 1585static __inline uint32_t 1586hn_rndis_pktmsg_offset(uint32_t ofs) 1587{ 1588 1589 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 1590 ("invalid RNDIS packet msg offset %u", ofs)); 1591 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 1592} 1593 1594static __inline void * 1595hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 1596 size_t pi_dlen, uint32_t pi_type) 1597{ 1598 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 1599 struct rndis_pktinfo *pi; 1600 1601 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 1602 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 1603 1604 /* 1605 * Per-packet-info does not move; it only grows. 1606 * 1607 * NOTE: 1608 * rm_pktinfooffset in this phase counts from the beginning 1609 * of rndis_packet_msg. 1610 */ 1611 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 1612 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 1613 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 1614 pkt->rm_pktinfolen); 1615 pkt->rm_pktinfolen += pi_size; 1616 1617 pi->rm_size = pi_size; 1618 pi->rm_type = pi_type; 1619 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 1620 1621 /* Data immediately follow per-packet-info. */ 1622 pkt->rm_dataoffset += pi_size; 1623 1624 /* Update RNDIS packet msg length */ 1625 pkt->rm_len += pi_size; 1626 1627 return (pi->rm_data); 1628} 1629 1630static __inline int 1631hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 1632{ 1633 struct hn_txdesc *txd; 1634 struct mbuf *m; 1635 int error, pkts; 1636 1637 txd = txr->hn_agg_txd; 1638 KASSERT(txd != NULL, ("no aggregate txdesc")); 1639 1640 /* 1641 * Since hn_txpkt() will reset this temporary stat, save 1642 * it now, so that oerrors can be updated properly, if 1643 * hn_txpkt() ever fails. 1644 */ 1645 pkts = txr->hn_stat_pkts; 1646 1647 /* 1648 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 1649 * failure, save it for later freeing, if hn_txpkt() ever 1650 * fails. 1651 */ 1652 m = txd->m; 1653 error = hn_txpkt(ifp, txr, txd); 1654 if (__predict_false(error)) { 1655 /* txd is freed, but m is not. */ 1656 m_freem(m); 1657 1658 txr->hn_flush_failed++; 1659 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 1660 } 1661 1662 /* Reset all aggregation states. */ 1663 txr->hn_agg_txd = NULL; 1664 txr->hn_agg_szleft = 0; 1665 txr->hn_agg_pktleft = 0; 1666 txr->hn_agg_prevpkt = NULL; 1667 1668 return (error); 1669} 1670 1671static void * 1672hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1673 int pktsize) 1674{ 1675 void *chim; 1676 1677 if (txr->hn_agg_txd != NULL) { 1678 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 1679 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 1680 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 1681 int olen; 1682 1683 /* 1684 * Update the previous RNDIS packet's total length, 1685 * it can be increased due to the mandatory alignment 1686 * padding for this RNDIS packet. And update the 1687 * aggregating txdesc's chimney sending buffer size 1688 * accordingly. 1689 * 1690 * XXX 1691 * Zero-out the padding, as required by the RNDIS spec. 1692 */ 1693 olen = pkt->rm_len; 1694 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 1695 agg_txd->chim_size += pkt->rm_len - olen; 1696 1697 /* Link this txdesc to the parent. */ 1698 hn_txdesc_agg(agg_txd, txd); 1699 1700 chim = (uint8_t *)pkt + pkt->rm_len; 1701 /* Save the current packet for later fixup. */ 1702 txr->hn_agg_prevpkt = chim; 1703 1704 txr->hn_agg_pktleft--; 1705 txr->hn_agg_szleft -= pktsize; 1706 if (txr->hn_agg_szleft <= 1707 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1708 /* 1709 * Probably can't aggregate more packets, 1710 * flush this aggregating txdesc proactively. 1711 */ 1712 txr->hn_agg_pktleft = 0; 1713 } 1714 /* Done! */ 1715 return (chim); 1716 } 1717 hn_flush_txagg(ifp, txr); 1718 } 1719 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 1720 1721 txr->hn_tx_chimney_tried++; 1722 txd->chim_index = hn_chim_alloc(txr->hn_sc); 1723 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 1724 return (NULL); 1725 txr->hn_tx_chimney++; 1726 1727 chim = txr->hn_sc->hn_chim + 1728 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 1729 1730 if (txr->hn_agg_pktmax > 1 && 1731 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1732 txr->hn_agg_txd = txd; 1733 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 1734 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 1735 txr->hn_agg_prevpkt = chim; 1736 } 1737 return (chim); 1738} 1739 1740/* 1741 * NOTE: 1742 * If this function fails, then both txd and m_head0 will be freed. 1743 */ 1744static int 1745hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1746 struct mbuf **m_head0) 1747{ 1748 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 1749 int error, nsegs, i; 1750 struct mbuf *m_head = *m_head0; 1751 struct rndis_packet_msg *pkt; 1752 uint32_t *pi_data; 1753 void *chim = NULL; 1754 int pkt_hlen, pkt_size; 1755 1756 pkt = txd->rndis_pkt; 1757 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 1758 if (pkt_size < txr->hn_chim_size) { 1759 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 1760 if (chim != NULL) 1761 pkt = chim; 1762 } else { 1763 if (txr->hn_agg_txd != NULL) 1764 hn_flush_txagg(ifp, txr); 1765 } 1766 1767 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 1768 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len; 1769 pkt->rm_dataoffset = sizeof(*pkt); 1770 pkt->rm_datalen = m_head->m_pkthdr.len; 1771 pkt->rm_oobdataoffset = 0; 1772 pkt->rm_oobdatalen = 0; 1773 pkt->rm_oobdataelements = 0; 1774 pkt->rm_pktinfooffset = sizeof(*pkt); 1775 pkt->rm_pktinfolen = 0; 1776 pkt->rm_vchandle = 0; 1777 pkt->rm_reserved = 0; 1778 1779 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 1780 /* 1781 * Set the hash value for this packet, so that the host could 1782 * dispatch the TX done event for this packet back to this TX 1783 * ring's channel. 1784 */ 1785 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1786 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 1787 *pi_data = txr->hn_tx_idx; 1788 } 1789 1790 if (m_head->m_flags & M_VLANTAG) { 1791 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1792 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 1793 *pi_data = NDIS_VLAN_INFO_MAKE( 1794 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 1795 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 1796 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 1797 } 1798 1799 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 1800#if defined(INET6) || defined(INET) 1801 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1802 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 1803#ifdef INET 1804 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 1805 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, 1806 m_head->m_pkthdr.tso_segsz); 1807 } 1808#endif 1809#if defined(INET6) && defined(INET) 1810 else 1811#endif 1812#ifdef INET6 1813 { 1814 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, 1815 m_head->m_pkthdr.tso_segsz); 1816 } 1817#endif 1818#endif /* INET6 || INET */ 1819 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 1820 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1821 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 1822 if (m_head->m_pkthdr.csum_flags & 1823 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 1824 *pi_data = NDIS_TXCSUM_INFO_IPV6; 1825 } else { 1826 *pi_data = NDIS_TXCSUM_INFO_IPV4; 1827 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 1828 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 1829 } 1830 1831 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) 1832 *pi_data |= NDIS_TXCSUM_INFO_TCPCS; 1833 else if (m_head->m_pkthdr.csum_flags & 1834 (CSUM_IP_UDP | CSUM_IP6_UDP)) 1835 *pi_data |= NDIS_TXCSUM_INFO_UDPCS; 1836 } 1837 1838 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 1839 /* Convert RNDIS packet message offsets */ 1840 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset); 1841 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 1842 1843 /* 1844 * Fast path: Chimney sending. 1845 */ 1846 if (chim != NULL) { 1847 struct hn_txdesc *tgt_txd = txd; 1848 1849 if (txr->hn_agg_txd != NULL) { 1850 tgt_txd = txr->hn_agg_txd; 1851#ifdef INVARIANTS 1852 *m_head0 = NULL; 1853#endif 1854 } 1855 1856 KASSERT(pkt == chim, 1857 ("RNDIS pkt not in chimney sending buffer")); 1858 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 1859 ("chimney sending buffer is not used")); 1860 tgt_txd->chim_size += pkt->rm_len; 1861 1862 m_copydata(m_head, 0, m_head->m_pkthdr.len, 1863 ((uint8_t *)chim) + pkt_hlen); 1864 1865 txr->hn_gpa_cnt = 0; 1866 txr->hn_sendpkt = hn_txpkt_chim; 1867 goto done; 1868 } 1869 1870 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 1871 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1872 ("chimney buffer is used")); 1873 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 1874 1875 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 1876 if (__predict_false(error)) { 1877 int freed; 1878 1879 /* 1880 * This mbuf is not linked w/ the txd yet, so free it now. 1881 */ 1882 m_freem(m_head); 1883 *m_head0 = NULL; 1884 1885 freed = hn_txdesc_put(txr, txd); 1886 KASSERT(freed != 0, 1887 ("fail to free txd upon txdma error")); 1888 1889 txr->hn_txdma_failed++; 1890 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 1891 return error; 1892 } 1893 *m_head0 = m_head; 1894 1895 /* +1 RNDIS packet message */ 1896 txr->hn_gpa_cnt = nsegs + 1; 1897 1898 /* send packet with page buffer */ 1899 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 1900 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 1901 txr->hn_gpa[0].gpa_len = pkt_hlen; 1902 1903 /* 1904 * Fill the page buffers with mbuf info after the page 1905 * buffer for RNDIS packet message. 1906 */ 1907 for (i = 0; i < nsegs; ++i) { 1908 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 1909 1910 gpa->gpa_page = atop(segs[i].ds_addr); 1911 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 1912 gpa->gpa_len = segs[i].ds_len; 1913 } 1914 1915 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1916 txd->chim_size = 0; 1917 txr->hn_sendpkt = hn_txpkt_sglist; 1918done: 1919 txd->m = m_head; 1920 1921 /* Set the completion routine */ 1922 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 1923 1924 /* Update temporary stats for later use. */ 1925 txr->hn_stat_pkts++; 1926 txr->hn_stat_size += m_head->m_pkthdr.len; 1927 if (m_head->m_flags & M_MCAST) 1928 txr->hn_stat_mcasts++; 1929 1930 return 0; 1931} 1932 1933/* 1934 * NOTE: 1935 * If this function fails, then txd will be freed, but the mbuf 1936 * associated w/ the txd will _not_ be freed. 1937 */ 1938static int 1939hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 1940{ 1941 int error, send_failed = 0, has_bpf; 1942 1943again: 1944 has_bpf = bpf_peers_present(ifp->if_bpf); 1945 if (has_bpf) { 1946 /* 1947 * Make sure that this txd and any aggregated txds are not 1948 * freed before ETHER_BPF_MTAP. 1949 */ 1950 hn_txdesc_hold(txd); 1951 } 1952 error = txr->hn_sendpkt(txr, txd); 1953 if (!error) { 1954 if (has_bpf) { 1955 const struct hn_txdesc *tmp_txd; 1956 1957 ETHER_BPF_MTAP(ifp, txd->m); 1958 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 1959 ETHER_BPF_MTAP(ifp, tmp_txd->m); 1960 } 1961 1962 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 1963#ifdef HN_IFSTART_SUPPORT 1964 if (!hn_use_if_start) 1965#endif 1966 { 1967 if_inc_counter(ifp, IFCOUNTER_OBYTES, 1968 txr->hn_stat_size); 1969 if (txr->hn_stat_mcasts != 0) { 1970 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1971 txr->hn_stat_mcasts); 1972 } 1973 } 1974 txr->hn_pkts += txr->hn_stat_pkts; 1975 txr->hn_sends++; 1976 } 1977 if (has_bpf) 1978 hn_txdesc_put(txr, txd); 1979 1980 if (__predict_false(error)) { 1981 int freed; 1982 1983 /* 1984 * This should "really rarely" happen. 1985 * 1986 * XXX Too many RX to be acked or too many sideband 1987 * commands to run? Ask netvsc_channel_rollup() 1988 * to kick start later. 1989 */ 1990 txr->hn_has_txeof = 1; 1991 if (!send_failed) { 1992 txr->hn_send_failed++; 1993 send_failed = 1; 1994 /* 1995 * Try sending again after set hn_has_txeof; 1996 * in case that we missed the last 1997 * netvsc_channel_rollup(). 1998 */ 1999 goto again; 2000 } 2001 if_printf(ifp, "send failed\n"); 2002 2003 /* 2004 * Caller will perform further processing on the 2005 * associated mbuf, so don't free it in hn_txdesc_put(); 2006 * only unload it from the DMA map in hn_txdesc_put(), 2007 * if it was loaded. 2008 */ 2009 txd->m = NULL; 2010 freed = hn_txdesc_put(txr, txd); 2011 KASSERT(freed != 0, 2012 ("fail to free txd upon send error")); 2013 2014 txr->hn_send_failed++; 2015 } 2016 2017 /* Reset temporary stats, after this sending is done. */ 2018 txr->hn_stat_size = 0; 2019 txr->hn_stat_pkts = 0; 2020 txr->hn_stat_mcasts = 0; 2021 2022 return (error); 2023} 2024 2025/* 2026 * Append the specified data to the indicated mbuf chain, 2027 * Extend the mbuf chain if the new data does not fit in 2028 * existing space. 2029 * 2030 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 2031 * There should be an equivalent in the kernel mbuf code, 2032 * but there does not appear to be one yet. 2033 * 2034 * Differs from m_append() in that additional mbufs are 2035 * allocated with cluster size MJUMPAGESIZE, and filled 2036 * accordingly. 2037 * 2038 * Return 1 if able to complete the job; otherwise 0. 2039 */ 2040static int 2041hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 2042{ 2043 struct mbuf *m, *n; 2044 int remainder, space; 2045 2046 for (m = m0; m->m_next != NULL; m = m->m_next) 2047 ; 2048 remainder = len; 2049 space = M_TRAILINGSPACE(m); 2050 if (space > 0) { 2051 /* 2052 * Copy into available space. 2053 */ 2054 if (space > remainder) 2055 space = remainder; 2056 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 2057 m->m_len += space; 2058 cp += space; 2059 remainder -= space; 2060 } 2061 while (remainder > 0) { 2062 /* 2063 * Allocate a new mbuf; could check space 2064 * and allocate a cluster instead. 2065 */ 2066 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE); 2067 if (n == NULL) 2068 break; 2069 n->m_len = min(MJUMPAGESIZE, remainder); 2070 bcopy(cp, mtod(n, caddr_t), n->m_len); 2071 cp += n->m_len; 2072 remainder -= n->m_len; 2073 m->m_next = n; 2074 m = n; 2075 } 2076 if (m0->m_flags & M_PKTHDR) 2077 m0->m_pkthdr.len += len - remainder; 2078 2079 return (remainder == 0); 2080} 2081 2082#if defined(INET) || defined(INET6) 2083static __inline int 2084hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 2085{ 2086#if __FreeBSD_version >= 1100095 2087 if (hn_lro_mbufq_depth) { 2088 tcp_lro_queue_mbuf(lc, m); 2089 return 0; 2090 } 2091#endif 2092 return tcp_lro_rx(lc, m, 0); 2093} 2094#endif 2095 2096static int 2097hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 2098 const struct hn_rxinfo *info) 2099{ 2100 struct ifnet *ifp = rxr->hn_ifp; 2101 struct mbuf *m_new; 2102 int size, do_lro = 0, do_csum = 1; 2103 int hash_type = M_HASHTYPE_OPAQUE; 2104 2105 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) 2106 return (0); 2107 2108 /* 2109 * Bail out if packet contains more data than configured MTU. 2110 */ 2111 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) { 2112 return (0); 2113 } else if (dlen <= MHLEN) { 2114 m_new = m_gethdr(M_NOWAIT, MT_DATA); 2115 if (m_new == NULL) { 2116 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2117 return (0); 2118 } 2119 memcpy(mtod(m_new, void *), data, dlen); 2120 m_new->m_pkthdr.len = m_new->m_len = dlen; 2121 rxr->hn_small_pkts++; 2122 } else { 2123 /* 2124 * Get an mbuf with a cluster. For packets 2K or less, 2125 * get a standard 2K cluster. For anything larger, get a 2126 * 4K cluster. Any buffers larger than 4K can cause problems 2127 * if looped around to the Hyper-V TX channel, so avoid them. 2128 */ 2129 size = MCLBYTES; 2130 if (dlen > MCLBYTES) { 2131 /* 4096 */ 2132 size = MJUMPAGESIZE; 2133 } 2134 2135 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 2136 if (m_new == NULL) { 2137 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2138 return (0); 2139 } 2140 2141 hv_m_append(m_new, dlen, data); 2142 } 2143 m_new->m_pkthdr.rcvif = ifp; 2144 2145 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) 2146 do_csum = 0; 2147 2148 /* receive side checksum offload */ 2149 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 2150 /* IP csum offload */ 2151 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 2152 m_new->m_pkthdr.csum_flags |= 2153 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2154 rxr->hn_csum_ip++; 2155 } 2156 2157 /* TCP/UDP csum offload */ 2158 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 2159 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 2160 m_new->m_pkthdr.csum_flags |= 2161 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2162 m_new->m_pkthdr.csum_data = 0xffff; 2163 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 2164 rxr->hn_csum_tcp++; 2165 else 2166 rxr->hn_csum_udp++; 2167 } 2168 2169 /* 2170 * XXX 2171 * As of this write (Oct 28th, 2016), host side will turn 2172 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 2173 * the do_lro setting here is actually _not_ accurate. We 2174 * depend on the RSS hash type check to reset do_lro. 2175 */ 2176 if ((info->csum_info & 2177 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 2178 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 2179 do_lro = 1; 2180 } else { 2181 const struct ether_header *eh; 2182 uint16_t etype; 2183 int hoff; 2184 2185 hoff = sizeof(*eh); 2186 if (m_new->m_len < hoff) 2187 goto skip; 2188 eh = mtod(m_new, struct ether_header *); 2189 etype = ntohs(eh->ether_type); 2190 if (etype == ETHERTYPE_VLAN) { 2191 const struct ether_vlan_header *evl; 2192 2193 hoff = sizeof(*evl); 2194 if (m_new->m_len < hoff) 2195 goto skip; 2196 evl = mtod(m_new, struct ether_vlan_header *); 2197 etype = ntohs(evl->evl_proto); 2198 } 2199 2200 if (etype == ETHERTYPE_IP) { 2201 int pr; 2202 2203 pr = hn_check_iplen(m_new, hoff); 2204 if (pr == IPPROTO_TCP) { 2205 if (do_csum && 2206 (rxr->hn_trust_hcsum & 2207 HN_TRUST_HCSUM_TCP)) { 2208 rxr->hn_csum_trusted++; 2209 m_new->m_pkthdr.csum_flags |= 2210 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2211 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2212 m_new->m_pkthdr.csum_data = 0xffff; 2213 } 2214 do_lro = 1; 2215 } else if (pr == IPPROTO_UDP) { 2216 if (do_csum && 2217 (rxr->hn_trust_hcsum & 2218 HN_TRUST_HCSUM_UDP)) { 2219 rxr->hn_csum_trusted++; 2220 m_new->m_pkthdr.csum_flags |= 2221 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2222 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2223 m_new->m_pkthdr.csum_data = 0xffff; 2224 } 2225 } else if (pr != IPPROTO_DONE && do_csum && 2226 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 2227 rxr->hn_csum_trusted++; 2228 m_new->m_pkthdr.csum_flags |= 2229 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2230 } 2231 } 2232 } 2233skip: 2234 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 2235 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 2236 NDIS_VLAN_INFO_ID(info->vlan_info), 2237 NDIS_VLAN_INFO_PRI(info->vlan_info), 2238 NDIS_VLAN_INFO_CFI(info->vlan_info)); 2239 m_new->m_flags |= M_VLANTAG; 2240 } 2241 2242 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 2243 rxr->hn_rss_pkts++; 2244 m_new->m_pkthdr.flowid = info->hash_value; 2245 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 2246 NDIS_HASH_FUNCTION_TOEPLITZ) { 2247 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); 2248 2249 /* 2250 * NOTE: 2251 * do_lro is resetted, if the hash types are not TCP 2252 * related. See the comment in the above csum_flags 2253 * setup section. 2254 */ 2255 switch (type) { 2256 case NDIS_HASH_IPV4: 2257 hash_type = M_HASHTYPE_RSS_IPV4; 2258 do_lro = 0; 2259 break; 2260 2261 case NDIS_HASH_TCP_IPV4: 2262 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 2263 break; 2264 2265 case NDIS_HASH_IPV6: 2266 hash_type = M_HASHTYPE_RSS_IPV6; 2267 do_lro = 0; 2268 break; 2269 2270 case NDIS_HASH_IPV6_EX: 2271 hash_type = M_HASHTYPE_RSS_IPV6_EX; 2272 do_lro = 0; 2273 break; 2274 2275 case NDIS_HASH_TCP_IPV6: 2276 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 2277 break; 2278 2279 case NDIS_HASH_TCP_IPV6_EX: 2280 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 2281 break; 2282 } 2283 } 2284 } else { 2285 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 2286 } 2287 M_HASHTYPE_SET(m_new, hash_type); 2288 2289 /* 2290 * Note: Moved RX completion back to hv_nv_on_receive() so all 2291 * messages (not just data messages) will trigger a response. 2292 */ 2293 2294 ifp->if_ipackets++; 2295 rxr->hn_pkts++; 2296 2297 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { 2298#if defined(INET) || defined(INET6) 2299 struct lro_ctrl *lro = &rxr->hn_lro; 2300 2301 if (lro->lro_cnt) { 2302 rxr->hn_lro_tried++; 2303 if (hn_lro_rx(lro, m_new) == 0) { 2304 /* DONE! */ 2305 return 0; 2306 } 2307 } 2308#endif 2309 } 2310 2311 /* We're not holding the lock here, so don't release it */ 2312 (*ifp->if_input)(ifp, m_new); 2313 2314 return (0); 2315} 2316 2317static int 2318hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 2319{ 2320 struct hn_softc *sc = ifp->if_softc; 2321 struct ifreq *ifr = (struct ifreq *)data; 2322 int mask, error = 0; 2323 2324 switch (cmd) { 2325 case SIOCSIFMTU: 2326 if (ifr->ifr_mtu > HN_MTU_MAX) { 2327 error = EINVAL; 2328 break; 2329 } 2330 2331 HN_LOCK(sc); 2332 2333 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2334 HN_UNLOCK(sc); 2335 break; 2336 } 2337 2338 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 2339 /* Can't change MTU */ 2340 HN_UNLOCK(sc); 2341 error = EOPNOTSUPP; 2342 break; 2343 } 2344 2345 if (ifp->if_mtu == ifr->ifr_mtu) { 2346 HN_UNLOCK(sc); 2347 break; 2348 } 2349 2350 /* 2351 * Suspend this interface before the synthetic parts 2352 * are ripped. 2353 */ 2354 hn_suspend(sc); 2355 2356 /* 2357 * Detach the synthetics parts, i.e. NVS and RNDIS. 2358 */ 2359 hn_synth_detach(sc); 2360 2361 /* 2362 * Reattach the synthetic parts, i.e. NVS and RNDIS, 2363 * with the new MTU setting. 2364 */ 2365 error = hn_synth_attach(sc, ifr->ifr_mtu); 2366 if (error) { 2367 HN_UNLOCK(sc); 2368 break; 2369 } 2370 2371 /* 2372 * Commit the requested MTU, after the synthetic parts 2373 * have been successfully attached. 2374 */ 2375 ifp->if_mtu = ifr->ifr_mtu; 2376 2377 /* 2378 * Make sure that various parameters based on MTU are 2379 * still valid, after the MTU change. 2380 */ 2381 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 2382 hn_set_chim_size(sc, sc->hn_chim_szmax); 2383 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 2384#if __FreeBSD_version >= 1100099 2385 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < 2386 HN_LRO_LENLIM_MIN(ifp)) 2387 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 2388#endif 2389 2390 /* 2391 * All done! Resume the interface now. 2392 */ 2393 hn_resume(sc); 2394 2395 HN_UNLOCK(sc); 2396 break; 2397 2398 case SIOCSIFFLAGS: 2399 HN_LOCK(sc); 2400 2401 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2402 HN_UNLOCK(sc); 2403 break; 2404 } 2405 2406 if (ifp->if_flags & IFF_UP) { 2407 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2408 /* 2409 * Caller meight hold mutex, e.g. 2410 * bpf; use busy-wait for the RNDIS 2411 * reply. 2412 */ 2413 HN_NO_SLEEPING(sc); 2414 hn_set_rxfilter(sc); 2415 HN_SLEEPING_OK(sc); 2416 } else { 2417 hn_init_locked(sc); 2418 } 2419 } else { 2420 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2421 hn_stop(sc); 2422 } 2423 sc->hn_if_flags = ifp->if_flags; 2424 2425 HN_UNLOCK(sc); 2426 break; 2427 2428 case SIOCSIFCAP: 2429 HN_LOCK(sc); 2430 mask = ifr->ifr_reqcap ^ ifp->if_capenable; 2431 2432 if (mask & IFCAP_TXCSUM) { 2433 ifp->if_capenable ^= IFCAP_TXCSUM; 2434 if (ifp->if_capenable & IFCAP_TXCSUM) 2435 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 2436 else 2437 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 2438 } 2439 if (mask & IFCAP_TXCSUM_IPV6) { 2440 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 2441 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 2442 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 2443 else 2444 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 2445 } 2446 2447 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 2448 if (mask & IFCAP_RXCSUM) 2449 ifp->if_capenable ^= IFCAP_RXCSUM; 2450#ifdef foo 2451 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2452 if (mask & IFCAP_RXCSUM_IPV6) 2453 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 2454#endif 2455 2456 if (mask & IFCAP_LRO) 2457 ifp->if_capenable ^= IFCAP_LRO; 2458 2459 if (mask & IFCAP_TSO4) { 2460 ifp->if_capenable ^= IFCAP_TSO4; 2461 if (ifp->if_capenable & IFCAP_TSO4) 2462 ifp->if_hwassist |= CSUM_IP_TSO; 2463 else 2464 ifp->if_hwassist &= ~CSUM_IP_TSO; 2465 } 2466 if (mask & IFCAP_TSO6) { 2467 ifp->if_capenable ^= IFCAP_TSO6; 2468 if (ifp->if_capenable & IFCAP_TSO6) 2469 ifp->if_hwassist |= CSUM_IP6_TSO; 2470 else 2471 ifp->if_hwassist &= ~CSUM_IP6_TSO; 2472 } 2473 2474 HN_UNLOCK(sc); 2475 break; 2476 2477 case SIOCADDMULTI: 2478 case SIOCDELMULTI: 2479 HN_LOCK(sc); 2480 2481 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2482 HN_UNLOCK(sc); 2483 break; 2484 } 2485 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2486 /* 2487 * Multicast uses mutex; use busy-wait for 2488 * the RNDIS reply. 2489 */ 2490 HN_NO_SLEEPING(sc); 2491 hn_set_rxfilter(sc); 2492 HN_SLEEPING_OK(sc); 2493 } 2494 2495 HN_UNLOCK(sc); 2496 break; 2497 2498 case SIOCSIFMEDIA: 2499 case SIOCGIFMEDIA: 2500 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 2501 break; 2502 2503 default: 2504 error = ether_ioctl(ifp, cmd, data); 2505 break; 2506 } 2507 return (error); 2508} 2509 2510static void 2511hn_stop(struct hn_softc *sc) 2512{ 2513 struct ifnet *ifp = sc->hn_ifp; 2514 int i; 2515 2516 HN_LOCK_ASSERT(sc); 2517 2518 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 2519 ("synthetic parts were not attached")); 2520 2521 /* Clear RUNNING bit _before_ hn_suspend_data() */ 2522 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2523 hn_suspend_data(sc); 2524 2525 /* Clear OACTIVE bit. */ 2526 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2527 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2528 sc->hn_tx_ring[i].hn_oactive = 0; 2529} 2530 2531static void 2532hn_init_locked(struct hn_softc *sc) 2533{ 2534 struct ifnet *ifp = sc->hn_ifp; 2535 int i; 2536 2537 HN_LOCK_ASSERT(sc); 2538 2539 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 2540 return; 2541 2542 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2543 return; 2544 2545 /* Configure RX filter */ 2546 hn_set_rxfilter(sc); 2547 2548 /* Clear OACTIVE bit. */ 2549 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2550 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2551 sc->hn_tx_ring[i].hn_oactive = 0; 2552 2553 /* Clear TX 'suspended' bit. */ 2554 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 2555 2556 /* Everything is ready; unleash! */ 2557 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2558} 2559 2560static void 2561hn_init(void *xsc) 2562{ 2563 struct hn_softc *sc = xsc; 2564 2565 HN_LOCK(sc); 2566 hn_init_locked(sc); 2567 HN_UNLOCK(sc); 2568} 2569 2570#if __FreeBSD_version >= 1100099 2571 2572static int 2573hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 2574{ 2575 struct hn_softc *sc = arg1; 2576 unsigned int lenlim; 2577 int error; 2578 2579 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 2580 error = sysctl_handle_int(oidp, &lenlim, 0, req); 2581 if (error || req->newptr == NULL) 2582 return error; 2583 2584 HN_LOCK(sc); 2585 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 2586 lenlim > TCP_LRO_LENGTH_MAX) { 2587 HN_UNLOCK(sc); 2588 return EINVAL; 2589 } 2590 hn_set_lro_lenlim(sc, lenlim); 2591 HN_UNLOCK(sc); 2592 2593 return 0; 2594} 2595 2596static int 2597hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 2598{ 2599 struct hn_softc *sc = arg1; 2600 int ackcnt, error, i; 2601 2602 /* 2603 * lro_ackcnt_lim is append count limit, 2604 * +1 to turn it into aggregation limit. 2605 */ 2606 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 2607 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 2608 if (error || req->newptr == NULL) 2609 return error; 2610 2611 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 2612 return EINVAL; 2613 2614 /* 2615 * Convert aggregation limit back to append 2616 * count limit. 2617 */ 2618 --ackcnt; 2619 HN_LOCK(sc); 2620 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 2621 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 2622 HN_UNLOCK(sc); 2623 return 0; 2624} 2625 2626#endif 2627 2628static int 2629hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 2630{ 2631 struct hn_softc *sc = arg1; 2632 int hcsum = arg2; 2633 int on, error, i; 2634 2635 on = 0; 2636 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 2637 on = 1; 2638 2639 error = sysctl_handle_int(oidp, &on, 0, req); 2640 if (error || req->newptr == NULL) 2641 return error; 2642 2643 HN_LOCK(sc); 2644 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2645 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 2646 2647 if (on) 2648 rxr->hn_trust_hcsum |= hcsum; 2649 else 2650 rxr->hn_trust_hcsum &= ~hcsum; 2651 } 2652 HN_UNLOCK(sc); 2653 return 0; 2654} 2655 2656static int 2657hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 2658{ 2659 struct hn_softc *sc = arg1; 2660 int chim_size, error; 2661 2662 chim_size = sc->hn_tx_ring[0].hn_chim_size; 2663 error = sysctl_handle_int(oidp, &chim_size, 0, req); 2664 if (error || req->newptr == NULL) 2665 return error; 2666 2667 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 2668 return EINVAL; 2669 2670 HN_LOCK(sc); 2671 hn_set_chim_size(sc, chim_size); 2672 HN_UNLOCK(sc); 2673 return 0; 2674} 2675 2676#if __FreeBSD_version < 1100095 2677static int 2678hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 2679{ 2680 struct hn_softc *sc = arg1; 2681 int ofs = arg2, i, error; 2682 struct hn_rx_ring *rxr; 2683 uint64_t stat; 2684 2685 stat = 0; 2686 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { 2687 rxr = &sc->hn_rx_ring[i]; 2688 stat += *((int *)((uint8_t *)rxr + ofs)); 2689 } 2690 2691 error = sysctl_handle_64(oidp, &stat, 0, req); 2692 if (error || req->newptr == NULL) 2693 return error; 2694 2695 /* Zero out this stat. */ 2696 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { 2697 rxr = &sc->hn_rx_ring[i]; 2698 *((int *)((uint8_t *)rxr + ofs)) = 0; 2699 } 2700 return 0; 2701} 2702#else 2703static int 2704hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 2705{ 2706 struct hn_softc *sc = arg1; 2707 int ofs = arg2, i, error; 2708 struct hn_rx_ring *rxr; 2709 uint64_t stat; 2710 2711 stat = 0; 2712 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2713 rxr = &sc->hn_rx_ring[i]; 2714 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 2715 } 2716 2717 error = sysctl_handle_64(oidp, &stat, 0, req); 2718 if (error || req->newptr == NULL) 2719 return error; 2720 2721 /* Zero out this stat. */ 2722 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2723 rxr = &sc->hn_rx_ring[i]; 2724 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 2725 } 2726 return 0; 2727} 2728 2729#endif 2730 2731static int 2732hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2733{ 2734 struct hn_softc *sc = arg1; 2735 int ofs = arg2, i, error; 2736 struct hn_rx_ring *rxr; 2737 u_long stat; 2738 2739 stat = 0; 2740 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2741 rxr = &sc->hn_rx_ring[i]; 2742 stat += *((u_long *)((uint8_t *)rxr + ofs)); 2743 } 2744 2745 error = sysctl_handle_long(oidp, &stat, 0, req); 2746 if (error || req->newptr == NULL) 2747 return error; 2748 2749 /* Zero out this stat. */ 2750 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2751 rxr = &sc->hn_rx_ring[i]; 2752 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 2753 } 2754 return 0; 2755} 2756 2757static int 2758hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2759{ 2760 struct hn_softc *sc = arg1; 2761 int ofs = arg2, i, error; 2762 struct hn_tx_ring *txr; 2763 u_long stat; 2764 2765 stat = 0; 2766 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2767 txr = &sc->hn_tx_ring[i]; 2768 stat += *((u_long *)((uint8_t *)txr + ofs)); 2769 } 2770 2771 error = sysctl_handle_long(oidp, &stat, 0, req); 2772 if (error || req->newptr == NULL) 2773 return error; 2774 2775 /* Zero out this stat. */ 2776 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2777 txr = &sc->hn_tx_ring[i]; 2778 *((u_long *)((uint8_t *)txr + ofs)) = 0; 2779 } 2780 return 0; 2781} 2782 2783static int 2784hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 2785{ 2786 struct hn_softc *sc = arg1; 2787 int ofs = arg2, i, error, conf; 2788 struct hn_tx_ring *txr; 2789 2790 txr = &sc->hn_tx_ring[0]; 2791 conf = *((int *)((uint8_t *)txr + ofs)); 2792 2793 error = sysctl_handle_int(oidp, &conf, 0, req); 2794 if (error || req->newptr == NULL) 2795 return error; 2796 2797 HN_LOCK(sc); 2798 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2799 txr = &sc->hn_tx_ring[i]; 2800 *((int *)((uint8_t *)txr + ofs)) = conf; 2801 } 2802 HN_UNLOCK(sc); 2803 2804 return 0; 2805} 2806 2807static int 2808hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 2809{ 2810 struct hn_softc *sc = arg1; 2811 int error, size; 2812 2813 size = sc->hn_agg_size; 2814 error = sysctl_handle_int(oidp, &size, 0, req); 2815 if (error || req->newptr == NULL) 2816 return (error); 2817 2818 HN_LOCK(sc); 2819 sc->hn_agg_size = size; 2820 hn_set_txagg(sc); 2821 HN_UNLOCK(sc); 2822 2823 return (0); 2824} 2825 2826static int 2827hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 2828{ 2829 struct hn_softc *sc = arg1; 2830 int error, pkts; 2831 2832 pkts = sc->hn_agg_pkts; 2833 error = sysctl_handle_int(oidp, &pkts, 0, req); 2834 if (error || req->newptr == NULL) 2835 return (error); 2836 2837 HN_LOCK(sc); 2838 sc->hn_agg_pkts = pkts; 2839 hn_set_txagg(sc); 2840 HN_UNLOCK(sc); 2841 2842 return (0); 2843} 2844 2845static int 2846hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 2847{ 2848 struct hn_softc *sc = arg1; 2849 int pkts; 2850 2851 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 2852 return (sysctl_handle_int(oidp, &pkts, 0, req)); 2853} 2854 2855static int 2856hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 2857{ 2858 struct hn_softc *sc = arg1; 2859 int align; 2860 2861 align = sc->hn_tx_ring[0].hn_agg_align; 2862 return (sysctl_handle_int(oidp, &align, 0, req)); 2863} 2864 2865static int 2866hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 2867{ 2868 struct hn_softc *sc = arg1; 2869 char verstr[16]; 2870 2871 snprintf(verstr, sizeof(verstr), "%u.%u", 2872 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 2873 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 2874 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 2875} 2876 2877static int 2878hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 2879{ 2880 struct hn_softc *sc = arg1; 2881 char caps_str[128]; 2882 uint32_t caps; 2883 2884 HN_LOCK(sc); 2885 caps = sc->hn_caps; 2886 HN_UNLOCK(sc); 2887 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 2888 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 2889} 2890 2891static int 2892hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 2893{ 2894 struct hn_softc *sc = arg1; 2895 char assist_str[128]; 2896 uint32_t hwassist; 2897 2898 HN_LOCK(sc); 2899 hwassist = sc->hn_ifp->if_hwassist; 2900 HN_UNLOCK(sc); 2901 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 2902 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 2903} 2904 2905static int 2906hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 2907{ 2908 struct hn_softc *sc = arg1; 2909 char filter_str[128]; 2910 uint32_t filter; 2911 2912 HN_LOCK(sc); 2913 filter = sc->hn_rx_filter; 2914 HN_UNLOCK(sc); 2915 snprintf(filter_str, sizeof(filter_str), "%b", filter, 2916 NDIS_PACKET_TYPES); 2917 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 2918} 2919 2920static int 2921hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 2922{ 2923 struct hn_softc *sc = arg1; 2924 int error; 2925 2926 HN_LOCK(sc); 2927 2928 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 2929 if (error || req->newptr == NULL) 2930 goto back; 2931 2932 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 2933 if (error) 2934 goto back; 2935 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 2936 2937 if (sc->hn_rx_ring_inuse > 1) { 2938 error = hn_rss_reconfig(sc); 2939 } else { 2940 /* Not RSS capable, at least for now; just save the RSS key. */ 2941 error = 0; 2942 } 2943back: 2944 HN_UNLOCK(sc); 2945 return (error); 2946} 2947 2948static int 2949hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 2950{ 2951 struct hn_softc *sc = arg1; 2952 int error; 2953 2954 HN_LOCK(sc); 2955 2956 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 2957 if (error || req->newptr == NULL) 2958 goto back; 2959 2960 /* 2961 * Don't allow RSS indirect table change, if this interface is not 2962 * RSS capable currently. 2963 */ 2964 if (sc->hn_rx_ring_inuse == 1) { 2965 error = EOPNOTSUPP; 2966 goto back; 2967 } 2968 2969 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 2970 if (error) 2971 goto back; 2972 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 2973 2974 hn_rss_ind_fixup(sc); 2975 error = hn_rss_reconfig(sc); 2976back: 2977 HN_UNLOCK(sc); 2978 return (error); 2979} 2980 2981static int 2982hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 2983{ 2984 struct hn_softc *sc = arg1; 2985 char hash_str[128]; 2986 uint32_t hash; 2987 2988 HN_LOCK(sc); 2989 hash = sc->hn_rss_hash; 2990 HN_UNLOCK(sc); 2991 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 2992 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 2993} 2994 2995static int 2996hn_check_iplen(const struct mbuf *m, int hoff) 2997{ 2998 const struct ip *ip; 2999 int len, iphlen, iplen; 3000 const struct tcphdr *th; 3001 int thoff; /* TCP data offset */ 3002 3003 len = hoff + sizeof(struct ip); 3004 3005 /* The packet must be at least the size of an IP header. */ 3006 if (m->m_pkthdr.len < len) 3007 return IPPROTO_DONE; 3008 3009 /* The fixed IP header must reside completely in the first mbuf. */ 3010 if (m->m_len < len) 3011 return IPPROTO_DONE; 3012 3013 ip = mtodo(m, hoff); 3014 3015 /* Bound check the packet's stated IP header length. */ 3016 iphlen = ip->ip_hl << 2; 3017 if (iphlen < sizeof(struct ip)) /* minimum header length */ 3018 return IPPROTO_DONE; 3019 3020 /* The full IP header must reside completely in the one mbuf. */ 3021 if (m->m_len < hoff + iphlen) 3022 return IPPROTO_DONE; 3023 3024 iplen = ntohs(ip->ip_len); 3025 3026 /* 3027 * Check that the amount of data in the buffers is as 3028 * at least much as the IP header would have us expect. 3029 */ 3030 if (m->m_pkthdr.len < hoff + iplen) 3031 return IPPROTO_DONE; 3032 3033 /* 3034 * Ignore IP fragments. 3035 */ 3036 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 3037 return IPPROTO_DONE; 3038 3039 /* 3040 * The TCP/IP or UDP/IP header must be entirely contained within 3041 * the first fragment of a packet. 3042 */ 3043 switch (ip->ip_p) { 3044 case IPPROTO_TCP: 3045 if (iplen < iphlen + sizeof(struct tcphdr)) 3046 return IPPROTO_DONE; 3047 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 3048 return IPPROTO_DONE; 3049 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 3050 thoff = th->th_off << 2; 3051 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 3052 return IPPROTO_DONE; 3053 if (m->m_len < hoff + iphlen + thoff) 3054 return IPPROTO_DONE; 3055 break; 3056 case IPPROTO_UDP: 3057 if (iplen < iphlen + sizeof(struct udphdr)) 3058 return IPPROTO_DONE; 3059 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 3060 return IPPROTO_DONE; 3061 break; 3062 default: 3063 if (iplen < iphlen) 3064 return IPPROTO_DONE; 3065 break; 3066 } 3067 return ip->ip_p; 3068} 3069 3070static int 3071hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 3072{ 3073 struct sysctl_oid_list *child; 3074 struct sysctl_ctx_list *ctx; 3075 device_t dev = sc->hn_dev; 3076#if defined(INET) || defined(INET6) 3077#if __FreeBSD_version >= 1100095 3078 int lroent_cnt; 3079#endif 3080#endif 3081 int i; 3082 3083 /* 3084 * Create RXBUF for reception. 3085 * 3086 * NOTE: 3087 * - It is shared by all channels. 3088 * - A large enough buffer is allocated, certain version of NVSes 3089 * may further limit the usable space. 3090 */ 3091 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3092 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 3093 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3094 if (sc->hn_rxbuf == NULL) { 3095 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 3096 return (ENOMEM); 3097 } 3098 3099 sc->hn_rx_ring_cnt = ring_cnt; 3100 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 3101 3102 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 3103 M_DEVBUF, M_WAITOK | M_ZERO); 3104 3105#if defined(INET) || defined(INET6) 3106#if __FreeBSD_version >= 1100095 3107 lroent_cnt = hn_lro_entry_count; 3108 if (lroent_cnt < TCP_LRO_ENTRIES) 3109 lroent_cnt = TCP_LRO_ENTRIES; 3110 if (bootverbose) 3111 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 3112#endif 3113#endif /* INET || INET6 */ 3114 3115 ctx = device_get_sysctl_ctx(dev); 3116 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 3117 3118 /* Create dev.hn.UNIT.rx sysctl tree */ 3119 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 3120 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3121 3122 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3123 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3124 3125 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3126 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 3127 &rxr->hn_br_dma, BUS_DMA_WAITOK); 3128 if (rxr->hn_br == NULL) { 3129 device_printf(dev, "allocate bufring failed\n"); 3130 return (ENOMEM); 3131 } 3132 3133 if (hn_trust_hosttcp) 3134 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 3135 if (hn_trust_hostudp) 3136 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 3137 if (hn_trust_hostip) 3138 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 3139 rxr->hn_ifp = sc->hn_ifp; 3140 if (i < sc->hn_tx_ring_cnt) 3141 rxr->hn_txr = &sc->hn_tx_ring[i]; 3142 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 3143 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 3144 rxr->hn_rx_idx = i; 3145 rxr->hn_rxbuf = sc->hn_rxbuf; 3146 3147 /* 3148 * Initialize LRO. 3149 */ 3150#if defined(INET) || defined(INET6) 3151#if __FreeBSD_version >= 1100095 3152 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 3153 hn_lro_mbufq_depth); 3154#else 3155 tcp_lro_init(&rxr->hn_lro); 3156 rxr->hn_lro.ifp = sc->hn_ifp; 3157#endif 3158#if __FreeBSD_version >= 1100099 3159 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 3160 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 3161#endif 3162#endif /* INET || INET6 */ 3163 3164 if (sc->hn_rx_sysctl_tree != NULL) { 3165 char name[16]; 3166 3167 /* 3168 * Create per RX ring sysctl tree: 3169 * dev.hn.UNIT.rx.RINGID 3170 */ 3171 snprintf(name, sizeof(name), "%d", i); 3172 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 3173 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 3174 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3175 3176 if (rxr->hn_rx_sysctl_tree != NULL) { 3177 SYSCTL_ADD_ULONG(ctx, 3178 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3179 OID_AUTO, "packets", CTLFLAG_RW, 3180 &rxr->hn_pkts, "# of packets received"); 3181 SYSCTL_ADD_ULONG(ctx, 3182 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3183 OID_AUTO, "rss_pkts", CTLFLAG_RW, 3184 &rxr->hn_rss_pkts, 3185 "# of packets w/ RSS info received"); 3186 SYSCTL_ADD_INT(ctx, 3187 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3188 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 3189 &rxr->hn_pktbuf_len, 0, 3190 "Temporary channel packet buffer length"); 3191 } 3192 } 3193 } 3194 3195 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 3196 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3197 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 3198#if __FreeBSD_version < 1100095 3199 hn_rx_stat_int_sysctl, 3200#else 3201 hn_rx_stat_u64_sysctl, 3202#endif 3203 "LU", "LRO queued"); 3204 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 3205 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3206 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 3207#if __FreeBSD_version < 1100095 3208 hn_rx_stat_int_sysctl, 3209#else 3210 hn_rx_stat_u64_sysctl, 3211#endif 3212 "LU", "LRO flushed"); 3213 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 3214 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3215 __offsetof(struct hn_rx_ring, hn_lro_tried), 3216 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 3217#if __FreeBSD_version >= 1100099 3218 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 3219 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3220 hn_lro_lenlim_sysctl, "IU", 3221 "Max # of data bytes to be aggregated by LRO"); 3222 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 3223 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3224 hn_lro_ackcnt_sysctl, "I", 3225 "Max # of ACKs to be aggregated by LRO"); 3226#endif 3227 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 3228 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 3229 hn_trust_hcsum_sysctl, "I", 3230 "Trust tcp segement verification on host side, " 3231 "when csum info is missing"); 3232 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 3233 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 3234 hn_trust_hcsum_sysctl, "I", 3235 "Trust udp datagram verification on host side, " 3236 "when csum info is missing"); 3237 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 3238 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 3239 hn_trust_hcsum_sysctl, "I", 3240 "Trust ip packet verification on host side, " 3241 "when csum info is missing"); 3242 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 3243 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3244 __offsetof(struct hn_rx_ring, hn_csum_ip), 3245 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 3246 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 3247 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3248 __offsetof(struct hn_rx_ring, hn_csum_tcp), 3249 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 3250 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 3251 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3252 __offsetof(struct hn_rx_ring, hn_csum_udp), 3253 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 3254 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 3255 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3256 __offsetof(struct hn_rx_ring, hn_csum_trusted), 3257 hn_rx_stat_ulong_sysctl, "LU", 3258 "# of packets that we trust host's csum verification"); 3259 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 3260 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3261 __offsetof(struct hn_rx_ring, hn_small_pkts), 3262 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 3263 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 3264 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3265 __offsetof(struct hn_rx_ring, hn_ack_failed), 3266 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 3267 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 3268 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 3269 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 3270 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 3271 3272 return (0); 3273} 3274 3275static void 3276hn_destroy_rx_data(struct hn_softc *sc) 3277{ 3278 int i; 3279 3280 if (sc->hn_rxbuf != NULL) { 3281 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 3282 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 3283 else 3284 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 3285 sc->hn_rxbuf = NULL; 3286 } 3287 3288 if (sc->hn_rx_ring_cnt == 0) 3289 return; 3290 3291 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3292 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3293 3294 if (rxr->hn_br == NULL) 3295 continue; 3296 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 3297 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 3298 } else { 3299 device_printf(sc->hn_dev, 3300 "%dth channel bufring is referenced", i); 3301 } 3302 rxr->hn_br = NULL; 3303 3304#if defined(INET) || defined(INET6) 3305 tcp_lro_free(&rxr->hn_lro); 3306#endif 3307 free(rxr->hn_pktbuf, M_DEVBUF); 3308 } 3309 free(sc->hn_rx_ring, M_DEVBUF); 3310 sc->hn_rx_ring = NULL; 3311 3312 sc->hn_rx_ring_cnt = 0; 3313 sc->hn_rx_ring_inuse = 0; 3314} 3315 3316static int 3317hn_tx_ring_create(struct hn_softc *sc, int id) 3318{ 3319 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 3320 device_t dev = sc->hn_dev; 3321 bus_dma_tag_t parent_dtag; 3322 int error, i; 3323 3324 txr->hn_sc = sc; 3325 txr->hn_tx_idx = id; 3326 3327#ifndef HN_USE_TXDESC_BUFRING 3328 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 3329#endif 3330 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 3331 3332 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 3333 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 3334 M_DEVBUF, M_WAITOK | M_ZERO); 3335#ifndef HN_USE_TXDESC_BUFRING 3336 SLIST_INIT(&txr->hn_txlist); 3337#else 3338 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 3339 M_WAITOK, &txr->hn_tx_lock); 3340#endif 3341 3342 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 3343 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 3344 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 3345 } else { 3346 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 3347 } 3348 3349#ifdef HN_IFSTART_SUPPORT 3350 if (hn_use_if_start) { 3351 txr->hn_txeof = hn_start_txeof; 3352 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 3353 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 3354 } else 3355#endif 3356 { 3357 int br_depth; 3358 3359 txr->hn_txeof = hn_xmit_txeof; 3360 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 3361 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 3362 3363 br_depth = hn_get_txswq_depth(txr); 3364 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 3365 M_WAITOK, &txr->hn_tx_lock); 3366 } 3367 3368 txr->hn_direct_tx_size = hn_direct_tx_size; 3369 3370 /* 3371 * Always schedule transmission instead of trying to do direct 3372 * transmission. This one gives the best performance so far. 3373 */ 3374 txr->hn_sched_tx = 1; 3375 3376 parent_dtag = bus_get_dma_tag(dev); 3377 3378 /* DMA tag for RNDIS packet messages. */ 3379 error = bus_dma_tag_create(parent_dtag, /* parent */ 3380 HN_RNDIS_PKT_ALIGN, /* alignment */ 3381 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 3382 BUS_SPACE_MAXADDR, /* lowaddr */ 3383 BUS_SPACE_MAXADDR, /* highaddr */ 3384 NULL, NULL, /* filter, filterarg */ 3385 HN_RNDIS_PKT_LEN, /* maxsize */ 3386 1, /* nsegments */ 3387 HN_RNDIS_PKT_LEN, /* maxsegsize */ 3388 0, /* flags */ 3389 NULL, /* lockfunc */ 3390 NULL, /* lockfuncarg */ 3391 &txr->hn_tx_rndis_dtag); 3392 if (error) { 3393 device_printf(dev, "failed to create rndis dmatag\n"); 3394 return error; 3395 } 3396 3397 /* DMA tag for data. */ 3398 error = bus_dma_tag_create(parent_dtag, /* parent */ 3399 1, /* alignment */ 3400 HN_TX_DATA_BOUNDARY, /* boundary */ 3401 BUS_SPACE_MAXADDR, /* lowaddr */ 3402 BUS_SPACE_MAXADDR, /* highaddr */ 3403 NULL, NULL, /* filter, filterarg */ 3404 HN_TX_DATA_MAXSIZE, /* maxsize */ 3405 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 3406 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 3407 0, /* flags */ 3408 NULL, /* lockfunc */ 3409 NULL, /* lockfuncarg */ 3410 &txr->hn_tx_data_dtag); 3411 if (error) { 3412 device_printf(dev, "failed to create data dmatag\n"); 3413 return error; 3414 } 3415 3416 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 3417 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 3418 3419 txd->txr = txr; 3420 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3421 STAILQ_INIT(&txd->agg_list); 3422 3423 /* 3424 * Allocate and load RNDIS packet message. 3425 */ 3426 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 3427 (void **)&txd->rndis_pkt, 3428 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 3429 &txd->rndis_pkt_dmap); 3430 if (error) { 3431 device_printf(dev, 3432 "failed to allocate rndis_packet_msg, %d\n", i); 3433 return error; 3434 } 3435 3436 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 3437 txd->rndis_pkt_dmap, 3438 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 3439 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 3440 BUS_DMA_NOWAIT); 3441 if (error) { 3442 device_printf(dev, 3443 "failed to load rndis_packet_msg, %d\n", i); 3444 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3445 txd->rndis_pkt, txd->rndis_pkt_dmap); 3446 return error; 3447 } 3448 3449 /* DMA map for TX data. */ 3450 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 3451 &txd->data_dmap); 3452 if (error) { 3453 device_printf(dev, 3454 "failed to allocate tx data dmamap\n"); 3455 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 3456 txd->rndis_pkt_dmap); 3457 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3458 txd->rndis_pkt, txd->rndis_pkt_dmap); 3459 return error; 3460 } 3461 3462 /* All set, put it to list */ 3463 txd->flags |= HN_TXD_FLAG_ONLIST; 3464#ifndef HN_USE_TXDESC_BUFRING 3465 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 3466#else 3467 buf_ring_enqueue(txr->hn_txdesc_br, txd); 3468#endif 3469 } 3470 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 3471 3472 if (sc->hn_tx_sysctl_tree != NULL) { 3473 struct sysctl_oid_list *child; 3474 struct sysctl_ctx_list *ctx; 3475 char name[16]; 3476 3477 /* 3478 * Create per TX ring sysctl tree: 3479 * dev.hn.UNIT.tx.RINGID 3480 */ 3481 ctx = device_get_sysctl_ctx(dev); 3482 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 3483 3484 snprintf(name, sizeof(name), "%d", id); 3485 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 3486 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3487 3488 if (txr->hn_tx_sysctl_tree != NULL) { 3489 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 3490 3491#ifdef HN_DEBUG 3492 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 3493 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 3494 "# of available TX descs"); 3495#endif 3496#ifdef HN_IFSTART_SUPPORT 3497 if (!hn_use_if_start) 3498#endif 3499 { 3500 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 3501 CTLFLAG_RD, &txr->hn_oactive, 0, 3502 "over active"); 3503 } 3504 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 3505 CTLFLAG_RW, &txr->hn_pkts, 3506 "# of packets transmitted"); 3507 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 3508 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 3509 } 3510 } 3511 3512 return 0; 3513} 3514 3515static void 3516hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 3517{ 3518 struct hn_tx_ring *txr = txd->txr; 3519 3520 KASSERT(txd->m == NULL, ("still has mbuf installed")); 3521 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 3522 3523 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 3524 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 3525 txd->rndis_pkt_dmap); 3526 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 3527} 3528 3529static void 3530hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 3531{ 3532 3533 KASSERT(txd->refs == 0 || txd->refs == 1, 3534 ("invalid txd refs %d", txd->refs)); 3535 3536 /* Aggregated txds will be freed by their aggregating txd. */ 3537 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 3538 int freed; 3539 3540 freed = hn_txdesc_put(txr, txd); 3541 KASSERT(freed, ("can't free txdesc")); 3542 } 3543} 3544 3545static void 3546hn_tx_ring_destroy(struct hn_tx_ring *txr) 3547{ 3548 int i; 3549 3550 if (txr->hn_txdesc == NULL) 3551 return; 3552 3553 /* 3554 * NOTE: 3555 * Because the freeing of aggregated txds will be deferred 3556 * to the aggregating txd, two passes are used here: 3557 * - The first pass GCes any pending txds. This GC is necessary, 3558 * since if the channels are revoked, hypervisor will not 3559 * deliver send-done for all pending txds. 3560 * - The second pass frees the busdma stuffs, i.e. after all txds 3561 * were freed. 3562 */ 3563 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3564 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 3565 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3566 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 3567 3568 if (txr->hn_tx_data_dtag != NULL) 3569 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 3570 if (txr->hn_tx_rndis_dtag != NULL) 3571 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 3572 3573#ifdef HN_USE_TXDESC_BUFRING 3574 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 3575#endif 3576 3577 free(txr->hn_txdesc, M_DEVBUF); 3578 txr->hn_txdesc = NULL; 3579 3580 if (txr->hn_mbuf_br != NULL) 3581 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 3582 3583#ifndef HN_USE_TXDESC_BUFRING 3584 mtx_destroy(&txr->hn_txlist_spin); 3585#endif 3586 mtx_destroy(&txr->hn_tx_lock); 3587} 3588 3589static int 3590hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 3591{ 3592 struct sysctl_oid_list *child; 3593 struct sysctl_ctx_list *ctx; 3594 int i; 3595 3596 /* 3597 * Create TXBUF for chimney sending. 3598 * 3599 * NOTE: It is shared by all channels. 3600 */ 3601 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 3602 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 3603 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3604 if (sc->hn_chim == NULL) { 3605 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 3606 return (ENOMEM); 3607 } 3608 3609 sc->hn_tx_ring_cnt = ring_cnt; 3610 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 3611 3612 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 3613 M_DEVBUF, M_WAITOK | M_ZERO); 3614 3615 ctx = device_get_sysctl_ctx(sc->hn_dev); 3616 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 3617 3618 /* Create dev.hn.UNIT.tx sysctl tree */ 3619 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 3620 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3621 3622 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3623 int error; 3624 3625 error = hn_tx_ring_create(sc, i); 3626 if (error) 3627 return error; 3628 } 3629 3630 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 3631 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3632 __offsetof(struct hn_tx_ring, hn_no_txdescs), 3633 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 3634 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 3635 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3636 __offsetof(struct hn_tx_ring, hn_send_failed), 3637 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 3638 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 3639 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3640 __offsetof(struct hn_tx_ring, hn_txdma_failed), 3641 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 3642 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 3643 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3644 __offsetof(struct hn_tx_ring, hn_flush_failed), 3645 hn_tx_stat_ulong_sysctl, "LU", 3646 "# of packet transmission aggregation flush failure"); 3647 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 3648 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3649 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 3650 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 3651 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 3652 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3653 __offsetof(struct hn_tx_ring, hn_tx_chimney), 3654 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 3655 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 3656 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3657 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 3658 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 3659 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 3660 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 3661 "# of total TX descs"); 3662 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 3663 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 3664 "Chimney send packet size upper boundary"); 3665 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 3666 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3667 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 3668 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 3669 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3670 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 3671 hn_tx_conf_int_sysctl, "I", 3672 "Size of the packet for direct transmission"); 3673 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 3674 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3675 __offsetof(struct hn_tx_ring, hn_sched_tx), 3676 hn_tx_conf_int_sysctl, "I", 3677 "Always schedule transmission " 3678 "instead of doing direct transmission"); 3679 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 3680 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 3681 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 3682 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 3683 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 3684 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 3685 "Applied packet transmission aggregation size"); 3686 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 3687 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3688 hn_txagg_pktmax_sysctl, "I", 3689 "Applied packet transmission aggregation packets"); 3690 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 3691 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3692 hn_txagg_align_sysctl, "I", 3693 "Applied packet transmission aggregation alignment"); 3694 3695 return 0; 3696} 3697 3698static void 3699hn_set_chim_size(struct hn_softc *sc, int chim_size) 3700{ 3701 int i; 3702 3703 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3704 sc->hn_tx_ring[i].hn_chim_size = chim_size; 3705} 3706 3707static void 3708hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 3709{ 3710 struct ifnet *ifp = sc->hn_ifp; 3711 int tso_minlen; 3712 3713 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 3714 return; 3715 3716 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 3717 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 3718 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 3719 3720 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 3721 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 3722 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 3723 3724 if (tso_maxlen < tso_minlen) 3725 tso_maxlen = tso_minlen; 3726 else if (tso_maxlen > IP_MAXPACKET) 3727 tso_maxlen = IP_MAXPACKET; 3728 if (tso_maxlen > sc->hn_ndis_tso_szmax) 3729 tso_maxlen = sc->hn_ndis_tso_szmax; 3730 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 3731 if (bootverbose) 3732 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 3733} 3734 3735static void 3736hn_fixup_tx_data(struct hn_softc *sc) 3737{ 3738 uint64_t csum_assist; 3739 int i; 3740 3741 hn_set_chim_size(sc, sc->hn_chim_szmax); 3742 if (hn_tx_chimney_size > 0 && 3743 hn_tx_chimney_size < sc->hn_chim_szmax) 3744 hn_set_chim_size(sc, hn_tx_chimney_size); 3745 3746 csum_assist = 0; 3747 if (sc->hn_caps & HN_CAP_IPCS) 3748 csum_assist |= CSUM_IP; 3749 if (sc->hn_caps & HN_CAP_TCP4CS) 3750 csum_assist |= CSUM_IP_TCP; 3751 if (sc->hn_caps & HN_CAP_UDP4CS) 3752 csum_assist |= CSUM_IP_UDP; 3753 if (sc->hn_caps & HN_CAP_TCP6CS) 3754 csum_assist |= CSUM_IP6_TCP; 3755 if (sc->hn_caps & HN_CAP_UDP6CS) 3756 csum_assist |= CSUM_IP6_UDP; 3757 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3758 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 3759 3760 if (sc->hn_caps & HN_CAP_HASHVAL) { 3761 /* 3762 * Support HASHVAL pktinfo on TX path. 3763 */ 3764 if (bootverbose) 3765 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 3766 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3767 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 3768 } 3769} 3770 3771static void 3772hn_destroy_tx_data(struct hn_softc *sc) 3773{ 3774 int i; 3775 3776 if (sc->hn_chim != NULL) { 3777 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 3778 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 3779 } else { 3780 device_printf(sc->hn_dev, 3781 "chimney sending buffer is referenced"); 3782 } 3783 sc->hn_chim = NULL; 3784 } 3785 3786 if (sc->hn_tx_ring_cnt == 0) 3787 return; 3788 3789 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3790 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 3791 3792 free(sc->hn_tx_ring, M_DEVBUF); 3793 sc->hn_tx_ring = NULL; 3794 3795 sc->hn_tx_ring_cnt = 0; 3796 sc->hn_tx_ring_inuse = 0; 3797} 3798 3799#ifdef HN_IFSTART_SUPPORT 3800 3801static void 3802hn_start_taskfunc(void *xtxr, int pending __unused) 3803{ 3804 struct hn_tx_ring *txr = xtxr; 3805 3806 mtx_lock(&txr->hn_tx_lock); 3807 hn_start_locked(txr, 0); 3808 mtx_unlock(&txr->hn_tx_lock); 3809} 3810 3811static int 3812hn_start_locked(struct hn_tx_ring *txr, int len) 3813{ 3814 struct hn_softc *sc = txr->hn_sc; 3815 struct ifnet *ifp = sc->hn_ifp; 3816 int sched = 0; 3817 3818 KASSERT(hn_use_if_start, 3819 ("hn_start_locked is called, when if_start is disabled")); 3820 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 3821 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 3822 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3823 3824 if (__predict_false(txr->hn_suspended)) 3825 return (0); 3826 3827 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 3828 IFF_DRV_RUNNING) 3829 return (0); 3830 3831 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 3832 struct hn_txdesc *txd; 3833 struct mbuf *m_head; 3834 int error; 3835 3836 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 3837 if (m_head == NULL) 3838 break; 3839 3840 if (len > 0 && m_head->m_pkthdr.len > len) { 3841 /* 3842 * This sending could be time consuming; let callers 3843 * dispatch this packet sending (and sending of any 3844 * following up packets) to tx taskqueue. 3845 */ 3846 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3847 sched = 1; 3848 break; 3849 } 3850 3851#if defined(INET6) || defined(INET) 3852 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3853 m_head = hn_tso_fixup(m_head); 3854 if (__predict_false(m_head == NULL)) { 3855 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3856 continue; 3857 } 3858 } 3859#endif 3860 3861 txd = hn_txdesc_get(txr); 3862 if (txd == NULL) { 3863 txr->hn_no_txdescs++; 3864 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3865 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 3866 break; 3867 } 3868 3869 error = hn_encap(ifp, txr, txd, &m_head); 3870 if (error) { 3871 /* Both txd and m_head are freed */ 3872 KASSERT(txr->hn_agg_txd == NULL, 3873 ("encap failed w/ pending aggregating txdesc")); 3874 continue; 3875 } 3876 3877 if (txr->hn_agg_pktleft == 0) { 3878 if (txr->hn_agg_txd != NULL) { 3879 KASSERT(m_head == NULL, 3880 ("pending mbuf for aggregating txdesc")); 3881 error = hn_flush_txagg(ifp, txr); 3882 if (__predict_false(error)) { 3883 atomic_set_int(&ifp->if_drv_flags, 3884 IFF_DRV_OACTIVE); 3885 break; 3886 } 3887 } else { 3888 KASSERT(m_head != NULL, ("mbuf was freed")); 3889 error = hn_txpkt(ifp, txr, txd); 3890 if (__predict_false(error)) { 3891 /* txd is freed, but m_head is not */ 3892 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3893 atomic_set_int(&ifp->if_drv_flags, 3894 IFF_DRV_OACTIVE); 3895 break; 3896 } 3897 } 3898 } 3899#ifdef INVARIANTS 3900 else { 3901 KASSERT(txr->hn_agg_txd != NULL, 3902 ("no aggregating txdesc")); 3903 KASSERT(m_head == NULL, 3904 ("pending mbuf for aggregating txdesc")); 3905 } 3906#endif 3907 } 3908 3909 /* Flush pending aggerated transmission. */ 3910 if (txr->hn_agg_txd != NULL) 3911 hn_flush_txagg(ifp, txr); 3912 return (sched); 3913} 3914 3915static void 3916hn_start(struct ifnet *ifp) 3917{ 3918 struct hn_softc *sc = ifp->if_softc; 3919 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 3920 3921 if (txr->hn_sched_tx) 3922 goto do_sched; 3923 3924 if (mtx_trylock(&txr->hn_tx_lock)) { 3925 int sched; 3926 3927 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 3928 mtx_unlock(&txr->hn_tx_lock); 3929 if (!sched) 3930 return; 3931 } 3932do_sched: 3933 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 3934} 3935 3936static void 3937hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 3938{ 3939 struct hn_tx_ring *txr = xtxr; 3940 3941 mtx_lock(&txr->hn_tx_lock); 3942 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 3943 hn_start_locked(txr, 0); 3944 mtx_unlock(&txr->hn_tx_lock); 3945} 3946 3947static void 3948hn_start_txeof(struct hn_tx_ring *txr) 3949{ 3950 struct hn_softc *sc = txr->hn_sc; 3951 struct ifnet *ifp = sc->hn_ifp; 3952 3953 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 3954 3955 if (txr->hn_sched_tx) 3956 goto do_sched; 3957 3958 if (mtx_trylock(&txr->hn_tx_lock)) { 3959 int sched; 3960 3961 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 3962 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 3963 mtx_unlock(&txr->hn_tx_lock); 3964 if (sched) { 3965 taskqueue_enqueue(txr->hn_tx_taskq, 3966 &txr->hn_tx_task); 3967 } 3968 } else { 3969do_sched: 3970 /* 3971 * Release the OACTIVE earlier, with the hope, that 3972 * others could catch up. The task will clear the 3973 * flag again with the hn_tx_lock to avoid possible 3974 * races. 3975 */ 3976 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 3977 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 3978 } 3979} 3980 3981#endif /* HN_IFSTART_SUPPORT */ 3982 3983static int 3984hn_xmit(struct hn_tx_ring *txr, int len) 3985{ 3986 struct hn_softc *sc = txr->hn_sc; 3987 struct ifnet *ifp = sc->hn_ifp; 3988 struct mbuf *m_head; 3989 int sched = 0; 3990 3991 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 3992#ifdef HN_IFSTART_SUPPORT 3993 KASSERT(hn_use_if_start == 0, 3994 ("hn_xmit is called, when if_start is enabled")); 3995#endif 3996 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3997 3998 if (__predict_false(txr->hn_suspended)) 3999 return (0); 4000 4001 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 4002 return (0); 4003 4004 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 4005 struct hn_txdesc *txd; 4006 int error; 4007 4008 if (len > 0 && m_head->m_pkthdr.len > len) { 4009 /* 4010 * This sending could be time consuming; let callers 4011 * dispatch this packet sending (and sending of any 4012 * following up packets) to tx taskqueue. 4013 */ 4014 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4015 sched = 1; 4016 break; 4017 } 4018 4019 txd = hn_txdesc_get(txr); 4020 if (txd == NULL) { 4021 txr->hn_no_txdescs++; 4022 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4023 txr->hn_oactive = 1; 4024 break; 4025 } 4026 4027 error = hn_encap(ifp, txr, txd, &m_head); 4028 if (error) { 4029 /* Both txd and m_head are freed; discard */ 4030 KASSERT(txr->hn_agg_txd == NULL, 4031 ("encap failed w/ pending aggregating txdesc")); 4032 drbr_advance(ifp, txr->hn_mbuf_br); 4033 continue; 4034 } 4035 4036 if (txr->hn_agg_pktleft == 0) { 4037 if (txr->hn_agg_txd != NULL) { 4038 KASSERT(m_head == NULL, 4039 ("pending mbuf for aggregating txdesc")); 4040 error = hn_flush_txagg(ifp, txr); 4041 if (__predict_false(error)) { 4042 txr->hn_oactive = 1; 4043 break; 4044 } 4045 } else { 4046 KASSERT(m_head != NULL, ("mbuf was freed")); 4047 error = hn_txpkt(ifp, txr, txd); 4048 if (__predict_false(error)) { 4049 /* txd is freed, but m_head is not */ 4050 drbr_putback(ifp, txr->hn_mbuf_br, 4051 m_head); 4052 txr->hn_oactive = 1; 4053 break; 4054 } 4055 } 4056 } 4057#ifdef INVARIANTS 4058 else { 4059 KASSERT(txr->hn_agg_txd != NULL, 4060 ("no aggregating txdesc")); 4061 KASSERT(m_head == NULL, 4062 ("pending mbuf for aggregating txdesc")); 4063 } 4064#endif 4065 4066 /* Sent */ 4067 drbr_advance(ifp, txr->hn_mbuf_br); 4068 } 4069 4070 /* Flush pending aggerated transmission. */ 4071 if (txr->hn_agg_txd != NULL) 4072 hn_flush_txagg(ifp, txr); 4073 return (sched); 4074} 4075 4076static int 4077hn_transmit(struct ifnet *ifp, struct mbuf *m) 4078{ 4079 struct hn_softc *sc = ifp->if_softc; 4080 struct hn_tx_ring *txr; 4081 int error, idx = 0; 4082 4083#if defined(INET6) || defined(INET) 4084 /* 4085 * Perform TSO packet header fixup now, since the TSO 4086 * packet header should be cache-hot. 4087 */ 4088 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 4089 m = hn_tso_fixup(m); 4090 if (__predict_false(m == NULL)) { 4091 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4092 return EIO; 4093 } 4094 } 4095#endif 4096 4097 /* 4098 * Select the TX ring based on flowid 4099 */ 4100 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) 4101 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 4102 txr = &sc->hn_tx_ring[idx]; 4103 4104 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 4105 if (error) { 4106 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 4107 return error; 4108 } 4109 4110 if (txr->hn_oactive) 4111 return 0; 4112 4113 if (txr->hn_sched_tx) 4114 goto do_sched; 4115 4116 if (mtx_trylock(&txr->hn_tx_lock)) { 4117 int sched; 4118 4119 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4120 mtx_unlock(&txr->hn_tx_lock); 4121 if (!sched) 4122 return 0; 4123 } 4124do_sched: 4125 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4126 return 0; 4127} 4128 4129static void 4130hn_tx_ring_qflush(struct hn_tx_ring *txr) 4131{ 4132 struct mbuf *m; 4133 4134 mtx_lock(&txr->hn_tx_lock); 4135 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 4136 m_freem(m); 4137 mtx_unlock(&txr->hn_tx_lock); 4138} 4139 4140static void 4141hn_xmit_qflush(struct ifnet *ifp) 4142{ 4143 struct hn_softc *sc = ifp->if_softc; 4144 int i; 4145 4146 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4147 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4148 if_qflush(ifp); 4149} 4150 4151static void 4152hn_xmit_txeof(struct hn_tx_ring *txr) 4153{ 4154 4155 if (txr->hn_sched_tx) 4156 goto do_sched; 4157 4158 if (mtx_trylock(&txr->hn_tx_lock)) { 4159 int sched; 4160 4161 txr->hn_oactive = 0; 4162 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4163 mtx_unlock(&txr->hn_tx_lock); 4164 if (sched) { 4165 taskqueue_enqueue(txr->hn_tx_taskq, 4166 &txr->hn_tx_task); 4167 } 4168 } else { 4169do_sched: 4170 /* 4171 * Release the oactive earlier, with the hope, that 4172 * others could catch up. The task will clear the 4173 * oactive again with the hn_tx_lock to avoid possible 4174 * races. 4175 */ 4176 txr->hn_oactive = 0; 4177 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4178 } 4179} 4180 4181static void 4182hn_xmit_taskfunc(void *xtxr, int pending __unused) 4183{ 4184 struct hn_tx_ring *txr = xtxr; 4185 4186 mtx_lock(&txr->hn_tx_lock); 4187 hn_xmit(txr, 0); 4188 mtx_unlock(&txr->hn_tx_lock); 4189} 4190 4191static void 4192hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 4193{ 4194 struct hn_tx_ring *txr = xtxr; 4195 4196 mtx_lock(&txr->hn_tx_lock); 4197 txr->hn_oactive = 0; 4198 hn_xmit(txr, 0); 4199 mtx_unlock(&txr->hn_tx_lock); 4200} 4201 4202static int 4203hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 4204{ 4205 struct vmbus_chan_br cbr; 4206 struct hn_rx_ring *rxr; 4207 struct hn_tx_ring *txr = NULL; 4208 int idx, error; 4209 4210 idx = vmbus_chan_subidx(chan); 4211 4212 /* 4213 * Link this channel to RX/TX ring. 4214 */ 4215 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4216 ("invalid channel index %d, should > 0 && < %d", 4217 idx, sc->hn_rx_ring_inuse)); 4218 rxr = &sc->hn_rx_ring[idx]; 4219 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 4220 ("RX ring %d already attached", idx)); 4221 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 4222 4223 if (bootverbose) { 4224 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 4225 idx, vmbus_chan_id(chan)); 4226 } 4227 4228 if (idx < sc->hn_tx_ring_inuse) { 4229 txr = &sc->hn_tx_ring[idx]; 4230 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 4231 ("TX ring %d already attached", idx)); 4232 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 4233 4234 txr->hn_chan = chan; 4235 if (bootverbose) { 4236 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 4237 idx, vmbus_chan_id(chan)); 4238 } 4239 } 4240 4241 /* Bind this channel to a proper CPU. */ 4242 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 4243 4244 /* 4245 * Open this channel 4246 */ 4247 cbr.cbr = rxr->hn_br; 4248 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 4249 cbr.cbr_txsz = HN_TXBR_SIZE; 4250 cbr.cbr_rxsz = HN_RXBR_SIZE; 4251 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 4252 if (error) { 4253 if (error == EISCONN) { 4254 if_printf(sc->hn_ifp, "bufring is connected after " 4255 "chan%u open failure\n", vmbus_chan_id(chan)); 4256 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4257 } else { 4258 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 4259 vmbus_chan_id(chan), error); 4260 } 4261 } 4262 return (error); 4263} 4264 4265static void 4266hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 4267{ 4268 struct hn_rx_ring *rxr; 4269 int idx, error; 4270 4271 idx = vmbus_chan_subidx(chan); 4272 4273 /* 4274 * Link this channel to RX/TX ring. 4275 */ 4276 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4277 ("invalid channel index %d, should > 0 && < %d", 4278 idx, sc->hn_rx_ring_inuse)); 4279 rxr = &sc->hn_rx_ring[idx]; 4280 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 4281 ("RX ring %d is not attached", idx)); 4282 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 4283 4284 if (idx < sc->hn_tx_ring_inuse) { 4285 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 4286 4287 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 4288 ("TX ring %d is not attached attached", idx)); 4289 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 4290 } 4291 4292 /* 4293 * Close this channel. 4294 * 4295 * NOTE: 4296 * Channel closing does _not_ destroy the target channel. 4297 */ 4298 error = vmbus_chan_close_direct(chan); 4299 if (error == EISCONN) { 4300 if_printf(sc->hn_ifp, "chan%u bufring is connected " 4301 "after being closed\n", vmbus_chan_id(chan)); 4302 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4303 } else if (error) { 4304 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 4305 vmbus_chan_id(chan), error); 4306 } 4307} 4308 4309static int 4310hn_attach_subchans(struct hn_softc *sc) 4311{ 4312 struct vmbus_channel **subchans; 4313 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4314 int i, error = 0; 4315 4316 KASSERT(subchan_cnt > 0, ("no sub-channels")); 4317 4318 /* Attach the sub-channels. */ 4319 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4320 for (i = 0; i < subchan_cnt; ++i) { 4321 int error1; 4322 4323 error1 = hn_chan_attach(sc, subchans[i]); 4324 if (error1) { 4325 error = error1; 4326 /* Move on; all channels will be detached later. */ 4327 } 4328 } 4329 vmbus_subchan_rel(subchans, subchan_cnt); 4330 4331 if (error) { 4332 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 4333 } else { 4334 if (bootverbose) { 4335 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 4336 subchan_cnt); 4337 } 4338 } 4339 return (error); 4340} 4341 4342static void 4343hn_detach_allchans(struct hn_softc *sc) 4344{ 4345 struct vmbus_channel **subchans; 4346 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4347 int i; 4348 4349 if (subchan_cnt == 0) 4350 goto back; 4351 4352 /* Detach the sub-channels. */ 4353 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4354 for (i = 0; i < subchan_cnt; ++i) 4355 hn_chan_detach(sc, subchans[i]); 4356 vmbus_subchan_rel(subchans, subchan_cnt); 4357 4358back: 4359 /* 4360 * Detach the primary channel, _after_ all sub-channels 4361 * are detached. 4362 */ 4363 hn_chan_detach(sc, sc->hn_prichan); 4364 4365 /* Wait for sub-channels to be destroyed, if any. */ 4366 vmbus_subchan_drain(sc->hn_prichan); 4367 4368#ifdef INVARIANTS 4369 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4370 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 4371 HN_RX_FLAG_ATTACHED) == 0, 4372 ("%dth RX ring is still attached", i)); 4373 } 4374 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4375 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 4376 HN_TX_FLAG_ATTACHED) == 0, 4377 ("%dth TX ring is still attached", i)); 4378 } 4379#endif 4380} 4381 4382static int 4383hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 4384{ 4385 struct vmbus_channel **subchans; 4386 int nchan, rxr_cnt, error; 4387 4388 nchan = *nsubch + 1; 4389 if (nchan == 1) { 4390 /* 4391 * Multiple RX/TX rings are not requested. 4392 */ 4393 *nsubch = 0; 4394 return (0); 4395 } 4396 4397 /* 4398 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 4399 * table entries. 4400 */ 4401 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 4402 if (error) { 4403 /* No RSS; this is benign. */ 4404 *nsubch = 0; 4405 return (0); 4406 } 4407 if (bootverbose) { 4408 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 4409 rxr_cnt, nchan); 4410 } 4411 4412 if (nchan > rxr_cnt) 4413 nchan = rxr_cnt; 4414 if (nchan == 1) { 4415 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 4416 *nsubch = 0; 4417 return (0); 4418 } 4419 4420 /* 4421 * Allocate sub-channels from NVS. 4422 */ 4423 *nsubch = nchan - 1; 4424 error = hn_nvs_alloc_subchans(sc, nsubch); 4425 if (error || *nsubch == 0) { 4426 /* Failed to allocate sub-channels. */ 4427 *nsubch = 0; 4428 return (0); 4429 } 4430 4431 /* 4432 * Wait for all sub-channels to become ready before moving on. 4433 */ 4434 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 4435 vmbus_subchan_rel(subchans, *nsubch); 4436 return (0); 4437} 4438 4439static bool 4440hn_synth_attachable(const struct hn_softc *sc) 4441{ 4442 int i; 4443 4444 if (sc->hn_flags & HN_FLAG_ERRORS) 4445 return (false); 4446 4447 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4448 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4449 4450 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 4451 return (false); 4452 } 4453 return (true); 4454} 4455 4456static int 4457hn_synth_attach(struct hn_softc *sc, int mtu) 4458{ 4459#define ATTACHED_NVS 0x0002 4460#define ATTACHED_RNDIS 0x0004 4461 4462 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 4463 int error, nsubch, nchan, i; 4464 uint32_t old_caps, attached = 0; 4465 4466 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 4467 ("synthetic parts were attached")); 4468 4469 if (!hn_synth_attachable(sc)) 4470 return (ENXIO); 4471 4472 /* Save capabilities for later verification. */ 4473 old_caps = sc->hn_caps; 4474 sc->hn_caps = 0; 4475 4476 /* Clear RSS stuffs. */ 4477 sc->hn_rss_ind_size = 0; 4478 sc->hn_rss_hash = 0; 4479 4480 /* 4481 * Attach the primary channel _before_ attaching NVS and RNDIS. 4482 */ 4483 error = hn_chan_attach(sc, sc->hn_prichan); 4484 if (error) 4485 goto failed; 4486 4487 /* 4488 * Attach NVS. 4489 */ 4490 error = hn_nvs_attach(sc, mtu); 4491 if (error) 4492 goto failed; 4493 attached |= ATTACHED_NVS; 4494 4495 /* 4496 * Attach RNDIS _after_ NVS is attached. 4497 */ 4498 error = hn_rndis_attach(sc, mtu); 4499 if (error) 4500 goto failed; 4501 attached |= ATTACHED_RNDIS; 4502 4503 /* 4504 * Make sure capabilities are not changed. 4505 */ 4506 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 4507 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 4508 old_caps, sc->hn_caps); 4509 error = ENXIO; 4510 goto failed; 4511 } 4512 4513 /* 4514 * Allocate sub-channels for multi-TX/RX rings. 4515 * 4516 * NOTE: 4517 * The # of RX rings that can be used is equivalent to the # of 4518 * channels to be requested. 4519 */ 4520 nsubch = sc->hn_rx_ring_cnt - 1; 4521 error = hn_synth_alloc_subchans(sc, &nsubch); 4522 if (error) 4523 goto failed; 4524 /* NOTE: _Full_ synthetic parts detach is required now. */ 4525 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 4526 4527 /* 4528 * Set the # of TX/RX rings that could be used according to 4529 * the # of channels that NVS offered. 4530 */ 4531 nchan = nsubch + 1; 4532 hn_set_ring_inuse(sc, nchan); 4533 if (nchan == 1) { 4534 /* Only the primary channel can be used; done */ 4535 goto back; 4536 } 4537 4538 /* 4539 * Attach the sub-channels. 4540 * 4541 * NOTE: hn_set_ring_inuse() _must_ have been called. 4542 */ 4543 error = hn_attach_subchans(sc); 4544 if (error) 4545 goto failed; 4546 4547 /* 4548 * Configure RSS key and indirect table _after_ all sub-channels 4549 * are attached. 4550 */ 4551 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 4552 /* 4553 * RSS key is not set yet; set it to the default RSS key. 4554 */ 4555 if (bootverbose) 4556 if_printf(sc->hn_ifp, "setup default RSS key\n"); 4557 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 4558 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4559 } 4560 4561 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 4562 /* 4563 * RSS indirect table is not set yet; set it up in round- 4564 * robin fashion. 4565 */ 4566 if (bootverbose) { 4567 if_printf(sc->hn_ifp, "setup default RSS indirect " 4568 "table\n"); 4569 } 4570 for (i = 0; i < NDIS_HASH_INDCNT; ++i) 4571 rss->rss_ind[i] = i % nchan; 4572 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4573 } else { 4574 /* 4575 * # of usable channels may be changed, so we have to 4576 * make sure that all entries in RSS indirect table 4577 * are valid. 4578 * 4579 * NOTE: hn_set_ring_inuse() _must_ have been called. 4580 */ 4581 hn_rss_ind_fixup(sc); 4582 } 4583 4584 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 4585 if (error) 4586 goto failed; 4587back: 4588 /* 4589 * Fixup transmission aggregation setup. 4590 */ 4591 hn_set_txagg(sc); 4592 return (0); 4593 4594failed: 4595 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 4596 hn_synth_detach(sc); 4597 } else { 4598 if (attached & ATTACHED_RNDIS) 4599 hn_rndis_detach(sc); 4600 if (attached & ATTACHED_NVS) 4601 hn_nvs_detach(sc); 4602 hn_chan_detach(sc, sc->hn_prichan); 4603 /* Restore old capabilities. */ 4604 sc->hn_caps = old_caps; 4605 } 4606 return (error); 4607 4608#undef ATTACHED_RNDIS 4609#undef ATTACHED_NVS 4610} 4611 4612/* 4613 * NOTE: 4614 * The interface must have been suspended though hn_suspend(), before 4615 * this function get called. 4616 */ 4617static void 4618hn_synth_detach(struct hn_softc *sc) 4619{ 4620 4621 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4622 ("synthetic parts were not attached")); 4623 4624 /* Detach the RNDIS first. */ 4625 hn_rndis_detach(sc); 4626 4627 /* Detach NVS. */ 4628 hn_nvs_detach(sc); 4629 4630 /* Detach all of the channels. */ 4631 hn_detach_allchans(sc); 4632 4633 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 4634} 4635 4636static void 4637hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 4638{ 4639 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 4640 ("invalid ring count %d", ring_cnt)); 4641 4642 if (sc->hn_tx_ring_cnt > ring_cnt) 4643 sc->hn_tx_ring_inuse = ring_cnt; 4644 else 4645 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 4646 sc->hn_rx_ring_inuse = ring_cnt; 4647 4648 if (bootverbose) { 4649 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 4650 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 4651 } 4652} 4653 4654static void 4655hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 4656{ 4657 4658 /* 4659 * NOTE: 4660 * The TX bufring will not be drained by the hypervisor, 4661 * if the primary channel is revoked. 4662 */ 4663 while (!vmbus_chan_rx_empty(chan) || 4664 (!vmbus_chan_is_revoked(sc->hn_prichan) && 4665 !vmbus_chan_tx_empty(chan))) 4666 pause("waitch", 1); 4667 vmbus_chan_intr_drain(chan); 4668} 4669 4670static void 4671hn_suspend_data(struct hn_softc *sc) 4672{ 4673 struct vmbus_channel **subch = NULL; 4674 struct hn_tx_ring *txr; 4675 int i, nsubch; 4676 4677 HN_LOCK_ASSERT(sc); 4678 4679 /* 4680 * Suspend TX. 4681 */ 4682 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4683 txr = &sc->hn_tx_ring[i]; 4684 4685 mtx_lock(&txr->hn_tx_lock); 4686 txr->hn_suspended = 1; 4687 mtx_unlock(&txr->hn_tx_lock); 4688 /* No one is able send more packets now. */ 4689 4690 /* 4691 * Wait for all pending sends to finish. 4692 * 4693 * NOTE: 4694 * We will _not_ receive all pending send-done, if the 4695 * primary channel is revoked. 4696 */ 4697 while (hn_tx_ring_pending(txr) && 4698 !vmbus_chan_is_revoked(sc->hn_prichan)) 4699 pause("hnwtx", 1 /* 1 tick */); 4700 } 4701 4702 /* 4703 * Disable RX by clearing RX filter. 4704 */ 4705 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 4706 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); 4707 4708 /* 4709 * Give RNDIS enough time to flush all pending data packets. 4710 */ 4711 pause("waitrx", (200 * hz) / 1000); 4712 4713 /* 4714 * Drain RX/TX bufrings and interrupts. 4715 */ 4716 nsubch = sc->hn_rx_ring_inuse - 1; 4717 if (nsubch > 0) 4718 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4719 4720 if (subch != NULL) { 4721 for (i = 0; i < nsubch; ++i) 4722 hn_chan_drain(sc, subch[i]); 4723 } 4724 hn_chan_drain(sc, sc->hn_prichan); 4725 4726 if (subch != NULL) 4727 vmbus_subchan_rel(subch, nsubch); 4728 4729 /* 4730 * Drain any pending TX tasks. 4731 * 4732 * NOTE: 4733 * The above hn_chan_drain() can dispatch TX tasks, so the TX 4734 * tasks will have to be drained _after_ the above hn_chan_drain() 4735 * calls. 4736 */ 4737 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4738 txr = &sc->hn_tx_ring[i]; 4739 4740 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 4741 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 4742 } 4743} 4744 4745static void 4746hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 4747{ 4748 4749 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 4750} 4751 4752static void 4753hn_suspend_mgmt(struct hn_softc *sc) 4754{ 4755 struct task task; 4756 4757 HN_LOCK_ASSERT(sc); 4758 4759 /* 4760 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 4761 * through hn_mgmt_taskq. 4762 */ 4763 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 4764 vmbus_chan_run_task(sc->hn_prichan, &task); 4765 4766 /* 4767 * Make sure that all pending management tasks are completed. 4768 */ 4769 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 4770 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 4771 taskqueue_drain_all(sc->hn_mgmt_taskq0); 4772} 4773 4774static void 4775hn_suspend(struct hn_softc *sc) 4776{ 4777 4778 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 4779 hn_suspend_data(sc); 4780 hn_suspend_mgmt(sc); 4781} 4782 4783static void 4784hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 4785{ 4786 int i; 4787 4788 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 4789 ("invalid TX ring count %d", tx_ring_cnt)); 4790 4791 for (i = 0; i < tx_ring_cnt; ++i) { 4792 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 4793 4794 mtx_lock(&txr->hn_tx_lock); 4795 txr->hn_suspended = 0; 4796 mtx_unlock(&txr->hn_tx_lock); 4797 } 4798} 4799 4800static void 4801hn_resume_data(struct hn_softc *sc) 4802{ 4803 int i; 4804 4805 HN_LOCK_ASSERT(sc); 4806 4807 /* 4808 * Re-enable RX. 4809 */ 4810 hn_set_rxfilter(sc); 4811 4812 /* 4813 * Make sure to clear suspend status on "all" TX rings, 4814 * since hn_tx_ring_inuse can be changed after 4815 * hn_suspend_data(). 4816 */ 4817 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 4818 4819#ifdef HN_IFSTART_SUPPORT 4820 if (!hn_use_if_start) 4821#endif 4822 { 4823 /* 4824 * Flush unused drbrs, since hn_tx_ring_inuse may be 4825 * reduced. 4826 */ 4827 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 4828 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4829 } 4830 4831 /* 4832 * Kick start TX. 4833 */ 4834 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4835 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 4836 4837 /* 4838 * Use txeof task, so that any pending oactive can be 4839 * cleared properly. 4840 */ 4841 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4842 } 4843} 4844 4845static void 4846hn_resume_mgmt(struct hn_softc *sc) 4847{ 4848 4849 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 4850 4851 /* 4852 * Kick off network change detection, if it was pending. 4853 * If no network change was pending, start link status 4854 * checks, which is more lightweight than network change 4855 * detection. 4856 */ 4857 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 4858 hn_change_network(sc); 4859 else 4860 hn_update_link_status(sc); 4861} 4862 4863static void 4864hn_resume(struct hn_softc *sc) 4865{ 4866 4867 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 4868 hn_resume_data(sc); 4869 hn_resume_mgmt(sc); 4870} 4871 4872static void 4873hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 4874{ 4875 const struct rndis_status_msg *msg; 4876 int ofs; 4877 4878 if (dlen < sizeof(*msg)) { 4879 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 4880 return; 4881 } 4882 msg = data; 4883 4884 switch (msg->rm_status) { 4885 case RNDIS_STATUS_MEDIA_CONNECT: 4886 case RNDIS_STATUS_MEDIA_DISCONNECT: 4887 hn_update_link_status(sc); 4888 break; 4889 4890 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 4891 /* Not really useful; ignore. */ 4892 break; 4893 4894 case RNDIS_STATUS_NETWORK_CHANGE: 4895 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 4896 if (dlen < ofs + msg->rm_stbuflen || 4897 msg->rm_stbuflen < sizeof(uint32_t)) { 4898 if_printf(sc->hn_ifp, "network changed\n"); 4899 } else { 4900 uint32_t change; 4901 4902 memcpy(&change, ((const uint8_t *)msg) + ofs, 4903 sizeof(change)); 4904 if_printf(sc->hn_ifp, "network changed, change %u\n", 4905 change); 4906 } 4907 hn_change_network(sc); 4908 break; 4909 4910 default: 4911 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 4912 msg->rm_status); 4913 break; 4914 } 4915} 4916 4917static int 4918hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 4919{ 4920 const struct rndis_pktinfo *pi = info_data; 4921 uint32_t mask = 0; 4922 4923 while (info_dlen != 0) { 4924 const void *data; 4925 uint32_t dlen; 4926 4927 if (__predict_false(info_dlen < sizeof(*pi))) 4928 return (EINVAL); 4929 if (__predict_false(info_dlen < pi->rm_size)) 4930 return (EINVAL); 4931 info_dlen -= pi->rm_size; 4932 4933 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 4934 return (EINVAL); 4935 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 4936 return (EINVAL); 4937 dlen = pi->rm_size - pi->rm_pktinfooffset; 4938 data = pi->rm_data; 4939 4940 switch (pi->rm_type) { 4941 case NDIS_PKTINFO_TYPE_VLAN: 4942 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 4943 return (EINVAL); 4944 info->vlan_info = *((const uint32_t *)data); 4945 mask |= HN_RXINFO_VLAN; 4946 break; 4947 4948 case NDIS_PKTINFO_TYPE_CSUM: 4949 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 4950 return (EINVAL); 4951 info->csum_info = *((const uint32_t *)data); 4952 mask |= HN_RXINFO_CSUM; 4953 break; 4954 4955 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 4956 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 4957 return (EINVAL); 4958 info->hash_value = *((const uint32_t *)data); 4959 mask |= HN_RXINFO_HASHVAL; 4960 break; 4961 4962 case HN_NDIS_PKTINFO_TYPE_HASHINF: 4963 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 4964 return (EINVAL); 4965 info->hash_info = *((const uint32_t *)data); 4966 mask |= HN_RXINFO_HASHINF; 4967 break; 4968 4969 default: 4970 goto next; 4971 } 4972 4973 if (mask == HN_RXINFO_ALL) { 4974 /* All found; done */ 4975 break; 4976 } 4977next: 4978 pi = (const struct rndis_pktinfo *) 4979 ((const uint8_t *)pi + pi->rm_size); 4980 } 4981 4982 /* 4983 * Final fixup. 4984 * - If there is no hash value, invalidate the hash info. 4985 */ 4986 if ((mask & HN_RXINFO_HASHVAL) == 0) 4987 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 4988 return (0); 4989} 4990 4991static __inline bool 4992hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 4993{ 4994 4995 if (off < check_off) { 4996 if (__predict_true(off + len <= check_off)) 4997 return (false); 4998 } else if (off > check_off) { 4999 if (__predict_true(check_off + check_len <= off)) 5000 return (false); 5001 } 5002 return (true); 5003} 5004 5005static void 5006hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 5007{ 5008 const struct rndis_packet_msg *pkt; 5009 struct hn_rxinfo info; 5010 int data_off, pktinfo_off, data_len, pktinfo_len; 5011 5012 /* 5013 * Check length. 5014 */ 5015 if (__predict_false(dlen < sizeof(*pkt))) { 5016 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 5017 return; 5018 } 5019 pkt = data; 5020 5021 if (__predict_false(dlen < pkt->rm_len)) { 5022 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 5023 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 5024 return; 5025 } 5026 if (__predict_false(pkt->rm_len < 5027 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 5028 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 5029 "msglen %u, data %u, oob %u, pktinfo %u\n", 5030 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 5031 pkt->rm_pktinfolen); 5032 return; 5033 } 5034 if (__predict_false(pkt->rm_datalen == 0)) { 5035 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 5036 return; 5037 } 5038 5039 /* 5040 * Check offests. 5041 */ 5042#define IS_OFFSET_INVALID(ofs) \ 5043 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 5044 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 5045 5046 /* XXX Hyper-V does not meet data offset alignment requirement */ 5047 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 5048 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5049 "data offset %u\n", pkt->rm_dataoffset); 5050 return; 5051 } 5052 if (__predict_false(pkt->rm_oobdataoffset > 0 && 5053 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 5054 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5055 "oob offset %u\n", pkt->rm_oobdataoffset); 5056 return; 5057 } 5058 if (__predict_true(pkt->rm_pktinfooffset > 0) && 5059 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 5060 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5061 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 5062 return; 5063 } 5064 5065#undef IS_OFFSET_INVALID 5066 5067 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 5068 data_len = pkt->rm_datalen; 5069 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 5070 pktinfo_len = pkt->rm_pktinfolen; 5071 5072 /* 5073 * Check OOB coverage. 5074 */ 5075 if (__predict_false(pkt->rm_oobdatalen != 0)) { 5076 int oob_off, oob_len; 5077 5078 if_printf(rxr->hn_ifp, "got oobdata\n"); 5079 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 5080 oob_len = pkt->rm_oobdatalen; 5081 5082 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 5083 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5084 "oob overflow, msglen %u, oob abs %d len %d\n", 5085 pkt->rm_len, oob_off, oob_len); 5086 return; 5087 } 5088 5089 /* 5090 * Check against data. 5091 */ 5092 if (hn_rndis_check_overlap(oob_off, oob_len, 5093 data_off, data_len)) { 5094 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5095 "oob overlaps data, oob abs %d len %d, " 5096 "data abs %d len %d\n", 5097 oob_off, oob_len, data_off, data_len); 5098 return; 5099 } 5100 5101 /* 5102 * Check against pktinfo. 5103 */ 5104 if (pktinfo_len != 0 && 5105 hn_rndis_check_overlap(oob_off, oob_len, 5106 pktinfo_off, pktinfo_len)) { 5107 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5108 "oob overlaps pktinfo, oob abs %d len %d, " 5109 "pktinfo abs %d len %d\n", 5110 oob_off, oob_len, pktinfo_off, pktinfo_len); 5111 return; 5112 } 5113 } 5114 5115 /* 5116 * Check per-packet-info coverage and find useful per-packet-info. 5117 */ 5118 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 5119 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 5120 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 5121 if (__predict_true(pktinfo_len != 0)) { 5122 bool overlap; 5123 int error; 5124 5125 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 5126 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5127 "pktinfo overflow, msglen %u, " 5128 "pktinfo abs %d len %d\n", 5129 pkt->rm_len, pktinfo_off, pktinfo_len); 5130 return; 5131 } 5132 5133 /* 5134 * Check packet info coverage. 5135 */ 5136 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 5137 data_off, data_len); 5138 if (__predict_false(overlap)) { 5139 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5140 "pktinfo overlap data, pktinfo abs %d len %d, " 5141 "data abs %d len %d\n", 5142 pktinfo_off, pktinfo_len, data_off, data_len); 5143 return; 5144 } 5145 5146 /* 5147 * Find useful per-packet-info. 5148 */ 5149 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 5150 pktinfo_len, &info); 5151 if (__predict_false(error)) { 5152 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 5153 "pktinfo\n"); 5154 return; 5155 } 5156 } 5157 5158 if (__predict_false(data_off + data_len > pkt->rm_len)) { 5159 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5160 "data overflow, msglen %u, data abs %d len %d\n", 5161 pkt->rm_len, data_off, data_len); 5162 return; 5163 } 5164 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 5165} 5166 5167static __inline void 5168hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 5169{ 5170 const struct rndis_msghdr *hdr; 5171 5172 if (__predict_false(dlen < sizeof(*hdr))) { 5173 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 5174 return; 5175 } 5176 hdr = data; 5177 5178 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 5179 /* Hot data path. */ 5180 hn_rndis_rx_data(rxr, data, dlen); 5181 /* Done! */ 5182 return; 5183 } 5184 5185 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 5186 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 5187 else 5188 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 5189} 5190 5191static void 5192hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 5193{ 5194 const struct hn_nvs_hdr *hdr; 5195 5196 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 5197 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 5198 return; 5199 } 5200 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 5201 5202 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 5203 /* Useless; ignore */ 5204 return; 5205 } 5206 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 5207} 5208 5209static void 5210hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 5211 const struct vmbus_chanpkt_hdr *pkt) 5212{ 5213 struct hn_nvs_sendctx *sndc; 5214 5215 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 5216 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 5217 VMBUS_CHANPKT_DATALEN(pkt)); 5218 /* 5219 * NOTE: 5220 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 5221 * its callback. 5222 */ 5223} 5224 5225static void 5226hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5227 const struct vmbus_chanpkt_hdr *pkthdr) 5228{ 5229 const struct vmbus_chanpkt_rxbuf *pkt; 5230 const struct hn_nvs_hdr *nvs_hdr; 5231 int count, i, hlen; 5232 5233 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 5234 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 5235 return; 5236 } 5237 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 5238 5239 /* Make sure that this is a RNDIS message. */ 5240 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 5241 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 5242 nvs_hdr->nvs_type); 5243 return; 5244 } 5245 5246 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 5247 if (__predict_false(hlen < sizeof(*pkt))) { 5248 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 5249 return; 5250 } 5251 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 5252 5253 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 5254 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 5255 pkt->cp_rxbuf_id); 5256 return; 5257 } 5258 5259 count = pkt->cp_rxbuf_cnt; 5260 if (__predict_false(hlen < 5261 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 5262 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 5263 return; 5264 } 5265 5266 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 5267 for (i = 0; i < count; ++i) { 5268 int ofs, len; 5269 5270 ofs = pkt->cp_rxbuf[i].rb_ofs; 5271 len = pkt->cp_rxbuf[i].rb_len; 5272 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 5273 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 5274 "ofs %d, len %d\n", i, ofs, len); 5275 continue; 5276 } 5277 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 5278 } 5279 5280 /* 5281 * Ack the consumed RXBUF associated w/ this channel packet, 5282 * so that this RXBUF can be recycled by the hypervisor. 5283 */ 5284 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 5285} 5286 5287static void 5288hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5289 uint64_t tid) 5290{ 5291 struct hn_nvs_rndis_ack ack; 5292 int retries, error; 5293 5294 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 5295 ack.nvs_status = HN_NVS_STATUS_OK; 5296 5297 retries = 0; 5298again: 5299 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 5300 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 5301 if (__predict_false(error == EAGAIN)) { 5302 /* 5303 * NOTE: 5304 * This should _not_ happen in real world, since the 5305 * consumption of the TX bufring from the TX path is 5306 * controlled. 5307 */ 5308 if (rxr->hn_ack_failed == 0) 5309 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 5310 rxr->hn_ack_failed++; 5311 retries++; 5312 if (retries < 10) { 5313 DELAY(100); 5314 goto again; 5315 } 5316 /* RXBUF leaks! */ 5317 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 5318 } 5319} 5320 5321static void 5322hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 5323{ 5324 struct hn_rx_ring *rxr = xrxr; 5325 struct hn_softc *sc = rxr->hn_ifp->if_softc; 5326 5327 for (;;) { 5328 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 5329 int error, pktlen; 5330 5331 pktlen = rxr->hn_pktbuf_len; 5332 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 5333 if (__predict_false(error == ENOBUFS)) { 5334 void *nbuf; 5335 int nlen; 5336 5337 /* 5338 * Expand channel packet buffer. 5339 * 5340 * XXX 5341 * Use M_WAITOK here, since allocation failure 5342 * is fatal. 5343 */ 5344 nlen = rxr->hn_pktbuf_len * 2; 5345 while (nlen < pktlen) 5346 nlen *= 2; 5347 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 5348 5349 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 5350 rxr->hn_pktbuf_len, nlen); 5351 5352 free(rxr->hn_pktbuf, M_DEVBUF); 5353 rxr->hn_pktbuf = nbuf; 5354 rxr->hn_pktbuf_len = nlen; 5355 /* Retry! */ 5356 continue; 5357 } else if (__predict_false(error == EAGAIN)) { 5358 /* No more channel packets; done! */ 5359 break; 5360 } 5361 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 5362 5363 switch (pkt->cph_type) { 5364 case VMBUS_CHANPKT_TYPE_COMP: 5365 hn_nvs_handle_comp(sc, chan, pkt); 5366 break; 5367 5368 case VMBUS_CHANPKT_TYPE_RXBUF: 5369 hn_nvs_handle_rxbuf(rxr, chan, pkt); 5370 break; 5371 5372 case VMBUS_CHANPKT_TYPE_INBAND: 5373 hn_nvs_handle_notify(sc, pkt); 5374 break; 5375 5376 default: 5377 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 5378 pkt->cph_type); 5379 break; 5380 } 5381 } 5382 hn_chan_rollup(rxr, rxr->hn_txr); 5383} 5384 5385static void 5386hn_tx_taskq_create(void *arg __unused) 5387{ 5388 int i; 5389 5390 /* 5391 * Fix the # of TX taskqueues. 5392 */ 5393 if (hn_tx_taskq_cnt <= 0) 5394 hn_tx_taskq_cnt = 1; 5395 else if (hn_tx_taskq_cnt > mp_ncpus) 5396 hn_tx_taskq_cnt = mp_ncpus; 5397 5398 /* 5399 * Fix the TX taskqueue mode. 5400 */ 5401 switch (hn_tx_taskq_mode) { 5402 case HN_TX_TASKQ_M_INDEP: 5403 case HN_TX_TASKQ_M_GLOBAL: 5404 case HN_TX_TASKQ_M_EVTTQ: 5405 break; 5406 default: 5407 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 5408 break; 5409 } 5410 5411 if (vm_guest != VM_GUEST_HV) 5412 return; 5413 5414 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 5415 return; 5416 5417 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 5418 M_DEVBUF, M_WAITOK); 5419 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 5420 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 5421 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 5422 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 5423 "hn tx%d", i); 5424 } 5425} 5426SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5427 hn_tx_taskq_create, NULL); 5428 5429static void 5430hn_tx_taskq_destroy(void *arg __unused) 5431{ 5432 5433 if (hn_tx_taskque != NULL) { 5434 int i; 5435 5436 for (i = 0; i < hn_tx_taskq_cnt; ++i) 5437 taskqueue_free(hn_tx_taskque[i]); 5438 free(hn_tx_taskque, M_DEVBUF); 5439 } 5440} 5441SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5442 hn_tx_taskq_destroy, NULL); 5443