1/*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
2
3/*
4 * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
5 * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
6 * Copyright (c) 2014, 2016 Marcelo Araujo <araujo@FreeBSD.org>
7 *
8 * Permission to use, copy, modify, and distribute this software for any
9 * purpose with or without fee is hereby granted, provided that the above
10 * copyright notice and this permission notice appear in all copies.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
13 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
14 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
15 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
16 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
17 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
18 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 */
20
21#include <sys/cdefs.h>
22__FBSDID("$FreeBSD$");
23
24#include "opt_inet.h"
25#include "opt_inet6.h"
26#include "opt_kern_tls.h"
27#include "opt_ratelimit.h"
28
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/malloc.h>
32#include <sys/mbuf.h>
33#include <sys/queue.h>
34#include <sys/socket.h>
35#include <sys/sockio.h>
36#include <sys/sysctl.h>
37#include <sys/module.h>
38#include <sys/priv.h>
39#include <sys/systm.h>
40#include <sys/proc.h>
41#include <sys/lock.h>
42#include <sys/rmlock.h>
43#include <sys/sx.h>
44#include <sys/taskqueue.h>
45#include <sys/eventhandler.h>
46
47#include <net/ethernet.h>
48#include <net/if.h>
49#include <net/if_clone.h>
50#include <net/if_arp.h>
51#include <net/if_dl.h>
52#include <net/if_media.h>
53#include <net/if_types.h>
54#include <net/if_var.h>
55#include <net/bpf.h>
56#include <net/route.h>
57#include <net/vnet.h>
58#include <net/infiniband.h>
59
60#if defined(INET) || defined(INET6)
61#include <netinet/in.h>
62#include <netinet/ip.h>
63#endif
64#ifdef INET
65#include <netinet/in_systm.h>
66#include <netinet/if_ether.h>
67#endif
68
69#ifdef INET6
70#include <netinet/ip6.h>
71#include <netinet6/in6_var.h>
72#include <netinet6/in6_ifattach.h>
73#endif
74
75#include <net/if_vlan_var.h>
76#include <net/if_lagg.h>
77#include <net/ieee8023ad_lacp.h>
78
79#ifdef INET6
80/*
81 * XXX: declare here to avoid to include many inet6 related files..
82 * should be more generalized?
83 */
84extern void	nd6_setmtu(struct ifnet *);
85#endif
86
87#define	LAGG_SX_INIT(_sc)	sx_init(&(_sc)->sc_sx, "if_lagg sx")
88#define	LAGG_SX_DESTROY(_sc)	sx_destroy(&(_sc)->sc_sx)
89#define	LAGG_XLOCK(_sc)		sx_xlock(&(_sc)->sc_sx)
90#define	LAGG_XUNLOCK(_sc)	sx_xunlock(&(_sc)->sc_sx)
91#define	LAGG_SXLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_LOCKED)
92#define	LAGG_XLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_XLOCKED)
93
94/* Special flags we should propagate to the lagg ports. */
95static struct {
96	int flag;
97	int (*func)(struct ifnet *, int);
98} lagg_pflags[] = {
99	{IFF_PROMISC, ifpromisc},
100	{IFF_ALLMULTI, if_allmulti},
101	{0, NULL}
102};
103
104struct lagg_snd_tag {
105	struct m_snd_tag com;
106	struct m_snd_tag *tag;
107};
108
109VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */
110#define	V_lagg_list	VNET(lagg_list)
111VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx);
112#define	V_lagg_list_mtx	VNET(lagg_list_mtx)
113#define	LAGG_LIST_LOCK_INIT(x)		mtx_init(&V_lagg_list_mtx, \
114					"if_lagg list", NULL, MTX_DEF)
115#define	LAGG_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_lagg_list_mtx)
116#define	LAGG_LIST_LOCK(x)		mtx_lock(&V_lagg_list_mtx)
117#define	LAGG_LIST_UNLOCK(x)		mtx_unlock(&V_lagg_list_mtx)
118eventhandler_tag	lagg_detach_cookie = NULL;
119
120static int	lagg_clone_create(struct if_clone *, int, caddr_t);
121static void	lagg_clone_destroy(struct ifnet *);
122VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner);
123#define	V_lagg_cloner	VNET(lagg_cloner)
124static const char laggname[] = "lagg";
125static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface");
126
127static void	lagg_capabilities(struct lagg_softc *);
128static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
129static int	lagg_port_destroy(struct lagg_port *, int);
130static struct mbuf *lagg_input_ethernet(struct ifnet *, struct mbuf *);
131static struct mbuf *lagg_input_infiniband(struct ifnet *, struct mbuf *);
132static void	lagg_linkstate(struct lagg_softc *);
133static void	lagg_port_state(struct ifnet *, int);
134static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
135static int	lagg_port_output(struct ifnet *, struct mbuf *,
136		    const struct sockaddr *, struct route *);
137static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
138#ifdef LAGG_PORT_STACKING
139static int	lagg_port_checkstacking(struct lagg_softc *);
140#endif
141static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
142static void	lagg_init(void *);
143static void	lagg_stop(struct lagg_softc *);
144static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
145#if defined(KERN_TLS) || defined(RATELIMIT)
146static int	lagg_snd_tag_alloc(struct ifnet *,
147		    union if_snd_tag_alloc_params *,
148		    struct m_snd_tag **);
149static int	lagg_snd_tag_modify(struct m_snd_tag *,
150		    union if_snd_tag_modify_params *);
151static int	lagg_snd_tag_query(struct m_snd_tag *,
152		    union if_snd_tag_query_params *);
153static void	lagg_snd_tag_free(struct m_snd_tag *);
154static struct m_snd_tag *lagg_next_snd_tag(struct m_snd_tag *);
155static void     lagg_ratelimit_query(struct ifnet *,
156		    struct if_ratelimit_query_results *);
157#endif
158static int	lagg_setmulti(struct lagg_port *);
159static int	lagg_clrmulti(struct lagg_port *);
160static	int	lagg_setcaps(struct lagg_port *, int cap);
161static	int	lagg_setflag(struct lagg_port *, int, int,
162		    int (*func)(struct ifnet *, int));
163static	int	lagg_setflags(struct lagg_port *, int status);
164static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt);
165static int	lagg_transmit_ethernet(struct ifnet *, struct mbuf *);
166static int	lagg_transmit_infiniband(struct ifnet *, struct mbuf *);
167static void	lagg_qflush(struct ifnet *);
168static int	lagg_media_change(struct ifnet *);
169static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
170static struct lagg_port *lagg_link_active(struct lagg_softc *,
171	    struct lagg_port *);
172
173/* Simple round robin */
174static void	lagg_rr_attach(struct lagg_softc *);
175static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
176static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
177		    struct mbuf *);
178
179/* Active failover */
180static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
181static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
182		    struct mbuf *);
183
184/* Loadbalancing */
185static void	lagg_lb_attach(struct lagg_softc *);
186static void	lagg_lb_detach(struct lagg_softc *);
187static int	lagg_lb_port_create(struct lagg_port *);
188static void	lagg_lb_port_destroy(struct lagg_port *);
189static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
190static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
191		    struct mbuf *);
192static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
193
194/* Broadcast */
195static int    lagg_bcast_start(struct lagg_softc *, struct mbuf *);
196static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *,
197		    struct mbuf *);
198
199/* 802.3ad LACP */
200static void	lagg_lacp_attach(struct lagg_softc *);
201static void	lagg_lacp_detach(struct lagg_softc *);
202static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
203static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
204		    struct mbuf *);
205static void	lagg_lacp_lladdr(struct lagg_softc *);
206
207/* lagg protocol table */
208static const struct lagg_proto {
209	lagg_proto	pr_num;
210	void		(*pr_attach)(struct lagg_softc *);
211	void		(*pr_detach)(struct lagg_softc *);
212	int		(*pr_start)(struct lagg_softc *, struct mbuf *);
213	struct mbuf *	(*pr_input)(struct lagg_softc *, struct lagg_port *,
214			    struct mbuf *);
215	int		(*pr_addport)(struct lagg_port *);
216	void		(*pr_delport)(struct lagg_port *);
217	void		(*pr_linkstate)(struct lagg_port *);
218	void 		(*pr_init)(struct lagg_softc *);
219	void 		(*pr_stop)(struct lagg_softc *);
220	void 		(*pr_lladdr)(struct lagg_softc *);
221	void		(*pr_request)(struct lagg_softc *, void *);
222	void		(*pr_portreq)(struct lagg_port *, void *);
223} lagg_protos[] = {
224    {
225	.pr_num = LAGG_PROTO_NONE
226    },
227    {
228	.pr_num = LAGG_PROTO_ROUNDROBIN,
229	.pr_attach = lagg_rr_attach,
230	.pr_start = lagg_rr_start,
231	.pr_input = lagg_rr_input,
232    },
233    {
234	.pr_num = LAGG_PROTO_FAILOVER,
235	.pr_start = lagg_fail_start,
236	.pr_input = lagg_fail_input,
237    },
238    {
239	.pr_num = LAGG_PROTO_LOADBALANCE,
240	.pr_attach = lagg_lb_attach,
241	.pr_detach = lagg_lb_detach,
242	.pr_start = lagg_lb_start,
243	.pr_input = lagg_lb_input,
244	.pr_addport = lagg_lb_port_create,
245	.pr_delport = lagg_lb_port_destroy,
246    },
247    {
248	.pr_num = LAGG_PROTO_LACP,
249	.pr_attach = lagg_lacp_attach,
250	.pr_detach = lagg_lacp_detach,
251	.pr_start = lagg_lacp_start,
252	.pr_input = lagg_lacp_input,
253	.pr_addport = lacp_port_create,
254	.pr_delport = lacp_port_destroy,
255	.pr_linkstate = lacp_linkstate,
256	.pr_init = lacp_init,
257	.pr_stop = lacp_stop,
258	.pr_lladdr = lagg_lacp_lladdr,
259	.pr_request = lacp_req,
260	.pr_portreq = lacp_portreq,
261    },
262    {
263	.pr_num = LAGG_PROTO_BROADCAST,
264	.pr_start = lagg_bcast_start,
265	.pr_input = lagg_bcast_input,
266    },
267};
268
269SYSCTL_DECL(_net_link);
270SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
271    "Link Aggregation");
272
273/* Allow input on any failover links */
274VNET_DEFINE_STATIC(int, lagg_failover_rx_all);
275#define	V_lagg_failover_rx_all	VNET(lagg_failover_rx_all)
276SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET,
277    &VNET_NAME(lagg_failover_rx_all), 0,
278    "Accept input from any interface in a failover lagg");
279
280/* Default value for using flowid */
281VNET_DEFINE_STATIC(int, def_use_flowid) = 0;
282#define	V_def_use_flowid	VNET(def_use_flowid)
283SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN,
284    &VNET_NAME(def_use_flowid), 0,
285    "Default setting for using flow id for load sharing");
286
287/* Default value for using numa */
288VNET_DEFINE_STATIC(int, def_use_numa) = 1;
289#define	V_def_use_numa	VNET(def_use_numa)
290SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN,
291    &VNET_NAME(def_use_numa), 0,
292    "Use numa to steer flows");
293
294/* Default value for flowid shift */
295VNET_DEFINE_STATIC(int, def_flowid_shift) = 16;
296#define	V_def_flowid_shift	VNET(def_flowid_shift)
297SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN,
298    &VNET_NAME(def_flowid_shift), 0,
299    "Default setting for flowid shift for load sharing");
300
301static void
302vnet_lagg_init(const void *unused __unused)
303{
304
305	LAGG_LIST_LOCK_INIT();
306	SLIST_INIT(&V_lagg_list);
307	V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create,
308	    lagg_clone_destroy, 0);
309}
310VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
311    vnet_lagg_init, NULL);
312
313static void
314vnet_lagg_uninit(const void *unused __unused)
315{
316
317	if_clone_detach(V_lagg_cloner);
318	LAGG_LIST_LOCK_DESTROY();
319}
320VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
321    vnet_lagg_uninit, NULL);
322
323static int
324lagg_modevent(module_t mod, int type, void *data)
325{
326
327	switch (type) {
328	case MOD_LOAD:
329		lagg_input_ethernet_p = lagg_input_ethernet;
330		lagg_input_infiniband_p = lagg_input_infiniband;
331		lagg_linkstate_p = lagg_port_state;
332		lagg_detach_cookie = EVENTHANDLER_REGISTER(
333		    ifnet_departure_event, lagg_port_ifdetach, NULL,
334		    EVENTHANDLER_PRI_ANY);
335		break;
336	case MOD_UNLOAD:
337		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
338		    lagg_detach_cookie);
339		lagg_input_ethernet_p = NULL;
340		lagg_input_infiniband_p = NULL;
341		lagg_linkstate_p = NULL;
342		break;
343	default:
344		return (EOPNOTSUPP);
345	}
346	return (0);
347}
348
349static moduledata_t lagg_mod = {
350	"if_lagg",
351	lagg_modevent,
352	0
353};
354
355DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
356MODULE_VERSION(if_lagg, 1);
357MODULE_DEPEND(if_lagg, if_infiniband, 1, 1, 1);
358
359static void
360lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr)
361{
362
363	LAGG_XLOCK_ASSERT(sc);
364	KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto",
365	    __func__, sc));
366
367	if (sc->sc_ifflags & IFF_DEBUG)
368		if_printf(sc->sc_ifp, "using proto %u\n", pr);
369
370	if (lagg_protos[pr].pr_attach != NULL)
371		lagg_protos[pr].pr_attach(sc);
372	sc->sc_proto = pr;
373}
374
375static void
376lagg_proto_detach(struct lagg_softc *sc)
377{
378	lagg_proto pr;
379
380	LAGG_XLOCK_ASSERT(sc);
381	pr = sc->sc_proto;
382	sc->sc_proto = LAGG_PROTO_NONE;
383
384	if (lagg_protos[pr].pr_detach != NULL)
385		lagg_protos[pr].pr_detach(sc);
386}
387
388static int
389lagg_proto_start(struct lagg_softc *sc, struct mbuf *m)
390{
391
392	return (lagg_protos[sc->sc_proto].pr_start(sc, m));
393}
394
395static struct mbuf *
396lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
397{
398
399	return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m));
400}
401
402static int
403lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp)
404{
405
406	if (lagg_protos[sc->sc_proto].pr_addport == NULL)
407		return (0);
408	else
409		return (lagg_protos[sc->sc_proto].pr_addport(lp));
410}
411
412static void
413lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp)
414{
415
416	if (lagg_protos[sc->sc_proto].pr_delport != NULL)
417		lagg_protos[sc->sc_proto].pr_delport(lp);
418}
419
420static void
421lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp)
422{
423
424	if (lagg_protos[sc->sc_proto].pr_linkstate != NULL)
425		lagg_protos[sc->sc_proto].pr_linkstate(lp);
426}
427
428static void
429lagg_proto_init(struct lagg_softc *sc)
430{
431
432	if (lagg_protos[sc->sc_proto].pr_init != NULL)
433		lagg_protos[sc->sc_proto].pr_init(sc);
434}
435
436static void
437lagg_proto_stop(struct lagg_softc *sc)
438{
439
440	if (lagg_protos[sc->sc_proto].pr_stop != NULL)
441		lagg_protos[sc->sc_proto].pr_stop(sc);
442}
443
444static void
445lagg_proto_lladdr(struct lagg_softc *sc)
446{
447
448	if (lagg_protos[sc->sc_proto].pr_lladdr != NULL)
449		lagg_protos[sc->sc_proto].pr_lladdr(sc);
450}
451
452static void
453lagg_proto_request(struct lagg_softc *sc, void *v)
454{
455
456	if (lagg_protos[sc->sc_proto].pr_request != NULL)
457		lagg_protos[sc->sc_proto].pr_request(sc, v);
458}
459
460static void
461lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v)
462{
463
464	if (lagg_protos[sc->sc_proto].pr_portreq != NULL)
465		lagg_protos[sc->sc_proto].pr_portreq(lp, v);
466}
467
468/*
469 * This routine is run via an vlan
470 * config EVENT
471 */
472static void
473lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
474{
475	struct lagg_softc *sc = ifp->if_softc;
476	struct lagg_port *lp;
477
478	if (ifp->if_softc !=  arg)   /* Not our event */
479		return;
480
481	LAGG_XLOCK(sc);
482	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
483		EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
484	LAGG_XUNLOCK(sc);
485}
486
487/*
488 * This routine is run via an vlan
489 * unconfig EVENT
490 */
491static void
492lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
493{
494	struct lagg_softc *sc = ifp->if_softc;
495	struct lagg_port *lp;
496
497	if (ifp->if_softc !=  arg)   /* Not our event */
498		return;
499
500	LAGG_XLOCK(sc);
501	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
502		EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
503	LAGG_XUNLOCK(sc);
504}
505
506static int
507lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
508{
509	struct iflaggparam iflp;
510	struct lagg_softc *sc;
511	struct ifnet *ifp;
512	int if_type;
513	int error;
514	static const uint8_t eaddr[LAGG_ADDR_LEN];
515
516	if (params != NULL) {
517		error = copyin(params, &iflp, sizeof(iflp));
518		if (error)
519			return (error);
520
521		switch (iflp.lagg_type) {
522		case LAGG_TYPE_ETHERNET:
523			if_type = IFT_ETHER;
524			break;
525		case LAGG_TYPE_INFINIBAND:
526			if_type = IFT_INFINIBAND;
527			break;
528		default:
529			return (EINVAL);
530		}
531	} else {
532		if_type = IFT_ETHER;
533	}
534
535	sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO);
536	ifp = sc->sc_ifp = if_alloc(if_type);
537	if (ifp == NULL) {
538		free(sc, M_LAGG);
539		return (ENOSPC);
540	}
541	LAGG_SX_INIT(sc);
542
543	mtx_init(&sc->sc_mtx, "lagg-mtx", NULL, MTX_DEF);
544	callout_init_mtx(&sc->sc_watchdog, &sc->sc_mtx, 0);
545
546	LAGG_XLOCK(sc);
547	if (V_def_use_flowid)
548		sc->sc_opts |= LAGG_OPT_USE_FLOWID;
549	if (V_def_use_numa)
550		sc->sc_opts |= LAGG_OPT_USE_NUMA;
551	sc->flowid_shift = V_def_flowid_shift;
552
553	/* Hash all layers by default */
554	sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4;
555
556	lagg_proto_attach(sc, LAGG_PROTO_DEFAULT);
557
558	CK_SLIST_INIT(&sc->sc_ports);
559
560	switch (if_type) {
561	case IFT_ETHER:
562		/* Initialise pseudo media types */
563		ifmedia_init(&sc->sc_media, 0, lagg_media_change,
564		    lagg_media_status);
565		ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
566		ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
567
568		if_initname(ifp, laggname, unit);
569		ifp->if_transmit = lagg_transmit_ethernet;
570		break;
571	case IFT_INFINIBAND:
572		if_initname(ifp, laggname, unit);
573		ifp->if_transmit = lagg_transmit_infiniband;
574		break;
575	default:
576		break;
577	}
578	ifp->if_softc = sc;
579	ifp->if_qflush = lagg_qflush;
580	ifp->if_init = lagg_init;
581	ifp->if_ioctl = lagg_ioctl;
582	ifp->if_get_counter = lagg_get_counter;
583	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
584#if defined(KERN_TLS) || defined(RATELIMIT)
585	ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
586	ifp->if_snd_tag_modify = lagg_snd_tag_modify;
587	ifp->if_snd_tag_query = lagg_snd_tag_query;
588	ifp->if_snd_tag_free = lagg_snd_tag_free;
589	ifp->if_next_snd_tag = lagg_next_snd_tag;
590	ifp->if_ratelimit_query = lagg_ratelimit_query;
591#endif
592	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
593
594	/*
595	 * Attach as an ordinary ethernet device, children will be attached
596	 * as special device IFT_IEEE8023ADLAG or IFT_INFINIBANDLAG.
597	 */
598	switch (if_type) {
599	case IFT_ETHER:
600		ether_ifattach(ifp, eaddr);
601		break;
602	case IFT_INFINIBAND:
603		infiniband_ifattach(ifp, eaddr, sc->sc_bcast_addr);
604		break;
605	default:
606		break;
607	}
608
609	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
610		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
611	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
612		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
613
614	/* Insert into the global list of laggs */
615	LAGG_LIST_LOCK();
616	SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries);
617	LAGG_LIST_UNLOCK();
618	LAGG_XUNLOCK(sc);
619
620	return (0);
621}
622
623static void
624lagg_clone_destroy(struct ifnet *ifp)
625{
626	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
627	struct lagg_port *lp;
628
629	LAGG_XLOCK(sc);
630	sc->sc_destroying = 1;
631	lagg_stop(sc);
632	ifp->if_flags &= ~IFF_UP;
633
634	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
635	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
636
637	/* Shutdown and remove lagg ports */
638	while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL)
639		lagg_port_destroy(lp, 1);
640
641	/* Unhook the aggregation protocol */
642	lagg_proto_detach(sc);
643	LAGG_XUNLOCK(sc);
644
645	switch (ifp->if_type) {
646	case IFT_ETHER:
647		ifmedia_removeall(&sc->sc_media);
648		ether_ifdetach(ifp);
649		break;
650	case IFT_INFINIBAND:
651		infiniband_ifdetach(ifp);
652		break;
653	default:
654		break;
655	}
656	if_free(ifp);
657
658	LAGG_LIST_LOCK();
659	SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries);
660	LAGG_LIST_UNLOCK();
661
662	mtx_destroy(&sc->sc_mtx);
663	LAGG_SX_DESTROY(sc);
664	free(sc, M_LAGG);
665}
666
667static void
668lagg_capabilities(struct lagg_softc *sc)
669{
670	struct lagg_port *lp;
671	int cap, ena, pena;
672	uint64_t hwa;
673	struct ifnet_hw_tsomax hw_tsomax;
674
675	LAGG_XLOCK_ASSERT(sc);
676
677	/* Get common enabled capabilities for the lagg ports */
678	ena = ~0;
679	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
680		ena &= lp->lp_ifp->if_capenable;
681	ena = (ena == ~0 ? 0 : ena);
682
683	/*
684	 * Apply common enabled capabilities back to the lagg ports.
685	 * May require several iterations if they are dependent.
686	 */
687	do {
688		pena = ena;
689		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
690			lagg_setcaps(lp, ena);
691			ena &= lp->lp_ifp->if_capenable;
692		}
693	} while (pena != ena);
694
695	/* Get other capabilities from the lagg ports */
696	cap = ~0;
697	hwa = ~(uint64_t)0;
698	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
699	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
700		cap &= lp->lp_ifp->if_capabilities;
701		hwa &= lp->lp_ifp->if_hwassist;
702		if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax);
703	}
704	cap = (cap == ~0 ? 0 : cap);
705	hwa = (hwa == ~(uint64_t)0 ? 0 : hwa);
706
707	if (sc->sc_ifp->if_capabilities != cap ||
708	    sc->sc_ifp->if_capenable != ena ||
709	    sc->sc_ifp->if_hwassist != hwa ||
710	    if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) {
711		sc->sc_ifp->if_capabilities = cap;
712		sc->sc_ifp->if_capenable = ena;
713		sc->sc_ifp->if_hwassist = hwa;
714		getmicrotime(&sc->sc_ifp->if_lastchange);
715
716		if (sc->sc_ifflags & IFF_DEBUG)
717			if_printf(sc->sc_ifp,
718			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
719	}
720}
721
722static int
723lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
724{
725	struct lagg_softc *sc_ptr;
726	struct lagg_port *lp, *tlp;
727	struct ifreq ifr;
728	int error, i, oldmtu;
729	int if_type;
730	uint64_t *pval;
731
732	LAGG_XLOCK_ASSERT(sc);
733
734	if (sc->sc_ifp == ifp) {
735		if_printf(sc->sc_ifp,
736		    "cannot add a lagg to itself as a port\n");
737		return (EINVAL);
738	}
739
740	if (sc->sc_destroying == 1)
741		return (ENXIO);
742
743	/* Limit the maximal number of lagg ports */
744	if (sc->sc_count >= LAGG_MAX_PORTS)
745		return (ENOSPC);
746
747	/* Check if port has already been associated to a lagg */
748	if (ifp->if_lagg != NULL) {
749		/* Port is already in the current lagg? */
750		lp = (struct lagg_port *)ifp->if_lagg;
751		if (lp->lp_softc == sc)
752			return (EEXIST);
753		return (EBUSY);
754	}
755
756	switch (sc->sc_ifp->if_type) {
757	case IFT_ETHER:
758		/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
759		if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN)
760			return (EPROTONOSUPPORT);
761		if_type = IFT_IEEE8023ADLAG;
762		break;
763	case IFT_INFINIBAND:
764		/* XXX Disallow non-infiniband interfaces */
765		if (ifp->if_type != IFT_INFINIBAND)
766			return (EPROTONOSUPPORT);
767		if_type = IFT_INFINIBANDLAG;
768		break;
769	default:
770		break;
771	}
772
773	/* Allow the first Ethernet member to define the MTU */
774	oldmtu = -1;
775	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
776		sc->sc_ifp->if_mtu = ifp->if_mtu;
777	} else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
778		if (ifp->if_ioctl == NULL) {
779			if_printf(sc->sc_ifp, "cannot change MTU for %s\n",
780			    ifp->if_xname);
781			return (EINVAL);
782		}
783		oldmtu = ifp->if_mtu;
784		strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name));
785		ifr.ifr_mtu = sc->sc_ifp->if_mtu;
786		error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
787		if (error != 0) {
788			if_printf(sc->sc_ifp, "invalid MTU for %s\n",
789			    ifp->if_xname);
790			return (error);
791		}
792		ifr.ifr_mtu = oldmtu;
793	}
794
795	lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO);
796	lp->lp_softc = sc;
797
798	/* Check if port is a stacked lagg */
799	LAGG_LIST_LOCK();
800	SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) {
801		if (ifp == sc_ptr->sc_ifp) {
802			LAGG_LIST_UNLOCK();
803			free(lp, M_LAGG);
804			if (oldmtu != -1)
805				(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
806				    (caddr_t)&ifr);
807			return (EINVAL);
808			/* XXX disable stacking for the moment, its untested */
809#ifdef LAGG_PORT_STACKING
810			lp->lp_flags |= LAGG_PORT_STACK;
811			if (lagg_port_checkstacking(sc_ptr) >=
812			    LAGG_MAX_STACKING) {
813				LAGG_LIST_UNLOCK();
814				free(lp, M_LAGG);
815				if (oldmtu != -1)
816					(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
817					    (caddr_t)&ifr);
818				return (E2BIG);
819			}
820#endif
821		}
822	}
823	LAGG_LIST_UNLOCK();
824
825	if_ref(ifp);
826	lp->lp_ifp = ifp;
827
828	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ifp->if_addrlen);
829	lp->lp_ifcapenable = ifp->if_capenable;
830	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
831		bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ifp->if_addrlen);
832		lagg_proto_lladdr(sc);
833		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
834	} else {
835		if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ifp->if_addrlen);
836	}
837	lagg_setflags(lp, 1);
838
839	if (CK_SLIST_EMPTY(&sc->sc_ports))
840		sc->sc_primary = lp;
841
842	/* Change the interface type */
843	lp->lp_iftype = ifp->if_type;
844	ifp->if_type = if_type;
845	ifp->if_lagg = lp;
846	lp->lp_ioctl = ifp->if_ioctl;
847	ifp->if_ioctl = lagg_port_ioctl;
848	lp->lp_output = ifp->if_output;
849	ifp->if_output = lagg_port_output;
850
851	/* Read port counters */
852	pval = lp->port_counters.val;
853	for (i = 0; i < IFCOUNTERS; i++, pval++)
854		*pval = ifp->if_get_counter(ifp, i);
855
856	/*
857	 * Insert into the list of ports.
858	 * Keep ports sorted by if_index. It is handy, when configuration
859	 * is predictable and `ifconfig laggN create ...` command
860	 * will lead to the same result each time.
861	 */
862	CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) {
863		if (tlp->lp_ifp->if_index < ifp->if_index && (
864		    CK_SLIST_NEXT(tlp, lp_entries) == NULL ||
865		    ((struct  lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index >
866		    ifp->if_index))
867			break;
868	}
869	if (tlp != NULL)
870		CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries);
871	else
872		CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
873	sc->sc_count++;
874
875	lagg_setmulti(lp);
876
877	if ((error = lagg_proto_addport(sc, lp)) != 0) {
878		/* Remove the port, without calling pr_delport. */
879		lagg_port_destroy(lp, 0);
880		if (oldmtu != -1)
881			(*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
882		return (error);
883	}
884
885	/* Update lagg capabilities */
886	lagg_capabilities(sc);
887	lagg_linkstate(sc);
888
889	return (0);
890}
891
892#ifdef LAGG_PORT_STACKING
893static int
894lagg_port_checkstacking(struct lagg_softc *sc)
895{
896	struct lagg_softc *sc_ptr;
897	struct lagg_port *lp;
898	int m = 0;
899
900	LAGG_SXLOCK_ASSERT(sc);
901	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
902		if (lp->lp_flags & LAGG_PORT_STACK) {
903			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
904			m = MAX(m, lagg_port_checkstacking(sc_ptr));
905		}
906	}
907
908	return (m + 1);
909}
910#endif
911
912static void
913lagg_port_destroy_cb(epoch_context_t ec)
914{
915	struct lagg_port *lp;
916	struct ifnet *ifp;
917
918	lp = __containerof(ec, struct lagg_port, lp_epoch_ctx);
919	ifp = lp->lp_ifp;
920
921	if_rele(ifp);
922	free(lp, M_LAGG);
923}
924
925static int
926lagg_port_destroy(struct lagg_port *lp, int rundelport)
927{
928	struct lagg_softc *sc = lp->lp_softc;
929	struct lagg_port *lp_ptr, *lp0;
930	struct ifnet *ifp = lp->lp_ifp;
931	uint64_t *pval, vdiff;
932	int i;
933
934	LAGG_XLOCK_ASSERT(sc);
935
936	if (rundelport)
937		lagg_proto_delport(sc, lp);
938
939	if (lp->lp_detaching == 0)
940		lagg_clrmulti(lp);
941
942	/* Restore interface */
943	ifp->if_type = lp->lp_iftype;
944	ifp->if_ioctl = lp->lp_ioctl;
945	ifp->if_output = lp->lp_output;
946	ifp->if_lagg = NULL;
947
948	/* Update detached port counters */
949	pval = lp->port_counters.val;
950	for (i = 0; i < IFCOUNTERS; i++, pval++) {
951		vdiff = ifp->if_get_counter(ifp, i) - *pval;
952		sc->detached_counters.val[i] += vdiff;
953	}
954
955	/* Finally, remove the port from the lagg */
956	CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
957	sc->sc_count--;
958
959	/* Update the primary interface */
960	if (lp == sc->sc_primary) {
961		uint8_t lladdr[LAGG_ADDR_LEN];
962
963		if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL)
964			bzero(&lladdr, LAGG_ADDR_LEN);
965		else
966			bcopy(lp0->lp_lladdr, lladdr, LAGG_ADDR_LEN);
967		sc->sc_primary = lp0;
968		if (sc->sc_destroying == 0) {
969			bcopy(lladdr, IF_LLADDR(sc->sc_ifp), sc->sc_ifp->if_addrlen);
970			lagg_proto_lladdr(sc);
971			EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
972		}
973
974		/*
975		 * Update lladdr for each port (new primary needs update
976		 * as well, to switch from old lladdr to its 'real' one)
977		 */
978		CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
979			if_setlladdr(lp_ptr->lp_ifp, lladdr, lp_ptr->lp_ifp->if_addrlen);
980	}
981
982	if (lp->lp_ifflags)
983		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
984
985	if (lp->lp_detaching == 0) {
986		lagg_setflags(lp, 0);
987		lagg_setcaps(lp, lp->lp_ifcapenable);
988		if_setlladdr(ifp, lp->lp_lladdr, ifp->if_addrlen);
989	}
990
991	/*
992	 * free port and release it's ifnet reference after a grace period has
993	 * elapsed.
994	 */
995	NET_EPOCH_CALL(lagg_port_destroy_cb, &lp->lp_epoch_ctx);
996	/* Update lagg capabilities */
997	lagg_capabilities(sc);
998	lagg_linkstate(sc);
999
1000	return (0);
1001}
1002
1003static int
1004lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1005{
1006	struct epoch_tracker et;
1007	struct lagg_reqport *rp = (struct lagg_reqport *)data;
1008	struct lagg_softc *sc;
1009	struct lagg_port *lp = NULL;
1010	int error = 0;
1011
1012	/* Should be checked by the caller */
1013	switch (ifp->if_type) {
1014	case IFT_IEEE8023ADLAG:
1015	case IFT_INFINIBANDLAG:
1016		if ((lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
1017			goto fallback;
1018		break;
1019	default:
1020		goto fallback;
1021	}
1022
1023	switch (cmd) {
1024	case SIOCGLAGGPORT:
1025		if (rp->rp_portname[0] == '\0' ||
1026		    ifunit(rp->rp_portname) != ifp) {
1027			error = EINVAL;
1028			break;
1029		}
1030
1031		NET_EPOCH_ENTER(et);
1032		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
1033			error = ENOENT;
1034			NET_EPOCH_EXIT(et);
1035			break;
1036		}
1037
1038		lagg_port2req(lp, rp);
1039		NET_EPOCH_EXIT(et);
1040		break;
1041
1042	case SIOCSIFCAP:
1043		if (lp->lp_ioctl == NULL) {
1044			error = EINVAL;
1045			break;
1046		}
1047		error = (*lp->lp_ioctl)(ifp, cmd, data);
1048		if (error)
1049			break;
1050
1051		/* Update lagg interface capabilities */
1052		LAGG_XLOCK(sc);
1053		lagg_capabilities(sc);
1054		LAGG_XUNLOCK(sc);
1055		VLAN_CAPABILITIES(sc->sc_ifp);
1056		break;
1057
1058	case SIOCSIFMTU:
1059		/* Do not allow the MTU to be changed once joined */
1060		error = EINVAL;
1061		break;
1062
1063	default:
1064		goto fallback;
1065	}
1066
1067	return (error);
1068
1069fallback:
1070	if (lp != NULL && lp->lp_ioctl != NULL)
1071		return ((*lp->lp_ioctl)(ifp, cmd, data));
1072
1073	return (EINVAL);
1074}
1075
1076/*
1077 * Requests counter @cnt data.
1078 *
1079 * Counter value is calculated the following way:
1080 * 1) for each port, sum  difference between current and "initial" measurements.
1081 * 2) add lagg logical interface counters.
1082 * 3) add data from detached_counters array.
1083 *
1084 * We also do the following things on ports attach/detach:
1085 * 1) On port attach we store all counters it has into port_counter array.
1086 * 2) On port detach we add the different between "initial" and
1087 *   current counters data to detached_counters array.
1088 */
1089static uint64_t
1090lagg_get_counter(struct ifnet *ifp, ift_counter cnt)
1091{
1092	struct epoch_tracker et;
1093	struct lagg_softc *sc;
1094	struct lagg_port *lp;
1095	struct ifnet *lpifp;
1096	uint64_t newval, oldval, vsum;
1097
1098	/* Revise this when we've got non-generic counters. */
1099	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
1100
1101	sc = (struct lagg_softc *)ifp->if_softc;
1102
1103	vsum = 0;
1104	NET_EPOCH_ENTER(et);
1105	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1106		/* Saved attached value */
1107		oldval = lp->port_counters.val[cnt];
1108		/* current value */
1109		lpifp = lp->lp_ifp;
1110		newval = lpifp->if_get_counter(lpifp, cnt);
1111		/* Calculate diff and save new */
1112		vsum += newval - oldval;
1113	}
1114	NET_EPOCH_EXIT(et);
1115
1116	/*
1117	 * Add counter data which might be added by upper
1118	 * layer protocols operating on logical interface.
1119	 */
1120	vsum += if_get_counter_default(ifp, cnt);
1121
1122	/*
1123	 * Add counter data from detached ports counters
1124	 */
1125	vsum += sc->detached_counters.val[cnt];
1126
1127	return (vsum);
1128}
1129
1130/*
1131 * For direct output to child ports.
1132 */
1133static int
1134lagg_port_output(struct ifnet *ifp, struct mbuf *m,
1135	const struct sockaddr *dst, struct route *ro)
1136{
1137	struct lagg_port *lp = ifp->if_lagg;
1138
1139	switch (dst->sa_family) {
1140		case pseudo_AF_HDRCMPLT:
1141		case AF_UNSPEC:
1142			if (lp != NULL)
1143				return ((*lp->lp_output)(ifp, m, dst, ro));
1144	}
1145
1146	/* drop any other frames */
1147	m_freem(m);
1148	return (ENETDOWN);
1149}
1150
1151static void
1152lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
1153{
1154	struct lagg_port *lp;
1155	struct lagg_softc *sc;
1156
1157	if ((lp = ifp->if_lagg) == NULL)
1158		return;
1159	/* If the ifnet is just being renamed, don't do anything. */
1160	if (ifp->if_flags & IFF_RENAMING)
1161		return;
1162
1163	sc = lp->lp_softc;
1164
1165	LAGG_XLOCK(sc);
1166	lp->lp_detaching = 1;
1167	lagg_port_destroy(lp, 1);
1168	LAGG_XUNLOCK(sc);
1169	VLAN_CAPABILITIES(sc->sc_ifp);
1170}
1171
1172static void
1173lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
1174{
1175	struct lagg_softc *sc = lp->lp_softc;
1176
1177	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
1178	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
1179	rp->rp_prio = lp->lp_prio;
1180	rp->rp_flags = lp->lp_flags;
1181	lagg_proto_portreq(sc, lp, &rp->rp_psc);
1182
1183	/* Add protocol specific flags */
1184	switch (sc->sc_proto) {
1185		case LAGG_PROTO_FAILOVER:
1186			if (lp == sc->sc_primary)
1187				rp->rp_flags |= LAGG_PORT_MASTER;
1188			if (lp == lagg_link_active(sc, sc->sc_primary))
1189				rp->rp_flags |= LAGG_PORT_ACTIVE;
1190			break;
1191
1192		case LAGG_PROTO_ROUNDROBIN:
1193		case LAGG_PROTO_LOADBALANCE:
1194		case LAGG_PROTO_BROADCAST:
1195			if (LAGG_PORTACTIVE(lp))
1196				rp->rp_flags |= LAGG_PORT_ACTIVE;
1197			break;
1198
1199		case LAGG_PROTO_LACP:
1200			/* LACP has a different definition of active */
1201			if (lacp_isactive(lp))
1202				rp->rp_flags |= LAGG_PORT_ACTIVE;
1203			if (lacp_iscollecting(lp))
1204				rp->rp_flags |= LAGG_PORT_COLLECTING;
1205			if (lacp_isdistributing(lp))
1206				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
1207			break;
1208	}
1209
1210}
1211
1212static void
1213lagg_watchdog_infiniband(void *arg)
1214{
1215	struct epoch_tracker et;
1216	struct lagg_softc *sc;
1217	struct lagg_port *lp;
1218	struct ifnet *ifp;
1219	struct ifnet *lp_ifp;
1220
1221	sc = arg;
1222
1223	/*
1224	 * Because infiniband nodes have a fixed MAC address, which is
1225	 * generated by the so-called GID, we need to regularly update
1226	 * the link level address of the parent lagg<N> device when
1227	 * the active port changes. Possibly we could piggy-back on
1228	 * link up/down events aswell, but using a timer also provides
1229	 * a guarantee against too frequent events. This operation
1230	 * does not have to be atomic.
1231	 */
1232	NET_EPOCH_ENTER(et);
1233	lp = lagg_link_active(sc, sc->sc_primary);
1234	if (lp != NULL) {
1235		ifp = sc->sc_ifp;
1236		lp_ifp = lp->lp_ifp;
1237
1238		if (ifp != NULL && lp_ifp != NULL &&
1239		    (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen) != 0 ||
1240		     memcmp(sc->sc_bcast_addr, lp_ifp->if_broadcastaddr, ifp->if_addrlen) != 0)) {
1241			memcpy(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen);
1242			memcpy(sc->sc_bcast_addr, lp_ifp->if_broadcastaddr, ifp->if_addrlen);
1243
1244			CURVNET_SET(ifp->if_vnet);
1245			EVENTHANDLER_INVOKE(iflladdr_event, ifp);
1246			CURVNET_RESTORE();
1247		}
1248	}
1249	NET_EPOCH_EXIT(et);
1250
1251	callout_reset(&sc->sc_watchdog, hz, &lagg_watchdog_infiniband, arg);
1252}
1253
1254static void
1255lagg_init(void *xsc)
1256{
1257	struct lagg_softc *sc = (struct lagg_softc *)xsc;
1258	struct ifnet *ifp = sc->sc_ifp;
1259	struct lagg_port *lp;
1260
1261	LAGG_XLOCK(sc);
1262	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1263		LAGG_XUNLOCK(sc);
1264		return;
1265	}
1266
1267	ifp->if_drv_flags |= IFF_DRV_RUNNING;
1268
1269	/*
1270	 * Update the port lladdrs if needed.
1271	 * This might be if_setlladdr() notification
1272	 * that lladdr has been changed.
1273	 */
1274	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1275		if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp),
1276		    ifp->if_addrlen) != 0)
1277			if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ifp->if_addrlen);
1278	}
1279
1280	lagg_proto_init(sc);
1281
1282	if (ifp->if_type == IFT_INFINIBAND) {
1283		mtx_lock(&sc->sc_mtx);
1284		lagg_watchdog_infiniband(sc);
1285		mtx_unlock(&sc->sc_mtx);
1286	}
1287
1288	LAGG_XUNLOCK(sc);
1289}
1290
1291static void
1292lagg_stop(struct lagg_softc *sc)
1293{
1294	struct ifnet *ifp = sc->sc_ifp;
1295
1296	LAGG_XLOCK_ASSERT(sc);
1297
1298	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1299		return;
1300
1301	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1302
1303	lagg_proto_stop(sc);
1304
1305	mtx_lock(&sc->sc_mtx);
1306	callout_stop(&sc->sc_watchdog);
1307	mtx_unlock(&sc->sc_mtx);
1308
1309	callout_drain(&sc->sc_watchdog);
1310}
1311
1312static int
1313lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1314{
1315	struct epoch_tracker et;
1316	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1317	struct lagg_reqall *ra = (struct lagg_reqall *)data;
1318	struct lagg_reqopts *ro = (struct lagg_reqopts *)data;
1319	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
1320	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
1321	struct ifreq *ifr = (struct ifreq *)data;
1322	struct lagg_port *lp;
1323	struct ifnet *tpif;
1324	struct thread *td = curthread;
1325	char *buf, *outbuf;
1326	int count, buflen, len, error = 0, oldmtu;
1327
1328	bzero(&rpbuf, sizeof(rpbuf));
1329
1330	/* XXX: This can race with lagg_clone_destroy. */
1331
1332	switch (cmd) {
1333	case SIOCGLAGG:
1334		LAGG_XLOCK(sc);
1335		buflen = sc->sc_count * sizeof(struct lagg_reqport);
1336		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
1337		ra->ra_proto = sc->sc_proto;
1338		lagg_proto_request(sc, &ra->ra_psc);
1339		count = 0;
1340		buf = outbuf;
1341		len = min(ra->ra_size, buflen);
1342		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1343			if (len < sizeof(rpbuf))
1344				break;
1345
1346			lagg_port2req(lp, &rpbuf);
1347			memcpy(buf, &rpbuf, sizeof(rpbuf));
1348			count++;
1349			buf += sizeof(rpbuf);
1350			len -= sizeof(rpbuf);
1351		}
1352		LAGG_XUNLOCK(sc);
1353		ra->ra_ports = count;
1354		ra->ra_size = count * sizeof(rpbuf);
1355		error = copyout(outbuf, ra->ra_port, ra->ra_size);
1356		free(outbuf, M_TEMP);
1357		break;
1358	case SIOCSLAGG:
1359		error = priv_check(td, PRIV_NET_LAGG);
1360		if (error)
1361			break;
1362		if (ra->ra_proto >= LAGG_PROTO_MAX) {
1363			error = EPROTONOSUPPORT;
1364			break;
1365		}
1366		/* Infiniband only supports the failover protocol. */
1367		if (ra->ra_proto != LAGG_PROTO_FAILOVER &&
1368		    ifp->if_type == IFT_INFINIBAND) {
1369			error = EPROTONOSUPPORT;
1370			break;
1371		}
1372		LAGG_XLOCK(sc);
1373		lagg_proto_detach(sc);
1374		lagg_proto_attach(sc, ra->ra_proto);
1375		LAGG_XUNLOCK(sc);
1376		break;
1377	case SIOCGLAGGOPTS:
1378		LAGG_XLOCK(sc);
1379		ro->ro_opts = sc->sc_opts;
1380		if (sc->sc_proto == LAGG_PROTO_LACP) {
1381			struct lacp_softc *lsc;
1382
1383			lsc = (struct lacp_softc *)sc->sc_psc;
1384			if (lsc->lsc_debug.lsc_tx_test != 0)
1385				ro->ro_opts |= LAGG_OPT_LACP_TXTEST;
1386			if (lsc->lsc_debug.lsc_rx_test != 0)
1387				ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
1388			if (lsc->lsc_strict_mode != 0)
1389				ro->ro_opts |= LAGG_OPT_LACP_STRICT;
1390			if (lsc->lsc_fast_timeout != 0)
1391				ro->ro_opts |= LAGG_OPT_LACP_FAST_TIMO;
1392
1393			ro->ro_active = sc->sc_active;
1394		} else {
1395			ro->ro_active = 0;
1396			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1397				ro->ro_active += LAGG_PORTACTIVE(lp);
1398		}
1399		ro->ro_bkt = sc->sc_stride;
1400		ro->ro_flapping = sc->sc_flapping;
1401		ro->ro_flowid_shift = sc->flowid_shift;
1402		LAGG_XUNLOCK(sc);
1403		break;
1404	case SIOCSLAGGOPTS:
1405		error = priv_check(td, PRIV_NET_LAGG);
1406		if (error)
1407			break;
1408
1409		/*
1410		 * The stride option was added without defining a corresponding
1411		 * LAGG_OPT flag, so handle a non-zero value before checking
1412		 * anything else to preserve compatibility.
1413		 */
1414		LAGG_XLOCK(sc);
1415		if (ro->ro_opts == 0 && ro->ro_bkt != 0) {
1416			if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN) {
1417				LAGG_XUNLOCK(sc);
1418				error = EINVAL;
1419				break;
1420			}
1421			sc->sc_stride = ro->ro_bkt;
1422		}
1423		if (ro->ro_opts == 0) {
1424			LAGG_XUNLOCK(sc);
1425			break;
1426		}
1427
1428		/*
1429		 * Set options.  LACP options are stored in sc->sc_psc,
1430		 * not in sc_opts.
1431		 */
1432		int valid, lacp;
1433
1434		switch (ro->ro_opts) {
1435		case LAGG_OPT_USE_FLOWID:
1436		case -LAGG_OPT_USE_FLOWID:
1437		case LAGG_OPT_USE_NUMA:
1438		case -LAGG_OPT_USE_NUMA:
1439		case LAGG_OPT_FLOWIDSHIFT:
1440		case LAGG_OPT_RR_LIMIT:
1441			valid = 1;
1442			lacp = 0;
1443			break;
1444		case LAGG_OPT_LACP_TXTEST:
1445		case -LAGG_OPT_LACP_TXTEST:
1446		case LAGG_OPT_LACP_RXTEST:
1447		case -LAGG_OPT_LACP_RXTEST:
1448		case LAGG_OPT_LACP_STRICT:
1449		case -LAGG_OPT_LACP_STRICT:
1450		case LAGG_OPT_LACP_FAST_TIMO:
1451		case -LAGG_OPT_LACP_FAST_TIMO:
1452			valid = lacp = 1;
1453			break;
1454		default:
1455			valid = lacp = 0;
1456			break;
1457		}
1458
1459		if (valid == 0 ||
1460		    (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) {
1461			/* Invalid combination of options specified. */
1462			error = EINVAL;
1463			LAGG_XUNLOCK(sc);
1464			break;	/* Return from SIOCSLAGGOPTS. */
1465		}
1466
1467		/*
1468		 * Store new options into sc->sc_opts except for
1469		 * FLOWIDSHIFT, RR and LACP options.
1470		 */
1471		if (lacp == 0) {
1472			if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT)
1473				sc->flowid_shift = ro->ro_flowid_shift;
1474			else if (ro->ro_opts == LAGG_OPT_RR_LIMIT) {
1475				if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN ||
1476				    ro->ro_bkt == 0) {
1477					error = EINVAL;
1478					LAGG_XUNLOCK(sc);
1479					break;
1480				}
1481				sc->sc_stride = ro->ro_bkt;
1482			} else if (ro->ro_opts > 0)
1483				sc->sc_opts |= ro->ro_opts;
1484			else
1485				sc->sc_opts &= ~ro->ro_opts;
1486		} else {
1487			struct lacp_softc *lsc;
1488			struct lacp_port *lp;
1489
1490			lsc = (struct lacp_softc *)sc->sc_psc;
1491
1492			switch (ro->ro_opts) {
1493			case LAGG_OPT_LACP_TXTEST:
1494				lsc->lsc_debug.lsc_tx_test = 1;
1495				break;
1496			case -LAGG_OPT_LACP_TXTEST:
1497				lsc->lsc_debug.lsc_tx_test = 0;
1498				break;
1499			case LAGG_OPT_LACP_RXTEST:
1500				lsc->lsc_debug.lsc_rx_test = 1;
1501				break;
1502			case -LAGG_OPT_LACP_RXTEST:
1503				lsc->lsc_debug.lsc_rx_test = 0;
1504				break;
1505			case LAGG_OPT_LACP_STRICT:
1506				lsc->lsc_strict_mode = 1;
1507				break;
1508			case -LAGG_OPT_LACP_STRICT:
1509				lsc->lsc_strict_mode = 0;
1510				break;
1511			case LAGG_OPT_LACP_FAST_TIMO:
1512				LACP_LOCK(lsc);
1513        			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
1514                        		lp->lp_state |= LACP_STATE_TIMEOUT;
1515				LACP_UNLOCK(lsc);
1516				lsc->lsc_fast_timeout = 1;
1517				break;
1518			case -LAGG_OPT_LACP_FAST_TIMO:
1519				LACP_LOCK(lsc);
1520        			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
1521                        		lp->lp_state &= ~LACP_STATE_TIMEOUT;
1522				LACP_UNLOCK(lsc);
1523				lsc->lsc_fast_timeout = 0;
1524				break;
1525			}
1526		}
1527		LAGG_XUNLOCK(sc);
1528		break;
1529	case SIOCGLAGGFLAGS:
1530		rf->rf_flags = 0;
1531		LAGG_XLOCK(sc);
1532		if (sc->sc_flags & MBUF_HASHFLAG_L2)
1533			rf->rf_flags |= LAGG_F_HASHL2;
1534		if (sc->sc_flags & MBUF_HASHFLAG_L3)
1535			rf->rf_flags |= LAGG_F_HASHL3;
1536		if (sc->sc_flags & MBUF_HASHFLAG_L4)
1537			rf->rf_flags |= LAGG_F_HASHL4;
1538		LAGG_XUNLOCK(sc);
1539		break;
1540	case SIOCSLAGGHASH:
1541		error = priv_check(td, PRIV_NET_LAGG);
1542		if (error)
1543			break;
1544		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
1545			error = EINVAL;
1546			break;
1547		}
1548		LAGG_XLOCK(sc);
1549		sc->sc_flags = 0;
1550		if (rf->rf_flags & LAGG_F_HASHL2)
1551			sc->sc_flags |= MBUF_HASHFLAG_L2;
1552		if (rf->rf_flags & LAGG_F_HASHL3)
1553			sc->sc_flags |= MBUF_HASHFLAG_L3;
1554		if (rf->rf_flags & LAGG_F_HASHL4)
1555			sc->sc_flags |= MBUF_HASHFLAG_L4;
1556		LAGG_XUNLOCK(sc);
1557		break;
1558	case SIOCGLAGGPORT:
1559		if (rp->rp_portname[0] == '\0' ||
1560		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
1561			error = EINVAL;
1562			break;
1563		}
1564
1565		NET_EPOCH_ENTER(et);
1566		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1567		    lp->lp_softc != sc) {
1568			error = ENOENT;
1569			NET_EPOCH_EXIT(et);
1570			if_rele(tpif);
1571			break;
1572		}
1573
1574		lagg_port2req(lp, rp);
1575		NET_EPOCH_EXIT(et);
1576		if_rele(tpif);
1577		break;
1578	case SIOCSLAGGPORT:
1579		error = priv_check(td, PRIV_NET_LAGG);
1580		if (error)
1581			break;
1582		if (rp->rp_portname[0] == '\0' ||
1583		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
1584			error = EINVAL;
1585			break;
1586		}
1587#ifdef INET6
1588		/*
1589		 * A laggport interface should not have inet6 address
1590		 * because two interfaces with a valid link-local
1591		 * scope zone must not be merged in any form.  This
1592		 * restriction is needed to prevent violation of
1593		 * link-local scope zone.  Attempts to add a laggport
1594		 * interface which has inet6 addresses triggers
1595		 * removal of all inet6 addresses on the member
1596		 * interface.
1597		 */
1598		if (in6ifa_llaonifp(tpif)) {
1599			in6_ifdetach(tpif);
1600				if_printf(sc->sc_ifp,
1601				    "IPv6 addresses on %s have been removed "
1602				    "before adding it as a member to prevent "
1603				    "IPv6 address scope violation.\n",
1604				    tpif->if_xname);
1605		}
1606#endif
1607		oldmtu = ifp->if_mtu;
1608		LAGG_XLOCK(sc);
1609		error = lagg_port_create(sc, tpif);
1610		LAGG_XUNLOCK(sc);
1611		if_rele(tpif);
1612
1613		/*
1614		 * LAGG MTU may change during addition of the first port.
1615		 * If it did, do network layer specific procedure.
1616		 */
1617		if (ifp->if_mtu != oldmtu) {
1618#ifdef INET6
1619			nd6_setmtu(ifp);
1620#endif
1621			rt_updatemtu(ifp);
1622		}
1623
1624		VLAN_CAPABILITIES(ifp);
1625		break;
1626	case SIOCSLAGGDELPORT:
1627		error = priv_check(td, PRIV_NET_LAGG);
1628		if (error)
1629			break;
1630		if (rp->rp_portname[0] == '\0' ||
1631		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
1632			error = EINVAL;
1633			break;
1634		}
1635
1636		LAGG_XLOCK(sc);
1637		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1638		    lp->lp_softc != sc) {
1639			error = ENOENT;
1640			LAGG_XUNLOCK(sc);
1641			if_rele(tpif);
1642			break;
1643		}
1644
1645		error = lagg_port_destroy(lp, 1);
1646		LAGG_XUNLOCK(sc);
1647		if_rele(tpif);
1648		VLAN_CAPABILITIES(ifp);
1649		break;
1650	case SIOCSIFFLAGS:
1651		/* Set flags on ports too */
1652		LAGG_XLOCK(sc);
1653		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1654			lagg_setflags(lp, 1);
1655		}
1656
1657		if (!(ifp->if_flags & IFF_UP) &&
1658		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1659			/*
1660			 * If interface is marked down and it is running,
1661			 * then stop and disable it.
1662			 */
1663			lagg_stop(sc);
1664			LAGG_XUNLOCK(sc);
1665		} else if ((ifp->if_flags & IFF_UP) &&
1666		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1667			/*
1668			 * If interface is marked up and it is stopped, then
1669			 * start it.
1670			 */
1671			LAGG_XUNLOCK(sc);
1672			(*ifp->if_init)(sc);
1673		} else
1674			LAGG_XUNLOCK(sc);
1675		break;
1676	case SIOCADDMULTI:
1677	case SIOCDELMULTI:
1678		LAGG_XLOCK(sc);
1679		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1680			lagg_clrmulti(lp);
1681			lagg_setmulti(lp);
1682		}
1683		LAGG_XUNLOCK(sc);
1684		error = 0;
1685		break;
1686	case SIOCSIFMEDIA:
1687	case SIOCGIFMEDIA:
1688		if (ifp->if_type == IFT_INFINIBAND)
1689			error = EINVAL;
1690		else
1691			error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
1692		break;
1693
1694	case SIOCSIFCAP:
1695		LAGG_XLOCK(sc);
1696		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1697			if (lp->lp_ioctl != NULL)
1698				(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
1699		}
1700		lagg_capabilities(sc);
1701		LAGG_XUNLOCK(sc);
1702		VLAN_CAPABILITIES(ifp);
1703		error = 0;
1704		break;
1705
1706	case SIOCSIFMTU:
1707		LAGG_XLOCK(sc);
1708		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1709			if (lp->lp_ioctl != NULL)
1710				error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
1711			else
1712				error = EINVAL;
1713			if (error != 0) {
1714				if_printf(ifp,
1715				    "failed to change MTU to %d on port %s, "
1716				    "reverting all ports to original MTU (%d)\n",
1717				    ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu);
1718				break;
1719			}
1720		}
1721		if (error == 0) {
1722			ifp->if_mtu = ifr->ifr_mtu;
1723		} else {
1724			/* set every port back to the original MTU */
1725			ifr->ifr_mtu = ifp->if_mtu;
1726			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1727				if (lp->lp_ioctl != NULL)
1728					(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
1729			}
1730		}
1731		LAGG_XUNLOCK(sc);
1732		break;
1733
1734	default:
1735		error = ether_ioctl(ifp, cmd, data);
1736		break;
1737	}
1738	return (error);
1739}
1740
1741#if defined(KERN_TLS) || defined(RATELIMIT)
1742static inline struct lagg_snd_tag *
1743mst_to_lst(struct m_snd_tag *mst)
1744{
1745
1746	return (__containerof(mst, struct lagg_snd_tag, com));
1747}
1748
1749/*
1750 * Look up the port used by a specific flow.  This only works for lagg
1751 * protocols with deterministic port mappings (e.g. not roundrobin).
1752 * In addition protocols which use a hash to map flows to ports must
1753 * be configured to use the mbuf flowid rather than hashing packet
1754 * contents.
1755 */
1756static struct lagg_port *
1757lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype,
1758    uint8_t numa_domain)
1759{
1760	struct lagg_softc *sc;
1761	struct lagg_port *lp;
1762	struct lagg_lb *lb;
1763	uint32_t hash, p;
1764	int err;
1765
1766	sc = ifp->if_softc;
1767
1768	switch (sc->sc_proto) {
1769	case LAGG_PROTO_FAILOVER:
1770		return (lagg_link_active(sc, sc->sc_primary));
1771	case LAGG_PROTO_LOADBALANCE:
1772		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
1773		    flowtype == M_HASHTYPE_NONE)
1774			return (NULL);
1775		p = flowid >> sc->flowid_shift;
1776		p %= sc->sc_count;
1777		lb = (struct lagg_lb *)sc->sc_psc;
1778		lp = lb->lb_ports[p];
1779		return (lagg_link_active(sc, lp));
1780	case LAGG_PROTO_LACP:
1781		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
1782		    flowtype == M_HASHTYPE_NONE)
1783			return (NULL);
1784		hash = flowid >> sc->flowid_shift;
1785		return (lacp_select_tx_port_by_hash(sc, hash, numa_domain, &err));
1786	default:
1787		return (NULL);
1788	}
1789}
1790
1791static int
1792lagg_snd_tag_alloc(struct ifnet *ifp,
1793    union if_snd_tag_alloc_params *params,
1794    struct m_snd_tag **ppmt)
1795{
1796	struct epoch_tracker et;
1797	struct lagg_snd_tag *lst;
1798	struct lagg_softc *sc;
1799	struct lagg_port *lp;
1800	struct ifnet *lp_ifp;
1801	int error;
1802
1803	sc = ifp->if_softc;
1804
1805	NET_EPOCH_ENTER(et);
1806	lp = lookup_snd_tag_port(ifp, params->hdr.flowid,
1807	    params->hdr.flowtype, params->hdr.numa_domain);
1808	if (lp == NULL) {
1809		NET_EPOCH_EXIT(et);
1810		return (EOPNOTSUPP);
1811	}
1812	if (lp->lp_ifp == NULL) {
1813		NET_EPOCH_EXIT(et);
1814		return (EOPNOTSUPP);
1815	}
1816	lp_ifp = lp->lp_ifp;
1817	if_ref(lp_ifp);
1818	NET_EPOCH_EXIT(et);
1819
1820	lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT);
1821	if (lst == NULL) {
1822		if_rele(lp_ifp);
1823		return (ENOMEM);
1824	}
1825
1826	error = m_snd_tag_alloc(lp_ifp, params, &lst->tag);
1827	if_rele(lp_ifp);
1828	if (error) {
1829		free(lst, M_LAGG);
1830		return (error);
1831	}
1832
1833	m_snd_tag_init(&lst->com, ifp, lst->tag->type);
1834
1835	*ppmt = &lst->com;
1836	return (0);
1837}
1838
1839static struct m_snd_tag *
1840lagg_next_snd_tag(struct m_snd_tag *mst)
1841{
1842	struct lagg_snd_tag *lst;
1843
1844	lst = mst_to_lst(mst);
1845	return (lst->tag);
1846}
1847
1848static int
1849lagg_snd_tag_modify(struct m_snd_tag *mst,
1850    union if_snd_tag_modify_params *params)
1851{
1852	struct lagg_snd_tag *lst;
1853
1854	lst = mst_to_lst(mst);
1855	return (lst->tag->ifp->if_snd_tag_modify(lst->tag, params));
1856}
1857
1858static int
1859lagg_snd_tag_query(struct m_snd_tag *mst,
1860    union if_snd_tag_query_params *params)
1861{
1862	struct lagg_snd_tag *lst;
1863
1864	lst = mst_to_lst(mst);
1865	return (lst->tag->ifp->if_snd_tag_query(lst->tag, params));
1866}
1867
1868static void
1869lagg_snd_tag_free(struct m_snd_tag *mst)
1870{
1871	struct lagg_snd_tag *lst;
1872
1873	lst = mst_to_lst(mst);
1874	m_snd_tag_rele(lst->tag);
1875	free(lst, M_LAGG);
1876}
1877
1878static void
1879lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
1880{
1881	/*
1882	 * For lagg, we have an indirect
1883	 * interface. The caller needs to
1884	 * get a ratelimit tag on the actual
1885	 * interface the flow will go on.
1886	 */
1887	q->rate_table = NULL;
1888	q->flags = RT_IS_INDIRECT;
1889	q->max_flows = 0;
1890	q->number_of_rates = 0;
1891}
1892#endif
1893
1894static int
1895lagg_setmulti(struct lagg_port *lp)
1896{
1897	struct lagg_softc *sc = lp->lp_softc;
1898	struct ifnet *ifp = lp->lp_ifp;
1899	struct ifnet *scifp = sc->sc_ifp;
1900	struct lagg_mc *mc;
1901	struct ifmultiaddr *ifma;
1902	int error;
1903
1904	IF_ADDR_WLOCK(scifp);
1905	CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
1906		if (ifma->ifma_addr->sa_family != AF_LINK)
1907			continue;
1908		mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT);
1909		if (mc == NULL) {
1910			IF_ADDR_WUNLOCK(scifp);
1911			return (ENOMEM);
1912		}
1913		bcopy(ifma->ifma_addr, &mc->mc_addr,
1914		    ifma->ifma_addr->sa_len);
1915		mc->mc_addr.sdl_index = ifp->if_index;
1916		mc->mc_ifma = NULL;
1917		SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
1918	}
1919	IF_ADDR_WUNLOCK(scifp);
1920	SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) {
1921		error = if_addmulti(ifp,
1922		    (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma);
1923		if (error)
1924			return (error);
1925	}
1926	return (0);
1927}
1928
1929static int
1930lagg_clrmulti(struct lagg_port *lp)
1931{
1932	struct lagg_mc *mc;
1933
1934	LAGG_XLOCK_ASSERT(lp->lp_softc);
1935	while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
1936		SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
1937		if (mc->mc_ifma && lp->lp_detaching == 0)
1938			if_delmulti_ifma(mc->mc_ifma);
1939		free(mc, M_LAGG);
1940	}
1941	return (0);
1942}
1943
1944static int
1945lagg_setcaps(struct lagg_port *lp, int cap)
1946{
1947	struct ifreq ifr;
1948
1949	if (lp->lp_ifp->if_capenable == cap)
1950		return (0);
1951	if (lp->lp_ioctl == NULL)
1952		return (ENXIO);
1953	ifr.ifr_reqcap = cap;
1954	return ((*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr));
1955}
1956
1957/* Handle a ref counted flag that should be set on the lagg port as well */
1958static int
1959lagg_setflag(struct lagg_port *lp, int flag, int status,
1960    int (*func)(struct ifnet *, int))
1961{
1962	struct lagg_softc *sc = lp->lp_softc;
1963	struct ifnet *scifp = sc->sc_ifp;
1964	struct ifnet *ifp = lp->lp_ifp;
1965	int error;
1966
1967	LAGG_XLOCK_ASSERT(sc);
1968
1969	status = status ? (scifp->if_flags & flag) : 0;
1970	/* Now "status" contains the flag value or 0 */
1971
1972	/*
1973	 * See if recorded ports status is different from what
1974	 * we want it to be.  If it is, flip it.  We record ports
1975	 * status in lp_ifflags so that we won't clear ports flag
1976	 * we haven't set.  In fact, we don't clear or set ports
1977	 * flags directly, but get or release references to them.
1978	 * That's why we can be sure that recorded flags still are
1979	 * in accord with actual ports flags.
1980	 */
1981	if (status != (lp->lp_ifflags & flag)) {
1982		error = (*func)(ifp, status);
1983		if (error)
1984			return (error);
1985		lp->lp_ifflags &= ~flag;
1986		lp->lp_ifflags |= status;
1987	}
1988	return (0);
1989}
1990
1991/*
1992 * Handle IFF_* flags that require certain changes on the lagg port
1993 * if "status" is true, update ports flags respective to the lagg
1994 * if "status" is false, forcedly clear the flags set on port.
1995 */
1996static int
1997lagg_setflags(struct lagg_port *lp, int status)
1998{
1999	int error, i;
2000
2001	for (i = 0; lagg_pflags[i].flag; i++) {
2002		error = lagg_setflag(lp, lagg_pflags[i].flag,
2003		    status, lagg_pflags[i].func);
2004		if (error)
2005			return (error);
2006	}
2007	return (0);
2008}
2009
2010static int
2011lagg_transmit_ethernet(struct ifnet *ifp, struct mbuf *m)
2012{
2013	struct epoch_tracker et;
2014	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
2015	int error;
2016
2017#if defined(KERN_TLS) || defined(RATELIMIT)
2018	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
2019		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
2020#endif
2021	NET_EPOCH_ENTER(et);
2022	/* We need a Tx algorithm and at least one port */
2023	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
2024		NET_EPOCH_EXIT(et);
2025		m_freem(m);
2026		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2027		return (ENXIO);
2028	}
2029
2030	ETHER_BPF_MTAP(ifp, m);
2031
2032	error = lagg_proto_start(sc, m);
2033	NET_EPOCH_EXIT(et);
2034	return (error);
2035}
2036
2037static int
2038lagg_transmit_infiniband(struct ifnet *ifp, struct mbuf *m)
2039{
2040	struct epoch_tracker et;
2041	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
2042	int error;
2043
2044#if defined(KERN_TLS) || defined(RATELIMIT)
2045	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
2046		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
2047#endif
2048	NET_EPOCH_ENTER(et);
2049	/* We need a Tx algorithm and at least one port */
2050	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
2051		NET_EPOCH_EXIT(et);
2052		m_freem(m);
2053		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2054		return (ENXIO);
2055	}
2056
2057	INFINIBAND_BPF_MTAP(ifp, m);
2058
2059	error = lagg_proto_start(sc, m);
2060	NET_EPOCH_EXIT(et);
2061	return (error);
2062}
2063
2064/*
2065 * The ifp->if_qflush entry point for lagg(4) is no-op.
2066 */
2067static void
2068lagg_qflush(struct ifnet *ifp __unused)
2069{
2070}
2071
2072static struct mbuf *
2073lagg_input_ethernet(struct ifnet *ifp, struct mbuf *m)
2074{
2075	struct epoch_tracker et;
2076	struct lagg_port *lp = ifp->if_lagg;
2077	struct lagg_softc *sc = lp->lp_softc;
2078	struct ifnet *scifp = sc->sc_ifp;
2079
2080	NET_EPOCH_ENTER(et);
2081	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
2082	    lp->lp_detaching != 0 ||
2083	    sc->sc_proto == LAGG_PROTO_NONE) {
2084		NET_EPOCH_EXIT(et);
2085		m_freem(m);
2086		return (NULL);
2087	}
2088
2089	ETHER_BPF_MTAP(scifp, m);
2090
2091	m = lagg_proto_input(sc, lp, m);
2092	if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) {
2093		m_freem(m);
2094		m = NULL;
2095	}
2096
2097	NET_EPOCH_EXIT(et);
2098	return (m);
2099}
2100
2101static struct mbuf *
2102lagg_input_infiniband(struct ifnet *ifp, struct mbuf *m)
2103{
2104	struct epoch_tracker et;
2105	struct lagg_port *lp = ifp->if_lagg;
2106	struct lagg_softc *sc = lp->lp_softc;
2107	struct ifnet *scifp = sc->sc_ifp;
2108
2109	NET_EPOCH_ENTER(et);
2110	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
2111	    lp->lp_detaching != 0 ||
2112	    sc->sc_proto == LAGG_PROTO_NONE) {
2113		NET_EPOCH_EXIT(et);
2114		m_freem(m);
2115		return (NULL);
2116	}
2117
2118	INFINIBAND_BPF_MTAP(scifp, m);
2119
2120	m = lagg_proto_input(sc, lp, m);
2121	if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) {
2122		m_freem(m);
2123		m = NULL;
2124	}
2125
2126	NET_EPOCH_EXIT(et);
2127	return (m);
2128}
2129
2130static int
2131lagg_media_change(struct ifnet *ifp)
2132{
2133	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
2134
2135	if (sc->sc_ifflags & IFF_DEBUG)
2136		printf("%s\n", __func__);
2137
2138	/* Ignore */
2139	return (0);
2140}
2141
2142static void
2143lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
2144{
2145	struct epoch_tracker et;
2146	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
2147	struct lagg_port *lp;
2148
2149	imr->ifm_status = IFM_AVALID;
2150	imr->ifm_active = IFM_ETHER | IFM_AUTO;
2151
2152	NET_EPOCH_ENTER(et);
2153	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
2154		if (LAGG_PORTACTIVE(lp))
2155			imr->ifm_status |= IFM_ACTIVE;
2156	}
2157	NET_EPOCH_EXIT(et);
2158}
2159
2160static void
2161lagg_linkstate(struct lagg_softc *sc)
2162{
2163	struct epoch_tracker et;
2164	struct lagg_port *lp;
2165	int new_link = LINK_STATE_DOWN;
2166	uint64_t speed;
2167
2168	LAGG_XLOCK_ASSERT(sc);
2169
2170	/* LACP handles link state itself */
2171	if (sc->sc_proto == LAGG_PROTO_LACP)
2172		return;
2173
2174	/* Our link is considered up if at least one of our ports is active */
2175	NET_EPOCH_ENTER(et);
2176	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
2177		if (lp->lp_ifp->if_link_state == LINK_STATE_UP) {
2178			new_link = LINK_STATE_UP;
2179			break;
2180		}
2181	}
2182	NET_EPOCH_EXIT(et);
2183	if_link_state_change(sc->sc_ifp, new_link);
2184
2185	/* Update if_baudrate to reflect the max possible speed */
2186	switch (sc->sc_proto) {
2187		case LAGG_PROTO_FAILOVER:
2188			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
2189			    sc->sc_primary->lp_ifp->if_baudrate : 0;
2190			break;
2191		case LAGG_PROTO_ROUNDROBIN:
2192		case LAGG_PROTO_LOADBALANCE:
2193		case LAGG_PROTO_BROADCAST:
2194			speed = 0;
2195			NET_EPOCH_ENTER(et);
2196			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2197				speed += lp->lp_ifp->if_baudrate;
2198			NET_EPOCH_EXIT(et);
2199			sc->sc_ifp->if_baudrate = speed;
2200			break;
2201		case LAGG_PROTO_LACP:
2202			/* LACP updates if_baudrate itself */
2203			break;
2204	}
2205}
2206
2207static void
2208lagg_port_state(struct ifnet *ifp, int state)
2209{
2210	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
2211	struct lagg_softc *sc = NULL;
2212
2213	if (lp != NULL)
2214		sc = lp->lp_softc;
2215	if (sc == NULL)
2216		return;
2217
2218	LAGG_XLOCK(sc);
2219	lagg_linkstate(sc);
2220	lagg_proto_linkstate(sc, lp);
2221	LAGG_XUNLOCK(sc);
2222}
2223
2224struct lagg_port *
2225lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
2226{
2227	struct lagg_port *lp_next, *rval = NULL;
2228
2229	/*
2230	 * Search a port which reports an active link state.
2231	 */
2232
2233#ifdef INVARIANTS
2234	/*
2235	 * This is called with either in the network epoch
2236	 * or with LAGG_XLOCK(sc) held.
2237	 */
2238	if (!in_epoch(net_epoch_preempt))
2239		LAGG_XLOCK_ASSERT(sc);
2240#endif
2241
2242	if (lp == NULL)
2243		goto search;
2244	if (LAGG_PORTACTIVE(lp)) {
2245		rval = lp;
2246		goto found;
2247	}
2248	if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL &&
2249	    LAGG_PORTACTIVE(lp_next)) {
2250		rval = lp_next;
2251		goto found;
2252	}
2253
2254search:
2255	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
2256		if (LAGG_PORTACTIVE(lp_next)) {
2257			return (lp_next);
2258		}
2259	}
2260found:
2261	return (rval);
2262}
2263
2264int
2265lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
2266{
2267
2268#if defined(KERN_TLS) || defined(RATELIMIT)
2269	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
2270		struct lagg_snd_tag *lst;
2271		struct m_snd_tag *mst;
2272
2273		mst = m->m_pkthdr.snd_tag;
2274		lst = mst_to_lst(mst);
2275		if (lst->tag->ifp != ifp) {
2276			m_freem(m);
2277			return (EAGAIN);
2278		}
2279		m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag);
2280		m_snd_tag_rele(mst);
2281	}
2282#endif
2283	return (ifp->if_transmit)(ifp, m);
2284}
2285
2286/*
2287 * Simple round robin aggregation
2288 */
2289static void
2290lagg_rr_attach(struct lagg_softc *sc)
2291{
2292	sc->sc_seq = 0;
2293	sc->sc_stride = 1;
2294}
2295
2296static int
2297lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
2298{
2299	struct lagg_port *lp;
2300	uint32_t p;
2301
2302	p = atomic_fetchadd_32(&sc->sc_seq, 1);
2303	p /= sc->sc_stride;
2304	p %= sc->sc_count;
2305	lp = CK_SLIST_FIRST(&sc->sc_ports);
2306
2307	while (p--)
2308		lp = CK_SLIST_NEXT(lp, lp_entries);
2309
2310	/*
2311	 * Check the port's link state. This will return the next active
2312	 * port if the link is down or the port is NULL.
2313	 */
2314	if ((lp = lagg_link_active(sc, lp)) == NULL) {
2315		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2316		m_freem(m);
2317		return (ENETDOWN);
2318	}
2319
2320	/* Send mbuf */
2321	return (lagg_enqueue(lp->lp_ifp, m));
2322}
2323
2324static struct mbuf *
2325lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2326{
2327	struct ifnet *ifp = sc->sc_ifp;
2328
2329	/* Just pass in the packet to our lagg device */
2330	m->m_pkthdr.rcvif = ifp;
2331
2332	return (m);
2333}
2334
2335/*
2336 * Broadcast mode
2337 */
2338static int
2339lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m)
2340{
2341	int active_ports = 0;
2342	int errors = 0;
2343	int ret;
2344	struct lagg_port *lp, *last = NULL;
2345	struct mbuf *m0;
2346
2347	NET_EPOCH_ASSERT();
2348	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
2349		if (!LAGG_PORTACTIVE(lp))
2350			continue;
2351
2352		active_ports++;
2353
2354		if (last != NULL) {
2355			m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT);
2356			if (m0 == NULL) {
2357				ret = ENOBUFS;
2358				errors++;
2359				break;
2360			}
2361			lagg_enqueue(last->lp_ifp, m0);
2362		}
2363		last = lp;
2364	}
2365
2366	if (last == NULL) {
2367		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2368		m_freem(m);
2369		return (ENOENT);
2370	}
2371	if ((last = lagg_link_active(sc, last)) == NULL) {
2372		errors++;
2373		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
2374		m_freem(m);
2375		return (ENETDOWN);
2376	}
2377
2378	ret = lagg_enqueue(last->lp_ifp, m);
2379	if (errors != 0)
2380		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
2381
2382	return (ret);
2383}
2384
2385static struct mbuf*
2386lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2387{
2388	struct ifnet *ifp = sc->sc_ifp;
2389
2390	/* Just pass in the packet to our lagg device */
2391	m->m_pkthdr.rcvif = ifp;
2392	return (m);
2393}
2394
2395/*
2396 * Active failover
2397 */
2398static int
2399lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
2400{
2401	struct lagg_port *lp;
2402
2403	/* Use the master port if active or the next available port */
2404	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
2405		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2406		m_freem(m);
2407		return (ENETDOWN);
2408	}
2409
2410	/* Send mbuf */
2411	return (lagg_enqueue(lp->lp_ifp, m));
2412}
2413
2414static struct mbuf *
2415lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2416{
2417	struct ifnet *ifp = sc->sc_ifp;
2418	struct lagg_port *tmp_tp;
2419
2420	if (lp == sc->sc_primary || V_lagg_failover_rx_all) {
2421		m->m_pkthdr.rcvif = ifp;
2422		return (m);
2423	}
2424
2425	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
2426		tmp_tp = lagg_link_active(sc, sc->sc_primary);
2427		/*
2428		 * If tmp_tp is null, we've received a packet when all
2429		 * our links are down. Weird, but process it anyways.
2430		 */
2431		if ((tmp_tp == NULL || tmp_tp == lp)) {
2432			m->m_pkthdr.rcvif = ifp;
2433			return (m);
2434		}
2435	}
2436
2437	m_freem(m);
2438	return (NULL);
2439}
2440
2441/*
2442 * Loadbalancing
2443 */
2444static void
2445lagg_lb_attach(struct lagg_softc *sc)
2446{
2447	struct lagg_port *lp;
2448	struct lagg_lb *lb;
2449
2450	LAGG_XLOCK_ASSERT(sc);
2451	lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO);
2452	lb->lb_key = m_ether_tcpip_hash_init();
2453	sc->sc_psc = lb;
2454
2455	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2456		lagg_lb_port_create(lp);
2457}
2458
2459static void
2460lagg_lb_detach(struct lagg_softc *sc)
2461{
2462	struct lagg_lb *lb;
2463
2464	lb = (struct lagg_lb *)sc->sc_psc;
2465	if (lb != NULL)
2466		free(lb, M_LAGG);
2467}
2468
2469static int
2470lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
2471{
2472	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
2473	struct lagg_port *lp_next;
2474	int i = 0, rv;
2475
2476	rv = 0;
2477	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
2478	LAGG_XLOCK_ASSERT(sc);
2479	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
2480		if (lp_next == lp)
2481			continue;
2482		if (i >= LAGG_MAX_PORTS) {
2483			rv = EINVAL;
2484			break;
2485		}
2486		if (sc->sc_ifflags & IFF_DEBUG)
2487			printf("%s: port %s at index %d\n",
2488			    sc->sc_ifname, lp_next->lp_ifp->if_xname, i);
2489		lb->lb_ports[i++] = lp_next;
2490	}
2491
2492	return (rv);
2493}
2494
2495static int
2496lagg_lb_port_create(struct lagg_port *lp)
2497{
2498	struct lagg_softc *sc = lp->lp_softc;
2499	return (lagg_lb_porttable(sc, NULL));
2500}
2501
2502static void
2503lagg_lb_port_destroy(struct lagg_port *lp)
2504{
2505	struct lagg_softc *sc = lp->lp_softc;
2506	lagg_lb_porttable(sc, lp);
2507}
2508
2509static int
2510lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
2511{
2512	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
2513	struct lagg_port *lp = NULL;
2514	uint32_t p = 0;
2515
2516	if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
2517	    M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
2518		p = m->m_pkthdr.flowid >> sc->flowid_shift;
2519	else
2520		p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key);
2521	p %= sc->sc_count;
2522	lp = lb->lb_ports[p];
2523
2524	/*
2525	 * Check the port's link state. This will return the next active
2526	 * port if the link is down or the port is NULL.
2527	 */
2528	if ((lp = lagg_link_active(sc, lp)) == NULL) {
2529		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2530		m_freem(m);
2531		return (ENETDOWN);
2532	}
2533
2534	/* Send mbuf */
2535	return (lagg_enqueue(lp->lp_ifp, m));
2536}
2537
2538static struct mbuf *
2539lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2540{
2541	struct ifnet *ifp = sc->sc_ifp;
2542
2543	/* Just pass in the packet to our lagg device */
2544	m->m_pkthdr.rcvif = ifp;
2545
2546	return (m);
2547}
2548
2549/*
2550 * 802.3ad LACP
2551 */
2552static void
2553lagg_lacp_attach(struct lagg_softc *sc)
2554{
2555	struct lagg_port *lp;
2556
2557	lacp_attach(sc);
2558	LAGG_XLOCK_ASSERT(sc);
2559	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2560		lacp_port_create(lp);
2561}
2562
2563static void
2564lagg_lacp_detach(struct lagg_softc *sc)
2565{
2566	struct lagg_port *lp;
2567	void *psc;
2568
2569	LAGG_XLOCK_ASSERT(sc);
2570	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2571		lacp_port_destroy(lp);
2572
2573	psc = sc->sc_psc;
2574	sc->sc_psc = NULL;
2575	lacp_detach(psc);
2576}
2577
2578static void
2579lagg_lacp_lladdr(struct lagg_softc *sc)
2580{
2581	struct lagg_port *lp;
2582
2583	LAGG_SXLOCK_ASSERT(sc);
2584
2585	/* purge all the lacp ports */
2586	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2587		lacp_port_destroy(lp);
2588
2589	/* add them back in */
2590	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2591		lacp_port_create(lp);
2592}
2593
2594static int
2595lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
2596{
2597	struct lagg_port *lp;
2598	int err;
2599
2600	lp = lacp_select_tx_port(sc, m, &err);
2601	if (lp == NULL) {
2602		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2603		m_freem(m);
2604		return (err);
2605	}
2606
2607	/* Send mbuf */
2608	return (lagg_enqueue(lp->lp_ifp, m));
2609}
2610
2611static struct mbuf *
2612lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2613{
2614	struct ifnet *ifp = sc->sc_ifp;
2615	struct ether_header *eh;
2616	u_short etype;
2617
2618	eh = mtod(m, struct ether_header *);
2619	etype = ntohs(eh->ether_type);
2620
2621	/* Tap off LACP control messages */
2622	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
2623		m = lacp_input(lp, m);
2624		if (m == NULL)
2625			return (NULL);
2626	}
2627
2628	/*
2629	 * If the port is not collecting or not in the active aggregator then
2630	 * free and return.
2631	 */
2632	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
2633		m_freem(m);
2634		return (NULL);
2635	}
2636
2637	m->m_pkthdr.rcvif = ifp;
2638	return (m);
2639}
2640