1/*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
2
3/*
4 * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
5 * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20#include <sys/cdefs.h>
21__FBSDID("$FreeBSD$");
22
23#include "opt_inet.h"
24#include "opt_inet6.h"
25
26#include <sys/param.h>
27#include <sys/kernel.h>
28#include <sys/malloc.h>
29#include <sys/mbuf.h>
30#include <sys/queue.h>
31#include <sys/socket.h>
32#include <sys/sockio.h>
33#include <sys/sysctl.h>
34#include <sys/module.h>
35#include <sys/priv.h>
36#include <sys/systm.h>
37#include <sys/proc.h>
38#include <sys/hash.h>
39#include <sys/lock.h>
40#include <sys/rwlock.h>
41#include <sys/taskqueue.h>
42#include <sys/eventhandler.h>
43
44#include <net/ethernet.h>
45#include <net/if.h>
46#include <net/if_clone.h>
47#include <net/if_arp.h>
48#include <net/if_dl.h>
49#include <net/if_llc.h>
50#include <net/if_media.h>
51#include <net/if_types.h>
52#include <net/if_var.h>
53#include <net/bpf.h>
54
55#if defined(INET) || defined(INET6)
56#include <netinet/in.h>
57#include <netinet/ip.h>
58#endif
59#ifdef INET
60#include <netinet/in_systm.h>
61#include <netinet/if_ether.h>
62#endif
63
64#ifdef INET6
65#include <netinet/ip6.h>
66#include <netinet6/in6_var.h>
67#include <netinet6/in6_ifattach.h>
68#endif
69
70#include <net/if_vlan_var.h>
71#include <net/if_lagg.h>
72#include <net/ieee8023ad_lacp.h>
73
74/* Special flags we should propagate to the lagg ports. */
75static struct {
76	int flag;
77	int (*func)(struct ifnet *, int);
78} lagg_pflags[] = {
79	{IFF_PROMISC, ifpromisc},
80	{IFF_ALLMULTI, if_allmulti},
81	{0, NULL}
82};
83
84SLIST_HEAD(__trhead, lagg_softc) lagg_list;	/* list of laggs */
85static struct mtx	lagg_list_mtx;
86eventhandler_tag	lagg_detach_cookie = NULL;
87
88static int	lagg_clone_create(struct if_clone *, int, caddr_t);
89static void	lagg_clone_destroy(struct ifnet *);
90static void	lagg_lladdr(struct lagg_softc *, uint8_t *);
91static void	lagg_capabilities(struct lagg_softc *);
92static void	lagg_port_lladdr(struct lagg_port *, uint8_t *);
93static void	lagg_port_setlladdr(void *, int);
94static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
95static int	lagg_port_destroy(struct lagg_port *, int);
96static struct mbuf *lagg_input(struct ifnet *, struct mbuf *);
97static void	lagg_linkstate(struct lagg_softc *);
98static void	lagg_port_state(struct ifnet *, int);
99static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
100static int	lagg_port_output(struct ifnet *, struct mbuf *,
101		    struct sockaddr *, struct route *);
102static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
103#ifdef LAGG_PORT_STACKING
104static int	lagg_port_checkstacking(struct lagg_softc *);
105#endif
106static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
107static void	lagg_init(void *);
108static void	lagg_stop(struct lagg_softc *);
109static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
110static int	lagg_ether_setmulti(struct lagg_softc *);
111static int	lagg_ether_cmdmulti(struct lagg_port *, int);
112static	int	lagg_setflag(struct lagg_port *, int, int,
113		    int (*func)(struct ifnet *, int));
114static	int	lagg_setflags(struct lagg_port *, int status);
115static int	lagg_transmit(struct ifnet *, struct mbuf *);
116static void	lagg_qflush(struct ifnet *);
117static int	lagg_media_change(struct ifnet *);
118static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
119static struct lagg_port *lagg_link_active(struct lagg_softc *,
120	    struct lagg_port *);
121static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *);
122
123IFC_SIMPLE_DECLARE(lagg, 0);
124
125/* Simple round robin */
126static int	lagg_rr_attach(struct lagg_softc *);
127static int	lagg_rr_detach(struct lagg_softc *);
128static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
129static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
130		    struct mbuf *);
131
132/* Active failover */
133static int	lagg_fail_attach(struct lagg_softc *);
134static int	lagg_fail_detach(struct lagg_softc *);
135static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
136static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
137		    struct mbuf *);
138
139/* Loadbalancing */
140static int	lagg_lb_attach(struct lagg_softc *);
141static int	lagg_lb_detach(struct lagg_softc *);
142static int	lagg_lb_port_create(struct lagg_port *);
143static void	lagg_lb_port_destroy(struct lagg_port *);
144static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
145static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
146		    struct mbuf *);
147static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
148
149/* 802.3ad LACP */
150static int	lagg_lacp_attach(struct lagg_softc *);
151static int	lagg_lacp_detach(struct lagg_softc *);
152static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
153static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
154		    struct mbuf *);
155static void	lagg_lacp_lladdr(struct lagg_softc *);
156
157/* lagg protocol table */
158static const struct {
159	int			ti_proto;
160	int			(*ti_attach)(struct lagg_softc *);
161} lagg_protos[] = {
162	{ LAGG_PROTO_ROUNDROBIN,	lagg_rr_attach },
163	{ LAGG_PROTO_FAILOVER,		lagg_fail_attach },
164	{ LAGG_PROTO_LOADBALANCE,	lagg_lb_attach },
165	{ LAGG_PROTO_ETHERCHANNEL,	lagg_lb_attach },
166	{ LAGG_PROTO_LACP,		lagg_lacp_attach },
167	{ LAGG_PROTO_NONE,		NULL }
168};
169
170SYSCTL_DECL(_net_link);
171static SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0,
172    "Link Aggregation");
173
174static int lagg_failover_rx_all = 0; /* Allow input on any failover links */
175SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW,
176    &lagg_failover_rx_all, 0,
177    "Accept input from any interface in a failover lagg");
178static int def_use_flowid = 1; /* Default value for using M_FLOWID */
179TUNABLE_INT("net.link.lagg.default_use_flowid", &def_use_flowid);
180SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RW,
181    &def_use_flowid, 0,
182    "Default setting for using flow id for load sharing");
183
184static int
185lagg_modevent(module_t mod, int type, void *data)
186{
187
188	switch (type) {
189	case MOD_LOAD:
190		mtx_init(&lagg_list_mtx, "if_lagg list", NULL, MTX_DEF);
191		SLIST_INIT(&lagg_list);
192		if_clone_attach(&lagg_cloner);
193		lagg_input_p = lagg_input;
194		lagg_linkstate_p = lagg_port_state;
195		lagg_detach_cookie = EVENTHANDLER_REGISTER(
196		    ifnet_departure_event, lagg_port_ifdetach, NULL,
197		    EVENTHANDLER_PRI_ANY);
198		break;
199	case MOD_UNLOAD:
200		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
201		    lagg_detach_cookie);
202		if_clone_detach(&lagg_cloner);
203		lagg_input_p = NULL;
204		lagg_linkstate_p = NULL;
205		mtx_destroy(&lagg_list_mtx);
206		break;
207	default:
208		return (EOPNOTSUPP);
209	}
210	return (0);
211}
212
213static moduledata_t lagg_mod = {
214	"if_lagg",
215	lagg_modevent,
216	0
217};
218
219DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
220MODULE_VERSION(if_lagg, 1);
221
222#if __FreeBSD_version >= 800000
223/*
224 * This routine is run via an vlan
225 * config EVENT
226 */
227static void
228lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
229{
230        struct lagg_softc       *sc = ifp->if_softc;
231        struct lagg_port        *lp;
232
233        if (ifp->if_softc !=  arg)   /* Not our event */
234                return;
235
236        LAGG_RLOCK(sc);
237        if (!SLIST_EMPTY(&sc->sc_ports)) {
238                SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
239                        EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
240        }
241        LAGG_RUNLOCK(sc);
242}
243
244/*
245 * This routine is run via an vlan
246 * unconfig EVENT
247 */
248static void
249lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
250{
251        struct lagg_softc       *sc = ifp->if_softc;
252        struct lagg_port        *lp;
253
254        if (ifp->if_softc !=  arg)   /* Not our event */
255                return;
256
257        LAGG_RLOCK(sc);
258        if (!SLIST_EMPTY(&sc->sc_ports)) {
259                SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
260                        EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
261        }
262        LAGG_RUNLOCK(sc);
263}
264#endif
265
266static int
267lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
268{
269	struct lagg_softc *sc;
270	struct ifnet *ifp;
271	int i, error = 0;
272	static const u_char eaddr[6];	/* 00:00:00:00:00:00 */
273	struct sysctl_oid *oid;
274	char num[14];			/* sufficient for 32 bits */
275
276	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
277	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
278	if (ifp == NULL) {
279		free(sc, M_DEVBUF);
280		return (ENOSPC);
281	}
282
283	sysctl_ctx_init(&sc->ctx);
284	snprintf(num, sizeof(num), "%u", unit);
285	sc->use_flowid = def_use_flowid;
286	oid = SYSCTL_ADD_NODE(&sc->ctx, &SYSCTL_NODE_CHILDREN(_net_link, lagg),
287		OID_AUTO, num, CTLFLAG_RD, NULL, "");
288	SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
289		"use_flowid", CTLTYPE_INT|CTLFLAG_RW, &sc->use_flowid, sc->use_flowid,
290		"Use flow id for load sharing");
291	SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
292		"count", CTLTYPE_INT|CTLFLAG_RD, &sc->sc_count, sc->sc_count,
293		"Total number of ports");
294	/* Hash all layers by default */
295	sc->sc_flags = LAGG_F_HASHL2|LAGG_F_HASHL3|LAGG_F_HASHL4;
296
297	sc->sc_proto = LAGG_PROTO_NONE;
298	for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) {
299		if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) {
300			sc->sc_proto = lagg_protos[i].ti_proto;
301			if ((error = lagg_protos[i].ti_attach(sc)) != 0) {
302				if_free_type(ifp, IFT_ETHER);
303				free(sc, M_DEVBUF);
304				return (error);
305			}
306			break;
307		}
308	}
309	LAGG_LOCK_INIT(sc);
310	SLIST_INIT(&sc->sc_ports);
311	TASK_INIT(&sc->sc_lladdr_task, 0, lagg_port_setlladdr, sc);
312
313	/* Initialise pseudo media types */
314	ifmedia_init(&sc->sc_media, 0, lagg_media_change,
315	    lagg_media_status);
316	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
317	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
318
319	if_initname(ifp, ifc->ifc_name, unit);
320	ifp->if_type = IFT_ETHER;
321	ifp->if_softc = sc;
322	ifp->if_transmit = lagg_transmit;
323	ifp->if_qflush = lagg_qflush;
324	ifp->if_init = lagg_init;
325	ifp->if_ioctl = lagg_ioctl;
326	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
327
328	/*
329	 * Attach as an ordinary ethernet device, childs will be attached
330	 * as special device IFT_IEEE8023ADLAG.
331	 */
332	ether_ifattach(ifp, eaddr);
333
334#if __FreeBSD_version >= 800000
335	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
336		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
337	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
338		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
339#endif
340
341	/* Insert into the global list of laggs */
342	mtx_lock(&lagg_list_mtx);
343	SLIST_INSERT_HEAD(&lagg_list, sc, sc_entries);
344	mtx_unlock(&lagg_list_mtx);
345
346	return (0);
347}
348
349static void
350lagg_clone_destroy(struct ifnet *ifp)
351{
352	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
353	struct lagg_port *lp;
354
355	LAGG_WLOCK(sc);
356
357	lagg_stop(sc);
358	ifp->if_flags &= ~IFF_UP;
359
360#if __FreeBSD_version >= 800000
361	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
362	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
363#endif
364
365	/* Shutdown and remove lagg ports */
366	while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL)
367		lagg_port_destroy(lp, 1);
368	/* Unhook the aggregation protocol */
369	if (sc->sc_detach != NULL)
370		(*sc->sc_detach)(sc);
371
372	LAGG_WUNLOCK(sc);
373
374	sysctl_ctx_free(&sc->ctx);
375	ifmedia_removeall(&sc->sc_media);
376	ether_ifdetach(ifp);
377	if_free_type(ifp, IFT_ETHER);
378
379	mtx_lock(&lagg_list_mtx);
380	SLIST_REMOVE(&lagg_list, sc, lagg_softc, sc_entries);
381	mtx_unlock(&lagg_list_mtx);
382
383	taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task);
384	LAGG_LOCK_DESTROY(sc);
385	free(sc, M_DEVBUF);
386}
387
388static void
389lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr)
390{
391	struct ifnet *ifp = sc->sc_ifp;
392
393	if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
394		return;
395
396	bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN);
397	/* Let the protocol know the MAC has changed */
398	if (sc->sc_lladdr != NULL)
399		(*sc->sc_lladdr)(sc);
400	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
401}
402
403static void
404lagg_capabilities(struct lagg_softc *sc)
405{
406	struct lagg_port *lp;
407	int cap = ~0, ena = ~0;
408	u_long hwa = ~0UL;
409#if defined(INET) || defined(INET6)
410	u_int hw_tsomax = IP_MAXPACKET;	/* Initialize to the maximum value. */
411#else
412	u_int hw_tsomax = ~0;	/* if_hw_tsomax is only for INET/INET6, but.. */
413#endif
414
415	LAGG_WLOCK_ASSERT(sc);
416
417	/* Get capabilities from the lagg ports */
418	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
419		cap &= lp->lp_ifp->if_capabilities;
420		ena &= lp->lp_ifp->if_capenable;
421		hwa &= lp->lp_ifp->if_hwassist;
422		/* Set to the minimum value of the lagg ports. */
423		if (lp->lp_ifp->if_hw_tsomax < hw_tsomax &&
424		    lp->lp_ifp->if_hw_tsomax > 0)
425			hw_tsomax = lp->lp_ifp->if_hw_tsomax;
426	}
427	cap = (cap == ~0 ? 0 : cap);
428	ena = (ena == ~0 ? 0 : ena);
429	hwa = (hwa == ~0 ? 0 : hwa);
430
431	if (sc->sc_ifp->if_capabilities != cap ||
432	    sc->sc_ifp->if_capenable != ena ||
433	    sc->sc_ifp->if_hwassist != hwa ||
434	    sc->sc_ifp->if_hw_tsomax != hw_tsomax) {
435		sc->sc_ifp->if_capabilities = cap;
436		sc->sc_ifp->if_capenable = ena;
437		sc->sc_ifp->if_hwassist = hwa;
438		sc->sc_ifp->if_hw_tsomax = hw_tsomax;
439		getmicrotime(&sc->sc_ifp->if_lastchange);
440
441		if (sc->sc_ifflags & IFF_DEBUG)
442			if_printf(sc->sc_ifp,
443			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
444	}
445}
446
447static void
448lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr)
449{
450	struct lagg_softc *sc = lp->lp_softc;
451	struct ifnet *ifp = lp->lp_ifp;
452	struct lagg_llq *llq;
453	int pending = 0;
454
455	LAGG_WLOCK_ASSERT(sc);
456
457	if (lp->lp_detaching ||
458	    memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
459		return;
460
461	/* Check to make sure its not already queued to be changed */
462	SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) {
463		if (llq->llq_ifp == ifp) {
464			pending = 1;
465			break;
466		}
467	}
468
469	if (!pending) {
470		llq = malloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT);
471		if (llq == NULL)	/* XXX what to do */
472			return;
473	}
474
475	/* Update the lladdr even if pending, it may have changed */
476	llq->llq_ifp = ifp;
477	bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN);
478
479	if (!pending)
480		SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries);
481
482	taskqueue_enqueue(taskqueue_swi, &sc->sc_lladdr_task);
483}
484
485/*
486 * Set the interface MAC address from a taskqueue to avoid a LOR.
487 */
488static void
489lagg_port_setlladdr(void *arg, int pending)
490{
491	struct lagg_softc *sc = (struct lagg_softc *)arg;
492	struct lagg_llq *llq, *head;
493	struct ifnet *ifp;
494	int error;
495
496	/* Grab a local reference of the queue and remove it from the softc */
497	LAGG_WLOCK(sc);
498	head = SLIST_FIRST(&sc->sc_llq_head);
499	SLIST_FIRST(&sc->sc_llq_head) = NULL;
500	LAGG_WUNLOCK(sc);
501
502	/*
503	 * Traverse the queue and set the lladdr on each ifp. It is safe to do
504	 * unlocked as we have the only reference to it.
505	 */
506	for (llq = head; llq != NULL; llq = head) {
507		ifp = llq->llq_ifp;
508
509		/* Set the link layer address */
510		CURVNET_SET(ifp->if_vnet);
511		error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN);
512		CURVNET_RESTORE();
513		if (error)
514			printf("%s: setlladdr failed on %s\n", __func__,
515			    ifp->if_xname);
516
517		head = SLIST_NEXT(llq, llq_entries);
518		free(llq, M_DEVBUF);
519	}
520}
521
522static int
523lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
524{
525	struct lagg_softc *sc_ptr;
526	struct lagg_port *lp;
527	int error = 0;
528
529	LAGG_WLOCK_ASSERT(sc);
530
531	/* Limit the maximal number of lagg ports */
532	if (sc->sc_count >= LAGG_MAX_PORTS)
533		return (ENOSPC);
534
535	/* Check if port has already been associated to a lagg */
536	if (ifp->if_lagg != NULL) {
537		/* Port is already in the current lagg? */
538		lp = (struct lagg_port *)ifp->if_lagg;
539		if (lp->lp_softc == sc)
540			return (EEXIST);
541		return (EBUSY);
542	}
543
544	/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
545	if (ifp->if_type != IFT_ETHER)
546		return (EPROTONOSUPPORT);
547
548#ifdef INET6
549	/*
550	 * The member interface should not have inet6 address because
551	 * two interfaces with a valid link-local scope zone must not be
552	 * merged in any form.  This restriction is needed to
553	 * prevent violation of link-local scope zone.  Attempts to
554	 * add a member interface which has inet6 addresses triggers
555	 * removal of all inet6 addresses on the member interface.
556	 */
557	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
558		if (in6ifa_llaonifp(lp->lp_ifp)) {
559			in6_ifdetach(lp->lp_ifp);
560			if_printf(sc->sc_ifp,
561			    "IPv6 addresses on %s have been removed "
562			    "before adding it as a member to prevent "
563			    "IPv6 address scope violation.\n",
564			    lp->lp_ifp->if_xname);
565		}
566	}
567	if (in6ifa_llaonifp(ifp)) {
568		in6_ifdetach(ifp);
569		if_printf(sc->sc_ifp,
570		    "IPv6 addresses on %s have been removed "
571		    "before adding it as a member to prevent "
572		    "IPv6 address scope violation.\n",
573		    ifp->if_xname);
574	}
575#endif
576	/* Allow the first Ethernet member to define the MTU */
577	if (SLIST_EMPTY(&sc->sc_ports))
578		sc->sc_ifp->if_mtu = ifp->if_mtu;
579	else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
580		if_printf(sc->sc_ifp, "invalid MTU for %s\n",
581		    ifp->if_xname);
582		return (EINVAL);
583	}
584
585	if ((lp = malloc(sizeof(struct lagg_port),
586	    M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
587		return (ENOMEM);
588
589	/* Check if port is a stacked lagg */
590	mtx_lock(&lagg_list_mtx);
591	SLIST_FOREACH(sc_ptr, &lagg_list, sc_entries) {
592		if (ifp == sc_ptr->sc_ifp) {
593			mtx_unlock(&lagg_list_mtx);
594			free(lp, M_DEVBUF);
595			return (EINVAL);
596			/* XXX disable stacking for the moment, its untested */
597#ifdef LAGG_PORT_STACKING
598			lp->lp_flags |= LAGG_PORT_STACK;
599			if (lagg_port_checkstacking(sc_ptr) >=
600			    LAGG_MAX_STACKING) {
601				mtx_unlock(&lagg_list_mtx);
602				free(lp, M_DEVBUF);
603				return (E2BIG);
604			}
605#endif
606		}
607	}
608	mtx_unlock(&lagg_list_mtx);
609
610	/* Change the interface type */
611	lp->lp_iftype = ifp->if_type;
612	ifp->if_type = IFT_IEEE8023ADLAG;
613	ifp->if_lagg = lp;
614	lp->lp_ioctl = ifp->if_ioctl;
615	ifp->if_ioctl = lagg_port_ioctl;
616	lp->lp_output = ifp->if_output;
617	ifp->if_output = lagg_port_output;
618
619	lp->lp_ifp = ifp;
620	lp->lp_softc = sc;
621
622	/* Save port link layer address */
623	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN);
624
625	if (SLIST_EMPTY(&sc->sc_ports)) {
626		sc->sc_primary = lp;
627		lagg_lladdr(sc, IF_LLADDR(ifp));
628	} else {
629		/* Update link layer address for this port */
630		lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp));
631	}
632
633	/* Insert into the list of ports */
634	SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
635	sc->sc_count++;
636
637	/* Update lagg capabilities */
638	lagg_capabilities(sc);
639	lagg_linkstate(sc);
640
641	/* Add multicast addresses and interface flags to this port */
642	lagg_ether_cmdmulti(lp, 1);
643	lagg_setflags(lp, 1);
644
645	if (sc->sc_port_create != NULL)
646		error = (*sc->sc_port_create)(lp);
647	if (error) {
648		/* remove the port again, without calling sc_port_destroy */
649		lagg_port_destroy(lp, 0);
650		return (error);
651	}
652
653	return (error);
654}
655
656#ifdef LAGG_PORT_STACKING
657static int
658lagg_port_checkstacking(struct lagg_softc *sc)
659{
660	struct lagg_softc *sc_ptr;
661	struct lagg_port *lp;
662	int m = 0;
663
664	LAGG_WLOCK_ASSERT(sc);
665
666	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
667		if (lp->lp_flags & LAGG_PORT_STACK) {
668			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
669			m = MAX(m, lagg_port_checkstacking(sc_ptr));
670		}
671	}
672
673	return (m + 1);
674}
675#endif
676
677static int
678lagg_port_destroy(struct lagg_port *lp, int runpd)
679{
680	struct lagg_softc *sc = lp->lp_softc;
681	struct lagg_port *lp_ptr;
682	struct lagg_llq *llq;
683	struct ifnet *ifp = lp->lp_ifp;
684
685	LAGG_WLOCK_ASSERT(sc);
686
687	if (runpd && sc->sc_port_destroy != NULL)
688		(*sc->sc_port_destroy)(lp);
689
690	/*
691	 * Remove multicast addresses and interface flags from this port and
692	 * reset the MAC address, skip if the interface is being detached.
693	 */
694	if (!lp->lp_detaching) {
695		lagg_ether_cmdmulti(lp, 0);
696		lagg_setflags(lp, 0);
697		lagg_port_lladdr(lp, lp->lp_lladdr);
698	}
699
700	/* Restore interface */
701	ifp->if_type = lp->lp_iftype;
702	ifp->if_ioctl = lp->lp_ioctl;
703	ifp->if_output = lp->lp_output;
704	ifp->if_lagg = NULL;
705
706	/* Finally, remove the port from the lagg */
707	SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
708	sc->sc_count--;
709
710	/* Update the primary interface */
711	if (lp == sc->sc_primary) {
712		uint8_t lladdr[ETHER_ADDR_LEN];
713
714		if ((lp_ptr = SLIST_FIRST(&sc->sc_ports)) == NULL) {
715			bzero(&lladdr, ETHER_ADDR_LEN);
716		} else {
717			bcopy(lp_ptr->lp_lladdr,
718			    lladdr, ETHER_ADDR_LEN);
719		}
720		lagg_lladdr(sc, lladdr);
721		sc->sc_primary = lp_ptr;
722
723		/* Update link layer address for each port */
724		SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
725			lagg_port_lladdr(lp_ptr, lladdr);
726	}
727
728	/* Remove any pending lladdr changes from the queue */
729	if (lp->lp_detaching) {
730		SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) {
731			if (llq->llq_ifp == ifp) {
732				SLIST_REMOVE(&sc->sc_llq_head, llq, lagg_llq,
733				    llq_entries);
734				free(llq, M_DEVBUF);
735				break;	/* Only appears once */
736			}
737		}
738	}
739
740	if (lp->lp_ifflags)
741		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
742
743	free(lp, M_DEVBUF);
744
745	/* Update lagg capabilities */
746	lagg_capabilities(sc);
747	lagg_linkstate(sc);
748
749	return (0);
750}
751
752static int
753lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
754{
755	struct lagg_reqport *rp = (struct lagg_reqport *)data;
756	struct lagg_softc *sc;
757	struct lagg_port *lp = NULL;
758	int error = 0;
759
760	/* Should be checked by the caller */
761	if (ifp->if_type != IFT_IEEE8023ADLAG ||
762	    (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
763		goto fallback;
764
765	switch (cmd) {
766	case SIOCGLAGGPORT:
767		if (rp->rp_portname[0] == '\0' ||
768		    ifunit(rp->rp_portname) != ifp) {
769			error = EINVAL;
770			break;
771		}
772
773		LAGG_RLOCK(sc);
774		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
775			error = ENOENT;
776			LAGG_RUNLOCK(sc);
777			break;
778		}
779
780		lagg_port2req(lp, rp);
781		LAGG_RUNLOCK(sc);
782		break;
783
784	case SIOCSIFCAP:
785		if (lp->lp_ioctl == NULL) {
786			error = EINVAL;
787			break;
788		}
789		error = (*lp->lp_ioctl)(ifp, cmd, data);
790		if (error)
791			break;
792
793		/* Update lagg interface capabilities */
794		LAGG_WLOCK(sc);
795		lagg_capabilities(sc);
796		LAGG_WUNLOCK(sc);
797		break;
798
799	case SIOCSIFMTU:
800		/* Do not allow the MTU to be changed once joined */
801		error = EINVAL;
802		break;
803
804	default:
805		goto fallback;
806	}
807
808	return (error);
809
810fallback:
811	if (lp->lp_ioctl != NULL)
812		return ((*lp->lp_ioctl)(ifp, cmd, data));
813
814	return (EINVAL);
815}
816
817/*
818 * For direct output to child ports.
819 */
820static int
821lagg_port_output(struct ifnet *ifp, struct mbuf *m,
822	struct sockaddr *dst, struct route *ro)
823{
824	struct lagg_port *lp = ifp->if_lagg;
825
826	switch (dst->sa_family) {
827		case pseudo_AF_HDRCMPLT:
828		case AF_UNSPEC:
829			return ((*lp->lp_output)(ifp, m, dst, ro));
830	}
831
832	/* drop any other frames */
833	m_freem(m);
834	return (ENETDOWN);
835}
836
837static void
838lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
839{
840	struct lagg_port *lp;
841	struct lagg_softc *sc;
842
843	if ((lp = ifp->if_lagg) == NULL)
844		return;
845	/* If the ifnet is just being renamed, don't do anything. */
846	if (ifp->if_flags & IFF_RENAMING)
847		return;
848
849	sc = lp->lp_softc;
850
851	LAGG_WLOCK(sc);
852	lp->lp_detaching = 1;
853	lagg_port_destroy(lp, 1);
854	LAGG_WUNLOCK(sc);
855}
856
857static void
858lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
859{
860	struct lagg_softc *sc = lp->lp_softc;
861
862	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
863	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
864	rp->rp_prio = lp->lp_prio;
865	rp->rp_flags = lp->lp_flags;
866	if (sc->sc_portreq != NULL)
867		(*sc->sc_portreq)(lp, (caddr_t)&rp->rp_psc);
868
869	/* Add protocol specific flags */
870	switch (sc->sc_proto) {
871		case LAGG_PROTO_FAILOVER:
872			if (lp == sc->sc_primary)
873				rp->rp_flags |= LAGG_PORT_MASTER;
874			if (lp == lagg_link_active(sc, sc->sc_primary))
875				rp->rp_flags |= LAGG_PORT_ACTIVE;
876			break;
877
878		case LAGG_PROTO_ROUNDROBIN:
879		case LAGG_PROTO_LOADBALANCE:
880		case LAGG_PROTO_ETHERCHANNEL:
881			if (LAGG_PORTACTIVE(lp))
882				rp->rp_flags |= LAGG_PORT_ACTIVE;
883			break;
884
885		case LAGG_PROTO_LACP:
886			/* LACP has a different definition of active */
887			if (lacp_isactive(lp))
888				rp->rp_flags |= LAGG_PORT_ACTIVE;
889			if (lacp_iscollecting(lp))
890				rp->rp_flags |= LAGG_PORT_COLLECTING;
891			if (lacp_isdistributing(lp))
892				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
893			break;
894	}
895
896}
897
898static void
899lagg_init(void *xsc)
900{
901	struct lagg_softc *sc = (struct lagg_softc *)xsc;
902	struct lagg_port *lp;
903	struct ifnet *ifp = sc->sc_ifp;
904
905	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
906		return;
907
908	LAGG_WLOCK(sc);
909
910	ifp->if_drv_flags |= IFF_DRV_RUNNING;
911	/* Update the port lladdrs */
912	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
913		lagg_port_lladdr(lp, IF_LLADDR(ifp));
914
915	if (sc->sc_init != NULL)
916		(*sc->sc_init)(sc);
917
918	LAGG_WUNLOCK(sc);
919}
920
921static void
922lagg_stop(struct lagg_softc *sc)
923{
924	struct ifnet *ifp = sc->sc_ifp;
925
926	LAGG_WLOCK_ASSERT(sc);
927
928	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
929		return;
930
931	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
932
933	if (sc->sc_stop != NULL)
934		(*sc->sc_stop)(sc);
935}
936
937static int
938lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
939{
940	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
941	struct lagg_reqall *ra = (struct lagg_reqall *)data;
942	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
943	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
944	struct ifreq *ifr = (struct ifreq *)data;
945	struct lagg_port *lp;
946	struct ifnet *tpif;
947	struct thread *td = curthread;
948	char *buf, *outbuf;
949	int count, buflen, len, error = 0;
950
951	bzero(&rpbuf, sizeof(rpbuf));
952
953	switch (cmd) {
954	case SIOCGLAGG:
955		LAGG_RLOCK(sc);
956		count = 0;
957		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
958			count++;
959		buflen = count * sizeof(struct lagg_reqport);
960		LAGG_RUNLOCK(sc);
961
962		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
963
964		LAGG_RLOCK(sc);
965		ra->ra_proto = sc->sc_proto;
966		if (sc->sc_req != NULL)
967			(*sc->sc_req)(sc, (caddr_t)&ra->ra_psc);
968
969		count = 0;
970		buf = outbuf;
971		len = min(ra->ra_size, buflen);
972		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
973			if (len < sizeof(rpbuf))
974				break;
975
976			lagg_port2req(lp, &rpbuf);
977			memcpy(buf, &rpbuf, sizeof(rpbuf));
978			count++;
979			buf += sizeof(rpbuf);
980			len -= sizeof(rpbuf);
981		}
982		LAGG_RUNLOCK(sc);
983		ra->ra_ports = count;
984		ra->ra_size = count * sizeof(rpbuf);
985		error = copyout(outbuf, ra->ra_port, ra->ra_size);
986		free(outbuf, M_TEMP);
987		break;
988	case SIOCSLAGG:
989		error = priv_check(td, PRIV_NET_LAGG);
990		if (error)
991			break;
992		if (ra->ra_proto >= LAGG_PROTO_MAX) {
993			error = EPROTONOSUPPORT;
994			break;
995		}
996		LAGG_WLOCK(sc);
997		if (sc->sc_proto != LAGG_PROTO_NONE) {
998			/* Reset protocol first in case detach unlocks */
999			sc->sc_proto = LAGG_PROTO_NONE;
1000			error = sc->sc_detach(sc);
1001			sc->sc_detach = NULL;
1002			sc->sc_start = NULL;
1003			sc->sc_input = NULL;
1004			sc->sc_port_create = NULL;
1005			sc->sc_port_destroy = NULL;
1006			sc->sc_linkstate = NULL;
1007			sc->sc_init = NULL;
1008			sc->sc_stop = NULL;
1009			sc->sc_lladdr = NULL;
1010			sc->sc_req = NULL;
1011			sc->sc_portreq = NULL;
1012		} else if (sc->sc_input != NULL) {
1013			/* Still detaching */
1014			error = EBUSY;
1015		}
1016		if (error != 0) {
1017			LAGG_WUNLOCK(sc);
1018			break;
1019		}
1020		for (int i = 0; i < (sizeof(lagg_protos) /
1021		    sizeof(lagg_protos[0])); i++) {
1022			if (lagg_protos[i].ti_proto == ra->ra_proto) {
1023				if (sc->sc_ifflags & IFF_DEBUG)
1024					printf("%s: using proto %u\n",
1025					    sc->sc_ifname,
1026					    lagg_protos[i].ti_proto);
1027				sc->sc_proto = lagg_protos[i].ti_proto;
1028				if (sc->sc_proto != LAGG_PROTO_NONE)
1029					error = lagg_protos[i].ti_attach(sc);
1030				LAGG_WUNLOCK(sc);
1031				return (error);
1032			}
1033		}
1034		LAGG_WUNLOCK(sc);
1035		error = EPROTONOSUPPORT;
1036		break;
1037	case SIOCGLAGGFLAGS:
1038		rf->rf_flags = sc->sc_flags;
1039		break;
1040	case SIOCSLAGGHASH:
1041		error = priv_check(td, PRIV_NET_LAGG);
1042		if (error)
1043			break;
1044		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
1045			error = EINVAL;
1046			break;
1047		}
1048		LAGG_WLOCK(sc);
1049		sc->sc_flags &= ~LAGG_F_HASHMASK;
1050		sc->sc_flags |= rf->rf_flags & LAGG_F_HASHMASK;
1051		LAGG_WUNLOCK(sc);
1052		break;
1053	case SIOCGLAGGPORT:
1054		if (rp->rp_portname[0] == '\0' ||
1055		    (tpif = ifunit(rp->rp_portname)) == NULL) {
1056			error = EINVAL;
1057			break;
1058		}
1059
1060		LAGG_RLOCK(sc);
1061		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1062		    lp->lp_softc != sc) {
1063			error = ENOENT;
1064			LAGG_RUNLOCK(sc);
1065			break;
1066		}
1067
1068		lagg_port2req(lp, rp);
1069		LAGG_RUNLOCK(sc);
1070		break;
1071	case SIOCSLAGGPORT:
1072		error = priv_check(td, PRIV_NET_LAGG);
1073		if (error)
1074			break;
1075		if (rp->rp_portname[0] == '\0' ||
1076		    (tpif = ifunit(rp->rp_portname)) == NULL) {
1077			error = EINVAL;
1078			break;
1079		}
1080		LAGG_WLOCK(sc);
1081		error = lagg_port_create(sc, tpif);
1082		LAGG_WUNLOCK(sc);
1083		break;
1084	case SIOCSLAGGDELPORT:
1085		error = priv_check(td, PRIV_NET_LAGG);
1086		if (error)
1087			break;
1088		if (rp->rp_portname[0] == '\0' ||
1089		    (tpif = ifunit(rp->rp_portname)) == NULL) {
1090			error = EINVAL;
1091			break;
1092		}
1093
1094		LAGG_WLOCK(sc);
1095		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1096		    lp->lp_softc != sc) {
1097			error = ENOENT;
1098			LAGG_WUNLOCK(sc);
1099			break;
1100		}
1101
1102		error = lagg_port_destroy(lp, 1);
1103		LAGG_WUNLOCK(sc);
1104		break;
1105	case SIOCSIFFLAGS:
1106		/* Set flags on ports too */
1107		LAGG_WLOCK(sc);
1108		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1109			lagg_setflags(lp, 1);
1110		}
1111		LAGG_WUNLOCK(sc);
1112
1113		if (!(ifp->if_flags & IFF_UP) &&
1114		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1115			/*
1116			 * If interface is marked down and it is running,
1117			 * then stop and disable it.
1118			 */
1119			LAGG_WLOCK(sc);
1120			lagg_stop(sc);
1121			LAGG_WUNLOCK(sc);
1122		} else if ((ifp->if_flags & IFF_UP) &&
1123		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1124			/*
1125			 * If interface is marked up and it is stopped, then
1126			 * start it.
1127			 */
1128			(*ifp->if_init)(sc);
1129		}
1130		break;
1131	case SIOCADDMULTI:
1132	case SIOCDELMULTI:
1133		LAGG_WLOCK(sc);
1134		error = lagg_ether_setmulti(sc);
1135		LAGG_WUNLOCK(sc);
1136		break;
1137	case SIOCSIFMEDIA:
1138	case SIOCGIFMEDIA:
1139		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
1140		break;
1141
1142	case SIOCSIFCAP:
1143	case SIOCSIFMTU:
1144		/* Do not allow the MTU or caps to be directly changed */
1145		error = EINVAL;
1146		break;
1147
1148	default:
1149		error = ether_ioctl(ifp, cmd, data);
1150		break;
1151	}
1152	return (error);
1153}
1154
1155static int
1156lagg_ether_setmulti(struct lagg_softc *sc)
1157{
1158	struct lagg_port *lp;
1159
1160	LAGG_WLOCK_ASSERT(sc);
1161
1162	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1163		/* First, remove any existing filter entries. */
1164		lagg_ether_cmdmulti(lp, 0);
1165		/* copy all addresses from the lagg interface to the port */
1166		lagg_ether_cmdmulti(lp, 1);
1167	}
1168	return (0);
1169}
1170
1171static int
1172lagg_ether_cmdmulti(struct lagg_port *lp, int set)
1173{
1174	struct lagg_softc *sc = lp->lp_softc;
1175	struct ifnet *ifp = lp->lp_ifp;
1176	struct ifnet *scifp = sc->sc_ifp;
1177	struct lagg_mc *mc;
1178	struct ifmultiaddr *ifma, *rifma = NULL;
1179	struct sockaddr_dl sdl;
1180	int error;
1181
1182	LAGG_WLOCK_ASSERT(sc);
1183
1184	bzero((char *)&sdl, sizeof(sdl));
1185	sdl.sdl_len = sizeof(sdl);
1186	sdl.sdl_family = AF_LINK;
1187	sdl.sdl_type = IFT_ETHER;
1188	sdl.sdl_alen = ETHER_ADDR_LEN;
1189	sdl.sdl_index = ifp->if_index;
1190
1191	if (set) {
1192		TAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
1193			if (ifma->ifma_addr->sa_family != AF_LINK)
1194				continue;
1195			bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1196			    LLADDR(&sdl), ETHER_ADDR_LEN);
1197
1198			error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma);
1199			if (error)
1200				return (error);
1201			mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT);
1202			if (mc == NULL)
1203				return (ENOMEM);
1204			mc->mc_ifma = rifma;
1205			SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
1206		}
1207	} else {
1208		while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
1209			SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
1210			if_delmulti_ifma(mc->mc_ifma);
1211			free(mc, M_DEVBUF);
1212		}
1213	}
1214	return (0);
1215}
1216
1217/* Handle a ref counted flag that should be set on the lagg port as well */
1218static int
1219lagg_setflag(struct lagg_port *lp, int flag, int status,
1220	     int (*func)(struct ifnet *, int))
1221{
1222	struct lagg_softc *sc = lp->lp_softc;
1223	struct ifnet *scifp = sc->sc_ifp;
1224	struct ifnet *ifp = lp->lp_ifp;
1225	int error;
1226
1227	LAGG_WLOCK_ASSERT(sc);
1228
1229	status = status ? (scifp->if_flags & flag) : 0;
1230	/* Now "status" contains the flag value or 0 */
1231
1232	/*
1233	 * See if recorded ports status is different from what
1234	 * we want it to be.  If it is, flip it.  We record ports
1235	 * status in lp_ifflags so that we won't clear ports flag
1236	 * we haven't set.  In fact, we don't clear or set ports
1237	 * flags directly, but get or release references to them.
1238	 * That's why we can be sure that recorded flags still are
1239	 * in accord with actual ports flags.
1240	 */
1241	if (status != (lp->lp_ifflags & flag)) {
1242		error = (*func)(ifp, status);
1243		if (error)
1244			return (error);
1245		lp->lp_ifflags &= ~flag;
1246		lp->lp_ifflags |= status;
1247	}
1248	return (0);
1249}
1250
1251/*
1252 * Handle IFF_* flags that require certain changes on the lagg port
1253 * if "status" is true, update ports flags respective to the lagg
1254 * if "status" is false, forcedly clear the flags set on port.
1255 */
1256static int
1257lagg_setflags(struct lagg_port *lp, int status)
1258{
1259	int error, i;
1260
1261	for (i = 0; lagg_pflags[i].flag; i++) {
1262		error = lagg_setflag(lp, lagg_pflags[i].flag,
1263		    status, lagg_pflags[i].func);
1264		if (error)
1265			return (error);
1266	}
1267	return (0);
1268}
1269
1270static int
1271lagg_transmit(struct ifnet *ifp, struct mbuf *m)
1272{
1273	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1274	int error, len, mcast;
1275
1276	len = m->m_pkthdr.len;
1277	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
1278
1279	LAGG_RLOCK(sc);
1280	/* We need a Tx algorithm and at least one port */
1281	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
1282		LAGG_RUNLOCK(sc);
1283		m_freem(m);
1284		ifp->if_oerrors++;
1285		return (ENXIO);
1286	}
1287
1288	ETHER_BPF_MTAP(ifp, m);
1289
1290	error = (*sc->sc_start)(sc, m);
1291	LAGG_RUNLOCK(sc);
1292
1293	if (error == 0) {
1294		ifp->if_opackets++;
1295		ifp->if_omcasts += mcast;
1296		ifp->if_obytes += len;
1297	} else
1298		ifp->if_oerrors++;
1299
1300	return (error);
1301}
1302
1303/*
1304 * The ifp->if_qflush entry point for lagg(4) is no-op.
1305 */
1306static void
1307lagg_qflush(struct ifnet *ifp __unused)
1308{
1309}
1310
1311static struct mbuf *
1312lagg_input(struct ifnet *ifp, struct mbuf *m)
1313{
1314	struct lagg_port *lp = ifp->if_lagg;
1315	struct lagg_softc *sc = lp->lp_softc;
1316	struct ifnet *scifp = sc->sc_ifp;
1317
1318	LAGG_RLOCK(sc);
1319	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
1320	    (lp->lp_flags & LAGG_PORT_DISABLED) ||
1321	    sc->sc_proto == LAGG_PROTO_NONE) {
1322		LAGG_RUNLOCK(sc);
1323		m_freem(m);
1324		return (NULL);
1325	}
1326
1327	ETHER_BPF_MTAP(scifp, m);
1328
1329	m = (*sc->sc_input)(sc, lp, m);
1330
1331	if (m != NULL) {
1332		scifp->if_ipackets++;
1333		scifp->if_ibytes += m->m_pkthdr.len;
1334
1335		if (scifp->if_flags & IFF_MONITOR) {
1336			m_freem(m);
1337			m = NULL;
1338		}
1339	}
1340
1341	LAGG_RUNLOCK(sc);
1342	return (m);
1343}
1344
1345static int
1346lagg_media_change(struct ifnet *ifp)
1347{
1348	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1349
1350	if (sc->sc_ifflags & IFF_DEBUG)
1351		printf("%s\n", __func__);
1352
1353	/* Ignore */
1354	return (0);
1355}
1356
1357static void
1358lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
1359{
1360	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1361	struct lagg_port *lp;
1362
1363	imr->ifm_status = IFM_AVALID;
1364	imr->ifm_active = IFM_ETHER | IFM_AUTO;
1365
1366	LAGG_RLOCK(sc);
1367	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1368		if (LAGG_PORTACTIVE(lp))
1369			imr->ifm_status |= IFM_ACTIVE;
1370	}
1371	LAGG_RUNLOCK(sc);
1372}
1373
1374static void
1375lagg_linkstate(struct lagg_softc *sc)
1376{
1377	struct lagg_port *lp;
1378	int new_link = LINK_STATE_DOWN;
1379	uint64_t speed;
1380
1381	/* Our link is considered up if at least one of our ports is active */
1382	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1383		if (lp->lp_link_state == LINK_STATE_UP) {
1384			new_link = LINK_STATE_UP;
1385			break;
1386		}
1387	}
1388	if_link_state_change(sc->sc_ifp, new_link);
1389
1390	/* Update if_baudrate to reflect the max possible speed */
1391	switch (sc->sc_proto) {
1392		case LAGG_PROTO_FAILOVER:
1393			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
1394			    sc->sc_primary->lp_ifp->if_baudrate : 0;
1395			break;
1396		case LAGG_PROTO_ROUNDROBIN:
1397		case LAGG_PROTO_LOADBALANCE:
1398		case LAGG_PROTO_ETHERCHANNEL:
1399			speed = 0;
1400			SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1401				speed += lp->lp_ifp->if_baudrate;
1402			sc->sc_ifp->if_baudrate = speed;
1403			break;
1404		case LAGG_PROTO_LACP:
1405			/* LACP updates if_baudrate itself */
1406			break;
1407	}
1408}
1409
1410static void
1411lagg_port_state(struct ifnet *ifp, int state)
1412{
1413	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
1414	struct lagg_softc *sc = NULL;
1415
1416	if (lp != NULL)
1417		sc = lp->lp_softc;
1418	if (sc == NULL)
1419		return;
1420
1421	LAGG_WLOCK(sc);
1422	lagg_linkstate(sc);
1423	if (sc->sc_linkstate != NULL)
1424		(*sc->sc_linkstate)(lp);
1425	LAGG_WUNLOCK(sc);
1426}
1427
1428struct lagg_port *
1429lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
1430{
1431	struct lagg_port *lp_next, *rval = NULL;
1432	// int new_link = LINK_STATE_DOWN;
1433
1434	LAGG_RLOCK_ASSERT(sc);
1435	/*
1436	 * Search a port which reports an active link state.
1437	 */
1438
1439	if (lp == NULL)
1440		goto search;
1441	if (LAGG_PORTACTIVE(lp)) {
1442		rval = lp;
1443		goto found;
1444	}
1445	if ((lp_next = SLIST_NEXT(lp, lp_entries)) != NULL &&
1446	    LAGG_PORTACTIVE(lp_next)) {
1447		rval = lp_next;
1448		goto found;
1449	}
1450
1451search:
1452	SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
1453		if (LAGG_PORTACTIVE(lp_next)) {
1454			rval = lp_next;
1455			goto found;
1456		}
1457	}
1458
1459found:
1460	if (rval != NULL) {
1461		/*
1462		 * The IEEE 802.1D standard assumes that a lagg with
1463		 * multiple ports is always full duplex. This is valid
1464		 * for load sharing laggs and if at least two links
1465		 * are active. Unfortunately, checking the latter would
1466		 * be too expensive at this point.
1467		 XXX
1468		if ((sc->sc_capabilities & IFCAP_LAGG_FULLDUPLEX) &&
1469		    (sc->sc_count > 1))
1470			new_link = LINK_STATE_FULL_DUPLEX;
1471		else
1472			new_link = rval->lp_link_state;
1473		 */
1474	}
1475
1476	return (rval);
1477}
1478
1479static const void *
1480lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf)
1481{
1482	if (m->m_pkthdr.len < (off + len)) {
1483		return (NULL);
1484	} else if (m->m_len < (off + len)) {
1485		m_copydata(m, off, len, buf);
1486		return (buf);
1487	}
1488	return (mtod(m, char *) + off);
1489}
1490
1491uint32_t
1492lagg_hashmbuf(struct lagg_softc *sc, struct mbuf *m, uint32_t key)
1493{
1494	uint16_t etype;
1495	uint32_t p = key;
1496	int off;
1497	struct ether_header *eh;
1498	const struct ether_vlan_header *vlan;
1499#ifdef INET
1500	const struct ip *ip;
1501	const uint32_t *ports;
1502	int iphlen;
1503#endif
1504#ifdef INET6
1505	const struct ip6_hdr *ip6;
1506	uint32_t flow;
1507#endif
1508	union {
1509#ifdef INET
1510		struct ip ip;
1511#endif
1512#ifdef INET6
1513		struct ip6_hdr ip6;
1514#endif
1515		struct ether_vlan_header vlan;
1516		uint32_t port;
1517	} buf;
1518
1519
1520	off = sizeof(*eh);
1521	if (m->m_len < off)
1522		goto out;
1523	eh = mtod(m, struct ether_header *);
1524	etype = ntohs(eh->ether_type);
1525	if (sc->sc_flags & LAGG_F_HASHL2) {
1526		p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, p);
1527		p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p);
1528	}
1529
1530	/* Special handling for encapsulating VLAN frames */
1531	if ((m->m_flags & M_VLANTAG) && (sc->sc_flags & LAGG_F_HASHL2)) {
1532		p = hash32_buf(&m->m_pkthdr.ether_vtag,
1533		    sizeof(m->m_pkthdr.ether_vtag), p);
1534	} else if (etype == ETHERTYPE_VLAN) {
1535		vlan = lagg_gethdr(m, off,  sizeof(*vlan), &buf);
1536		if (vlan == NULL)
1537			goto out;
1538
1539		if (sc->sc_flags & LAGG_F_HASHL2)
1540			p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p);
1541		etype = ntohs(vlan->evl_proto);
1542		off += sizeof(*vlan) - sizeof(*eh);
1543	}
1544
1545	switch (etype) {
1546#ifdef INET
1547	case ETHERTYPE_IP:
1548		ip = lagg_gethdr(m, off, sizeof(*ip), &buf);
1549		if (ip == NULL)
1550			goto out;
1551
1552		if (sc->sc_flags & LAGG_F_HASHL3) {
1553			p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p);
1554			p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p);
1555		}
1556		if (!(sc->sc_flags & LAGG_F_HASHL4))
1557			break;
1558		switch (ip->ip_p) {
1559			case IPPROTO_TCP:
1560			case IPPROTO_UDP:
1561			case IPPROTO_SCTP:
1562				iphlen = ip->ip_hl << 2;
1563				if (iphlen < sizeof(*ip))
1564					break;
1565				off += iphlen;
1566				ports = lagg_gethdr(m, off, sizeof(*ports), &buf);
1567				if (ports == NULL)
1568					break;
1569				p = hash32_buf(ports, sizeof(*ports), p);
1570				break;
1571		}
1572		break;
1573#endif
1574#ifdef INET6
1575	case ETHERTYPE_IPV6:
1576		if (!(sc->sc_flags & LAGG_F_HASHL3))
1577			break;
1578		ip6 = lagg_gethdr(m, off, sizeof(*ip6), &buf);
1579		if (ip6 == NULL)
1580			goto out;
1581
1582		p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p);
1583		p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p);
1584		flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK;
1585		p = hash32_buf(&flow, sizeof(flow), p);	/* IPv6 flow label */
1586		break;
1587#endif
1588	}
1589out:
1590	return (p);
1591}
1592
1593int
1594lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
1595{
1596
1597	return (ifp->if_transmit)(ifp, m);
1598}
1599
1600/*
1601 * Simple round robin aggregation
1602 */
1603
1604static int
1605lagg_rr_attach(struct lagg_softc *sc)
1606{
1607	sc->sc_detach = lagg_rr_detach;
1608	sc->sc_start = lagg_rr_start;
1609	sc->sc_input = lagg_rr_input;
1610	sc->sc_port_create = NULL;
1611	sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
1612	sc->sc_seq = 0;
1613
1614	return (0);
1615}
1616
1617static int
1618lagg_rr_detach(struct lagg_softc *sc)
1619{
1620	return (0);
1621}
1622
1623static int
1624lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
1625{
1626	struct lagg_port *lp;
1627	uint32_t p;
1628
1629	p = atomic_fetchadd_32(&sc->sc_seq, 1);
1630	p %= sc->sc_count;
1631	lp = SLIST_FIRST(&sc->sc_ports);
1632	while (p--)
1633		lp = SLIST_NEXT(lp, lp_entries);
1634
1635	/*
1636	 * Check the port's link state. This will return the next active
1637	 * port if the link is down or the port is NULL.
1638	 */
1639	if ((lp = lagg_link_active(sc, lp)) == NULL) {
1640		m_freem(m);
1641		return (ENETDOWN);
1642	}
1643
1644	/* Send mbuf */
1645	return (lagg_enqueue(lp->lp_ifp, m));
1646}
1647
1648static struct mbuf *
1649lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1650{
1651	struct ifnet *ifp = sc->sc_ifp;
1652
1653	/* Just pass in the packet to our lagg device */
1654	m->m_pkthdr.rcvif = ifp;
1655
1656	return (m);
1657}
1658
1659/*
1660 * Active failover
1661 */
1662
1663static int
1664lagg_fail_attach(struct lagg_softc *sc)
1665{
1666	sc->sc_detach = lagg_fail_detach;
1667	sc->sc_start = lagg_fail_start;
1668	sc->sc_input = lagg_fail_input;
1669	sc->sc_port_create = NULL;
1670	sc->sc_port_destroy = NULL;
1671
1672	return (0);
1673}
1674
1675static int
1676lagg_fail_detach(struct lagg_softc *sc)
1677{
1678	return (0);
1679}
1680
1681static int
1682lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
1683{
1684	struct lagg_port *lp;
1685
1686	/* Use the master port if active or the next available port */
1687	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
1688		m_freem(m);
1689		return (ENETDOWN);
1690	}
1691
1692	/* Send mbuf */
1693	return (lagg_enqueue(lp->lp_ifp, m));
1694}
1695
1696static struct mbuf *
1697lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1698{
1699	struct ifnet *ifp = sc->sc_ifp;
1700	struct lagg_port *tmp_tp;
1701
1702	if (lp == sc->sc_primary || lagg_failover_rx_all) {
1703		m->m_pkthdr.rcvif = ifp;
1704		return (m);
1705	}
1706
1707	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
1708		tmp_tp = lagg_link_active(sc, sc->sc_primary);
1709		/*
1710		 * If tmp_tp is null, we've recieved a packet when all
1711		 * our links are down. Weird, but process it anyways.
1712		 */
1713		if ((tmp_tp == NULL || tmp_tp == lp)) {
1714			m->m_pkthdr.rcvif = ifp;
1715			return (m);
1716		}
1717	}
1718
1719	m_freem(m);
1720	return (NULL);
1721}
1722
1723/*
1724 * Loadbalancing
1725 */
1726
1727static int
1728lagg_lb_attach(struct lagg_softc *sc)
1729{
1730	struct lagg_port *lp;
1731	struct lagg_lb *lb;
1732
1733	if ((lb = (struct lagg_lb *)malloc(sizeof(struct lagg_lb),
1734	    M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
1735		return (ENOMEM);
1736
1737	sc->sc_detach = lagg_lb_detach;
1738	sc->sc_start = lagg_lb_start;
1739	sc->sc_input = lagg_lb_input;
1740	sc->sc_port_create = lagg_lb_port_create;
1741	sc->sc_port_destroy = lagg_lb_port_destroy;
1742	sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
1743
1744	lb->lb_key = arc4random();
1745	sc->sc_psc = (caddr_t)lb;
1746
1747	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1748		lagg_lb_port_create(lp);
1749
1750	return (0);
1751}
1752
1753static int
1754lagg_lb_detach(struct lagg_softc *sc)
1755{
1756	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1757	if (lb != NULL)
1758		free(lb, M_DEVBUF);
1759	return (0);
1760}
1761
1762static int
1763lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
1764{
1765	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1766	struct lagg_port *lp_next;
1767	int i = 0;
1768
1769	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
1770	SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
1771		if (lp_next == lp)
1772			continue;
1773		if (i >= LAGG_MAX_PORTS)
1774			return (EINVAL);
1775		if (sc->sc_ifflags & IFF_DEBUG)
1776			printf("%s: port %s at index %d\n",
1777			    sc->sc_ifname, lp_next->lp_ifname, i);
1778		lb->lb_ports[i++] = lp_next;
1779	}
1780
1781	return (0);
1782}
1783
1784static int
1785lagg_lb_port_create(struct lagg_port *lp)
1786{
1787	struct lagg_softc *sc = lp->lp_softc;
1788	return (lagg_lb_porttable(sc, NULL));
1789}
1790
1791static void
1792lagg_lb_port_destroy(struct lagg_port *lp)
1793{
1794	struct lagg_softc *sc = lp->lp_softc;
1795	lagg_lb_porttable(sc, lp);
1796}
1797
1798static int
1799lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
1800{
1801	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1802	struct lagg_port *lp = NULL;
1803	uint32_t p = 0;
1804
1805	if (sc->use_flowid && (m->m_flags & M_FLOWID))
1806		p = m->m_pkthdr.flowid;
1807	else
1808		p = lagg_hashmbuf(sc, m, lb->lb_key);
1809	p %= sc->sc_count;
1810	lp = lb->lb_ports[p];
1811
1812	/*
1813	 * Check the port's link state. This will return the next active
1814	 * port if the link is down or the port is NULL.
1815	 */
1816	if ((lp = lagg_link_active(sc, lp)) == NULL) {
1817		m_freem(m);
1818		return (ENETDOWN);
1819	}
1820
1821	/* Send mbuf */
1822	return (lagg_enqueue(lp->lp_ifp, m));
1823}
1824
1825static struct mbuf *
1826lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1827{
1828	struct ifnet *ifp = sc->sc_ifp;
1829
1830	/* Just pass in the packet to our lagg device */
1831	m->m_pkthdr.rcvif = ifp;
1832
1833	return (m);
1834}
1835
1836/*
1837 * 802.3ad LACP
1838 */
1839
1840static int
1841lagg_lacp_attach(struct lagg_softc *sc)
1842{
1843	struct lagg_port *lp;
1844	int error;
1845
1846	sc->sc_detach = lagg_lacp_detach;
1847	sc->sc_port_create = lacp_port_create;
1848	sc->sc_port_destroy = lacp_port_destroy;
1849	sc->sc_linkstate = lacp_linkstate;
1850	sc->sc_start = lagg_lacp_start;
1851	sc->sc_input = lagg_lacp_input;
1852	sc->sc_init = lacp_init;
1853	sc->sc_stop = lacp_stop;
1854	sc->sc_lladdr = lagg_lacp_lladdr;
1855	sc->sc_req = lacp_req;
1856	sc->sc_portreq = lacp_portreq;
1857
1858	error = lacp_attach(sc);
1859	if (error)
1860		return (error);
1861
1862	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1863		lacp_port_create(lp);
1864
1865	return (error);
1866}
1867
1868static int
1869lagg_lacp_detach(struct lagg_softc *sc)
1870{
1871	struct lagg_port *lp;
1872	int error;
1873
1874	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1875		lacp_port_destroy(lp);
1876
1877	/* unlocking is safe here */
1878	LAGG_WUNLOCK(sc);
1879	error = lacp_detach(sc);
1880	LAGG_WLOCK(sc);
1881
1882	return (error);
1883}
1884
1885static void
1886lagg_lacp_lladdr(struct lagg_softc *sc)
1887{
1888	struct lagg_port *lp;
1889
1890	/* purge all the lacp ports */
1891	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1892		lacp_port_destroy(lp);
1893
1894	/* add them back in */
1895	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1896		lacp_port_create(lp);
1897}
1898
1899static int
1900lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
1901{
1902	struct lagg_port *lp;
1903
1904	lp = lacp_select_tx_port(sc, m);
1905	if (lp == NULL) {
1906		m_freem(m);
1907		return (ENETDOWN);
1908	}
1909
1910	/* Send mbuf */
1911	return (lagg_enqueue(lp->lp_ifp, m));
1912}
1913
1914static struct mbuf *
1915lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1916{
1917	struct ifnet *ifp = sc->sc_ifp;
1918	struct ether_header *eh;
1919	u_short etype;
1920
1921	eh = mtod(m, struct ether_header *);
1922	etype = ntohs(eh->ether_type);
1923
1924	/* Tap off LACP control messages */
1925	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
1926		m = lacp_input(lp, m);
1927		if (m == NULL)
1928			return (NULL);
1929	}
1930
1931	/*
1932	 * If the port is not collecting or not in the active aggregator then
1933	 * free and return.
1934	 */
1935	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
1936		m_freem(m);
1937		return (NULL);
1938	}
1939
1940	m->m_pkthdr.rcvif = ifp;
1941	return (m);
1942}
1943