1/*-
2 * Copyright (c) 2014, Bryan Venteicher <bryanv@FreeBSD.org>
3 * All rights reserved.
4 * Copyright (c) 2020, Chelsio Communications.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "opt_inet.h"
29#include "opt_inet6.h"
30
31#include <sys/param.h>
32#include <sys/eventhandler.h>
33#include <sys/kernel.h>
34#include <sys/lock.h>
35#include <sys/hash.h>
36#include <sys/malloc.h>
37#include <sys/mbuf.h>
38#include <sys/module.h>
39#include <sys/refcount.h>
40#include <sys/rmlock.h>
41#include <sys/priv.h>
42#include <sys/proc.h>
43#include <sys/queue.h>
44#include <sys/sbuf.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sockio.h>
48#include <sys/sysctl.h>
49#include <sys/systm.h>
50
51#include <net/bpf.h>
52#include <net/ethernet.h>
53#include <net/if.h>
54#include <net/if_var.h>
55#include <net/if_private.h>
56#include <net/if_clone.h>
57#include <net/if_dl.h>
58#include <net/if_media.h>
59#include <net/if_types.h>
60#include <net/if_vxlan.h>
61#include <net/netisr.h>
62#include <net/route.h>
63#include <net/route/nhop.h>
64
65#include <netinet/in.h>
66#include <netinet/in_systm.h>
67#include <netinet/in_var.h>
68#include <netinet/in_pcb.h>
69#include <netinet/ip.h>
70#include <netinet/ip6.h>
71#include <netinet/ip_var.h>
72#include <netinet/udp.h>
73#include <netinet/udp_var.h>
74#include <netinet/in_fib.h>
75#include <netinet6/in6_fib.h>
76
77#include <netinet6/ip6_var.h>
78#include <netinet6/scope6_var.h>
79
80struct vxlan_softc;
81LIST_HEAD(vxlan_softc_head, vxlan_softc);
82
83struct sx vxlan_sx;
84SX_SYSINIT(vxlan, &vxlan_sx, "VXLAN global start/stop lock");
85
86struct vxlan_socket_mc_info {
87	union vxlan_sockaddr		 vxlsomc_saddr;
88	union vxlan_sockaddr		 vxlsomc_gaddr;
89	int				 vxlsomc_ifidx;
90	int				 vxlsomc_users;
91};
92
93/*
94 * The maximum MTU of encapsulated ethernet frame within IPv4/UDP packet.
95 */
96#define VXLAN_MAX_MTU	(IP_MAXPACKET - \
97		60 /* Maximum IPv4 header len */ - \
98		sizeof(struct udphdr) - \
99		sizeof(struct vxlan_header) - \
100		ETHER_HDR_LEN - ETHER_CRC_LEN - ETHER_VLAN_ENCAP_LEN)
101#define VXLAN_BASIC_IFCAPS (IFCAP_LINKSTATE | IFCAP_JUMBO_MTU)
102
103#define VXLAN_SO_MC_MAX_GROUPS		32
104
105#define VXLAN_SO_VNI_HASH_SHIFT		6
106#define VXLAN_SO_VNI_HASH_SIZE		(1 << VXLAN_SO_VNI_HASH_SHIFT)
107#define VXLAN_SO_VNI_HASH(_vni)		((_vni) % VXLAN_SO_VNI_HASH_SIZE)
108
109struct vxlan_socket {
110	struct socket			*vxlso_sock;
111	struct rmlock			 vxlso_lock;
112	u_int				 vxlso_refcnt;
113	union vxlan_sockaddr		 vxlso_laddr;
114	LIST_ENTRY(vxlan_socket)	 vxlso_entry;
115	struct vxlan_softc_head		 vxlso_vni_hash[VXLAN_SO_VNI_HASH_SIZE];
116	struct vxlan_socket_mc_info	 vxlso_mc[VXLAN_SO_MC_MAX_GROUPS];
117};
118
119#define VXLAN_SO_RLOCK(_vso, _p)	rm_rlock(&(_vso)->vxlso_lock, (_p))
120#define VXLAN_SO_RUNLOCK(_vso, _p)	rm_runlock(&(_vso)->vxlso_lock, (_p))
121#define VXLAN_SO_WLOCK(_vso)		rm_wlock(&(_vso)->vxlso_lock)
122#define VXLAN_SO_WUNLOCK(_vso)		rm_wunlock(&(_vso)->vxlso_lock)
123#define VXLAN_SO_LOCK_ASSERT(_vso) \
124    rm_assert(&(_vso)->vxlso_lock, RA_LOCKED)
125#define VXLAN_SO_LOCK_WASSERT(_vso) \
126    rm_assert(&(_vso)->vxlso_lock, RA_WLOCKED)
127
128#define VXLAN_SO_ACQUIRE(_vso)		refcount_acquire(&(_vso)->vxlso_refcnt)
129#define VXLAN_SO_RELEASE(_vso)		refcount_release(&(_vso)->vxlso_refcnt)
130
131struct vxlan_ftable_entry {
132	LIST_ENTRY(vxlan_ftable_entry)	 vxlfe_hash;
133	uint16_t			 vxlfe_flags;
134	uint8_t				 vxlfe_mac[ETHER_ADDR_LEN];
135	union vxlan_sockaddr		 vxlfe_raddr;
136	time_t				 vxlfe_expire;
137};
138
139#define VXLAN_FE_FLAG_DYNAMIC		0x01
140#define VXLAN_FE_FLAG_STATIC		0x02
141
142#define VXLAN_FE_IS_DYNAMIC(_fe) \
143    ((_fe)->vxlfe_flags & VXLAN_FE_FLAG_DYNAMIC)
144
145#define VXLAN_SC_FTABLE_SHIFT		9
146#define VXLAN_SC_FTABLE_SIZE		(1 << VXLAN_SC_FTABLE_SHIFT)
147#define VXLAN_SC_FTABLE_MASK		(VXLAN_SC_FTABLE_SIZE - 1)
148#define VXLAN_SC_FTABLE_HASH(_sc, _mac)	\
149    (vxlan_mac_hash(_sc, _mac) % VXLAN_SC_FTABLE_SIZE)
150
151LIST_HEAD(vxlan_ftable_head, vxlan_ftable_entry);
152
153struct vxlan_statistics {
154	uint32_t	ftable_nospace;
155	uint32_t	ftable_lock_upgrade_failed;
156	counter_u64_t	txcsum;
157	counter_u64_t	tso;
158	counter_u64_t	rxcsum;
159};
160
161struct vxlan_softc {
162	struct ifnet			*vxl_ifp;
163	int				 vxl_reqcap;
164	u_int				 vxl_fibnum;
165	struct vxlan_socket		*vxl_sock;
166	uint32_t			 vxl_vni;
167	union vxlan_sockaddr		 vxl_src_addr;
168	union vxlan_sockaddr		 vxl_dst_addr;
169	uint32_t			 vxl_flags;
170#define VXLAN_FLAG_INIT		0x0001
171#define VXLAN_FLAG_TEARDOWN	0x0002
172#define VXLAN_FLAG_LEARN	0x0004
173#define VXLAN_FLAG_USER_MTU	0x0008
174
175	uint32_t			 vxl_port_hash_key;
176	uint16_t			 vxl_min_port;
177	uint16_t			 vxl_max_port;
178	uint8_t				 vxl_ttl;
179
180	/* Lookup table from MAC address to forwarding entry. */
181	uint32_t			 vxl_ftable_cnt;
182	uint32_t			 vxl_ftable_max;
183	uint32_t			 vxl_ftable_timeout;
184	uint32_t			 vxl_ftable_hash_key;
185	struct vxlan_ftable_head	*vxl_ftable;
186
187	/* Derived from vxl_dst_addr. */
188	struct vxlan_ftable_entry	 vxl_default_fe;
189
190	struct ip_moptions		*vxl_im4o;
191	struct ip6_moptions		*vxl_im6o;
192
193	struct rmlock			 vxl_lock;
194	volatile u_int			 vxl_refcnt;
195
196	int				 vxl_unit;
197	int				 vxl_vso_mc_index;
198	struct vxlan_statistics		 vxl_stats;
199	struct sysctl_oid		*vxl_sysctl_node;
200	struct sysctl_ctx_list		 vxl_sysctl_ctx;
201	struct callout			 vxl_callout;
202	struct ether_addr		 vxl_hwaddr;
203	int				 vxl_mc_ifindex;
204	struct ifnet			*vxl_mc_ifp;
205	struct ifmedia 			 vxl_media;
206	char				 vxl_mc_ifname[IFNAMSIZ];
207	LIST_ENTRY(vxlan_softc)		 vxl_entry;
208	LIST_ENTRY(vxlan_softc)		 vxl_ifdetach_list;
209
210	/* For rate limiting errors on the tx fast path. */
211	struct timeval err_time;
212	int err_pps;
213};
214
215#define VXLAN_RLOCK(_sc, _p)	rm_rlock(&(_sc)->vxl_lock, (_p))
216#define VXLAN_RUNLOCK(_sc, _p)	rm_runlock(&(_sc)->vxl_lock, (_p))
217#define VXLAN_WLOCK(_sc)	rm_wlock(&(_sc)->vxl_lock)
218#define VXLAN_WUNLOCK(_sc)	rm_wunlock(&(_sc)->vxl_lock)
219#define VXLAN_LOCK_WOWNED(_sc)	rm_wowned(&(_sc)->vxl_lock)
220#define VXLAN_LOCK_ASSERT(_sc)	rm_assert(&(_sc)->vxl_lock, RA_LOCKED)
221#define VXLAN_LOCK_WASSERT(_sc) rm_assert(&(_sc)->vxl_lock, RA_WLOCKED)
222#define VXLAN_UNLOCK(_sc, _p) do {		\
223    if (VXLAN_LOCK_WOWNED(_sc))			\
224	VXLAN_WUNLOCK(_sc);			\
225    else					\
226	VXLAN_RUNLOCK(_sc, _p);			\
227} while (0)
228
229#define VXLAN_ACQUIRE(_sc)	refcount_acquire(&(_sc)->vxl_refcnt)
230#define VXLAN_RELEASE(_sc)	refcount_release(&(_sc)->vxl_refcnt)
231
232#define	satoconstsin(sa)	((const struct sockaddr_in *)(sa))
233#define	satoconstsin6(sa)	((const struct sockaddr_in6 *)(sa))
234
235struct vxlanudphdr {
236	struct udphdr		vxlh_udp;
237	struct vxlan_header	vxlh_hdr;
238} __packed;
239
240static int	vxlan_ftable_addr_cmp(const uint8_t *, const uint8_t *);
241static void	vxlan_ftable_init(struct vxlan_softc *);
242static void	vxlan_ftable_fini(struct vxlan_softc *);
243static void	vxlan_ftable_flush(struct vxlan_softc *, int);
244static void	vxlan_ftable_expire(struct vxlan_softc *);
245static int	vxlan_ftable_update_locked(struct vxlan_softc *,
246		    const union vxlan_sockaddr *, const uint8_t *,
247		    struct rm_priotracker *);
248static int	vxlan_ftable_learn(struct vxlan_softc *,
249		    const struct sockaddr *, const uint8_t *);
250static int	vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS);
251
252static struct vxlan_ftable_entry *
253		vxlan_ftable_entry_alloc(void);
254static void	vxlan_ftable_entry_free(struct vxlan_ftable_entry *);
255static void	vxlan_ftable_entry_init(struct vxlan_softc *,
256		    struct vxlan_ftable_entry *, const uint8_t *,
257		    const struct sockaddr *, uint32_t);
258static void	vxlan_ftable_entry_destroy(struct vxlan_softc *,
259		    struct vxlan_ftable_entry *);
260static int	vxlan_ftable_entry_insert(struct vxlan_softc *,
261		    struct vxlan_ftable_entry *);
262static struct vxlan_ftable_entry *
263		vxlan_ftable_entry_lookup(struct vxlan_softc *,
264		    const uint8_t *);
265static void	vxlan_ftable_entry_dump(struct vxlan_ftable_entry *,
266		    struct sbuf *);
267
268static struct vxlan_socket *
269		vxlan_socket_alloc(const union vxlan_sockaddr *);
270static void	vxlan_socket_destroy(struct vxlan_socket *);
271static void	vxlan_socket_release(struct vxlan_socket *);
272static struct vxlan_socket *
273		vxlan_socket_lookup(union vxlan_sockaddr *vxlsa);
274static void	vxlan_socket_insert(struct vxlan_socket *);
275static int	vxlan_socket_init(struct vxlan_socket *, struct ifnet *);
276static int	vxlan_socket_bind(struct vxlan_socket *, struct ifnet *);
277static int	vxlan_socket_create(struct ifnet *, int,
278		    const union vxlan_sockaddr *, struct vxlan_socket **);
279static void	vxlan_socket_ifdetach(struct vxlan_socket *,
280		    struct ifnet *, struct vxlan_softc_head *);
281
282static struct vxlan_socket *
283		vxlan_socket_mc_lookup(const union vxlan_sockaddr *);
284static int	vxlan_sockaddr_mc_info_match(
285		    const struct vxlan_socket_mc_info *,
286		    const union vxlan_sockaddr *,
287		    const union vxlan_sockaddr *, int);
288static int	vxlan_socket_mc_join_group(struct vxlan_socket *,
289		    const union vxlan_sockaddr *, const union vxlan_sockaddr *,
290		    int *, union vxlan_sockaddr *);
291static int	vxlan_socket_mc_leave_group(struct vxlan_socket *,
292		    const union vxlan_sockaddr *,
293		    const union vxlan_sockaddr *, int);
294static int	vxlan_socket_mc_add_group(struct vxlan_socket *,
295		    const union vxlan_sockaddr *, const union vxlan_sockaddr *,
296		    int, int *);
297static void	vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *,
298		    int);
299
300static struct vxlan_softc *
301		vxlan_socket_lookup_softc_locked(struct vxlan_socket *,
302		    uint32_t);
303static struct vxlan_softc *
304		vxlan_socket_lookup_softc(struct vxlan_socket *, uint32_t);
305static int	vxlan_socket_insert_softc(struct vxlan_socket *,
306		    struct vxlan_softc *);
307static void	vxlan_socket_remove_softc(struct vxlan_socket *,
308		    struct vxlan_softc *);
309
310static struct ifnet *
311		vxlan_multicast_if_ref(struct vxlan_softc *, int);
312static void	vxlan_free_multicast(struct vxlan_softc *);
313static int	vxlan_setup_multicast_interface(struct vxlan_softc *);
314
315static int	vxlan_setup_multicast(struct vxlan_softc *);
316static int	vxlan_setup_socket(struct vxlan_softc *);
317#ifdef INET6
318static void	vxlan_setup_zero_checksum_port(struct vxlan_softc *);
319#endif
320static void	vxlan_setup_interface_hdrlen(struct vxlan_softc *);
321static int	vxlan_valid_init_config(struct vxlan_softc *);
322static void	vxlan_init_wait(struct vxlan_softc *);
323static void	vxlan_init_complete(struct vxlan_softc *);
324static void	vxlan_init(void *);
325static void	vxlan_release(struct vxlan_softc *);
326static void	vxlan_teardown_wait(struct vxlan_softc *);
327static void	vxlan_teardown_complete(struct vxlan_softc *);
328static void	vxlan_teardown_locked(struct vxlan_softc *);
329static void	vxlan_teardown(struct vxlan_softc *);
330static void	vxlan_ifdetach(struct vxlan_softc *, struct ifnet *,
331		    struct vxlan_softc_head *);
332static void	vxlan_timer(void *);
333
334static int	vxlan_ctrl_get_config(struct vxlan_softc *, void *);
335static int	vxlan_ctrl_set_vni(struct vxlan_softc *, void *);
336static int	vxlan_ctrl_set_local_addr(struct vxlan_softc *, void *);
337static int	vxlan_ctrl_set_remote_addr(struct vxlan_softc *, void *);
338static int	vxlan_ctrl_set_local_port(struct vxlan_softc *, void *);
339static int	vxlan_ctrl_set_remote_port(struct vxlan_softc *, void *);
340static int	vxlan_ctrl_set_port_range(struct vxlan_softc *, void *);
341static int	vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *, void *);
342static int	vxlan_ctrl_set_ftable_max(struct vxlan_softc *, void *);
343static int	vxlan_ctrl_set_multicast_if(struct vxlan_softc * , void *);
344static int	vxlan_ctrl_set_ttl(struct vxlan_softc *, void *);
345static int	vxlan_ctrl_set_learn(struct vxlan_softc *, void *);
346static int	vxlan_ctrl_ftable_entry_add(struct vxlan_softc *, void *);
347static int	vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *, void *);
348static int	vxlan_ctrl_flush(struct vxlan_softc *, void *);
349static int	vxlan_ioctl_drvspec(struct vxlan_softc *,
350		    struct ifdrv *, int);
351static int	vxlan_ioctl_ifflags(struct vxlan_softc *);
352static int	vxlan_ioctl(struct ifnet *, u_long, caddr_t);
353
354#if defined(INET) || defined(INET6)
355static uint16_t vxlan_pick_source_port(struct vxlan_softc *, struct mbuf *);
356static void	vxlan_encap_header(struct vxlan_softc *, struct mbuf *,
357		    int, uint16_t, uint16_t);
358#endif
359static int	vxlan_encap4(struct vxlan_softc *,
360		    const union vxlan_sockaddr *, struct mbuf *);
361static int	vxlan_encap6(struct vxlan_softc *,
362		    const union vxlan_sockaddr *, struct mbuf *);
363static int	vxlan_transmit(struct ifnet *, struct mbuf *);
364static void	vxlan_qflush(struct ifnet *);
365static bool	vxlan_rcv_udp_packet(struct mbuf *, int, struct inpcb *,
366		    const struct sockaddr *, void *);
367static int	vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **,
368		    const struct sockaddr *);
369
370static int	vxlan_stats_alloc(struct vxlan_softc *);
371static void	vxlan_stats_free(struct vxlan_softc *);
372static void	vxlan_set_default_config(struct vxlan_softc *);
373static int	vxlan_set_user_config(struct vxlan_softc *,
374		     struct ifvxlanparam *);
375static int	vxlan_set_reqcap(struct vxlan_softc *, struct ifnet *, int);
376static void	vxlan_set_hwcaps(struct vxlan_softc *);
377static int	vxlan_clone_create(struct if_clone *, char *, size_t,
378		    struct ifc_data *, struct ifnet **);
379static int	vxlan_clone_destroy(struct if_clone *, struct ifnet *, uint32_t);
380
381static uint32_t vxlan_mac_hash(struct vxlan_softc *, const uint8_t *);
382static int	vxlan_media_change(struct ifnet *);
383static void	vxlan_media_status(struct ifnet *, struct ifmediareq *);
384
385static int	vxlan_sockaddr_cmp(const union vxlan_sockaddr *,
386		    const struct sockaddr *);
387static void	vxlan_sockaddr_copy(union vxlan_sockaddr *,
388		    const struct sockaddr *);
389static int	vxlan_sockaddr_in_equal(const union vxlan_sockaddr *,
390		    const struct sockaddr *);
391static void	vxlan_sockaddr_in_copy(union vxlan_sockaddr *,
392		    const struct sockaddr *);
393static int	vxlan_sockaddr_supported(const union vxlan_sockaddr *, int);
394static int	vxlan_sockaddr_in_any(const union vxlan_sockaddr *);
395static int	vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *);
396static int	vxlan_sockaddr_in6_embedscope(union vxlan_sockaddr *);
397
398static int	vxlan_can_change_config(struct vxlan_softc *);
399static int	vxlan_check_vni(uint32_t);
400static int	vxlan_check_ttl(int);
401static int	vxlan_check_ftable_timeout(uint32_t);
402static int	vxlan_check_ftable_max(uint32_t);
403
404static void	vxlan_sysctl_setup(struct vxlan_softc *);
405static void	vxlan_sysctl_destroy(struct vxlan_softc *);
406static int	vxlan_tunable_int(struct vxlan_softc *, const char *, int);
407
408static void	vxlan_ifdetach_event(void *, struct ifnet *);
409static void	vxlan_load(void);
410static void	vxlan_unload(void);
411static int	vxlan_modevent(module_t, int, void *);
412
413static const char vxlan_name[] = "vxlan";
414static MALLOC_DEFINE(M_VXLAN, vxlan_name,
415    "Virtual eXtensible LAN Interface");
416static struct if_clone *vxlan_cloner;
417
418static struct mtx vxlan_list_mtx;
419#define VXLAN_LIST_LOCK()	mtx_lock(&vxlan_list_mtx)
420#define VXLAN_LIST_UNLOCK()	mtx_unlock(&vxlan_list_mtx)
421
422static LIST_HEAD(, vxlan_socket) vxlan_socket_list;
423
424static eventhandler_tag vxlan_ifdetach_event_tag;
425
426SYSCTL_DECL(_net_link);
427SYSCTL_NODE(_net_link, OID_AUTO, vxlan, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
428    "Virtual eXtensible Local Area Network");
429
430static int vxlan_legacy_port = 0;
431TUNABLE_INT("net.link.vxlan.legacy_port", &vxlan_legacy_port);
432static int vxlan_reuse_port = 0;
433TUNABLE_INT("net.link.vxlan.reuse_port", &vxlan_reuse_port);
434
435/*
436 * This macro controls the default upper limitation on nesting of vxlan
437 * tunnels. By default it is 3, as the overhead of IPv6 vxlan tunnel is 70
438 * bytes, this will create at most 210 bytes overhead and the most inner
439 * tunnel's MTU will be 1290 which will meet IPv6 minimum MTU size 1280.
440 * Be careful to configure the tunnels when raising the limit. A large
441 * number of nested tunnels can introduce system crash.
442 */
443#ifndef MAX_VXLAN_NEST
444#define MAX_VXLAN_NEST	3
445#endif
446static int max_vxlan_nesting = MAX_VXLAN_NEST;
447SYSCTL_INT(_net_link_vxlan, OID_AUTO, max_nesting, CTLFLAG_RW,
448    &max_vxlan_nesting, 0, "Max nested tunnels");
449
450/* Default maximum number of addresses in the forwarding table. */
451#ifndef VXLAN_FTABLE_MAX
452#define VXLAN_FTABLE_MAX	2000
453#endif
454
455/* Timeout (in seconds) of addresses learned in the forwarding table. */
456#ifndef VXLAN_FTABLE_TIMEOUT
457#define VXLAN_FTABLE_TIMEOUT	(20 * 60)
458#endif
459
460/*
461 * Maximum timeout (in seconds) of addresses learned in the forwarding
462 * table.
463 */
464#ifndef VXLAN_FTABLE_MAX_TIMEOUT
465#define VXLAN_FTABLE_MAX_TIMEOUT	(60 * 60 * 24)
466#endif
467
468/* Number of seconds between pruning attempts of the forwarding table. */
469#ifndef VXLAN_FTABLE_PRUNE
470#define VXLAN_FTABLE_PRUNE	(5 * 60)
471#endif
472
473static int vxlan_ftable_prune_period = VXLAN_FTABLE_PRUNE;
474
475struct vxlan_control {
476	int	(*vxlc_func)(struct vxlan_softc *, void *);
477	int	vxlc_argsize;
478	int	vxlc_flags;
479#define VXLAN_CTRL_FLAG_COPYIN	0x01
480#define VXLAN_CTRL_FLAG_COPYOUT	0x02
481#define VXLAN_CTRL_FLAG_SUSER	0x04
482};
483
484static const struct vxlan_control vxlan_control_table[] = {
485	[VXLAN_CMD_GET_CONFIG] =
486	    {	vxlan_ctrl_get_config, sizeof(struct ifvxlancfg),
487		VXLAN_CTRL_FLAG_COPYOUT
488	    },
489
490	[VXLAN_CMD_SET_VNI] =
491	    {   vxlan_ctrl_set_vni, sizeof(struct ifvxlancmd),
492		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
493	    },
494
495	[VXLAN_CMD_SET_LOCAL_ADDR] =
496	    {   vxlan_ctrl_set_local_addr, sizeof(struct ifvxlancmd),
497		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
498	    },
499
500	[VXLAN_CMD_SET_REMOTE_ADDR] =
501	    {   vxlan_ctrl_set_remote_addr, sizeof(struct ifvxlancmd),
502		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
503	    },
504
505	[VXLAN_CMD_SET_LOCAL_PORT] =
506	    {   vxlan_ctrl_set_local_port, sizeof(struct ifvxlancmd),
507		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
508	    },
509
510	[VXLAN_CMD_SET_REMOTE_PORT] =
511	    {   vxlan_ctrl_set_remote_port, sizeof(struct ifvxlancmd),
512		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
513	    },
514
515	[VXLAN_CMD_SET_PORT_RANGE] =
516	    {   vxlan_ctrl_set_port_range, sizeof(struct ifvxlancmd),
517		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
518	    },
519
520	[VXLAN_CMD_SET_FTABLE_TIMEOUT] =
521	    {	vxlan_ctrl_set_ftable_timeout, sizeof(struct ifvxlancmd),
522		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
523	    },
524
525	[VXLAN_CMD_SET_FTABLE_MAX] =
526	    {	vxlan_ctrl_set_ftable_max, sizeof(struct ifvxlancmd),
527		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
528	    },
529
530	[VXLAN_CMD_SET_MULTICAST_IF] =
531	    {	vxlan_ctrl_set_multicast_if, sizeof(struct ifvxlancmd),
532		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
533	    },
534
535	[VXLAN_CMD_SET_TTL] =
536	    {	vxlan_ctrl_set_ttl, sizeof(struct ifvxlancmd),
537		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
538	    },
539
540	[VXLAN_CMD_SET_LEARN] =
541	    {	vxlan_ctrl_set_learn, sizeof(struct ifvxlancmd),
542		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
543	    },
544
545	[VXLAN_CMD_FTABLE_ENTRY_ADD] =
546	    {	vxlan_ctrl_ftable_entry_add, sizeof(struct ifvxlancmd),
547		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
548	    },
549
550	[VXLAN_CMD_FTABLE_ENTRY_REM] =
551	    {	vxlan_ctrl_ftable_entry_rem, sizeof(struct ifvxlancmd),
552		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
553	    },
554
555	[VXLAN_CMD_FLUSH] =
556	    {   vxlan_ctrl_flush, sizeof(struct ifvxlancmd),
557		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
558	    },
559};
560
561static const int vxlan_control_table_size = nitems(vxlan_control_table);
562
563static int
564vxlan_ftable_addr_cmp(const uint8_t *a, const uint8_t *b)
565{
566	int i, d;
567
568	for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++)
569		d = ((int)a[i]) - ((int)b[i]);
570
571	return (d);
572}
573
574static void
575vxlan_ftable_init(struct vxlan_softc *sc)
576{
577	int i;
578
579	sc->vxl_ftable = malloc(sizeof(struct vxlan_ftable_head) *
580	    VXLAN_SC_FTABLE_SIZE, M_VXLAN, M_ZERO | M_WAITOK);
581
582	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++)
583		LIST_INIT(&sc->vxl_ftable[i]);
584	sc->vxl_ftable_hash_key = arc4random();
585}
586
587static void
588vxlan_ftable_fini(struct vxlan_softc *sc)
589{
590	int i;
591
592	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
593		KASSERT(LIST_EMPTY(&sc->vxl_ftable[i]),
594		    ("%s: vxlan %p ftable[%d] not empty", __func__, sc, i));
595	}
596	MPASS(sc->vxl_ftable_cnt == 0);
597
598	free(sc->vxl_ftable, M_VXLAN);
599	sc->vxl_ftable = NULL;
600}
601
602static void
603vxlan_ftable_flush(struct vxlan_softc *sc, int all)
604{
605	struct vxlan_ftable_entry *fe, *tfe;
606	int i;
607
608	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
609		LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) {
610			if (all || VXLAN_FE_IS_DYNAMIC(fe))
611				vxlan_ftable_entry_destroy(sc, fe);
612		}
613	}
614}
615
616static void
617vxlan_ftable_expire(struct vxlan_softc *sc)
618{
619	struct vxlan_ftable_entry *fe, *tfe;
620	int i;
621
622	VXLAN_LOCK_WASSERT(sc);
623
624	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
625		LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) {
626			if (VXLAN_FE_IS_DYNAMIC(fe) &&
627			    time_uptime >= fe->vxlfe_expire)
628				vxlan_ftable_entry_destroy(sc, fe);
629		}
630	}
631}
632
633static int
634vxlan_ftable_update_locked(struct vxlan_softc *sc,
635    const union vxlan_sockaddr *vxlsa, const uint8_t *mac,
636    struct rm_priotracker *tracker)
637{
638	struct vxlan_ftable_entry *fe;
639	int error __unused;
640
641	VXLAN_LOCK_ASSERT(sc);
642
643again:
644	/*
645	 * A forwarding entry for this MAC address might already exist. If
646	 * so, update it, otherwise create a new one. We may have to upgrade
647	 * the lock if we have to change or create an entry.
648	 */
649	fe = vxlan_ftable_entry_lookup(sc, mac);
650	if (fe != NULL) {
651		fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout;
652
653		if (!VXLAN_FE_IS_DYNAMIC(fe) ||
654		    vxlan_sockaddr_in_equal(&fe->vxlfe_raddr, &vxlsa->sa))
655			return (0);
656		if (!VXLAN_LOCK_WOWNED(sc)) {
657			VXLAN_RUNLOCK(sc, tracker);
658			VXLAN_WLOCK(sc);
659			sc->vxl_stats.ftable_lock_upgrade_failed++;
660			goto again;
661		}
662		vxlan_sockaddr_in_copy(&fe->vxlfe_raddr, &vxlsa->sa);
663		return (0);
664	}
665
666	if (!VXLAN_LOCK_WOWNED(sc)) {
667		VXLAN_RUNLOCK(sc, tracker);
668		VXLAN_WLOCK(sc);
669		sc->vxl_stats.ftable_lock_upgrade_failed++;
670		goto again;
671	}
672
673	if (sc->vxl_ftable_cnt >= sc->vxl_ftable_max) {
674		sc->vxl_stats.ftable_nospace++;
675		return (ENOSPC);
676	}
677
678	fe = vxlan_ftable_entry_alloc();
679	if (fe == NULL)
680		return (ENOMEM);
681
682	vxlan_ftable_entry_init(sc, fe, mac, &vxlsa->sa, VXLAN_FE_FLAG_DYNAMIC);
683
684	/* The prior lookup failed, so the insert should not. */
685	error = vxlan_ftable_entry_insert(sc, fe);
686	MPASS(error == 0);
687
688	return (0);
689}
690
691static int
692vxlan_ftable_learn(struct vxlan_softc *sc, const struct sockaddr *sa,
693    const uint8_t *mac)
694{
695	struct rm_priotracker tracker;
696	union vxlan_sockaddr vxlsa;
697	int error;
698
699	/*
700	 * The source port may be randomly selected by the remote host, so
701	 * use the port of the default destination address.
702	 */
703	vxlan_sockaddr_copy(&vxlsa, sa);
704	vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port;
705
706	if (VXLAN_SOCKADDR_IS_IPV6(&vxlsa)) {
707		error = vxlan_sockaddr_in6_embedscope(&vxlsa);
708		if (error)
709			return (error);
710	}
711
712	VXLAN_RLOCK(sc, &tracker);
713	error = vxlan_ftable_update_locked(sc, &vxlsa, mac, &tracker);
714	VXLAN_UNLOCK(sc, &tracker);
715
716	return (error);
717}
718
719static int
720vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS)
721{
722	struct rm_priotracker tracker;
723	struct sbuf sb;
724	struct vxlan_softc *sc;
725	struct vxlan_ftable_entry *fe;
726	size_t size;
727	int i, error;
728
729	/*
730	 * This is mostly intended for debugging during development. It is
731	 * not practical to dump an entire large table this way.
732	 */
733
734	sc = arg1;
735	size = PAGE_SIZE;	/* Calculate later. */
736
737	sbuf_new(&sb, NULL, size, SBUF_FIXEDLEN);
738	sbuf_putc(&sb, '\n');
739
740	VXLAN_RLOCK(sc, &tracker);
741	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
742		LIST_FOREACH(fe, &sc->vxl_ftable[i], vxlfe_hash) {
743			if (sbuf_error(&sb) != 0)
744				break;
745			vxlan_ftable_entry_dump(fe, &sb);
746		}
747	}
748	VXLAN_RUNLOCK(sc, &tracker);
749
750	if (sbuf_len(&sb) == 1)
751		sbuf_setpos(&sb, 0);
752
753	sbuf_finish(&sb);
754	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
755	sbuf_delete(&sb);
756
757	return (error);
758}
759
760static struct vxlan_ftable_entry *
761vxlan_ftable_entry_alloc(void)
762{
763	struct vxlan_ftable_entry *fe;
764
765	fe = malloc(sizeof(*fe), M_VXLAN, M_ZERO | M_NOWAIT);
766
767	return (fe);
768}
769
770static void
771vxlan_ftable_entry_free(struct vxlan_ftable_entry *fe)
772{
773
774	free(fe, M_VXLAN);
775}
776
777static void
778vxlan_ftable_entry_init(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe,
779    const uint8_t *mac, const struct sockaddr *sa, uint32_t flags)
780{
781
782	fe->vxlfe_flags = flags;
783	fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout;
784	memcpy(fe->vxlfe_mac, mac, ETHER_ADDR_LEN);
785	vxlan_sockaddr_copy(&fe->vxlfe_raddr, sa);
786}
787
788static void
789vxlan_ftable_entry_destroy(struct vxlan_softc *sc,
790    struct vxlan_ftable_entry *fe)
791{
792
793	sc->vxl_ftable_cnt--;
794	LIST_REMOVE(fe, vxlfe_hash);
795	vxlan_ftable_entry_free(fe);
796}
797
798static int
799vxlan_ftable_entry_insert(struct vxlan_softc *sc,
800    struct vxlan_ftable_entry *fe)
801{
802	struct vxlan_ftable_entry *lfe;
803	uint32_t hash;
804	int dir;
805
806	VXLAN_LOCK_WASSERT(sc);
807	hash = VXLAN_SC_FTABLE_HASH(sc, fe->vxlfe_mac);
808
809	lfe = LIST_FIRST(&sc->vxl_ftable[hash]);
810	if (lfe == NULL) {
811		LIST_INSERT_HEAD(&sc->vxl_ftable[hash], fe, vxlfe_hash);
812		goto out;
813	}
814
815	do {
816		dir = vxlan_ftable_addr_cmp(fe->vxlfe_mac, lfe->vxlfe_mac);
817		if (dir == 0)
818			return (EEXIST);
819		if (dir > 0) {
820			LIST_INSERT_BEFORE(lfe, fe, vxlfe_hash);
821			goto out;
822		} else if (LIST_NEXT(lfe, vxlfe_hash) == NULL) {
823			LIST_INSERT_AFTER(lfe, fe, vxlfe_hash);
824			goto out;
825		} else
826			lfe = LIST_NEXT(lfe, vxlfe_hash);
827	} while (lfe != NULL);
828
829out:
830	sc->vxl_ftable_cnt++;
831
832	return (0);
833}
834
835static struct vxlan_ftable_entry *
836vxlan_ftable_entry_lookup(struct vxlan_softc *sc, const uint8_t *mac)
837{
838	struct vxlan_ftable_entry *fe;
839	uint32_t hash;
840	int dir;
841
842	VXLAN_LOCK_ASSERT(sc);
843	hash = VXLAN_SC_FTABLE_HASH(sc, mac);
844
845	LIST_FOREACH(fe, &sc->vxl_ftable[hash], vxlfe_hash) {
846		dir = vxlan_ftable_addr_cmp(mac, fe->vxlfe_mac);
847		if (dir == 0)
848			return (fe);
849		if (dir > 0)
850			break;
851	}
852
853	return (NULL);
854}
855
856static void
857vxlan_ftable_entry_dump(struct vxlan_ftable_entry *fe, struct sbuf *sb)
858{
859	char buf[64];
860	const union vxlan_sockaddr *sa;
861	const void *addr;
862	int i, len, af, width;
863
864	sa = &fe->vxlfe_raddr;
865	af = sa->sa.sa_family;
866	len = sbuf_len(sb);
867
868	sbuf_printf(sb, "%c 0x%02X ", VXLAN_FE_IS_DYNAMIC(fe) ? 'D' : 'S',
869	    fe->vxlfe_flags);
870
871	for (i = 0; i < ETHER_ADDR_LEN - 1; i++)
872		sbuf_printf(sb, "%02X:", fe->vxlfe_mac[i]);
873	sbuf_printf(sb, "%02X ", fe->vxlfe_mac[i]);
874
875	if (af == AF_INET) {
876		addr = &sa->in4.sin_addr;
877		width = INET_ADDRSTRLEN - 1;
878	} else {
879		addr = &sa->in6.sin6_addr;
880		width = INET6_ADDRSTRLEN - 1;
881	}
882	inet_ntop(af, addr, buf, sizeof(buf));
883	sbuf_printf(sb, "%*s ", width, buf);
884
885	sbuf_printf(sb, "%08jd", (intmax_t)fe->vxlfe_expire);
886
887	sbuf_putc(sb, '\n');
888
889	/* Truncate a partial line. */
890	if (sbuf_error(sb) != 0)
891		sbuf_setpos(sb, len);
892}
893
894static struct vxlan_socket *
895vxlan_socket_alloc(const union vxlan_sockaddr *sa)
896{
897	struct vxlan_socket *vso;
898	int i;
899
900	vso = malloc(sizeof(*vso), M_VXLAN, M_WAITOK | M_ZERO);
901	rm_init(&vso->vxlso_lock, "vxlansorm");
902	refcount_init(&vso->vxlso_refcnt, 0);
903	for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++)
904		LIST_INIT(&vso->vxlso_vni_hash[i]);
905	vso->vxlso_laddr = *sa;
906
907	return (vso);
908}
909
910static void
911vxlan_socket_destroy(struct vxlan_socket *vso)
912{
913	struct socket *so;
914#ifdef INVARIANTS
915	int i;
916	struct vxlan_socket_mc_info *mc;
917
918	for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) {
919		mc = &vso->vxlso_mc[i];
920		KASSERT(mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC,
921		    ("%s: socket %p mc[%d] still has address",
922		     __func__, vso, i));
923	}
924
925	for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) {
926		KASSERT(LIST_EMPTY(&vso->vxlso_vni_hash[i]),
927		    ("%s: socket %p vni_hash[%d] not empty",
928		     __func__, vso, i));
929	}
930#endif
931	so = vso->vxlso_sock;
932	if (so != NULL) {
933		vso->vxlso_sock = NULL;
934		soclose(so);
935	}
936
937	rm_destroy(&vso->vxlso_lock);
938	free(vso, M_VXLAN);
939}
940
941static void
942vxlan_socket_release(struct vxlan_socket *vso)
943{
944	int destroy;
945
946	VXLAN_LIST_LOCK();
947	destroy = VXLAN_SO_RELEASE(vso);
948	if (destroy != 0)
949		LIST_REMOVE(vso, vxlso_entry);
950	VXLAN_LIST_UNLOCK();
951
952	if (destroy != 0)
953		vxlan_socket_destroy(vso);
954}
955
956static struct vxlan_socket *
957vxlan_socket_lookup(union vxlan_sockaddr *vxlsa)
958{
959	struct vxlan_socket *vso;
960
961	VXLAN_LIST_LOCK();
962	LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry) {
963		if (vxlan_sockaddr_cmp(&vso->vxlso_laddr, &vxlsa->sa) == 0) {
964			VXLAN_SO_ACQUIRE(vso);
965			break;
966		}
967	}
968	VXLAN_LIST_UNLOCK();
969
970	return (vso);
971}
972
973static void
974vxlan_socket_insert(struct vxlan_socket *vso)
975{
976
977	VXLAN_LIST_LOCK();
978	VXLAN_SO_ACQUIRE(vso);
979	LIST_INSERT_HEAD(&vxlan_socket_list, vso, vxlso_entry);
980	VXLAN_LIST_UNLOCK();
981}
982
983static int
984vxlan_socket_init(struct vxlan_socket *vso, struct ifnet *ifp)
985{
986	struct thread *td;
987	int error;
988
989	td = curthread;
990
991	error = socreate(vso->vxlso_laddr.sa.sa_family, &vso->vxlso_sock,
992	    SOCK_DGRAM, IPPROTO_UDP, td->td_ucred, td);
993	if (error) {
994		if_printf(ifp, "cannot create socket: %d\n", error);
995		return (error);
996	}
997
998	error = udp_set_kernel_tunneling(vso->vxlso_sock,
999	    vxlan_rcv_udp_packet, NULL, vso);
1000	if (error) {
1001		if_printf(ifp, "cannot set tunneling function: %d\n", error);
1002		return (error);
1003	}
1004
1005	if (vxlan_reuse_port != 0) {
1006		struct sockopt sopt;
1007		int val = 1;
1008
1009		bzero(&sopt, sizeof(sopt));
1010		sopt.sopt_dir = SOPT_SET;
1011		sopt.sopt_level = IPPROTO_IP;
1012		sopt.sopt_name = SO_REUSEPORT;
1013		sopt.sopt_val = &val;
1014		sopt.sopt_valsize = sizeof(val);
1015		error = sosetopt(vso->vxlso_sock, &sopt);
1016		if (error) {
1017			if_printf(ifp,
1018			    "cannot set REUSEADDR socket opt: %d\n", error);
1019			return (error);
1020		}
1021	}
1022
1023	return (0);
1024}
1025
1026static int
1027vxlan_socket_bind(struct vxlan_socket *vso, struct ifnet *ifp)
1028{
1029	union vxlan_sockaddr laddr;
1030	struct thread *td;
1031	int error;
1032
1033	td = curthread;
1034	laddr = vso->vxlso_laddr;
1035
1036	error = sobind(vso->vxlso_sock, &laddr.sa, td);
1037	if (error) {
1038		if (error != EADDRINUSE)
1039			if_printf(ifp, "cannot bind socket: %d\n", error);
1040		return (error);
1041	}
1042
1043	return (0);
1044}
1045
1046static int
1047vxlan_socket_create(struct ifnet *ifp, int multicast,
1048    const union vxlan_sockaddr *saddr, struct vxlan_socket **vsop)
1049{
1050	union vxlan_sockaddr laddr;
1051	struct vxlan_socket *vso;
1052	int error;
1053
1054	laddr = *saddr;
1055
1056	/*
1057	 * If this socket will be multicast, then only the local port
1058	 * must be specified when binding.
1059	 */
1060	if (multicast != 0) {
1061		if (VXLAN_SOCKADDR_IS_IPV4(&laddr))
1062			laddr.in4.sin_addr.s_addr = INADDR_ANY;
1063#ifdef INET6
1064		else
1065			laddr.in6.sin6_addr = in6addr_any;
1066#endif
1067	}
1068
1069	vso = vxlan_socket_alloc(&laddr);
1070	if (vso == NULL)
1071		return (ENOMEM);
1072
1073	error = vxlan_socket_init(vso, ifp);
1074	if (error)
1075		goto fail;
1076
1077	error = vxlan_socket_bind(vso, ifp);
1078	if (error)
1079		goto fail;
1080
1081	/*
1082	 * There is a small window between the bind completing and
1083	 * inserting the socket, so that a concurrent create may fail.
1084	 * Let's not worry about that for now.
1085	 */
1086	vxlan_socket_insert(vso);
1087	*vsop = vso;
1088
1089	return (0);
1090
1091fail:
1092	vxlan_socket_destroy(vso);
1093
1094	return (error);
1095}
1096
1097static void
1098vxlan_socket_ifdetach(struct vxlan_socket *vso, struct ifnet *ifp,
1099    struct vxlan_softc_head *list)
1100{
1101	struct rm_priotracker tracker;
1102	struct vxlan_softc *sc;
1103	int i;
1104
1105	VXLAN_SO_RLOCK(vso, &tracker);
1106	for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) {
1107		LIST_FOREACH(sc, &vso->vxlso_vni_hash[i], vxl_entry)
1108			vxlan_ifdetach(sc, ifp, list);
1109	}
1110	VXLAN_SO_RUNLOCK(vso, &tracker);
1111}
1112
1113static struct vxlan_socket *
1114vxlan_socket_mc_lookup(const union vxlan_sockaddr *vxlsa)
1115{
1116	union vxlan_sockaddr laddr;
1117	struct vxlan_socket *vso;
1118
1119	laddr = *vxlsa;
1120
1121	if (VXLAN_SOCKADDR_IS_IPV4(&laddr))
1122		laddr.in4.sin_addr.s_addr = INADDR_ANY;
1123#ifdef INET6
1124	else
1125		laddr.in6.sin6_addr = in6addr_any;
1126#endif
1127
1128	vso = vxlan_socket_lookup(&laddr);
1129
1130	return (vso);
1131}
1132
1133static int
1134vxlan_sockaddr_mc_info_match(const struct vxlan_socket_mc_info *mc,
1135    const union vxlan_sockaddr *group, const union vxlan_sockaddr *local,
1136    int ifidx)
1137{
1138
1139	if (!vxlan_sockaddr_in_any(local) &&
1140	    !vxlan_sockaddr_in_equal(&mc->vxlsomc_saddr, &local->sa))
1141		return (0);
1142	if (!vxlan_sockaddr_in_equal(&mc->vxlsomc_gaddr, &group->sa))
1143		return (0);
1144	if (ifidx != 0 && ifidx != mc->vxlsomc_ifidx)
1145		return (0);
1146
1147	return (1);
1148}
1149
1150static int
1151vxlan_socket_mc_join_group(struct vxlan_socket *vso,
1152    const union vxlan_sockaddr *group, const union vxlan_sockaddr *local,
1153    int *ifidx, union vxlan_sockaddr *source)
1154{
1155	struct sockopt sopt;
1156	int error;
1157
1158	*source = *local;
1159
1160	if (VXLAN_SOCKADDR_IS_IPV4(group)) {
1161		struct ip_mreq mreq;
1162
1163		mreq.imr_multiaddr = group->in4.sin_addr;
1164		mreq.imr_interface = local->in4.sin_addr;
1165
1166		bzero(&sopt, sizeof(sopt));
1167		sopt.sopt_dir = SOPT_SET;
1168		sopt.sopt_level = IPPROTO_IP;
1169		sopt.sopt_name = IP_ADD_MEMBERSHIP;
1170		sopt.sopt_val = &mreq;
1171		sopt.sopt_valsize = sizeof(mreq);
1172		error = sosetopt(vso->vxlso_sock, &sopt);
1173		if (error)
1174			return (error);
1175
1176		/*
1177		 * BMV: Ideally, there would be a formal way for us to get
1178		 * the local interface that was selected based on the
1179		 * imr_interface address. We could then update *ifidx so
1180		 * vxlan_sockaddr_mc_info_match() would return a match for
1181		 * later creates that explicitly set the multicast interface.
1182		 *
1183		 * If we really need to, we can of course look in the INP's
1184		 * membership list:
1185		 *     sotoinpcb(vso->vxlso_sock)->inp_moptions->
1186		 *         imo_head[]->imf_inm->inm_ifp
1187		 * similarly to imo_match_group().
1188		 */
1189		source->in4.sin_addr = local->in4.sin_addr;
1190
1191	} else if (VXLAN_SOCKADDR_IS_IPV6(group)) {
1192		struct ipv6_mreq mreq;
1193
1194		mreq.ipv6mr_multiaddr = group->in6.sin6_addr;
1195		mreq.ipv6mr_interface = *ifidx;
1196
1197		bzero(&sopt, sizeof(sopt));
1198		sopt.sopt_dir = SOPT_SET;
1199		sopt.sopt_level = IPPROTO_IPV6;
1200		sopt.sopt_name = IPV6_JOIN_GROUP;
1201		sopt.sopt_val = &mreq;
1202		sopt.sopt_valsize = sizeof(mreq);
1203		error = sosetopt(vso->vxlso_sock, &sopt);
1204		if (error)
1205			return (error);
1206
1207		/*
1208		 * BMV: As with IPv4, we would really like to know what
1209		 * interface in6p_lookup_mcast_ifp() selected.
1210		 */
1211	} else
1212		error = EAFNOSUPPORT;
1213
1214	return (error);
1215}
1216
1217static int
1218vxlan_socket_mc_leave_group(struct vxlan_socket *vso,
1219    const union vxlan_sockaddr *group, const union vxlan_sockaddr *source,
1220    int ifidx)
1221{
1222	struct sockopt sopt;
1223	int error;
1224
1225	bzero(&sopt, sizeof(sopt));
1226	sopt.sopt_dir = SOPT_SET;
1227
1228	if (VXLAN_SOCKADDR_IS_IPV4(group)) {
1229		struct ip_mreq mreq;
1230
1231		mreq.imr_multiaddr = group->in4.sin_addr;
1232		mreq.imr_interface = source->in4.sin_addr;
1233
1234		sopt.sopt_level = IPPROTO_IP;
1235		sopt.sopt_name = IP_DROP_MEMBERSHIP;
1236		sopt.sopt_val = &mreq;
1237		sopt.sopt_valsize = sizeof(mreq);
1238		error = sosetopt(vso->vxlso_sock, &sopt);
1239
1240	} else if (VXLAN_SOCKADDR_IS_IPV6(group)) {
1241		struct ipv6_mreq mreq;
1242
1243		mreq.ipv6mr_multiaddr = group->in6.sin6_addr;
1244		mreq.ipv6mr_interface = ifidx;
1245
1246		sopt.sopt_level = IPPROTO_IPV6;
1247		sopt.sopt_name = IPV6_LEAVE_GROUP;
1248		sopt.sopt_val = &mreq;
1249		sopt.sopt_valsize = sizeof(mreq);
1250		error = sosetopt(vso->vxlso_sock, &sopt);
1251
1252	} else
1253		error = EAFNOSUPPORT;
1254
1255	return (error);
1256}
1257
1258static int
1259vxlan_socket_mc_add_group(struct vxlan_socket *vso,
1260    const union vxlan_sockaddr *group, const union vxlan_sockaddr *local,
1261    int ifidx, int *idx)
1262{
1263	union vxlan_sockaddr source;
1264	struct vxlan_socket_mc_info *mc;
1265	int i, empty, error;
1266
1267	/*
1268	 * Within a socket, the same multicast group may be used by multiple
1269	 * interfaces, each with a different network identifier. But a socket
1270	 * may only join a multicast group once, so keep track of the users
1271	 * here.
1272	 */
1273
1274	VXLAN_SO_WLOCK(vso);
1275	for (empty = 0, i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) {
1276		mc = &vso->vxlso_mc[i];
1277
1278		if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) {
1279			empty++;
1280			continue;
1281		}
1282
1283		if (vxlan_sockaddr_mc_info_match(mc, group, local, ifidx))
1284			goto out;
1285	}
1286	VXLAN_SO_WUNLOCK(vso);
1287
1288	if (empty == 0)
1289		return (ENOSPC);
1290
1291	error = vxlan_socket_mc_join_group(vso, group, local, &ifidx, &source);
1292	if (error)
1293		return (error);
1294
1295	VXLAN_SO_WLOCK(vso);
1296	for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) {
1297		mc = &vso->vxlso_mc[i];
1298
1299		if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) {
1300			vxlan_sockaddr_copy(&mc->vxlsomc_gaddr, &group->sa);
1301			vxlan_sockaddr_copy(&mc->vxlsomc_saddr, &source.sa);
1302			mc->vxlsomc_ifidx = ifidx;
1303			goto out;
1304		}
1305	}
1306	VXLAN_SO_WUNLOCK(vso);
1307
1308	error = vxlan_socket_mc_leave_group(vso, group, &source, ifidx);
1309	MPASS(error == 0);
1310
1311	return (ENOSPC);
1312
1313out:
1314	mc->vxlsomc_users++;
1315	VXLAN_SO_WUNLOCK(vso);
1316
1317	*idx = i;
1318
1319	return (0);
1320}
1321
1322static void
1323vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *vso, int idx)
1324{
1325	union vxlan_sockaddr group, source;
1326	struct vxlan_socket_mc_info *mc;
1327	int ifidx, leave;
1328
1329	KASSERT(idx >= 0 && idx < VXLAN_SO_MC_MAX_GROUPS,
1330	    ("%s: vso %p idx %d out of bounds", __func__, vso, idx));
1331
1332	leave = 0;
1333	mc = &vso->vxlso_mc[idx];
1334
1335	VXLAN_SO_WLOCK(vso);
1336	mc->vxlsomc_users--;
1337	if (mc->vxlsomc_users == 0) {
1338		group = mc->vxlsomc_gaddr;
1339		source = mc->vxlsomc_saddr;
1340		ifidx = mc->vxlsomc_ifidx;
1341		bzero(mc, sizeof(*mc));
1342		leave = 1;
1343	}
1344	VXLAN_SO_WUNLOCK(vso);
1345
1346	if (leave != 0) {
1347		/*
1348		 * Our socket's membership in this group may have already
1349		 * been removed if we joined through an interface that's
1350		 * been detached.
1351		 */
1352		vxlan_socket_mc_leave_group(vso, &group, &source, ifidx);
1353	}
1354}
1355
1356static struct vxlan_softc *
1357vxlan_socket_lookup_softc_locked(struct vxlan_socket *vso, uint32_t vni)
1358{
1359	struct vxlan_softc *sc;
1360	uint32_t hash;
1361
1362	VXLAN_SO_LOCK_ASSERT(vso);
1363	hash = VXLAN_SO_VNI_HASH(vni);
1364
1365	LIST_FOREACH(sc, &vso->vxlso_vni_hash[hash], vxl_entry) {
1366		if (sc->vxl_vni == vni) {
1367			VXLAN_ACQUIRE(sc);
1368			break;
1369		}
1370	}
1371
1372	return (sc);
1373}
1374
1375static struct vxlan_softc *
1376vxlan_socket_lookup_softc(struct vxlan_socket *vso, uint32_t vni)
1377{
1378	struct rm_priotracker tracker;
1379	struct vxlan_softc *sc;
1380
1381	VXLAN_SO_RLOCK(vso, &tracker);
1382	sc = vxlan_socket_lookup_softc_locked(vso, vni);
1383	VXLAN_SO_RUNLOCK(vso, &tracker);
1384
1385	return (sc);
1386}
1387
1388static int
1389vxlan_socket_insert_softc(struct vxlan_socket *vso, struct vxlan_softc *sc)
1390{
1391	struct vxlan_softc *tsc;
1392	uint32_t vni, hash;
1393
1394	vni = sc->vxl_vni;
1395	hash = VXLAN_SO_VNI_HASH(vni);
1396
1397	VXLAN_SO_WLOCK(vso);
1398	tsc = vxlan_socket_lookup_softc_locked(vso, vni);
1399	if (tsc != NULL) {
1400		VXLAN_SO_WUNLOCK(vso);
1401		vxlan_release(tsc);
1402		return (EEXIST);
1403	}
1404
1405	VXLAN_ACQUIRE(sc);
1406	LIST_INSERT_HEAD(&vso->vxlso_vni_hash[hash], sc, vxl_entry);
1407	VXLAN_SO_WUNLOCK(vso);
1408
1409	return (0);
1410}
1411
1412static void
1413vxlan_socket_remove_softc(struct vxlan_socket *vso, struct vxlan_softc *sc)
1414{
1415
1416	VXLAN_SO_WLOCK(vso);
1417	LIST_REMOVE(sc, vxl_entry);
1418	VXLAN_SO_WUNLOCK(vso);
1419
1420	vxlan_release(sc);
1421}
1422
1423static struct ifnet *
1424vxlan_multicast_if_ref(struct vxlan_softc *sc, int ipv4)
1425{
1426	struct ifnet *ifp;
1427
1428	VXLAN_LOCK_ASSERT(sc);
1429
1430	if (ipv4 && sc->vxl_im4o != NULL)
1431		ifp = sc->vxl_im4o->imo_multicast_ifp;
1432	else if (!ipv4 && sc->vxl_im6o != NULL)
1433		ifp = sc->vxl_im6o->im6o_multicast_ifp;
1434	else
1435		ifp = NULL;
1436
1437	if (ifp != NULL)
1438		if_ref(ifp);
1439
1440	return (ifp);
1441}
1442
1443static void
1444vxlan_free_multicast(struct vxlan_softc *sc)
1445{
1446
1447	if (sc->vxl_mc_ifp != NULL) {
1448		if_rele(sc->vxl_mc_ifp);
1449		sc->vxl_mc_ifp = NULL;
1450		sc->vxl_mc_ifindex = 0;
1451	}
1452
1453	if (sc->vxl_im4o != NULL) {
1454		free(sc->vxl_im4o, M_VXLAN);
1455		sc->vxl_im4o = NULL;
1456	}
1457
1458	if (sc->vxl_im6o != NULL) {
1459		free(sc->vxl_im6o, M_VXLAN);
1460		sc->vxl_im6o = NULL;
1461	}
1462}
1463
1464static int
1465vxlan_setup_multicast_interface(struct vxlan_softc *sc)
1466{
1467	struct ifnet *ifp;
1468
1469	ifp = ifunit_ref(sc->vxl_mc_ifname);
1470	if (ifp == NULL) {
1471		if_printf(sc->vxl_ifp, "multicast interface %s does "
1472		    "not exist\n", sc->vxl_mc_ifname);
1473		return (ENOENT);
1474	}
1475
1476	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1477		if_printf(sc->vxl_ifp, "interface %s does not support "
1478		     "multicast\n", sc->vxl_mc_ifname);
1479		if_rele(ifp);
1480		return (ENOTSUP);
1481	}
1482
1483	sc->vxl_mc_ifp = ifp;
1484	sc->vxl_mc_ifindex = ifp->if_index;
1485
1486	return (0);
1487}
1488
1489static int
1490vxlan_setup_multicast(struct vxlan_softc *sc)
1491{
1492	const union vxlan_sockaddr *group;
1493	int error;
1494
1495	group = &sc->vxl_dst_addr;
1496	error = 0;
1497
1498	if (sc->vxl_mc_ifname[0] != '\0') {
1499		error = vxlan_setup_multicast_interface(sc);
1500		if (error)
1501			return (error);
1502	}
1503
1504	/*
1505	 * Initialize an multicast options structure that is sufficiently
1506	 * populated for use in the respective IP output routine. This
1507	 * structure is typically stored in the socket, but our sockets
1508	 * may be shared among multiple interfaces.
1509	 */
1510	if (VXLAN_SOCKADDR_IS_IPV4(group)) {
1511		sc->vxl_im4o = malloc(sizeof(struct ip_moptions), M_VXLAN,
1512		    M_ZERO | M_WAITOK);
1513		sc->vxl_im4o->imo_multicast_ifp = sc->vxl_mc_ifp;
1514		sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl;
1515		sc->vxl_im4o->imo_multicast_vif = -1;
1516	} else if (VXLAN_SOCKADDR_IS_IPV6(group)) {
1517		sc->vxl_im6o = malloc(sizeof(struct ip6_moptions), M_VXLAN,
1518		    M_ZERO | M_WAITOK);
1519		sc->vxl_im6o->im6o_multicast_ifp = sc->vxl_mc_ifp;
1520		sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl;
1521	}
1522
1523	return (error);
1524}
1525
1526static int
1527vxlan_setup_socket(struct vxlan_softc *sc)
1528{
1529	struct vxlan_socket *vso;
1530	struct ifnet *ifp;
1531	union vxlan_sockaddr *saddr, *daddr;
1532	int multicast, error;
1533
1534	vso = NULL;
1535	ifp = sc->vxl_ifp;
1536	saddr = &sc->vxl_src_addr;
1537	daddr = &sc->vxl_dst_addr;
1538
1539	multicast = vxlan_sockaddr_in_multicast(daddr);
1540	MPASS(multicast != -1);
1541	sc->vxl_vso_mc_index = -1;
1542
1543	/*
1544	 * Try to create the socket. If that fails, attempt to use an
1545	 * existing socket.
1546	 */
1547	error = vxlan_socket_create(ifp, multicast, saddr, &vso);
1548	if (error) {
1549		if (multicast != 0)
1550			vso = vxlan_socket_mc_lookup(saddr);
1551		else
1552			vso = vxlan_socket_lookup(saddr);
1553
1554		if (vso == NULL) {
1555			if_printf(ifp, "cannot create socket (error: %d), "
1556			    "and no existing socket found\n", error);
1557			goto out;
1558		}
1559	}
1560
1561	if (multicast != 0) {
1562		error = vxlan_setup_multicast(sc);
1563		if (error)
1564			goto out;
1565
1566		error = vxlan_socket_mc_add_group(vso, daddr, saddr,
1567		    sc->vxl_mc_ifindex, &sc->vxl_vso_mc_index);
1568		if (error)
1569			goto out;
1570	}
1571
1572	sc->vxl_sock = vso;
1573	error = vxlan_socket_insert_softc(vso, sc);
1574	if (error) {
1575		sc->vxl_sock = NULL;
1576		if_printf(ifp, "network identifier %d already exists in "
1577		    "this socket\n", sc->vxl_vni);
1578		goto out;
1579	}
1580
1581	return (0);
1582
1583out:
1584	if (vso != NULL) {
1585		if (sc->vxl_vso_mc_index != -1) {
1586			vxlan_socket_mc_release_group_by_idx(vso,
1587			    sc->vxl_vso_mc_index);
1588			sc->vxl_vso_mc_index = -1;
1589		}
1590		if (multicast != 0)
1591			vxlan_free_multicast(sc);
1592		vxlan_socket_release(vso);
1593	}
1594
1595	return (error);
1596}
1597
1598#ifdef INET6
1599static void
1600vxlan_setup_zero_checksum_port(struct vxlan_softc *sc)
1601{
1602
1603	if (!VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_src_addr))
1604		return;
1605
1606	MPASS(sc->vxl_src_addr.in6.sin6_port != 0);
1607	MPASS(sc->vxl_dst_addr.in6.sin6_port != 0);
1608
1609	if (sc->vxl_src_addr.in6.sin6_port != sc->vxl_dst_addr.in6.sin6_port) {
1610		if_printf(sc->vxl_ifp, "port %d in src address does not match "
1611		    "port %d in dst address, rfc6935_port (%d) not updated.\n",
1612		    ntohs(sc->vxl_src_addr.in6.sin6_port),
1613		    ntohs(sc->vxl_dst_addr.in6.sin6_port),
1614		    V_zero_checksum_port);
1615		return;
1616	}
1617
1618	if (V_zero_checksum_port != 0) {
1619		if (V_zero_checksum_port !=
1620		    ntohs(sc->vxl_src_addr.in6.sin6_port)) {
1621			if_printf(sc->vxl_ifp, "rfc6935_port is already set to "
1622			    "%d, cannot set it to %d.\n", V_zero_checksum_port,
1623			    ntohs(sc->vxl_src_addr.in6.sin6_port));
1624		}
1625		return;
1626	}
1627
1628	V_zero_checksum_port = ntohs(sc->vxl_src_addr.in6.sin6_port);
1629	if_printf(sc->vxl_ifp, "rfc6935_port set to %d\n",
1630	    V_zero_checksum_port);
1631}
1632#endif
1633
1634static void
1635vxlan_setup_interface_hdrlen(struct vxlan_softc *sc)
1636{
1637	struct ifnet *ifp;
1638
1639	VXLAN_LOCK_WASSERT(sc);
1640
1641	ifp = sc->vxl_ifp;
1642	ifp->if_hdrlen = ETHER_HDR_LEN + sizeof(struct vxlanudphdr);
1643
1644	if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr) != 0)
1645		ifp->if_hdrlen += sizeof(struct ip);
1646	else if (VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_dst_addr) != 0)
1647		ifp->if_hdrlen += sizeof(struct ip6_hdr);
1648
1649	if ((sc->vxl_flags & VXLAN_FLAG_USER_MTU) == 0)
1650		ifp->if_mtu = ETHERMTU - ifp->if_hdrlen;
1651}
1652
1653static int
1654vxlan_valid_init_config(struct vxlan_softc *sc)
1655{
1656	const char *reason;
1657
1658	if (vxlan_check_vni(sc->vxl_vni) != 0) {
1659		reason = "invalid virtual network identifier specified";
1660		goto fail;
1661	}
1662
1663	if (vxlan_sockaddr_supported(&sc->vxl_src_addr, 1) == 0) {
1664		reason = "source address type is not supported";
1665		goto fail;
1666	}
1667
1668	if (vxlan_sockaddr_supported(&sc->vxl_dst_addr, 0) == 0) {
1669		reason = "destination address type is not supported";
1670		goto fail;
1671	}
1672
1673	if (vxlan_sockaddr_in_any(&sc->vxl_dst_addr) != 0) {
1674		reason = "no valid destination address specified";
1675		goto fail;
1676	}
1677
1678	if (vxlan_sockaddr_in_multicast(&sc->vxl_dst_addr) == 0 &&
1679	    sc->vxl_mc_ifname[0] != '\0') {
1680		reason = "can only specify interface with a group address";
1681		goto fail;
1682	}
1683
1684	if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) {
1685		if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_src_addr) ^
1686		    VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr)) {
1687			reason = "source and destination address must both "
1688			    "be either IPv4 or IPv6";
1689			goto fail;
1690		}
1691	}
1692
1693	if (sc->vxl_src_addr.in4.sin_port == 0) {
1694		reason = "local port not specified";
1695		goto fail;
1696	}
1697
1698	if (sc->vxl_dst_addr.in4.sin_port == 0) {
1699		reason = "remote port not specified";
1700		goto fail;
1701	}
1702
1703	return (0);
1704
1705fail:
1706	if_printf(sc->vxl_ifp, "cannot initialize interface: %s\n", reason);
1707	return (EINVAL);
1708}
1709
1710static void
1711vxlan_init_wait(struct vxlan_softc *sc)
1712{
1713
1714	VXLAN_LOCK_WASSERT(sc);
1715	while (sc->vxl_flags & VXLAN_FLAG_INIT)
1716		rm_sleep(sc, &sc->vxl_lock, 0, "vxlint", hz);
1717}
1718
1719static void
1720vxlan_init_complete(struct vxlan_softc *sc)
1721{
1722
1723	VXLAN_WLOCK(sc);
1724	sc->vxl_flags &= ~VXLAN_FLAG_INIT;
1725	wakeup(sc);
1726	VXLAN_WUNLOCK(sc);
1727}
1728
1729static void
1730vxlan_init(void *xsc)
1731{
1732	static const uint8_t empty_mac[ETHER_ADDR_LEN];
1733	struct vxlan_softc *sc;
1734	struct ifnet *ifp;
1735
1736	sc = xsc;
1737	ifp = sc->vxl_ifp;
1738
1739	sx_xlock(&vxlan_sx);
1740	VXLAN_WLOCK(sc);
1741	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1742		VXLAN_WUNLOCK(sc);
1743		sx_xunlock(&vxlan_sx);
1744		return;
1745	}
1746	sc->vxl_flags |= VXLAN_FLAG_INIT;
1747	VXLAN_WUNLOCK(sc);
1748
1749	if (vxlan_valid_init_config(sc) != 0)
1750		goto out;
1751
1752	if (vxlan_setup_socket(sc) != 0)
1753		goto out;
1754
1755#ifdef INET6
1756	vxlan_setup_zero_checksum_port(sc);
1757#endif
1758
1759	/* Initialize the default forwarding entry. */
1760	vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac,
1761	    &sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC);
1762
1763	VXLAN_WLOCK(sc);
1764	ifp->if_drv_flags |= IFF_DRV_RUNNING;
1765	callout_reset(&sc->vxl_callout, vxlan_ftable_prune_period * hz,
1766	    vxlan_timer, sc);
1767	VXLAN_WUNLOCK(sc);
1768
1769	if_link_state_change(ifp, LINK_STATE_UP);
1770
1771	EVENTHANDLER_INVOKE(vxlan_start, ifp, sc->vxl_src_addr.in4.sin_family,
1772	    ntohs(sc->vxl_src_addr.in4.sin_port));
1773out:
1774	vxlan_init_complete(sc);
1775	sx_xunlock(&vxlan_sx);
1776}
1777
1778static void
1779vxlan_release(struct vxlan_softc *sc)
1780{
1781
1782	/*
1783	 * The softc may be destroyed as soon as we release our reference,
1784	 * so we cannot serialize the wakeup with the softc lock. We use a
1785	 * timeout in our sleeps so a missed wakeup is unfortunate but not
1786	 * fatal.
1787	 */
1788	if (VXLAN_RELEASE(sc) != 0)
1789		wakeup(sc);
1790}
1791
1792static void
1793vxlan_teardown_wait(struct vxlan_softc *sc)
1794{
1795
1796	VXLAN_LOCK_WASSERT(sc);
1797	while (sc->vxl_flags & VXLAN_FLAG_TEARDOWN)
1798		rm_sleep(sc, &sc->vxl_lock, 0, "vxltrn", hz);
1799}
1800
1801static void
1802vxlan_teardown_complete(struct vxlan_softc *sc)
1803{
1804
1805	VXLAN_WLOCK(sc);
1806	sc->vxl_flags &= ~VXLAN_FLAG_TEARDOWN;
1807	wakeup(sc);
1808	VXLAN_WUNLOCK(sc);
1809}
1810
1811static void
1812vxlan_teardown_locked(struct vxlan_softc *sc)
1813{
1814	struct ifnet *ifp;
1815	struct vxlan_socket *vso;
1816
1817	sx_assert(&vxlan_sx, SA_XLOCKED);
1818	VXLAN_LOCK_WASSERT(sc);
1819	MPASS(sc->vxl_flags & VXLAN_FLAG_TEARDOWN);
1820
1821	ifp = sc->vxl_ifp;
1822	ifp->if_flags &= ~IFF_UP;
1823	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1824	callout_stop(&sc->vxl_callout);
1825	vso = sc->vxl_sock;
1826	sc->vxl_sock = NULL;
1827
1828	VXLAN_WUNLOCK(sc);
1829	if_link_state_change(ifp, LINK_STATE_DOWN);
1830	EVENTHANDLER_INVOKE(vxlan_stop, ifp, sc->vxl_src_addr.in4.sin_family,
1831	    ntohs(sc->vxl_src_addr.in4.sin_port));
1832
1833	if (vso != NULL) {
1834		vxlan_socket_remove_softc(vso, sc);
1835
1836		if (sc->vxl_vso_mc_index != -1) {
1837			vxlan_socket_mc_release_group_by_idx(vso,
1838			    sc->vxl_vso_mc_index);
1839			sc->vxl_vso_mc_index = -1;
1840		}
1841	}
1842
1843	VXLAN_WLOCK(sc);
1844	while (sc->vxl_refcnt != 0)
1845		rm_sleep(sc, &sc->vxl_lock, 0, "vxldrn", hz);
1846	VXLAN_WUNLOCK(sc);
1847
1848	callout_drain(&sc->vxl_callout);
1849
1850	vxlan_free_multicast(sc);
1851	if (vso != NULL)
1852		vxlan_socket_release(vso);
1853
1854	vxlan_teardown_complete(sc);
1855}
1856
1857static void
1858vxlan_teardown(struct vxlan_softc *sc)
1859{
1860
1861	sx_xlock(&vxlan_sx);
1862	VXLAN_WLOCK(sc);
1863	if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) {
1864		vxlan_teardown_wait(sc);
1865		VXLAN_WUNLOCK(sc);
1866		sx_xunlock(&vxlan_sx);
1867		return;
1868	}
1869
1870	sc->vxl_flags |= VXLAN_FLAG_TEARDOWN;
1871	vxlan_teardown_locked(sc);
1872	sx_xunlock(&vxlan_sx);
1873}
1874
1875static void
1876vxlan_ifdetach(struct vxlan_softc *sc, struct ifnet *ifp,
1877    struct vxlan_softc_head *list)
1878{
1879
1880	VXLAN_WLOCK(sc);
1881
1882	if (sc->vxl_mc_ifp != ifp)
1883		goto out;
1884	if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN)
1885		goto out;
1886
1887	sc->vxl_flags |= VXLAN_FLAG_TEARDOWN;
1888	LIST_INSERT_HEAD(list, sc, vxl_ifdetach_list);
1889
1890out:
1891	VXLAN_WUNLOCK(sc);
1892}
1893
1894static void
1895vxlan_timer(void *xsc)
1896{
1897	struct vxlan_softc *sc;
1898
1899	sc = xsc;
1900	VXLAN_LOCK_WASSERT(sc);
1901
1902	vxlan_ftable_expire(sc);
1903	callout_schedule(&sc->vxl_callout, vxlan_ftable_prune_period * hz);
1904}
1905
1906static int
1907vxlan_ioctl_ifflags(struct vxlan_softc *sc)
1908{
1909	struct ifnet *ifp;
1910
1911	ifp = sc->vxl_ifp;
1912
1913	if (ifp->if_flags & IFF_UP) {
1914		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1915			vxlan_init(sc);
1916	} else {
1917		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1918			vxlan_teardown(sc);
1919	}
1920
1921	return (0);
1922}
1923
1924static int
1925vxlan_ctrl_get_config(struct vxlan_softc *sc, void *arg)
1926{
1927	struct rm_priotracker tracker;
1928	struct ifvxlancfg *cfg;
1929
1930	cfg = arg;
1931	bzero(cfg, sizeof(*cfg));
1932
1933	VXLAN_RLOCK(sc, &tracker);
1934	cfg->vxlc_vni = sc->vxl_vni;
1935	memcpy(&cfg->vxlc_local_sa, &sc->vxl_src_addr,
1936	    sizeof(union vxlan_sockaddr));
1937	memcpy(&cfg->vxlc_remote_sa, &sc->vxl_dst_addr,
1938	    sizeof(union vxlan_sockaddr));
1939	cfg->vxlc_mc_ifindex = sc->vxl_mc_ifindex;
1940	cfg->vxlc_ftable_cnt = sc->vxl_ftable_cnt;
1941	cfg->vxlc_ftable_max = sc->vxl_ftable_max;
1942	cfg->vxlc_ftable_timeout = sc->vxl_ftable_timeout;
1943	cfg->vxlc_port_min = sc->vxl_min_port;
1944	cfg->vxlc_port_max = sc->vxl_max_port;
1945	cfg->vxlc_learn = (sc->vxl_flags & VXLAN_FLAG_LEARN) != 0;
1946	cfg->vxlc_ttl = sc->vxl_ttl;
1947	VXLAN_RUNLOCK(sc, &tracker);
1948
1949#ifdef INET6
1950	if (VXLAN_SOCKADDR_IS_IPV6(&cfg->vxlc_local_sa))
1951		sa6_recoverscope(&cfg->vxlc_local_sa.in6);
1952	if (VXLAN_SOCKADDR_IS_IPV6(&cfg->vxlc_remote_sa))
1953		sa6_recoverscope(&cfg->vxlc_remote_sa.in6);
1954#endif
1955
1956	return (0);
1957}
1958
1959static int
1960vxlan_ctrl_set_vni(struct vxlan_softc *sc, void *arg)
1961{
1962	struct ifvxlancmd *cmd;
1963	int error;
1964
1965	cmd = arg;
1966
1967	if (vxlan_check_vni(cmd->vxlcmd_vni) != 0)
1968		return (EINVAL);
1969
1970	VXLAN_WLOCK(sc);
1971	if (vxlan_can_change_config(sc)) {
1972		sc->vxl_vni = cmd->vxlcmd_vni;
1973		error = 0;
1974	} else
1975		error = EBUSY;
1976	VXLAN_WUNLOCK(sc);
1977
1978	return (error);
1979}
1980
1981static int
1982vxlan_ctrl_set_local_addr(struct vxlan_softc *sc, void *arg)
1983{
1984	struct ifvxlancmd *cmd;
1985	union vxlan_sockaddr *vxlsa;
1986	int error;
1987
1988	cmd = arg;
1989	vxlsa = &cmd->vxlcmd_sa;
1990
1991	if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa))
1992		return (EINVAL);
1993	if (vxlan_sockaddr_in_multicast(vxlsa) != 0)
1994		return (EINVAL);
1995	if (VXLAN_SOCKADDR_IS_IPV6(vxlsa)) {
1996		error = vxlan_sockaddr_in6_embedscope(vxlsa);
1997		if (error)
1998			return (error);
1999	}
2000
2001	VXLAN_WLOCK(sc);
2002	if (vxlan_can_change_config(sc)) {
2003		vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa);
2004		vxlan_set_hwcaps(sc);
2005		error = 0;
2006	} else
2007		error = EBUSY;
2008	VXLAN_WUNLOCK(sc);
2009
2010	return (error);
2011}
2012
2013static int
2014vxlan_ctrl_set_remote_addr(struct vxlan_softc *sc, void *arg)
2015{
2016	struct ifvxlancmd *cmd;
2017	union vxlan_sockaddr *vxlsa;
2018	int error;
2019
2020	cmd = arg;
2021	vxlsa = &cmd->vxlcmd_sa;
2022
2023	if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa))
2024		return (EINVAL);
2025	if (VXLAN_SOCKADDR_IS_IPV6(vxlsa)) {
2026		error = vxlan_sockaddr_in6_embedscope(vxlsa);
2027		if (error)
2028			return (error);
2029	}
2030
2031	VXLAN_WLOCK(sc);
2032	if (vxlan_can_change_config(sc)) {
2033		vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa);
2034		vxlan_setup_interface_hdrlen(sc);
2035		error = 0;
2036	} else
2037		error = EBUSY;
2038	VXLAN_WUNLOCK(sc);
2039
2040	return (error);
2041}
2042
2043static int
2044vxlan_ctrl_set_local_port(struct vxlan_softc *sc, void *arg)
2045{
2046	struct ifvxlancmd *cmd;
2047	int error;
2048
2049	cmd = arg;
2050
2051	if (cmd->vxlcmd_port == 0)
2052		return (EINVAL);
2053
2054	VXLAN_WLOCK(sc);
2055	if (vxlan_can_change_config(sc)) {
2056		sc->vxl_src_addr.in4.sin_port = htons(cmd->vxlcmd_port);
2057		error = 0;
2058	} else
2059		error = EBUSY;
2060	VXLAN_WUNLOCK(sc);
2061
2062	return (error);
2063}
2064
2065static int
2066vxlan_ctrl_set_remote_port(struct vxlan_softc *sc, void *arg)
2067{
2068	struct ifvxlancmd *cmd;
2069	int error;
2070
2071	cmd = arg;
2072
2073	if (cmd->vxlcmd_port == 0)
2074		return (EINVAL);
2075
2076	VXLAN_WLOCK(sc);
2077	if (vxlan_can_change_config(sc)) {
2078		sc->vxl_dst_addr.in4.sin_port = htons(cmd->vxlcmd_port);
2079		error = 0;
2080	} else
2081		error = EBUSY;
2082	VXLAN_WUNLOCK(sc);
2083
2084	return (error);
2085}
2086
2087static int
2088vxlan_ctrl_set_port_range(struct vxlan_softc *sc, void *arg)
2089{
2090	struct ifvxlancmd *cmd;
2091	uint16_t min, max;
2092	int error;
2093
2094	cmd = arg;
2095	min = cmd->vxlcmd_port_min;
2096	max = cmd->vxlcmd_port_max;
2097
2098	if (max < min)
2099		return (EINVAL);
2100
2101	VXLAN_WLOCK(sc);
2102	if (vxlan_can_change_config(sc)) {
2103		sc->vxl_min_port = min;
2104		sc->vxl_max_port = max;
2105		error = 0;
2106	} else
2107		error = EBUSY;
2108	VXLAN_WUNLOCK(sc);
2109
2110	return (error);
2111}
2112
2113static int
2114vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *sc, void *arg)
2115{
2116	struct ifvxlancmd *cmd;
2117	int error;
2118
2119	cmd = arg;
2120
2121	VXLAN_WLOCK(sc);
2122	if (vxlan_check_ftable_timeout(cmd->vxlcmd_ftable_timeout) == 0) {
2123		sc->vxl_ftable_timeout = cmd->vxlcmd_ftable_timeout;
2124		error = 0;
2125	} else
2126		error = EINVAL;
2127	VXLAN_WUNLOCK(sc);
2128
2129	return (error);
2130}
2131
2132static int
2133vxlan_ctrl_set_ftable_max(struct vxlan_softc *sc, void *arg)
2134{
2135	struct ifvxlancmd *cmd;
2136	int error;
2137
2138	cmd = arg;
2139
2140	VXLAN_WLOCK(sc);
2141	if (vxlan_check_ftable_max(cmd->vxlcmd_ftable_max) == 0) {
2142		sc->vxl_ftable_max = cmd->vxlcmd_ftable_max;
2143		error = 0;
2144	} else
2145		error = EINVAL;
2146	VXLAN_WUNLOCK(sc);
2147
2148	return (error);
2149}
2150
2151static int
2152vxlan_ctrl_set_multicast_if(struct vxlan_softc * sc, void *arg)
2153{
2154	struct ifvxlancmd *cmd;
2155	int error;
2156
2157	cmd = arg;
2158
2159	VXLAN_WLOCK(sc);
2160	if (vxlan_can_change_config(sc)) {
2161		strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ);
2162		vxlan_set_hwcaps(sc);
2163		error = 0;
2164	} else
2165		error = EBUSY;
2166	VXLAN_WUNLOCK(sc);
2167
2168	return (error);
2169}
2170
2171static int
2172vxlan_ctrl_set_ttl(struct vxlan_softc *sc, void *arg)
2173{
2174	struct ifvxlancmd *cmd;
2175	int error;
2176
2177	cmd = arg;
2178
2179	VXLAN_WLOCK(sc);
2180	if (vxlan_check_ttl(cmd->vxlcmd_ttl) == 0) {
2181		sc->vxl_ttl = cmd->vxlcmd_ttl;
2182		if (sc->vxl_im4o != NULL)
2183			sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl;
2184		if (sc->vxl_im6o != NULL)
2185			sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl;
2186		error = 0;
2187	} else
2188		error = EINVAL;
2189	VXLAN_WUNLOCK(sc);
2190
2191	return (error);
2192}
2193
2194static int
2195vxlan_ctrl_set_learn(struct vxlan_softc *sc, void *arg)
2196{
2197	struct ifvxlancmd *cmd;
2198
2199	cmd = arg;
2200
2201	VXLAN_WLOCK(sc);
2202	if (cmd->vxlcmd_flags & VXLAN_CMD_FLAG_LEARN)
2203		sc->vxl_flags |= VXLAN_FLAG_LEARN;
2204	else
2205		sc->vxl_flags &= ~VXLAN_FLAG_LEARN;
2206	VXLAN_WUNLOCK(sc);
2207
2208	return (0);
2209}
2210
2211static int
2212vxlan_ctrl_ftable_entry_add(struct vxlan_softc *sc, void *arg)
2213{
2214	union vxlan_sockaddr vxlsa;
2215	struct ifvxlancmd *cmd;
2216	struct vxlan_ftable_entry *fe;
2217	int error;
2218
2219	cmd = arg;
2220	vxlsa = cmd->vxlcmd_sa;
2221
2222	if (!VXLAN_SOCKADDR_IS_IPV46(&vxlsa))
2223		return (EINVAL);
2224	if (vxlan_sockaddr_in_any(&vxlsa) != 0)
2225		return (EINVAL);
2226	if (vxlan_sockaddr_in_multicast(&vxlsa) != 0)
2227		return (EINVAL);
2228	/* BMV: We could support both IPv4 and IPv6 later. */
2229	if (vxlsa.sa.sa_family != sc->vxl_dst_addr.sa.sa_family)
2230		return (EAFNOSUPPORT);
2231
2232	if (VXLAN_SOCKADDR_IS_IPV6(&vxlsa)) {
2233		error = vxlan_sockaddr_in6_embedscope(&vxlsa);
2234		if (error)
2235			return (error);
2236	}
2237
2238	fe = vxlan_ftable_entry_alloc();
2239	if (fe == NULL)
2240		return (ENOMEM);
2241
2242	if (vxlsa.in4.sin_port == 0)
2243		vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port;
2244
2245	vxlan_ftable_entry_init(sc, fe, cmd->vxlcmd_mac, &vxlsa.sa,
2246	    VXLAN_FE_FLAG_STATIC);
2247
2248	VXLAN_WLOCK(sc);
2249	error = vxlan_ftable_entry_insert(sc, fe);
2250	VXLAN_WUNLOCK(sc);
2251
2252	if (error)
2253		vxlan_ftable_entry_free(fe);
2254
2255	return (error);
2256}
2257
2258static int
2259vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *sc, void *arg)
2260{
2261	struct ifvxlancmd *cmd;
2262	struct vxlan_ftable_entry *fe;
2263	int error;
2264
2265	cmd = arg;
2266
2267	VXLAN_WLOCK(sc);
2268	fe = vxlan_ftable_entry_lookup(sc, cmd->vxlcmd_mac);
2269	if (fe != NULL) {
2270		vxlan_ftable_entry_destroy(sc, fe);
2271		error = 0;
2272	} else
2273		error = ENOENT;
2274	VXLAN_WUNLOCK(sc);
2275
2276	return (error);
2277}
2278
2279static int
2280vxlan_ctrl_flush(struct vxlan_softc *sc, void *arg)
2281{
2282	struct ifvxlancmd *cmd;
2283	int all;
2284
2285	cmd = arg;
2286	all = cmd->vxlcmd_flags & VXLAN_CMD_FLAG_FLUSH_ALL;
2287
2288	VXLAN_WLOCK(sc);
2289	vxlan_ftable_flush(sc, all);
2290	VXLAN_WUNLOCK(sc);
2291
2292	return (0);
2293}
2294
2295static int
2296vxlan_ioctl_drvspec(struct vxlan_softc *sc, struct ifdrv *ifd, int get)
2297{
2298	const struct vxlan_control *vc;
2299	union {
2300		struct ifvxlancfg	cfg;
2301		struct ifvxlancmd	cmd;
2302	} args;
2303	int out, error;
2304
2305	if (ifd->ifd_cmd >= vxlan_control_table_size)
2306		return (EINVAL);
2307
2308	bzero(&args, sizeof(args));
2309	vc = &vxlan_control_table[ifd->ifd_cmd];
2310	out = (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) != 0;
2311
2312	if ((get != 0 && out == 0) || (get == 0 && out != 0))
2313		return (EINVAL);
2314
2315	if (vc->vxlc_flags & VXLAN_CTRL_FLAG_SUSER) {
2316		error = priv_check(curthread, PRIV_NET_VXLAN);
2317		if (error)
2318			return (error);
2319	}
2320
2321	if (ifd->ifd_len != vc->vxlc_argsize ||
2322	    ifd->ifd_len > sizeof(args))
2323		return (EINVAL);
2324
2325	if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYIN) {
2326		error = copyin(ifd->ifd_data, &args, ifd->ifd_len);
2327		if (error)
2328			return (error);
2329	}
2330
2331	error = vc->vxlc_func(sc, &args);
2332	if (error)
2333		return (error);
2334
2335	if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) {
2336		error = copyout(&args, ifd->ifd_data, ifd->ifd_len);
2337		if (error)
2338			return (error);
2339	}
2340
2341	return (0);
2342}
2343
2344static int
2345vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2346{
2347	struct rm_priotracker tracker;
2348	struct vxlan_softc *sc;
2349	struct ifreq *ifr;
2350	struct ifdrv *ifd;
2351	int error;
2352
2353	sc = ifp->if_softc;
2354	ifr = (struct ifreq *) data;
2355	ifd = (struct ifdrv *) data;
2356
2357	error = 0;
2358
2359	switch (cmd) {
2360	case SIOCADDMULTI:
2361	case SIOCDELMULTI:
2362		break;
2363
2364	case SIOCGDRVSPEC:
2365	case SIOCSDRVSPEC:
2366		error = vxlan_ioctl_drvspec(sc, ifd, cmd == SIOCGDRVSPEC);
2367		break;
2368
2369	case SIOCSIFFLAGS:
2370		error = vxlan_ioctl_ifflags(sc);
2371		break;
2372
2373	case SIOCSIFMEDIA:
2374	case SIOCGIFMEDIA:
2375		error = ifmedia_ioctl(ifp, ifr, &sc->vxl_media, cmd);
2376		break;
2377
2378	case SIOCSIFMTU:
2379		if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > VXLAN_MAX_MTU) {
2380			error = EINVAL;
2381		} else {
2382			VXLAN_WLOCK(sc);
2383			ifp->if_mtu = ifr->ifr_mtu;
2384			sc->vxl_flags |= VXLAN_FLAG_USER_MTU;
2385			VXLAN_WUNLOCK(sc);
2386		}
2387		break;
2388
2389	case SIOCSIFCAP:
2390		VXLAN_WLOCK(sc);
2391		error = vxlan_set_reqcap(sc, ifp, ifr->ifr_reqcap);
2392		if (error == 0)
2393			vxlan_set_hwcaps(sc);
2394		VXLAN_WUNLOCK(sc);
2395		break;
2396
2397	case SIOCGTUNFIB:
2398		VXLAN_RLOCK(sc, &tracker);
2399		ifr->ifr_fib = sc->vxl_fibnum;
2400		VXLAN_RUNLOCK(sc, &tracker);
2401		break;
2402
2403	case SIOCSTUNFIB:
2404		if ((error = priv_check(curthread, PRIV_NET_VXLAN)) != 0)
2405			break;
2406
2407		if (ifr->ifr_fib >= rt_numfibs)
2408			error = EINVAL;
2409		else {
2410			VXLAN_WLOCK(sc);
2411			sc->vxl_fibnum = ifr->ifr_fib;
2412			VXLAN_WUNLOCK(sc);
2413		}
2414		break;
2415
2416	default:
2417		error = ether_ioctl(ifp, cmd, data);
2418		break;
2419	}
2420
2421	return (error);
2422}
2423
2424#if defined(INET) || defined(INET6)
2425static uint16_t
2426vxlan_pick_source_port(struct vxlan_softc *sc, struct mbuf *m)
2427{
2428	int range;
2429	uint32_t hash;
2430
2431	range = sc->vxl_max_port - sc->vxl_min_port + 1;
2432
2433	if (M_HASHTYPE_ISHASH(m))
2434		hash = m->m_pkthdr.flowid;
2435	else
2436		hash = jenkins_hash(m->m_data, ETHER_HDR_LEN,
2437		    sc->vxl_port_hash_key);
2438
2439	return (sc->vxl_min_port + (hash % range));
2440}
2441
2442static void
2443vxlan_encap_header(struct vxlan_softc *sc, struct mbuf *m, int ipoff,
2444    uint16_t srcport, uint16_t dstport)
2445{
2446	struct vxlanudphdr *hdr;
2447	struct udphdr *udph;
2448	struct vxlan_header *vxh;
2449	int len;
2450
2451	len = m->m_pkthdr.len - ipoff;
2452	MPASS(len >= sizeof(struct vxlanudphdr));
2453	hdr = mtodo(m, ipoff);
2454
2455	udph = &hdr->vxlh_udp;
2456	udph->uh_sport = srcport;
2457	udph->uh_dport = dstport;
2458	udph->uh_ulen = htons(len);
2459	udph->uh_sum = 0;
2460
2461	vxh = &hdr->vxlh_hdr;
2462	vxh->vxlh_flags = htonl(VXLAN_HDR_FLAGS_VALID_VNI);
2463	vxh->vxlh_vni = htonl(sc->vxl_vni << VXLAN_HDR_VNI_SHIFT);
2464}
2465#endif
2466
2467#if defined(INET6) || defined(INET)
2468/*
2469 * Return the CSUM_INNER_* equivalent of CSUM_* caps.
2470 */
2471static uint32_t
2472csum_flags_to_inner_flags(uint32_t csum_flags_in, const uint32_t encap)
2473{
2474	uint32_t csum_flags = encap;
2475	const uint32_t v4 = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP;
2476
2477	/*
2478	 * csum_flags can request either v4 or v6 offload but not both.
2479	 * tcp_output always sets CSUM_TSO (both CSUM_IP_TSO and CSUM_IP6_TSO)
2480	 * so those bits are no good to detect the IP version.  Other bits are
2481	 * always set with CSUM_TSO and we use those to figure out the IP
2482	 * version.
2483	 */
2484	if (csum_flags_in & v4) {
2485		if (csum_flags_in & CSUM_IP)
2486			csum_flags |= CSUM_INNER_IP;
2487		if (csum_flags_in & CSUM_IP_UDP)
2488			csum_flags |= CSUM_INNER_IP_UDP;
2489		if (csum_flags_in & CSUM_IP_TCP)
2490			csum_flags |= CSUM_INNER_IP_TCP;
2491		if (csum_flags_in & CSUM_IP_TSO)
2492			csum_flags |= CSUM_INNER_IP_TSO;
2493	} else {
2494#ifdef INVARIANTS
2495		const uint32_t v6 = CSUM_IP6_UDP | CSUM_IP6_TCP;
2496
2497		MPASS((csum_flags_in & v6) != 0);
2498#endif
2499		if (csum_flags_in & CSUM_IP6_UDP)
2500			csum_flags |= CSUM_INNER_IP6_UDP;
2501		if (csum_flags_in & CSUM_IP6_TCP)
2502			csum_flags |= CSUM_INNER_IP6_TCP;
2503		if (csum_flags_in & CSUM_IP6_TSO)
2504			csum_flags |= CSUM_INNER_IP6_TSO;
2505	}
2506
2507	return (csum_flags);
2508}
2509#endif
2510
2511static int
2512vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
2513    struct mbuf *m)
2514{
2515#ifdef INET
2516	struct ifnet *ifp;
2517	struct ip *ip;
2518	struct in_addr srcaddr, dstaddr;
2519	uint16_t srcport, dstport;
2520	int plen, mcast, error;
2521	struct route route, *ro;
2522	struct sockaddr_in *sin;
2523	uint32_t csum_flags;
2524
2525	NET_EPOCH_ASSERT();
2526
2527	ifp = sc->vxl_ifp;
2528	srcaddr = sc->vxl_src_addr.in4.sin_addr;
2529	srcport = vxlan_pick_source_port(sc, m);
2530	dstaddr = fvxlsa->in4.sin_addr;
2531	dstport = fvxlsa->in4.sin_port;
2532
2533	plen = m->m_pkthdr.len;
2534	M_PREPEND(m, sizeof(struct ip) + sizeof(struct vxlanudphdr),
2535	    M_NOWAIT);
2536	if (m == NULL) {
2537		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2538		return (ENOBUFS);
2539	}
2540
2541	ip = mtod(m, struct ip *);
2542	ip->ip_tos = 0;
2543	ip->ip_len = htons(m->m_pkthdr.len);
2544	ip->ip_off = 0;
2545	ip->ip_ttl = sc->vxl_ttl;
2546	ip->ip_p = IPPROTO_UDP;
2547	ip->ip_sum = 0;
2548	ip->ip_src = srcaddr;
2549	ip->ip_dst = dstaddr;
2550
2551	vxlan_encap_header(sc, m, sizeof(struct ip), srcport, dstport);
2552
2553	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
2554	m->m_flags &= ~(M_MCAST | M_BCAST);
2555
2556	m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
2557	if (m->m_pkthdr.csum_flags != 0) {
2558		/*
2559		 * HW checksum (L3 and/or L4) or TSO has been requested.  Look
2560		 * up the ifnet for the outbound route and verify that the
2561		 * outbound ifnet can perform the requested operation on the
2562		 * inner frame.
2563		 */
2564		bzero(&route, sizeof(route));
2565		ro = &route;
2566		sin = (struct sockaddr_in *)&ro->ro_dst;
2567		sin->sin_family = AF_INET;
2568		sin->sin_len = sizeof(*sin);
2569		sin->sin_addr = ip->ip_dst;
2570		ro->ro_nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE,
2571		    0);
2572		if (ro->ro_nh == NULL) {
2573			m_freem(m);
2574			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2575			return (EHOSTUNREACH);
2576		}
2577
2578		csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
2579		    CSUM_ENCAP_VXLAN);
2580		if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
2581		    csum_flags) {
2582			if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
2583				const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
2584
2585				if_printf(ifp, "interface %s is missing hwcaps "
2586				    "0x%08x, csum_flags 0x%08x -> 0x%08x, "
2587				    "hwassist 0x%08x\n", nh_ifp->if_xname,
2588				    csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
2589				    m->m_pkthdr.csum_flags, csum_flags,
2590				    (uint32_t)nh_ifp->if_hwassist);
2591			}
2592			m_freem(m);
2593			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2594			return (ENXIO);
2595		}
2596		m->m_pkthdr.csum_flags = csum_flags;
2597		if (csum_flags &
2598		    (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
2599		    CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
2600			counter_u64_add(sc->vxl_stats.txcsum, 1);
2601			if (csum_flags & CSUM_INNER_TSO)
2602				counter_u64_add(sc->vxl_stats.tso, 1);
2603		}
2604	} else
2605		ro = NULL;
2606	error = ip_output(m, NULL, ro, 0, sc->vxl_im4o, NULL);
2607	if (error == 0) {
2608		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
2609		if_inc_counter(ifp, IFCOUNTER_OBYTES, plen);
2610		if (mcast != 0)
2611			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
2612	} else
2613		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2614
2615	return (error);
2616#else
2617	m_freem(m);
2618	return (ENOTSUP);
2619#endif
2620}
2621
2622static int
2623vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
2624    struct mbuf *m)
2625{
2626#ifdef INET6
2627	struct ifnet *ifp;
2628	struct ip6_hdr *ip6;
2629	const struct in6_addr *srcaddr, *dstaddr;
2630	uint16_t srcport, dstport;
2631	int plen, mcast, error;
2632	struct route_in6 route, *ro;
2633	struct sockaddr_in6 *sin6;
2634	uint32_t csum_flags;
2635
2636	NET_EPOCH_ASSERT();
2637
2638	ifp = sc->vxl_ifp;
2639	srcaddr = &sc->vxl_src_addr.in6.sin6_addr;
2640	srcport = vxlan_pick_source_port(sc, m);
2641	dstaddr = &fvxlsa->in6.sin6_addr;
2642	dstport = fvxlsa->in6.sin6_port;
2643
2644	plen = m->m_pkthdr.len;
2645	M_PREPEND(m, sizeof(struct ip6_hdr) + sizeof(struct vxlanudphdr),
2646	    M_NOWAIT);
2647	if (m == NULL) {
2648		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2649		return (ENOBUFS);
2650	}
2651
2652	ip6 = mtod(m, struct ip6_hdr *);
2653	ip6->ip6_flow = 0;		/* BMV: Keep in forwarding entry? */
2654	ip6->ip6_vfc = IPV6_VERSION;
2655	ip6->ip6_plen = 0;
2656	ip6->ip6_nxt = IPPROTO_UDP;
2657	ip6->ip6_hlim = sc->vxl_ttl;
2658	ip6->ip6_src = *srcaddr;
2659	ip6->ip6_dst = *dstaddr;
2660
2661	vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport);
2662
2663	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
2664	m->m_flags &= ~(M_MCAST | M_BCAST);
2665
2666	ro = NULL;
2667	m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
2668	if (m->m_pkthdr.csum_flags != 0) {
2669		/*
2670		 * HW checksum (L3 and/or L4) or TSO has been requested.  Look
2671		 * up the ifnet for the outbound route and verify that the
2672		 * outbound ifnet can perform the requested operation on the
2673		 * inner frame.
2674		 */
2675		bzero(&route, sizeof(route));
2676		ro = &route;
2677		sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
2678		sin6->sin6_family = AF_INET6;
2679		sin6->sin6_len = sizeof(*sin6);
2680		sin6->sin6_addr = ip6->ip6_dst;
2681		ro->ro_nh = fib6_lookup(M_GETFIB(m), &ip6->ip6_dst, 0,
2682		    NHR_NONE, 0);
2683		if (ro->ro_nh == NULL) {
2684			m_freem(m);
2685			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2686			return (EHOSTUNREACH);
2687		}
2688
2689		csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
2690		    CSUM_ENCAP_VXLAN);
2691		if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
2692		    csum_flags) {
2693			if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
2694				const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
2695
2696				if_printf(ifp, "interface %s is missing hwcaps "
2697				    "0x%08x, csum_flags 0x%08x -> 0x%08x, "
2698				    "hwassist 0x%08x\n", nh_ifp->if_xname,
2699				    csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
2700				    m->m_pkthdr.csum_flags, csum_flags,
2701				    (uint32_t)nh_ifp->if_hwassist);
2702			}
2703			m_freem(m);
2704			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2705			return (ENXIO);
2706		}
2707		m->m_pkthdr.csum_flags = csum_flags;
2708		if (csum_flags &
2709		    (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
2710		    CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
2711			counter_u64_add(sc->vxl_stats.txcsum, 1);
2712			if (csum_flags & CSUM_INNER_TSO)
2713				counter_u64_add(sc->vxl_stats.tso, 1);
2714		}
2715	} else if (ntohs(dstport) != V_zero_checksum_port) {
2716		struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr));
2717
2718		hdr->uh_sum = in6_cksum_pseudo(ip6,
2719		    m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0);
2720		m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
2721		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
2722	}
2723	error = ip6_output(m, NULL, ro, 0, sc->vxl_im6o, NULL, NULL);
2724	if (error == 0) {
2725		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
2726		if_inc_counter(ifp, IFCOUNTER_OBYTES, plen);
2727		if (mcast != 0)
2728			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
2729	} else
2730		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2731
2732	return (error);
2733#else
2734	m_freem(m);
2735	return (ENOTSUP);
2736#endif
2737}
2738
2739#define MTAG_VXLAN_LOOP	0x7876706c /* vxlp */
2740static int
2741vxlan_transmit(struct ifnet *ifp, struct mbuf *m)
2742{
2743	struct rm_priotracker tracker;
2744	union vxlan_sockaddr vxlsa;
2745	struct vxlan_softc *sc;
2746	struct vxlan_ftable_entry *fe;
2747	struct ifnet *mcifp;
2748	struct ether_header *eh;
2749	int ipv4, error;
2750
2751	sc = ifp->if_softc;
2752	eh = mtod(m, struct ether_header *);
2753	fe = NULL;
2754	mcifp = NULL;
2755
2756	ETHER_BPF_MTAP(ifp, m);
2757
2758	VXLAN_RLOCK(sc, &tracker);
2759	M_SETFIB(m, sc->vxl_fibnum);
2760	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2761		VXLAN_RUNLOCK(sc, &tracker);
2762		m_freem(m);
2763		return (ENETDOWN);
2764	}
2765	if (__predict_false(if_tunnel_check_nesting(ifp, m, MTAG_VXLAN_LOOP,
2766	    max_vxlan_nesting) != 0)) {
2767		VXLAN_RUNLOCK(sc, &tracker);
2768		m_freem(m);
2769		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2770		return (ELOOP);
2771	}
2772
2773	if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
2774		fe = vxlan_ftable_entry_lookup(sc, eh->ether_dhost);
2775	if (fe == NULL)
2776		fe = &sc->vxl_default_fe;
2777	vxlan_sockaddr_copy(&vxlsa, &fe->vxlfe_raddr.sa);
2778
2779	ipv4 = VXLAN_SOCKADDR_IS_IPV4(&vxlsa) != 0;
2780	if (vxlan_sockaddr_in_multicast(&vxlsa) != 0)
2781		mcifp = vxlan_multicast_if_ref(sc, ipv4);
2782
2783	VXLAN_ACQUIRE(sc);
2784	VXLAN_RUNLOCK(sc, &tracker);
2785
2786	if (ipv4 != 0)
2787		error = vxlan_encap4(sc, &vxlsa, m);
2788	else
2789		error = vxlan_encap6(sc, &vxlsa, m);
2790
2791	vxlan_release(sc);
2792	if (mcifp != NULL)
2793		if_rele(mcifp);
2794
2795	return (error);
2796}
2797
2798static void
2799vxlan_qflush(struct ifnet *ifp __unused)
2800{
2801}
2802
2803static bool
2804vxlan_rcv_udp_packet(struct mbuf *m, int offset, struct inpcb *inpcb,
2805    const struct sockaddr *srcsa, void *xvso)
2806{
2807	struct vxlan_socket *vso;
2808	struct vxlan_header *vxh, vxlanhdr;
2809	uint32_t vni;
2810	int error __unused;
2811
2812	M_ASSERTPKTHDR(m);
2813	vso = xvso;
2814	offset += sizeof(struct udphdr);
2815
2816	if (m->m_pkthdr.len < offset + sizeof(struct vxlan_header))
2817		goto out;
2818
2819	if (__predict_false(m->m_len < offset + sizeof(struct vxlan_header))) {
2820		m_copydata(m, offset, sizeof(struct vxlan_header),
2821		    (caddr_t) &vxlanhdr);
2822		vxh = &vxlanhdr;
2823	} else
2824		vxh = mtodo(m, offset);
2825
2826	/*
2827	 * Drop if there is a reserved bit set in either the flags or VNI
2828	 * fields of the header. This goes against the specification, but
2829	 * a bit set may indicate an unsupported new feature. This matches
2830	 * the behavior of the Linux implementation.
2831	 */
2832	if (vxh->vxlh_flags != htonl(VXLAN_HDR_FLAGS_VALID_VNI) ||
2833	    vxh->vxlh_vni & ~VXLAN_VNI_MASK)
2834		goto out;
2835
2836	vni = ntohl(vxh->vxlh_vni) >> VXLAN_HDR_VNI_SHIFT;
2837
2838	/* Adjust to the start of the inner Ethernet frame. */
2839	m_adj_decap(m, offset + sizeof(struct vxlan_header));
2840
2841	error = vxlan_input(vso, vni, &m, srcsa);
2842	MPASS(error != 0 || m == NULL);
2843
2844out:
2845	if (m != NULL)
2846		m_freem(m);
2847
2848	return (true);
2849}
2850
2851static int
2852vxlan_input(struct vxlan_socket *vso, uint32_t vni, struct mbuf **m0,
2853    const struct sockaddr *sa)
2854{
2855	struct vxlan_softc *sc;
2856	struct ifnet *ifp;
2857	struct mbuf *m;
2858	struct ether_header *eh;
2859	int error;
2860
2861	m = *m0;
2862
2863	if (m->m_pkthdr.len < ETHER_HDR_LEN)
2864		return (EINVAL);
2865
2866	sc = vxlan_socket_lookup_softc(vso, vni);
2867	if (sc == NULL)
2868		return (ENOENT);
2869
2870	ifp = sc->vxl_ifp;
2871	if (m->m_len < ETHER_HDR_LEN &&
2872	    (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
2873		*m0 = NULL;
2874		error = ENOBUFS;
2875		goto out;
2876	}
2877	eh = mtod(m, struct ether_header *);
2878
2879	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2880		error = ENETDOWN;
2881		goto out;
2882	} else if (ifp == m->m_pkthdr.rcvif) {
2883		/* XXX Does not catch more complex loops. */
2884		error = EDEADLK;
2885		goto out;
2886	}
2887
2888	if (sc->vxl_flags & VXLAN_FLAG_LEARN)
2889		vxlan_ftable_learn(sc, sa, eh->ether_shost);
2890
2891	m_clrprotoflags(m);
2892	m->m_pkthdr.rcvif = ifp;
2893	M_SETFIB(m, ifp->if_fib);
2894	if (((ifp->if_capenable & IFCAP_RXCSUM &&
2895	    m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) ||
2896	    (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
2897	    !(m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)))) {
2898		uint32_t csum_flags = 0;
2899
2900		if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)
2901			csum_flags |= CSUM_L3_CALC;
2902		if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_VALID)
2903			csum_flags |= CSUM_L3_VALID;
2904		if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_CALC)
2905			csum_flags |= CSUM_L4_CALC;
2906		if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_VALID)
2907			csum_flags |= CSUM_L4_VALID;
2908		m->m_pkthdr.csum_flags = csum_flags;
2909		counter_u64_add(sc->vxl_stats.rxcsum, 1);
2910	} else {
2911		/* clear everything */
2912		m->m_pkthdr.csum_flags = 0;
2913		m->m_pkthdr.csum_data = 0;
2914	}
2915
2916	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2917	(*ifp->if_input)(ifp, m);
2918	*m0 = NULL;
2919	error = 0;
2920
2921out:
2922	vxlan_release(sc);
2923	return (error);
2924}
2925
2926static int
2927vxlan_stats_alloc(struct vxlan_softc *sc)
2928{
2929	struct vxlan_statistics *stats = &sc->vxl_stats;
2930
2931	stats->txcsum = counter_u64_alloc(M_WAITOK);
2932	if (stats->txcsum == NULL)
2933		goto failed;
2934
2935	stats->tso = counter_u64_alloc(M_WAITOK);
2936	if (stats->tso == NULL)
2937		goto failed;
2938
2939	stats->rxcsum = counter_u64_alloc(M_WAITOK);
2940	if (stats->rxcsum == NULL)
2941		goto failed;
2942
2943	return (0);
2944failed:
2945	vxlan_stats_free(sc);
2946	return (ENOMEM);
2947}
2948
2949static void
2950vxlan_stats_free(struct vxlan_softc *sc)
2951{
2952	struct vxlan_statistics *stats = &sc->vxl_stats;
2953
2954	if (stats->txcsum != NULL) {
2955		counter_u64_free(stats->txcsum);
2956		stats->txcsum = NULL;
2957	}
2958	if (stats->tso != NULL) {
2959		counter_u64_free(stats->tso);
2960		stats->tso = NULL;
2961	}
2962	if (stats->rxcsum != NULL) {
2963		counter_u64_free(stats->rxcsum);
2964		stats->rxcsum = NULL;
2965	}
2966}
2967
2968static void
2969vxlan_set_default_config(struct vxlan_softc *sc)
2970{
2971
2972	sc->vxl_flags |= VXLAN_FLAG_LEARN;
2973
2974	sc->vxl_vni = VXLAN_VNI_MAX;
2975	sc->vxl_ttl = IPDEFTTL;
2976
2977	if (!vxlan_tunable_int(sc, "legacy_port", vxlan_legacy_port)) {
2978		sc->vxl_src_addr.in4.sin_port = htons(VXLAN_PORT);
2979		sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_PORT);
2980	} else {
2981		sc->vxl_src_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT);
2982		sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT);
2983	}
2984
2985	sc->vxl_min_port = V_ipport_firstauto;
2986	sc->vxl_max_port = V_ipport_lastauto;
2987
2988	sc->vxl_ftable_max = VXLAN_FTABLE_MAX;
2989	sc->vxl_ftable_timeout = VXLAN_FTABLE_TIMEOUT;
2990}
2991
2992static int
2993vxlan_set_user_config(struct vxlan_softc *sc, struct ifvxlanparam *vxlp)
2994{
2995
2996#ifndef INET
2997	if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR4 |
2998	    VXLAN_PARAM_WITH_REMOTE_ADDR4))
2999		return (EAFNOSUPPORT);
3000#endif
3001
3002#ifndef INET6
3003	if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR6 |
3004	    VXLAN_PARAM_WITH_REMOTE_ADDR6))
3005		return (EAFNOSUPPORT);
3006#else
3007	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) {
3008		int error = vxlan_sockaddr_in6_embedscope(&vxlp->vxlp_local_sa);
3009		if (error)
3010			return (error);
3011	}
3012	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) {
3013		int error = vxlan_sockaddr_in6_embedscope(
3014		   &vxlp->vxlp_remote_sa);
3015		if (error)
3016			return (error);
3017	}
3018#endif
3019
3020	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_VNI) {
3021		if (vxlan_check_vni(vxlp->vxlp_vni) == 0)
3022			sc->vxl_vni = vxlp->vxlp_vni;
3023	}
3024
3025	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR4) {
3026		sc->vxl_src_addr.in4.sin_len = sizeof(struct sockaddr_in);
3027		sc->vxl_src_addr.in4.sin_family = AF_INET;
3028		sc->vxl_src_addr.in4.sin_addr =
3029		    vxlp->vxlp_local_sa.in4.sin_addr;
3030	} else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) {
3031		sc->vxl_src_addr.in6.sin6_len = sizeof(struct sockaddr_in6);
3032		sc->vxl_src_addr.in6.sin6_family = AF_INET6;
3033		sc->vxl_src_addr.in6.sin6_addr =
3034		    vxlp->vxlp_local_sa.in6.sin6_addr;
3035	}
3036
3037	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR4) {
3038		sc->vxl_dst_addr.in4.sin_len = sizeof(struct sockaddr_in);
3039		sc->vxl_dst_addr.in4.sin_family = AF_INET;
3040		sc->vxl_dst_addr.in4.sin_addr =
3041		    vxlp->vxlp_remote_sa.in4.sin_addr;
3042	} else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) {
3043		sc->vxl_dst_addr.in6.sin6_len = sizeof(struct sockaddr_in6);
3044		sc->vxl_dst_addr.in6.sin6_family = AF_INET6;
3045		sc->vxl_dst_addr.in6.sin6_addr =
3046		    vxlp->vxlp_remote_sa.in6.sin6_addr;
3047	}
3048
3049	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_PORT)
3050		sc->vxl_src_addr.in4.sin_port = htons(vxlp->vxlp_local_port);
3051	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_PORT)
3052		sc->vxl_dst_addr.in4.sin_port = htons(vxlp->vxlp_remote_port);
3053
3054	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_PORT_RANGE) {
3055		if (vxlp->vxlp_min_port <= vxlp->vxlp_max_port) {
3056			sc->vxl_min_port = vxlp->vxlp_min_port;
3057			sc->vxl_max_port = vxlp->vxlp_max_port;
3058		}
3059	}
3060
3061	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_MULTICAST_IF)
3062		strlcpy(sc->vxl_mc_ifname, vxlp->vxlp_mc_ifname, IFNAMSIZ);
3063
3064	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_TIMEOUT) {
3065		if (vxlan_check_ftable_timeout(vxlp->vxlp_ftable_timeout) == 0)
3066			sc->vxl_ftable_timeout = vxlp->vxlp_ftable_timeout;
3067	}
3068
3069	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_MAX) {
3070		if (vxlan_check_ftable_max(vxlp->vxlp_ftable_max) == 0)
3071			sc->vxl_ftable_max = vxlp->vxlp_ftable_max;
3072	}
3073
3074	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_TTL) {
3075		if (vxlan_check_ttl(vxlp->vxlp_ttl) == 0)
3076			sc->vxl_ttl = vxlp->vxlp_ttl;
3077	}
3078
3079	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LEARN) {
3080		if (vxlp->vxlp_learn == 0)
3081			sc->vxl_flags &= ~VXLAN_FLAG_LEARN;
3082	}
3083
3084	return (0);
3085}
3086
3087static int
3088vxlan_set_reqcap(struct vxlan_softc *sc, struct ifnet *ifp, int reqcap)
3089{
3090	int mask = reqcap ^ ifp->if_capenable;
3091
3092	/* Disable TSO if tx checksums are disabled. */
3093	if (mask & IFCAP_TXCSUM && !(reqcap & IFCAP_TXCSUM) &&
3094	    reqcap & IFCAP_TSO4) {
3095		reqcap &= ~IFCAP_TSO4;
3096		if_printf(ifp, "tso4 disabled due to -txcsum.\n");
3097	}
3098	if (mask & IFCAP_TXCSUM_IPV6 && !(reqcap & IFCAP_TXCSUM_IPV6) &&
3099	    reqcap & IFCAP_TSO6) {
3100		reqcap &= ~IFCAP_TSO6;
3101		if_printf(ifp, "tso6 disabled due to -txcsum6.\n");
3102	}
3103
3104	/* Do not enable TSO if tx checksums are disabled. */
3105	if (mask & IFCAP_TSO4 && reqcap & IFCAP_TSO4 &&
3106	    !(reqcap & IFCAP_TXCSUM)) {
3107		if_printf(ifp, "enable txcsum first.\n");
3108		return (EAGAIN);
3109	}
3110	if (mask & IFCAP_TSO6 && reqcap & IFCAP_TSO6 &&
3111	    !(reqcap & IFCAP_TXCSUM_IPV6)) {
3112		if_printf(ifp, "enable txcsum6 first.\n");
3113		return (EAGAIN);
3114	}
3115
3116	sc->vxl_reqcap = reqcap;
3117	return (0);
3118}
3119
3120/*
3121 * A VXLAN interface inherits the capabilities of the vxlandev or the interface
3122 * hosting the vxlanlocal address.
3123 */
3124static void
3125vxlan_set_hwcaps(struct vxlan_softc *sc)
3126{
3127	struct epoch_tracker et;
3128	struct ifnet *p;
3129	struct ifaddr *ifa;
3130	u_long hwa;
3131	int cap, ena;
3132	bool rel;
3133	struct ifnet *ifp = sc->vxl_ifp;
3134
3135	/* reset caps */
3136	ifp->if_capabilities &= VXLAN_BASIC_IFCAPS;
3137	ifp->if_capenable &= VXLAN_BASIC_IFCAPS;
3138	ifp->if_hwassist = 0;
3139
3140	NET_EPOCH_ENTER(et);
3141	CURVNET_SET(ifp->if_vnet);
3142
3143	rel = false;
3144	p = NULL;
3145	if (sc->vxl_mc_ifname[0] != '\0') {
3146		rel = true;
3147		p = ifunit_ref(sc->vxl_mc_ifname);
3148	} else if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) {
3149		if (sc->vxl_src_addr.sa.sa_family == AF_INET) {
3150			struct sockaddr_in in4 = sc->vxl_src_addr.in4;
3151
3152			in4.sin_port = 0;
3153			ifa = ifa_ifwithaddr((struct sockaddr *)&in4);
3154			if (ifa != NULL)
3155				p = ifa->ifa_ifp;
3156		} else if (sc->vxl_src_addr.sa.sa_family == AF_INET6) {
3157			struct sockaddr_in6 in6 = sc->vxl_src_addr.in6;
3158
3159			in6.sin6_port = 0;
3160			ifa = ifa_ifwithaddr((struct sockaddr *)&in6);
3161			if (ifa != NULL)
3162				p = ifa->ifa_ifp;
3163		}
3164	}
3165	if (p == NULL)
3166		goto done;
3167
3168	cap = ena = hwa = 0;
3169
3170	/* checksum offload */
3171	if (p->if_capabilities & IFCAP_VXLAN_HWCSUM)
3172		cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
3173	if (p->if_capenable & IFCAP_VXLAN_HWCSUM) {
3174		ena |= sc->vxl_reqcap & p->if_capenable &
3175		    (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
3176		if (ena & IFCAP_TXCSUM) {
3177			if (p->if_hwassist & CSUM_INNER_IP)
3178				hwa |= CSUM_IP;
3179			if (p->if_hwassist & CSUM_INNER_IP_UDP)
3180				hwa |= CSUM_IP_UDP;
3181			if (p->if_hwassist & CSUM_INNER_IP_TCP)
3182				hwa |= CSUM_IP_TCP;
3183		}
3184		if (ena & IFCAP_TXCSUM_IPV6) {
3185			if (p->if_hwassist & CSUM_INNER_IP6_UDP)
3186				hwa |= CSUM_IP6_UDP;
3187			if (p->if_hwassist & CSUM_INNER_IP6_TCP)
3188				hwa |= CSUM_IP6_TCP;
3189		}
3190	}
3191
3192	/* hardware TSO */
3193	if (p->if_capabilities & IFCAP_VXLAN_HWTSO) {
3194		cap |= p->if_capabilities & IFCAP_TSO;
3195		if (p->if_hw_tsomax > IP_MAXPACKET - ifp->if_hdrlen)
3196			ifp->if_hw_tsomax = IP_MAXPACKET - ifp->if_hdrlen;
3197		else
3198			ifp->if_hw_tsomax = p->if_hw_tsomax;
3199		/* XXX: tsomaxsegcount decrement is cxgbe specific  */
3200		ifp->if_hw_tsomaxsegcount = p->if_hw_tsomaxsegcount - 1;
3201		ifp->if_hw_tsomaxsegsize = p->if_hw_tsomaxsegsize;
3202	}
3203	if (p->if_capenable & IFCAP_VXLAN_HWTSO) {
3204		ena |= sc->vxl_reqcap & p->if_capenable & IFCAP_TSO;
3205		if (ena & IFCAP_TSO) {
3206			if (p->if_hwassist & CSUM_INNER_IP_TSO)
3207				hwa |= CSUM_IP_TSO;
3208			if (p->if_hwassist & CSUM_INNER_IP6_TSO)
3209				hwa |= CSUM_IP6_TSO;
3210		}
3211	}
3212
3213	ifp->if_capabilities |= cap;
3214	ifp->if_capenable |= ena;
3215	ifp->if_hwassist |= hwa;
3216	if (rel)
3217		if_rele(p);
3218done:
3219	CURVNET_RESTORE();
3220	NET_EPOCH_EXIT(et);
3221}
3222
3223static int
3224vxlan_clone_create(struct if_clone *ifc, char *name, size_t len,
3225    struct ifc_data *ifd, struct ifnet **ifpp)
3226{
3227	struct vxlan_softc *sc;
3228	struct ifnet *ifp;
3229	struct ifvxlanparam vxlp;
3230	int error;
3231
3232	sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO);
3233	sc->vxl_unit = ifd->unit;
3234	sc->vxl_fibnum = curthread->td_proc->p_fibnum;
3235	vxlan_set_default_config(sc);
3236	error = vxlan_stats_alloc(sc);
3237	if (error != 0)
3238		goto fail;
3239
3240	if (ifd->params != NULL) {
3241		error = ifc_copyin(ifd, &vxlp, sizeof(vxlp));
3242		if (error)
3243			goto fail;
3244
3245		error = vxlan_set_user_config(sc, &vxlp);
3246		if (error)
3247			goto fail;
3248	}
3249
3250	ifp = if_alloc(IFT_ETHER);
3251	if (ifp == NULL) {
3252		error = ENOSPC;
3253		goto fail;
3254	}
3255
3256	sc->vxl_ifp = ifp;
3257	rm_init(&sc->vxl_lock, "vxlanrm");
3258	callout_init_rw(&sc->vxl_callout, &sc->vxl_lock, 0);
3259	sc->vxl_port_hash_key = arc4random();
3260	vxlan_ftable_init(sc);
3261
3262	vxlan_sysctl_setup(sc);
3263
3264	ifp->if_softc = sc;
3265	if_initname(ifp, vxlan_name, ifd->unit);
3266	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3267	ifp->if_init = vxlan_init;
3268	ifp->if_ioctl = vxlan_ioctl;
3269	ifp->if_transmit = vxlan_transmit;
3270	ifp->if_qflush = vxlan_qflush;
3271	ifp->if_capabilities = VXLAN_BASIC_IFCAPS;
3272	ifp->if_capenable = VXLAN_BASIC_IFCAPS;
3273	sc->vxl_reqcap = -1;
3274	vxlan_set_hwcaps(sc);
3275
3276	ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status);
3277	ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL);
3278	ifmedia_set(&sc->vxl_media, IFM_ETHER | IFM_AUTO);
3279
3280	ether_gen_addr(ifp, &sc->vxl_hwaddr);
3281	ether_ifattach(ifp, sc->vxl_hwaddr.octet);
3282
3283	ifp->if_baudrate = 0;
3284
3285	VXLAN_WLOCK(sc);
3286	vxlan_setup_interface_hdrlen(sc);
3287	VXLAN_WUNLOCK(sc);
3288	*ifpp = ifp;
3289
3290	return (0);
3291
3292fail:
3293	free(sc, M_VXLAN);
3294	return (error);
3295}
3296
3297static int
3298vxlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
3299{
3300	struct vxlan_softc *sc;
3301
3302	sc = ifp->if_softc;
3303
3304	vxlan_teardown(sc);
3305
3306	vxlan_ftable_flush(sc, 1);
3307
3308	ether_ifdetach(ifp);
3309	if_free(ifp);
3310	ifmedia_removeall(&sc->vxl_media);
3311
3312	vxlan_ftable_fini(sc);
3313
3314	vxlan_sysctl_destroy(sc);
3315	rm_destroy(&sc->vxl_lock);
3316	vxlan_stats_free(sc);
3317	free(sc, M_VXLAN);
3318
3319	return (0);
3320}
3321
3322/* BMV: Taken from if_bridge. */
3323static uint32_t
3324vxlan_mac_hash(struct vxlan_softc *sc, const uint8_t *addr)
3325{
3326	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->vxl_ftable_hash_key;
3327
3328	b += addr[5] << 8;
3329	b += addr[4];
3330	a += addr[3] << 24;
3331	a += addr[2] << 16;
3332	a += addr[1] << 8;
3333	a += addr[0];
3334
3335/*
3336 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
3337 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
3338 */
3339#define	mix(a, b, c)							\
3340do {									\
3341	a -= b; a -= c; a ^= (c >> 13);					\
3342	b -= c; b -= a; b ^= (a << 8);					\
3343	c -= a; c -= b; c ^= (b >> 13);					\
3344	a -= b; a -= c; a ^= (c >> 12);					\
3345	b -= c; b -= a; b ^= (a << 16);					\
3346	c -= a; c -= b; c ^= (b >> 5);					\
3347	a -= b; a -= c; a ^= (c >> 3);					\
3348	b -= c; b -= a; b ^= (a << 10);					\
3349	c -= a; c -= b; c ^= (b >> 15);					\
3350} while (0)
3351
3352	mix(a, b, c);
3353
3354#undef mix
3355
3356	return (c);
3357}
3358
3359static int
3360vxlan_media_change(struct ifnet *ifp)
3361{
3362
3363	/* Ignore. */
3364	return (0);
3365}
3366
3367static void
3368vxlan_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3369{
3370
3371	ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID;
3372	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3373}
3374
3375static int
3376vxlan_sockaddr_cmp(const union vxlan_sockaddr *vxladdr,
3377    const struct sockaddr *sa)
3378{
3379
3380	return (bcmp(&vxladdr->sa, sa, vxladdr->sa.sa_len));
3381}
3382
3383static void
3384vxlan_sockaddr_copy(union vxlan_sockaddr *vxladdr,
3385    const struct sockaddr *sa)
3386{
3387
3388	MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6);
3389	bzero(vxladdr, sizeof(*vxladdr));
3390
3391	if (sa->sa_family == AF_INET) {
3392		vxladdr->in4 = *satoconstsin(sa);
3393		vxladdr->in4.sin_len = sizeof(struct sockaddr_in);
3394	} else if (sa->sa_family == AF_INET6) {
3395		vxladdr->in6 = *satoconstsin6(sa);
3396		vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6);
3397	}
3398}
3399
3400static int
3401vxlan_sockaddr_in_equal(const union vxlan_sockaddr *vxladdr,
3402    const struct sockaddr *sa)
3403{
3404	int equal;
3405
3406	if (sa->sa_family == AF_INET) {
3407		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
3408		equal = in4->s_addr == vxladdr->in4.sin_addr.s_addr;
3409	} else if (sa->sa_family == AF_INET6) {
3410		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
3411		equal = IN6_ARE_ADDR_EQUAL(in6, &vxladdr->in6.sin6_addr);
3412	} else
3413		equal = 0;
3414
3415	return (equal);
3416}
3417
3418static void
3419vxlan_sockaddr_in_copy(union vxlan_sockaddr *vxladdr,
3420    const struct sockaddr *sa)
3421{
3422
3423	MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6);
3424
3425	if (sa->sa_family == AF_INET) {
3426		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
3427		vxladdr->in4.sin_family = AF_INET;
3428		vxladdr->in4.sin_len = sizeof(struct sockaddr_in);
3429		vxladdr->in4.sin_addr = *in4;
3430	} else if (sa->sa_family == AF_INET6) {
3431		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
3432		vxladdr->in6.sin6_family = AF_INET6;
3433		vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6);
3434		vxladdr->in6.sin6_addr = *in6;
3435	}
3436}
3437
3438static int
3439vxlan_sockaddr_supported(const union vxlan_sockaddr *vxladdr, int unspec)
3440{
3441	const struct sockaddr *sa;
3442	int supported;
3443
3444	sa = &vxladdr->sa;
3445	supported = 0;
3446
3447	if (sa->sa_family == AF_UNSPEC && unspec != 0) {
3448		supported = 1;
3449	} else if (sa->sa_family == AF_INET) {
3450#ifdef INET
3451		supported = 1;
3452#endif
3453	} else if (sa->sa_family == AF_INET6) {
3454#ifdef INET6
3455		supported = 1;
3456#endif
3457	}
3458
3459	return (supported);
3460}
3461
3462static int
3463vxlan_sockaddr_in_any(const union vxlan_sockaddr *vxladdr)
3464{
3465	const struct sockaddr *sa;
3466	int any;
3467
3468	sa = &vxladdr->sa;
3469
3470	if (sa->sa_family == AF_INET) {
3471		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
3472		any = in4->s_addr == INADDR_ANY;
3473	} else if (sa->sa_family == AF_INET6) {
3474		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
3475		any = IN6_IS_ADDR_UNSPECIFIED(in6);
3476	} else
3477		any = -1;
3478
3479	return (any);
3480}
3481
3482static int
3483vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *vxladdr)
3484{
3485	const struct sockaddr *sa;
3486	int mc;
3487
3488	sa = &vxladdr->sa;
3489
3490	if (sa->sa_family == AF_INET) {
3491		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
3492		mc = IN_MULTICAST(ntohl(in4->s_addr));
3493	} else if (sa->sa_family == AF_INET6) {
3494		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
3495		mc = IN6_IS_ADDR_MULTICAST(in6);
3496	} else
3497		mc = -1;
3498
3499	return (mc);
3500}
3501
3502static int
3503vxlan_sockaddr_in6_embedscope(union vxlan_sockaddr *vxladdr)
3504{
3505	int error;
3506
3507	MPASS(VXLAN_SOCKADDR_IS_IPV6(vxladdr));
3508#ifdef INET6
3509	error = sa6_embedscope(&vxladdr->in6, V_ip6_use_defzone);
3510#else
3511	error = EAFNOSUPPORT;
3512#endif
3513
3514	return (error);
3515}
3516
3517static int
3518vxlan_can_change_config(struct vxlan_softc *sc)
3519{
3520	struct ifnet *ifp;
3521
3522	ifp = sc->vxl_ifp;
3523	VXLAN_LOCK_ASSERT(sc);
3524
3525	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3526		return (0);
3527	if (sc->vxl_flags & (VXLAN_FLAG_INIT | VXLAN_FLAG_TEARDOWN))
3528		return (0);
3529
3530	return (1);
3531}
3532
3533static int
3534vxlan_check_vni(uint32_t vni)
3535{
3536
3537	return (vni >= VXLAN_VNI_MAX);
3538}
3539
3540static int
3541vxlan_check_ttl(int ttl)
3542{
3543
3544	return (ttl > MAXTTL);
3545}
3546
3547static int
3548vxlan_check_ftable_timeout(uint32_t timeout)
3549{
3550
3551	return (timeout > VXLAN_FTABLE_MAX_TIMEOUT);
3552}
3553
3554static int
3555vxlan_check_ftable_max(uint32_t max)
3556{
3557
3558	return (max > VXLAN_FTABLE_MAX);
3559}
3560
3561static void
3562vxlan_sysctl_setup(struct vxlan_softc *sc)
3563{
3564	struct sysctl_ctx_list *ctx;
3565	struct sysctl_oid *node;
3566	struct vxlan_statistics *stats;
3567	char namebuf[8];
3568
3569	ctx = &sc->vxl_sysctl_ctx;
3570	stats = &sc->vxl_stats;
3571	snprintf(namebuf, sizeof(namebuf), "%d", sc->vxl_unit);
3572
3573	sysctl_ctx_init(ctx);
3574	sc->vxl_sysctl_node = SYSCTL_ADD_NODE(ctx,
3575	    SYSCTL_STATIC_CHILDREN(_net_link_vxlan), OID_AUTO, namebuf,
3576	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
3577
3578	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node),
3579	    OID_AUTO, "ftable", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
3580	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "count",
3581	    CTLFLAG_RD, &sc->vxl_ftable_cnt, 0,
3582	    "Number of entries in forwarding table");
3583	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "max",
3584	     CTLFLAG_RD, &sc->vxl_ftable_max, 0,
3585	    "Maximum number of entries allowed in forwarding table");
3586	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "timeout",
3587	    CTLFLAG_RD, &sc->vxl_ftable_timeout, 0,
3588	    "Number of seconds between prunes of the forwarding table");
3589	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "dump",
3590	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
3591	    sc, 0, vxlan_ftable_sysctl_dump, "A",
3592	    "Dump the forwarding table entries");
3593
3594	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node),
3595	    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
3596	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
3597	    "ftable_nospace", CTLFLAG_RD, &stats->ftable_nospace, 0,
3598	    "Fowarding table reached maximum entries");
3599	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
3600	    "ftable_lock_upgrade_failed", CTLFLAG_RD,
3601	    &stats->ftable_lock_upgrade_failed, 0,
3602	    "Forwarding table update required lock upgrade");
3603
3604	SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "txcsum",
3605	    CTLFLAG_RD, &stats->txcsum,
3606	    "# of times hardware assisted with tx checksum");
3607	SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "tso",
3608	    CTLFLAG_RD, &stats->tso, "# of times hardware assisted with TSO");
3609	SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "rxcsum",
3610	    CTLFLAG_RD, &stats->rxcsum,
3611	    "# of times hardware assisted with rx checksum");
3612}
3613
3614static void
3615vxlan_sysctl_destroy(struct vxlan_softc *sc)
3616{
3617
3618	sysctl_ctx_free(&sc->vxl_sysctl_ctx);
3619	sc->vxl_sysctl_node = NULL;
3620}
3621
3622static int
3623vxlan_tunable_int(struct vxlan_softc *sc, const char *knob, int def)
3624{
3625	char path[64];
3626
3627	snprintf(path, sizeof(path), "net.link.vxlan.%d.%s",
3628	    sc->vxl_unit, knob);
3629	TUNABLE_INT_FETCH(path, &def);
3630
3631	return (def);
3632}
3633
3634static void
3635vxlan_ifdetach_event(void *arg __unused, struct ifnet *ifp)
3636{
3637	struct vxlan_softc_head list;
3638	struct vxlan_socket *vso;
3639	struct vxlan_softc *sc, *tsc;
3640
3641	LIST_INIT(&list);
3642
3643	if (ifp->if_flags & IFF_RENAMING)
3644		return;
3645	if ((ifp->if_flags & IFF_MULTICAST) == 0)
3646		return;
3647
3648	VXLAN_LIST_LOCK();
3649	LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry)
3650		vxlan_socket_ifdetach(vso, ifp, &list);
3651	VXLAN_LIST_UNLOCK();
3652
3653	LIST_FOREACH_SAFE(sc, &list, vxl_ifdetach_list, tsc) {
3654		LIST_REMOVE(sc, vxl_ifdetach_list);
3655
3656		sx_xlock(&vxlan_sx);
3657		VXLAN_WLOCK(sc);
3658		if (sc->vxl_flags & VXLAN_FLAG_INIT)
3659			vxlan_init_wait(sc);
3660		vxlan_teardown_locked(sc);
3661		sx_xunlock(&vxlan_sx);
3662	}
3663}
3664
3665static void
3666vxlan_load(void)
3667{
3668
3669	mtx_init(&vxlan_list_mtx, "vxlan list", NULL, MTX_DEF);
3670	LIST_INIT(&vxlan_socket_list);
3671	vxlan_ifdetach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
3672	    vxlan_ifdetach_event, NULL, EVENTHANDLER_PRI_ANY);
3673
3674	struct if_clone_addreq req = {
3675		.create_f = vxlan_clone_create,
3676		.destroy_f = vxlan_clone_destroy,
3677		.flags = IFC_F_AUTOUNIT,
3678	};
3679	vxlan_cloner = ifc_attach_cloner(vxlan_name, &req);
3680}
3681
3682static void
3683vxlan_unload(void)
3684{
3685
3686	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
3687	    vxlan_ifdetach_event_tag);
3688	ifc_detach_cloner(vxlan_cloner);
3689	mtx_destroy(&vxlan_list_mtx);
3690	MPASS(LIST_EMPTY(&vxlan_socket_list));
3691}
3692
3693static int
3694vxlan_modevent(module_t mod, int type, void *unused)
3695{
3696	int error;
3697
3698	error = 0;
3699
3700	switch (type) {
3701	case MOD_LOAD:
3702		vxlan_load();
3703		break;
3704	case MOD_UNLOAD:
3705		vxlan_unload();
3706		break;
3707	default:
3708		error = ENOTSUP;
3709		break;
3710	}
3711
3712	return (error);
3713}
3714
3715static moduledata_t vxlan_mod = {
3716	"if_vxlan",
3717	vxlan_modevent,
3718	0
3719};
3720
3721DECLARE_MODULE(if_vxlan, vxlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
3722MODULE_VERSION(if_vxlan, 1);
3723