1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * Ethernet address resolution protocol.
34 * TODO:
35 *	add "inuse/lock" bit (or ref. count) along with valid bit
36 */
37
38#include <sys/cdefs.h>
39#include "opt_inet.h"
40
41#include <sys/param.h>
42#include <sys/eventhandler.h>
43#include <sys/kernel.h>
44#include <sys/lock.h>
45#include <sys/queue.h>
46#include <sys/sysctl.h>
47#include <sys/systm.h>
48#include <sys/mbuf.h>
49#include <sys/malloc.h>
50#include <sys/proc.h>
51#include <sys/socket.h>
52#include <sys/syslog.h>
53
54#include <net/if.h>
55#include <net/if_var.h>
56#include <net/if_dl.h>
57#include <net/if_private.h>
58#include <net/if_types.h>
59#include <net/netisr.h>
60#include <net/ethernet.h>
61#include <net/route.h>
62#include <net/route/nhop.h>
63#include <net/vnet.h>
64
65#include <netinet/in.h>
66#include <netinet/in_fib.h>
67#include <netinet/in_var.h>
68#include <net/if_llatbl.h>
69#include <netinet/if_ether.h>
70#ifdef INET
71#include <netinet/ip_carp.h>
72#endif
73
74#include <security/mac/mac_framework.h>
75
76#define SIN(s) ((const struct sockaddr_in *)(s))
77
78static struct timeval arp_lastlog;
79static int arp_curpps;
80static int arp_maxpps = 1;
81
82/* Simple ARP state machine */
83enum arp_llinfo_state {
84	ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */
85	ARP_LLINFO_REACHABLE,	/* LLE is valid */
86	ARP_LLINFO_VERIFY,	/* LLE is valid, need refresh */
87	ARP_LLINFO_DELETED,	/* LLE is deleted */
88};
89
90SYSCTL_DECL(_net_link_ether);
91static SYSCTL_NODE(_net_link_ether, PF_INET, inet,
92    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
93    "");
94static SYSCTL_NODE(_net_link_ether, PF_ARP, arp,
95    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
96    "");
97
98/* timer values */
99VNET_DEFINE_STATIC(int, arpt_keep) = (20*60);	/* once resolved, good for 20
100						 * minutes */
101VNET_DEFINE_STATIC(int, arp_maxtries) = 5;
102VNET_DEFINE_STATIC(int, arp_proxyall) = 0;
103VNET_DEFINE_STATIC(int, arpt_down) = 20;	/* keep incomplete entries for
104						 * 20 seconds */
105VNET_DEFINE_STATIC(int, arpt_rexmit) = 1;	/* retransmit arp entries, sec*/
106VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat);  /* ARP statistics, see if_arp.h */
107VNET_PCPUSTAT_SYSINIT(arpstat);
108
109#ifdef VIMAGE
110VNET_PCPUSTAT_SYSUNINIT(arpstat);
111#endif /* VIMAGE */
112
113VNET_DEFINE_STATIC(int, arp_maxhold) = 16;
114
115#define	V_arpt_keep		VNET(arpt_keep)
116#define	V_arpt_down		VNET(arpt_down)
117#define	V_arpt_rexmit		VNET(arpt_rexmit)
118#define	V_arp_maxtries		VNET(arp_maxtries)
119#define	V_arp_proxyall		VNET(arp_proxyall)
120#define	V_arp_maxhold		VNET(arp_maxhold)
121
122SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW,
123	&VNET_NAME(arpt_keep), 0,
124	"ARP entry lifetime in seconds");
125SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW,
126	&VNET_NAME(arp_maxtries), 0,
127	"ARP resolution attempts before returning error");
128SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW,
129	&VNET_NAME(arp_proxyall), 0,
130	"Enable proxy ARP for all suitable requests");
131SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW,
132	&VNET_NAME(arpt_down), 0,
133	"Incomplete ARP entry lifetime in seconds");
134SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
135    arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
136SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW,
137	&VNET_NAME(arp_maxhold), 0,
138	"Number of packets to hold per ARP entry");
139SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
140	CTLFLAG_RW, &arp_maxpps, 0,
141	"Maximum number of remotely triggered ARP messages that can be "
142	"logged per second");
143
144/*
145 * Due to the exponential backoff algorithm used for the interval between GARP
146 * retransmissions, the maximum number of retransmissions is limited for
147 * sanity. This limit corresponds to a maximum interval between retransmissions
148 * of 2^16 seconds ~= 18 hours.
149 *
150 * Making this limit more dynamic is more complicated than worthwhile,
151 * especially since sending out GARPs spaced days apart would be of little
152 * use. A maximum dynamic limit would look something like:
153 *
154 * const int max = fls(INT_MAX / hz) - 1;
155 */
156#define MAX_GARP_RETRANSMITS 16
157static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS);
158static int garp_rexmit_count = 0; /* GARP retransmission setting. */
159
160SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, garp_rexmit_count,
161    CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
162    &garp_rexmit_count, 0, sysctl_garp_rexmit, "I",
163    "Number of times to retransmit GARP packets;"
164    " 0 to disable, maximum of 16");
165
166VNET_DEFINE_STATIC(int, arp_log_level) = LOG_INFO;	/* Min. log(9) level. */
167#define	V_arp_log_level		VNET(arp_log_level)
168SYSCTL_INT(_net_link_ether_arp, OID_AUTO, log_level, CTLFLAG_VNET | CTLFLAG_RW,
169	&VNET_NAME(arp_log_level), 0,
170	"Minimum log(9) level for recording rate limited arp log messages. "
171	"The higher will be log more (emerg=0, info=6 (default), debug=7).");
172#define	ARP_LOG(pri, ...)	do {					\
173	if ((pri) <= V_arp_log_level &&					\
174	    ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps))	\
175		log((pri), "arp: " __VA_ARGS__);			\
176} while (0)
177
178static void	arpintr(struct mbuf *);
179static void	arptimer(void *);
180#ifdef INET
181static void	in_arpinput(struct mbuf *);
182#endif
183
184static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr,
185    struct ifnet *ifp, int bridged, struct llentry *la);
186static void arp_mark_lle_reachable(struct llentry *la, struct ifnet *ifp);
187static void arp_iflladdr(void *arg __unused, struct ifnet *ifp);
188
189static eventhandler_tag iflladdr_tag;
190
191static const struct netisr_handler arp_nh = {
192	.nh_name = "arp",
193	.nh_handler = arpintr,
194	.nh_proto = NETISR_ARP,
195	.nh_policy = NETISR_POLICY_SOURCE,
196};
197
198/*
199 * Timeout routine.  Age arp_tab entries periodically.
200 */
201static void
202arptimer(void *arg)
203{
204	struct llentry *lle = (struct llentry *)arg;
205	struct ifnet *ifp;
206
207	if (lle->la_flags & LLE_STATIC) {
208		return;
209	}
210	LLE_WLOCK(lle);
211	if (callout_pending(&lle->lle_timer)) {
212		/*
213		 * Here we are a bit odd here in the treatment of
214		 * active/pending. If the pending bit is set, it got
215		 * rescheduled before I ran. The active
216		 * bit we ignore, since if it was stopped
217		 * in ll_tablefree() and was currently running
218		 * it would have return 0 so the code would
219		 * not have deleted it since the callout could
220		 * not be stopped so we want to go through
221		 * with the delete here now. If the callout
222		 * was restarted, the pending bit will be back on and
223		 * we just want to bail since the callout_reset would
224		 * return 1 and our reference would have been removed
225		 * by arpresolve() below.
226		 */
227		LLE_WUNLOCK(lle);
228 		return;
229 	}
230	ifp = lle->lle_tbl->llt_ifp;
231	CURVNET_SET(ifp->if_vnet);
232
233	switch (lle->ln_state) {
234	case ARP_LLINFO_REACHABLE:
235
236		/*
237		 * Expiration time is approaching.
238		 * Request usage feedback from the datapath.
239		 * Change state and re-schedule ourselves.
240		 */
241		llentry_request_feedback(lle);
242		lle->ln_state = ARP_LLINFO_VERIFY;
243		callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
244		LLE_WUNLOCK(lle);
245		CURVNET_RESTORE();
246		return;
247	case ARP_LLINFO_VERIFY:
248		if (llentry_get_hittime(lle) > 0 && lle->la_preempt > 0) {
249			/* Entry was used, issue refresh request */
250			struct epoch_tracker et;
251			struct in_addr dst;
252
253			dst = lle->r_l3addr.addr4;
254			lle->la_preempt--;
255			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
256			LLE_WUNLOCK(lle);
257			NET_EPOCH_ENTER(et);
258			arprequest(ifp, NULL, &dst, NULL);
259			NET_EPOCH_EXIT(et);
260			CURVNET_RESTORE();
261			return;
262		}
263		/* Nothing happened. Reschedule if not too late */
264		if (lle->la_expire > time_uptime) {
265			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
266			LLE_WUNLOCK(lle);
267			CURVNET_RESTORE();
268			return;
269		}
270		break;
271	case ARP_LLINFO_INCOMPLETE:
272	case ARP_LLINFO_DELETED:
273		break;
274	}
275
276	if ((lle->la_flags & LLE_DELETED) == 0) {
277		int evt;
278
279		if (lle->la_flags & LLE_VALID)
280			evt = LLENTRY_EXPIRED;
281		else
282			evt = LLENTRY_TIMEDOUT;
283		EVENTHANDLER_INVOKE(lle_event, lle, evt);
284	}
285
286	callout_stop(&lle->lle_timer);
287
288	/* XXX: LOR avoidance. We still have ref on lle. */
289	LLE_WUNLOCK(lle);
290	IF_AFDATA_LOCK(ifp);
291	LLE_WLOCK(lle);
292
293	/* Guard against race with other llentry_free(). */
294	if (lle->la_flags & LLE_LINKED) {
295		LLE_REMREF(lle);
296		lltable_unlink_entry(lle->lle_tbl, lle);
297	}
298	IF_AFDATA_UNLOCK(ifp);
299
300	size_t pkts_dropped = llentry_free(lle);
301
302	ARPSTAT_ADD(dropped, pkts_dropped);
303	ARPSTAT_INC(timeouts);
304
305	CURVNET_RESTORE();
306}
307
308/*
309 * Stores link-layer header for @ifp in format suitable for if_output()
310 * into buffer @buf. Resulting header length is stored in @bufsize.
311 *
312 * Returns 0 on success.
313 */
314static int
315arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf,
316    size_t *bufsize)
317{
318	struct if_encap_req ereq;
319	int error;
320
321	bzero(buf, *bufsize);
322	bzero(&ereq, sizeof(ereq));
323	ereq.buf = buf;
324	ereq.bufsize = *bufsize;
325	ereq.rtype = IFENCAP_LL;
326	ereq.family = AF_ARP;
327	ereq.lladdr = ar_tha(ah);
328	ereq.hdata = (u_char *)ah;
329	if (bcast)
330		ereq.flags = IFENCAP_FLAG_BROADCAST;
331	error = ifp->if_requestencap(ifp, &ereq);
332	if (error == 0)
333		*bufsize = ereq.bufsize;
334
335	return (error);
336}
337
338/*
339 * Broadcast an ARP request. Caller specifies:
340 *	- arp header source ip address
341 *	- arp header target ip address
342 *	- arp header source ethernet address
343 */
344static int
345arprequest_internal(struct ifnet *ifp, const struct in_addr *sip,
346    const struct in_addr *tip, u_char *enaddr)
347{
348	struct mbuf *m;
349	struct arphdr *ah;
350	struct sockaddr sa;
351	u_char *carpaddr = NULL;
352	uint8_t linkhdr[LLE_MAX_LINKHDR];
353	size_t linkhdrsize;
354	struct route ro;
355	int error;
356
357	NET_EPOCH_ASSERT();
358
359	if (sip == NULL) {
360		/*
361		 * The caller did not supply a source address, try to find
362		 * a compatible one among those assigned to this interface.
363		 */
364		struct ifaddr *ifa;
365
366		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
367			if (ifa->ifa_addr->sa_family != AF_INET)
368				continue;
369
370			if (ifa->ifa_carp) {
371				if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
372					continue;
373				sip = &IA_SIN(ifa)->sin_addr;
374			} else {
375				carpaddr = NULL;
376				sip = &IA_SIN(ifa)->sin_addr;
377			}
378
379			if (0 == ((sip->s_addr ^ tip->s_addr) &
380			    IA_MASKSIN(ifa)->sin_addr.s_addr))
381				break;  /* found it. */
382		}
383		if (sip == NULL) {
384			printf("%s: cannot find matching address\n", __func__);
385			return (EADDRNOTAVAIL);
386		}
387	}
388	if (enaddr == NULL)
389		enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
390
391	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
392		return (ENOMEM);
393	m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) +
394		2 * ifp->if_addrlen;
395	m->m_pkthdr.len = m->m_len;
396	M_ALIGN(m, m->m_len);
397	ah = mtod(m, struct arphdr *);
398	bzero((caddr_t)ah, m->m_len);
399#ifdef MAC
400	mac_netinet_arp_send(ifp, m);
401#endif
402	ah->ar_pro = htons(ETHERTYPE_IP);
403	ah->ar_hln = ifp->if_addrlen;		/* hardware address length */
404	ah->ar_pln = sizeof(struct in_addr);	/* protocol address length */
405	ah->ar_op = htons(ARPOP_REQUEST);
406	bcopy(enaddr, ar_sha(ah), ah->ar_hln);
407	bcopy(sip, ar_spa(ah), ah->ar_pln);
408	bcopy(tip, ar_tpa(ah), ah->ar_pln);
409	sa.sa_family = AF_ARP;
410	sa.sa_len = 2;
411
412	/* Calculate link header for sending frame */
413	bzero(&ro, sizeof(ro));
414	linkhdrsize = sizeof(linkhdr);
415	error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize);
416	if (error != 0 && error != EAFNOSUPPORT) {
417		m_freem(m);
418		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
419		    if_name(ifp), error);
420		return (error);
421	}
422
423	ro.ro_prepend = linkhdr;
424	ro.ro_plen = linkhdrsize;
425	ro.ro_flags = 0;
426
427	m->m_flags |= M_BCAST;
428	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
429	error = (*ifp->if_output)(ifp, m, &sa, &ro);
430	ARPSTAT_INC(txrequests);
431	if (error) {
432		ARPSTAT_INC(txerrors);
433		ARP_LOG(LOG_DEBUG, "Failed to send ARP packet on %s: %d\n",
434		    if_name(ifp), error);
435	}
436	return (error);
437}
438
439void
440arprequest(struct ifnet *ifp, const struct in_addr *sip,
441    const struct in_addr *tip, u_char *enaddr)
442{
443
444	(void) arprequest_internal(ifp, sip, tip, enaddr);
445}
446
447/*
448 * Resolve an IP address into an ethernet address - heavy version.
449 * Used internally by arpresolve().
450 * We have already checked that we can't use an existing lle without
451 * modification so we have to acquire an LLE_EXCLUSIVE lle lock.
452 *
453 * On success, desten and pflags are filled in and the function returns 0;
454 * If the packet must be held pending resolution, we return EWOULDBLOCK
455 * On other errors, we return the corresponding error code.
456 * Note that m_freem() handles NULL.
457 */
458static int
459arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m,
460	const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
461	struct llentry **plle)
462{
463	struct llentry *la = NULL, *la_tmp;
464	int error, renew;
465	char *lladdr;
466	int ll_len;
467
468	NET_EPOCH_ASSERT();
469
470	if (pflags != NULL)
471		*pflags = 0;
472	if (plle != NULL)
473		*plle = NULL;
474
475	if ((flags & LLE_CREATE) == 0)
476		la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
477	if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) {
478		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
479		if (la == NULL) {
480			char addrbuf[INET_ADDRSTRLEN];
481
482			log(LOG_DEBUG,
483			    "arpresolve: can't allocate llinfo for %s on %s\n",
484			    inet_ntoa_r(SIN(dst)->sin_addr, addrbuf),
485			    if_name(ifp));
486			m_freem(m);
487			return (EINVAL);
488		}
489
490		IF_AFDATA_WLOCK(ifp);
491		LLE_WLOCK(la);
492		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
493		/* Prefer ANY existing lle over newly-created one */
494		if (la_tmp == NULL)
495			lltable_link_entry(LLTABLE(ifp), la);
496		IF_AFDATA_WUNLOCK(ifp);
497		if (la_tmp != NULL) {
498			lltable_free_entry(LLTABLE(ifp), la);
499			la = la_tmp;
500		}
501	}
502	if (la == NULL) {
503		m_freem(m);
504		return (EINVAL);
505	}
506
507	if ((la->la_flags & LLE_VALID) &&
508	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
509		if (flags & LLE_ADDRONLY) {
510			lladdr = la->ll_addr;
511			ll_len = ifp->if_addrlen;
512		} else {
513			lladdr = la->r_linkdata;
514			ll_len = la->r_hdrlen;
515		}
516		bcopy(lladdr, desten, ll_len);
517
518		/* Notify LLE code that the entry was used by datapath */
519		llentry_provide_feedback(la);
520		if (pflags != NULL)
521			*pflags = la->la_flags & (LLE_VALID|LLE_IFADDR);
522		if (plle) {
523			LLE_ADDREF(la);
524			*plle = la;
525		}
526		LLE_WUNLOCK(la);
527		return (0);
528	}
529
530	renew = (la->la_asked == 0 || la->la_expire != time_uptime);
531
532	/*
533	 * There is an arptab entry, but no ethernet address
534	 * response yet.  Add the mbuf to the list, dropping
535	 * the oldest packet if we have exceeded the system
536	 * setting.
537	 */
538	if (m != NULL) {
539		size_t dropped = lltable_append_entry_queue(la, m, V_arp_maxhold);
540		ARPSTAT_ADD(dropped, dropped);
541	}
542
543	/*
544	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
545	 * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
546	 * if we have already sent arp_maxtries ARP requests. Retransmit the
547	 * ARP request, but not faster than one request per second.
548	 */
549	if (la->la_asked < V_arp_maxtries)
550		error = EWOULDBLOCK;	/* First request. */
551	else
552		error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN;
553
554	if (renew) {
555		int canceled, e;
556
557		LLE_ADDREF(la);
558		la->la_expire = time_uptime;
559		canceled = callout_reset(&la->lle_timer, hz * V_arpt_down,
560		    arptimer, la);
561		if (canceled)
562			LLE_REMREF(la);
563		la->la_asked++;
564		LLE_WUNLOCK(la);
565		e = arprequest_internal(ifp, NULL, &SIN(dst)->sin_addr, NULL);
566		/*
567		 * Only overwrite 'error' in case of error; in case of success
568		 * the proper return value was already set above.
569		 */
570		if (e != 0)
571			return (e);
572		return (error);
573	}
574
575	LLE_WUNLOCK(la);
576	return (error);
577}
578
579/*
580 * Lookups link header based on an IP address.
581 * On input:
582 *    ifp is the interface we use
583 *    is_gw != 0 if @dst represents gateway to some destination
584 *    m is the mbuf. May be NULL if we don't have a packet.
585 *    dst is the next hop,
586 *    desten is the storage to put LL header.
587 *    flags returns subset of lle flags: LLE_VALID | LLE_IFADDR
588 *
589 * On success, full/partial link header and flags are filled in and
590 * the function returns 0.
591 * If the packet must be held pending resolution, we return EWOULDBLOCK
592 * On other errors, we return the corresponding error code.
593 * Note that m_freem() handles NULL.
594 */
595int
596arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
597	const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
598	struct llentry **plle)
599{
600	struct llentry *la = NULL;
601
602	NET_EPOCH_ASSERT();
603
604	if (pflags != NULL)
605		*pflags = 0;
606	if (plle != NULL)
607		*plle = NULL;
608
609	if (m != NULL) {
610		if (m->m_flags & M_BCAST) {
611			/* broadcast */
612			(void)memcpy(desten,
613			    ifp->if_broadcastaddr, ifp->if_addrlen);
614			return (0);
615		}
616		if (m->m_flags & M_MCAST) {
617			/* multicast */
618			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
619			return (0);
620		}
621	}
622
623	la = lla_lookup(LLTABLE(ifp), plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, dst);
624	if (la != NULL && (la->r_flags & RLLE_VALID) != 0) {
625		/* Entry found, let's copy lle info */
626		bcopy(la->r_linkdata, desten, la->r_hdrlen);
627		if (pflags != NULL)
628			*pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR);
629		/* Notify the LLE handling code that the entry was used. */
630		llentry_provide_feedback(la);
631		if (plle) {
632			LLE_ADDREF(la);
633			*plle = la;
634			LLE_WUNLOCK(la);
635		}
636		return (0);
637	}
638	if (plle && la)
639		LLE_WUNLOCK(la);
640
641	return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst,
642	    desten, pflags, plle));
643}
644
645/*
646 * Common length and type checks are done here,
647 * then the protocol-specific routine is called.
648 */
649static void
650arpintr(struct mbuf *m)
651{
652	struct arphdr *ar;
653	struct ifnet *ifp;
654	char *layer;
655	int hlen;
656
657	ifp = m->m_pkthdr.rcvif;
658
659	if (m->m_len < sizeof(struct arphdr) &&
660	    ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
661		ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n",
662		    if_name(ifp));
663		return;
664	}
665	ar = mtod(m, struct arphdr *);
666
667	/* Check if length is sufficient */
668	if (m->m_len <  arphdr_len(ar)) {
669		m = m_pullup(m, arphdr_len(ar));
670		if (m == NULL) {
671			ARP_LOG(LOG_NOTICE, "short packet received on %s\n",
672			    if_name(ifp));
673			return;
674		}
675		ar = mtod(m, struct arphdr *);
676	}
677
678	hlen = 0;
679	layer = "";
680	switch (ntohs(ar->ar_hrd)) {
681	case ARPHRD_ETHER:
682		hlen = ETHER_ADDR_LEN; /* RFC 826 */
683		layer = "ethernet";
684		break;
685	case ARPHRD_IEEE802:
686		hlen = ETHER_ADDR_LEN;
687		layer = "ieee802";
688		break;
689	case ARPHRD_INFINIBAND:
690		hlen = 20;	/* RFC 4391, INFINIBAND_ALEN */
691		layer = "infiniband";
692		break;
693	case ARPHRD_IEEE1394:
694		hlen = 0; /* SHALL be 16 */ /* RFC 2734 */
695		layer = "firewire";
696
697		/*
698		 * Restrict too long hardware addresses.
699		 * Currently we are capable of handling 20-byte
700		 * addresses ( sizeof(lle->ll_addr) )
701		 */
702		if (ar->ar_hln >= 20)
703			hlen = 16;
704		break;
705	default:
706		ARP_LOG(LOG_NOTICE,
707		    "packet with unknown hardware format 0x%02d received on "
708		    "%s\n", ntohs(ar->ar_hrd), if_name(ifp));
709		m_freem(m);
710		return;
711	}
712
713	if (hlen != 0 && hlen != ar->ar_hln) {
714		ARP_LOG(LOG_NOTICE,
715		    "packet with invalid %s address length %d received on %s\n",
716		    layer, ar->ar_hln, if_name(ifp));
717		m_freem(m);
718		return;
719	}
720
721	ARPSTAT_INC(received);
722	switch (ntohs(ar->ar_pro)) {
723#ifdef INET
724	case ETHERTYPE_IP:
725		in_arpinput(m);
726		return;
727#endif
728	}
729	m_freem(m);
730}
731
732#ifdef INET
733/*
734 * ARP for Internet protocols on 10 Mb/s Ethernet.
735 * Algorithm is that given in RFC 826.
736 * In addition, a sanity check is performed on the sender
737 * protocol address, to catch impersonators.
738 * We no longer handle negotiations for use of trailer protocol:
739 * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
740 * along with IP replies if we wanted trailers sent to us,
741 * and also sent them in response to IP replies.
742 * This allowed either end to announce the desire to receive
743 * trailer packets.
744 * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
745 * but formerly didn't normally send requests.
746 */
747static int log_arp_wrong_iface = 1;
748static int log_arp_movements = 1;
749static int log_arp_permanent_modify = 1;
750static int allow_multicast = 0;
751
752SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
753	&log_arp_wrong_iface, 0,
754	"log arp packets arriving on the wrong interface");
755SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
756	&log_arp_movements, 0,
757	"log arp replies from MACs different than the one in the cache");
758SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
759	&log_arp_permanent_modify, 0,
760	"log arp replies from MACs different than the one in the permanent arp entry");
761SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
762	&allow_multicast, 0, "accept multicast addresses");
763
764static void
765in_arpinput(struct mbuf *m)
766{
767	struct arphdr *ah;
768	struct ifnet *ifp = m->m_pkthdr.rcvif;
769	struct llentry *la = NULL, *la_tmp;
770	struct ifaddr *ifa;
771	struct in_ifaddr *ia;
772	struct sockaddr sa;
773	struct in_addr isaddr, itaddr, myaddr;
774	u_int8_t *enaddr = NULL;
775	int op;
776	int bridged = 0, is_bridge = 0;
777	int carped;
778	struct sockaddr_in sin;
779	struct sockaddr *dst;
780	struct nhop_object *nh;
781	uint8_t linkhdr[LLE_MAX_LINKHDR];
782	struct route ro;
783	size_t linkhdrsize;
784	int lladdr_off;
785	int error;
786	char addrbuf[INET_ADDRSTRLEN];
787
788	NET_EPOCH_ASSERT();
789
790	sin.sin_len = sizeof(struct sockaddr_in);
791	sin.sin_family = AF_INET;
792	sin.sin_addr.s_addr = 0;
793
794	if (ifp->if_bridge)
795		bridged = 1;
796	if (ifp->if_type == IFT_BRIDGE)
797		is_bridge = 1;
798
799	/*
800	 * We already have checked that mbuf contains enough contiguous data
801	 * to hold entire arp message according to the arp header.
802	 */
803	ah = mtod(m, struct arphdr *);
804
805	/*
806	 * ARP is only for IPv4 so we can reject packets with
807	 * a protocol length not equal to an IPv4 address.
808	 */
809	if (ah->ar_pln != sizeof(struct in_addr)) {
810		ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
811		    sizeof(struct in_addr));
812		goto drop;
813	}
814
815	if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
816		ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
817		    ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
818		goto drop;
819	}
820
821	op = ntohs(ah->ar_op);
822	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
823	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
824
825	if (op == ARPOP_REPLY)
826		ARPSTAT_INC(rxreplies);
827
828	/*
829	 * For a bridge, we want to check the address irrespective
830	 * of the receive interface. (This will change slightly
831	 * when we have clusters of interfaces).
832	 */
833	CK_LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
834		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
835		    ia->ia_ifp == ifp) &&
836		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
837		    (ia->ia_ifa.ifa_carp == NULL ||
838		    (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
839			ifa_ref(&ia->ia_ifa);
840			goto match;
841		}
842	}
843	CK_LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
844		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
845		    ia->ia_ifp == ifp) &&
846		    isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
847			ifa_ref(&ia->ia_ifa);
848			goto match;
849		}
850
851#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia)				\
852  (ia->ia_ifp->if_bridge == ifp->if_softc &&				\
853  !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) &&	\
854  addr == ia->ia_addr.sin_addr.s_addr)
855	/*
856	 * Check the case when bridge shares its MAC address with
857	 * some of its children, so packets are claimed by bridge
858	 * itself (bridge_input() does it first), but they are really
859	 * meant to be destined to the bridge member.
860	 */
861	if (is_bridge) {
862		CK_LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
863			if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
864				ifa_ref(&ia->ia_ifa);
865				ifp = ia->ia_ifp;
866				goto match;
867			}
868		}
869	}
870#undef BDG_MEMBER_MATCHES_ARP
871
872	/*
873	 * No match, use the first inet address on the receive interface
874	 * as a dummy address for the rest of the function.
875	 */
876	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
877		if (ifa->ifa_addr->sa_family == AF_INET &&
878		    (ifa->ifa_carp == NULL ||
879		    (*carp_iamatch_p)(ifa, &enaddr))) {
880			ia = ifatoia(ifa);
881			ifa_ref(ifa);
882			goto match;
883		}
884
885	/*
886	 * If bridging, fall back to using any inet address.
887	 */
888	if (!bridged || (ia = CK_STAILQ_FIRST(&V_in_ifaddrhead)) == NULL)
889		goto drop;
890	ifa_ref(&ia->ia_ifa);
891match:
892	if (!enaddr)
893		enaddr = (u_int8_t *)IF_LLADDR(ifp);
894	carped = (ia->ia_ifa.ifa_carp != NULL);
895	myaddr = ia->ia_addr.sin_addr;
896	ifa_free(&ia->ia_ifa);
897	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
898		goto drop;	/* it's from me, ignore it. */
899	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
900		ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
901		    "%s!\n", inet_ntoa_r(isaddr, addrbuf));
902		goto drop;
903	}
904
905	if (ifp->if_addrlen != ah->ar_hln) {
906		ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
907		    "i/f %d (ignored)\n", ifp->if_addrlen,
908		    (u_char *) ar_sha(ah), ":", ah->ar_hln,
909		    ifp->if_addrlen);
910		goto drop;
911	}
912
913	/*
914	 * Warn if another host is using the same IP address, but only if the
915	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
916	 * case we suppress the warning to avoid false positive complaints of
917	 * potential misconfiguration.
918	 */
919	if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
920	    myaddr.s_addr != 0) {
921		ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
922		   ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
923		   inet_ntoa_r(isaddr, addrbuf), ifp->if_xname);
924		itaddr = myaddr;
925		ARPSTAT_INC(dupips);
926		goto reply;
927	}
928	if (ifp->if_flags & IFF_STATICARP)
929		goto reply;
930
931	bzero(&sin, sizeof(sin));
932	sin.sin_len = sizeof(struct sockaddr_in);
933	sin.sin_family = AF_INET;
934	sin.sin_addr = isaddr;
935	dst = (struct sockaddr *)&sin;
936	la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
937	if (la != NULL)
938		arp_check_update_lle(ah, isaddr, ifp, bridged, la);
939	else if (itaddr.s_addr == myaddr.s_addr) {
940		/*
941		 * Request/reply to our address, but no lle exists yet.
942		 * Calculate full link prepend to use in lle.
943		 */
944		linkhdrsize = sizeof(linkhdr);
945		if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
946		    &linkhdrsize, &lladdr_off) != 0)
947			goto reply;
948
949		/* Allocate new entry */
950		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
951		if (la == NULL) {
952			/*
953			 * lle creation may fail if source address belongs
954			 * to non-directly connected subnet. However, we
955			 * will try to answer the request instead of dropping
956			 * frame.
957			 */
958			goto reply;
959		}
960		lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
961		    lladdr_off);
962
963		IF_AFDATA_WLOCK(ifp);
964		LLE_WLOCK(la);
965		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
966
967		/*
968		 * Check if lle still does not exists.
969		 * If it does, that means that we either
970		 * 1) have configured it explicitly, via
971		 * 1a) 'arp -s' static entry or
972		 * 1b) interface address static record
973		 * or
974		 * 2) it was the result of sending first packet to-host
975		 * or
976		 * 3) it was another arp reply packet we handled in
977		 * different thread.
978		 *
979		 * In all cases except 3) we definitely need to prefer
980		 * existing lle. For the sake of simplicity, prefer any
981		 * existing lle over newly-create one.
982		 */
983		if (la_tmp == NULL)
984			lltable_link_entry(LLTABLE(ifp), la);
985		IF_AFDATA_WUNLOCK(ifp);
986
987		if (la_tmp == NULL) {
988			arp_mark_lle_reachable(la, ifp);
989			LLE_WUNLOCK(la);
990		} else {
991			/* Free newly-create entry and handle packet */
992			lltable_free_entry(LLTABLE(ifp), la);
993			la = la_tmp;
994			la_tmp = NULL;
995			arp_check_update_lle(ah, isaddr, ifp, bridged, la);
996			/* arp_check_update_lle() returns @la unlocked */
997		}
998		la = NULL;
999	}
1000reply:
1001	if (op != ARPOP_REQUEST)
1002		goto drop;
1003	ARPSTAT_INC(rxrequests);
1004
1005	if (itaddr.s_addr == myaddr.s_addr) {
1006		/* Shortcut.. the receiving interface is the target. */
1007		(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
1008		(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
1009	} else {
1010		/*
1011		 * Destination address is not ours. Check if
1012		 * proxyarp entry exists or proxyarp is turned on globally.
1013		 */
1014		struct llentry *lle;
1015
1016		sin.sin_addr = itaddr;
1017		lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
1018
1019		if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
1020			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
1021			(void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln);
1022			LLE_RUNLOCK(lle);
1023		} else {
1024			if (lle != NULL)
1025				LLE_RUNLOCK(lle);
1026
1027			if (!V_arp_proxyall)
1028				goto drop;
1029
1030			NET_EPOCH_ASSERT();
1031			nh = fib4_lookup(ifp->if_fib, itaddr, 0, 0, 0);
1032			if (nh == NULL)
1033				goto drop;
1034
1035			/*
1036			 * Don't send proxies for nodes on the same interface
1037			 * as this one came out of, or we'll get into a fight
1038			 * over who claims what Ether address.
1039			 */
1040			if (nh->nh_ifp == ifp)
1041				goto drop;
1042
1043			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
1044			(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
1045
1046			/*
1047			 * Also check that the node which sent the ARP packet
1048			 * is on the interface we expect it to be on. This
1049			 * avoids ARP chaos if an interface is connected to the
1050			 * wrong network.
1051			 */
1052
1053			nh = fib4_lookup(ifp->if_fib, isaddr, 0, 0, 0);
1054			if (nh == NULL)
1055				goto drop;
1056			if (nh->nh_ifp != ifp) {
1057				ARP_LOG(LOG_INFO, "proxy: ignoring request"
1058				    " from %s via %s\n",
1059				    inet_ntoa_r(isaddr, addrbuf),
1060				    ifp->if_xname);
1061				goto drop;
1062			}
1063
1064#ifdef DEBUG_PROXY
1065			printf("arp: proxying for %s\n",
1066			    inet_ntoa_r(itaddr, addrbuf));
1067#endif
1068		}
1069	}
1070
1071	if (itaddr.s_addr == myaddr.s_addr &&
1072	    IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
1073		/* RFC 3927 link-local IPv4; always reply by broadcast. */
1074#ifdef DEBUG_LINKLOCAL
1075		printf("arp: sending reply for link-local addr %s\n",
1076		    inet_ntoa_r(itaddr, addrbuf));
1077#endif
1078		m->m_flags |= M_BCAST;
1079		m->m_flags &= ~M_MCAST;
1080	} else {
1081		/* default behaviour; never reply by broadcast. */
1082		m->m_flags &= ~(M_BCAST|M_MCAST);
1083	}
1084	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
1085	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
1086	ah->ar_op = htons(ARPOP_REPLY);
1087	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
1088	m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
1089	m->m_pkthdr.len = m->m_len;
1090	m->m_pkthdr.rcvif = NULL;
1091	sa.sa_family = AF_ARP;
1092	sa.sa_len = 2;
1093
1094	/* Calculate link header for sending frame */
1095	bzero(&ro, sizeof(ro));
1096	linkhdrsize = sizeof(linkhdr);
1097	error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize);
1098
1099	/*
1100	 * arp_fillheader() may fail due to lack of support inside encap request
1101	 * routing. This is not necessary an error, AF_ARP can/should be handled
1102	 * by if_output().
1103	 */
1104	if (error != 0 && error != EAFNOSUPPORT) {
1105		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
1106		    if_name(ifp), error);
1107		goto drop;
1108	}
1109
1110	ro.ro_prepend = linkhdr;
1111	ro.ro_plen = linkhdrsize;
1112	ro.ro_flags = 0;
1113
1114	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
1115	(*ifp->if_output)(ifp, m, &sa, &ro);
1116	ARPSTAT_INC(txreplies);
1117	return;
1118
1119drop:
1120	m_freem(m);
1121}
1122#endif
1123
1124static struct mbuf *
1125arp_grab_holdchain(struct llentry *la)
1126{
1127	struct mbuf *chain;
1128
1129	LLE_WLOCK_ASSERT(la);
1130
1131	chain = la->la_hold;
1132	la->la_hold = NULL;
1133	la->la_numheld = 0;
1134
1135	return (chain);
1136}
1137
1138static void
1139arp_flush_holdchain(struct ifnet *ifp, struct llentry *la, struct mbuf *chain)
1140{
1141	struct mbuf *m_hold, *m_hold_next;
1142	struct sockaddr_in sin;
1143
1144	NET_EPOCH_ASSERT();
1145
1146	struct route ro = {
1147		.ro_prepend = la->r_linkdata,
1148		.ro_plen = la->r_hdrlen,
1149	};
1150
1151	lltable_fill_sa_entry(la, (struct sockaddr *)&sin);
1152
1153	for (m_hold = chain; m_hold != NULL; m_hold = m_hold_next) {
1154		m_hold_next = m_hold->m_nextpkt;
1155		m_hold->m_nextpkt = NULL;
1156		/* Avoid confusing lower layers. */
1157		m_clrprotoflags(m_hold);
1158		(*ifp->if_output)(ifp, m_hold, (struct sockaddr *)&sin, &ro);
1159	}
1160}
1161
1162/*
1163 * Checks received arp data against existing @la.
1164 * Updates lle state/performs notification if necessary.
1165 */
1166static void
1167arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp,
1168    int bridged, struct llentry *la)
1169{
1170	uint8_t linkhdr[LLE_MAX_LINKHDR];
1171	size_t linkhdrsize;
1172	int lladdr_off;
1173	char addrbuf[INET_ADDRSTRLEN];
1174
1175	LLE_WLOCK_ASSERT(la);
1176
1177	/* the following is not an error when doing bridging */
1178	if (!bridged && la->lle_tbl->llt_ifp != ifp) {
1179		if (log_arp_wrong_iface)
1180			ARP_LOG(LOG_WARNING, "%s is on %s "
1181			    "but got reply from %*D on %s\n",
1182			    inet_ntoa_r(isaddr, addrbuf),
1183			    la->lle_tbl->llt_ifp->if_xname,
1184			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
1185			    ifp->if_xname);
1186		LLE_WUNLOCK(la);
1187		return;
1188	}
1189	if ((la->la_flags & LLE_VALID) &&
1190	    bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) {
1191		if (la->la_flags & LLE_STATIC) {
1192			LLE_WUNLOCK(la);
1193			if (log_arp_permanent_modify)
1194				ARP_LOG(LOG_ERR,
1195				    "%*D attempts to modify "
1196				    "permanent entry for %s on %s\n",
1197				    ifp->if_addrlen,
1198				    (u_char *)ar_sha(ah), ":",
1199				    inet_ntoa_r(isaddr, addrbuf),
1200				    ifp->if_xname);
1201			return;
1202		}
1203		if (log_arp_movements) {
1204			ARP_LOG(LOG_INFO, "%s moved from %*D "
1205			    "to %*D on %s\n",
1206			    inet_ntoa_r(isaddr, addrbuf),
1207			    ifp->if_addrlen,
1208			    (u_char *)la->ll_addr, ":",
1209			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
1210			    ifp->if_xname);
1211		}
1212	}
1213
1214	/* Calculate full link prepend to use in lle */
1215	linkhdrsize = sizeof(linkhdr);
1216	if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
1217	    &linkhdrsize, &lladdr_off) != 0) {
1218		LLE_WUNLOCK(la);
1219		return;
1220	}
1221
1222	/* Check if something has changed */
1223	if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 ||
1224	    (la->la_flags & LLE_VALID) == 0) {
1225		/* Try to perform LLE update */
1226		if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
1227		    lladdr_off) == 0) {
1228			LLE_WUNLOCK(la);
1229			return;
1230		}
1231
1232		/* Clear fast path feedback request if set */
1233		llentry_mark_used(la);
1234	}
1235
1236	arp_mark_lle_reachable(la, ifp);
1237
1238	/*
1239	 * The packets are all freed within the call to the output
1240	 * routine.
1241	 *
1242	 * NB: The lock MUST be released before the call to the
1243	 * output routine.
1244	 */
1245	if (la->la_hold != NULL) {
1246		struct mbuf *chain;
1247
1248		chain = arp_grab_holdchain(la);
1249		LLE_WUNLOCK(la);
1250		arp_flush_holdchain(ifp, la, chain);
1251	} else
1252		LLE_WUNLOCK(la);
1253}
1254
1255static void
1256arp_mark_lle_reachable(struct llentry *la, struct ifnet *ifp)
1257{
1258	int canceled, wtime;
1259
1260	LLE_WLOCK_ASSERT(la);
1261
1262	la->ln_state = ARP_LLINFO_REACHABLE;
1263	EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
1264
1265	if ((ifp->if_flags & IFF_STICKYARP) != 0)
1266		la->la_flags |= LLE_STATIC;
1267
1268	if (!(la->la_flags & LLE_STATIC)) {
1269		LLE_ADDREF(la);
1270		la->la_expire = time_uptime + V_arpt_keep;
1271		wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit;
1272		if (wtime < 0)
1273			wtime = V_arpt_keep;
1274		canceled = callout_reset(&la->lle_timer,
1275		    hz * wtime, arptimer, la);
1276		if (canceled)
1277			LLE_REMREF(la);
1278	}
1279	la->la_asked = 0;
1280	la->la_preempt = V_arp_maxtries;
1281}
1282
1283/*
1284 * Add permanent link-layer record for given interface address.
1285 */
1286static __noinline void
1287arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst)
1288{
1289	struct llentry *lle, *lle_tmp;
1290
1291	/*
1292	 * Interface address LLE record is considered static
1293	 * because kernel code relies on LLE_STATIC flag to check
1294	 * if these entries can be rewriten by arp updates.
1295	 */
1296	lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst);
1297	if (lle == NULL) {
1298		log(LOG_INFO, "arp_ifinit: cannot create arp "
1299		    "entry for interface address\n");
1300		return;
1301	}
1302
1303	IF_AFDATA_WLOCK(ifp);
1304	LLE_WLOCK(lle);
1305	/* Unlink any entry if exists */
1306	lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
1307	if (lle_tmp != NULL)
1308		lltable_unlink_entry(LLTABLE(ifp), lle_tmp);
1309
1310	lltable_link_entry(LLTABLE(ifp), lle);
1311	IF_AFDATA_WUNLOCK(ifp);
1312
1313	if (lle_tmp != NULL)
1314		EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED);
1315
1316	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
1317	LLE_WUNLOCK(lle);
1318	if (lle_tmp != NULL)
1319		lltable_free_entry(LLTABLE(ifp), lle_tmp);
1320}
1321
1322/*
1323 * Handle the garp_rexmit_count. Like sysctl_handle_int(), but limits the range
1324 * of valid values.
1325 */
1326static int
1327sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS)
1328{
1329	int error;
1330	int rexmit_count = *(int *)arg1;
1331
1332	error = sysctl_handle_int(oidp, &rexmit_count, 0, req);
1333
1334	/* Enforce limits on any new value that may have been set. */
1335	if (!error && req->newptr) {
1336		/* A new value was set. */
1337		if (rexmit_count < 0) {
1338			rexmit_count = 0;
1339		} else if (rexmit_count > MAX_GARP_RETRANSMITS) {
1340			rexmit_count = MAX_GARP_RETRANSMITS;
1341		}
1342		*(int *)arg1 = rexmit_count;
1343	}
1344
1345	return (error);
1346}
1347
1348/*
1349 * Retransmit a Gratuitous ARP (GARP) and, if necessary, schedule a callout to
1350 * retransmit it again. A pending callout owns a reference to the ifa.
1351 */
1352static void
1353garp_rexmit(void *arg)
1354{
1355	struct in_ifaddr *ia = arg;
1356
1357	if (callout_pending(&ia->ia_garp_timer) ||
1358	    !callout_active(&ia->ia_garp_timer)) {
1359		IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
1360		ifa_free(&ia->ia_ifa);
1361		return;
1362	}
1363
1364	CURVNET_SET(ia->ia_ifa.ifa_ifp->if_vnet);
1365
1366	/*
1367	 * Drop lock while the ARP request is generated.
1368	 */
1369	IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
1370
1371	arprequest(ia->ia_ifa.ifa_ifp, &IA_SIN(ia)->sin_addr,
1372	    &IA_SIN(ia)->sin_addr, IF_LLADDR(ia->ia_ifa.ifa_ifp));
1373
1374	/*
1375	 * Increment the count of retransmissions. If the count has reached the
1376	 * maximum value, stop sending the GARP packets. Otherwise, schedule
1377	 * the callout to retransmit another GARP packet.
1378	 */
1379	++ia->ia_garp_count;
1380	if (ia->ia_garp_count >= garp_rexmit_count) {
1381		ifa_free(&ia->ia_ifa);
1382	} else {
1383		int rescheduled;
1384		IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp);
1385		rescheduled = callout_reset(&ia->ia_garp_timer,
1386		    (1 << ia->ia_garp_count) * hz,
1387		    garp_rexmit, ia);
1388		IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
1389		if (rescheduled) {
1390			ifa_free(&ia->ia_ifa);
1391		}
1392	}
1393
1394	CURVNET_RESTORE();
1395}
1396
1397/*
1398 * Start the GARP retransmit timer.
1399 *
1400 * A single GARP is always transmitted when an IPv4 address is added
1401 * to an interface and that is usually sufficient. However, in some
1402 * circumstances, such as when a shared address is passed between
1403 * cluster nodes, this single GARP may occasionally be dropped or
1404 * lost. This can lead to neighbors on the network link working with a
1405 * stale ARP cache and sending packets destined for that address to
1406 * the node that previously owned the address, which may not respond.
1407 *
1408 * To avoid this situation, GARP retransmits can be enabled by setting
1409 * the net.link.ether.inet.garp_rexmit_count sysctl to a value greater
1410 * than zero. The setting represents the maximum number of
1411 * retransmissions. The interval between retransmissions is calculated
1412 * using an exponential backoff algorithm, doubling each time, so the
1413 * retransmission intervals are: {1, 2, 4, 8, 16, ...} (seconds).
1414 */
1415static void
1416garp_timer_start(struct ifaddr *ifa)
1417{
1418	struct in_ifaddr *ia = (struct in_ifaddr *) ifa;
1419
1420	IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp);
1421	ia->ia_garp_count = 0;
1422	if (callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz,
1423	    garp_rexmit, ia) == 0) {
1424		ifa_ref(ifa);
1425	}
1426	IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
1427}
1428
1429void
1430arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
1431{
1432	struct epoch_tracker et;
1433	const struct sockaddr_in *dst_in;
1434	const struct sockaddr *dst;
1435
1436	if (ifa->ifa_carp != NULL)
1437		return;
1438
1439	dst = ifa->ifa_addr;
1440	dst_in = (const struct sockaddr_in *)dst;
1441
1442	if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY)
1443		return;
1444	NET_EPOCH_ENTER(et);
1445	arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp));
1446	NET_EPOCH_EXIT(et);
1447	if (garp_rexmit_count > 0) {
1448		garp_timer_start(ifa);
1449	}
1450
1451	arp_add_ifa_lle(ifp, dst);
1452}
1453
1454void
1455arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr)
1456{
1457
1458	if (ntohl(addr.s_addr) != INADDR_ANY)
1459		arprequest(ifp, &addr, &addr, enaddr);
1460}
1461
1462/*
1463 * Sends gratuitous ARPs for each ifaddr to notify other
1464 * nodes about the address change.
1465 */
1466static __noinline void
1467arp_handle_ifllchange(struct ifnet *ifp)
1468{
1469	struct ifaddr *ifa;
1470
1471	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1472		if (ifa->ifa_addr->sa_family == AF_INET)
1473			arp_ifinit(ifp, ifa);
1474	}
1475}
1476
1477/*
1478 * A handler for interface link layer address change event.
1479 */
1480static void
1481arp_iflladdr(void *arg __unused, struct ifnet *ifp)
1482{
1483	/* if_bridge can update its lladdr during if_vmove(), after we've done
1484	 * if_detach_internal()/dom_ifdetach(). */
1485	if (ifp->if_afdata[AF_INET] == NULL)
1486		return;
1487
1488	lltable_update_ifaddr(LLTABLE(ifp));
1489
1490	if ((ifp->if_flags & IFF_UP) != 0)
1491		arp_handle_ifllchange(ifp);
1492}
1493
1494static void
1495vnet_arp_init(void)
1496{
1497
1498	if (IS_DEFAULT_VNET(curvnet)) {
1499		netisr_register(&arp_nh);
1500		iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
1501		    arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
1502	}
1503#ifdef VIMAGE
1504	else
1505		netisr_register_vnet(&arp_nh);
1506#endif
1507}
1508VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND,
1509    vnet_arp_init, 0);
1510
1511#ifdef VIMAGE
1512/*
1513 * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH
1514 * lookups after destroying the hash.  Ideally this would go on SI_ORDER_3.5.
1515 */
1516static void
1517vnet_arp_destroy(__unused void *arg)
1518{
1519
1520	netisr_unregister_vnet(&arp_nh);
1521}
1522VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
1523    vnet_arp_destroy, NULL);
1524#endif
1525