1/*-
2 * Copyright (c) 1982, 1986, 1988, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)if_ether.c	8.1 (Berkeley) 6/10/93
30 */
31
32/*
33 * Ethernet address resolution protocol.
34 * TODO:
35 *	add "inuse/lock" bit (or ref. count) along with valid bit
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: stable/10/sys/netinet/if_ether.c 309340 2016-11-30 22:20:23Z vangyzen $");
40
41#include "opt_inet.h"
42
43#include <sys/param.h>
44#include <sys/kernel.h>
45#include <sys/queue.h>
46#include <sys/sysctl.h>
47#include <sys/systm.h>
48#include <sys/mbuf.h>
49#include <sys/malloc.h>
50#include <sys/proc.h>
51#include <sys/socket.h>
52#include <sys/syslog.h>
53
54#include <net/if.h>
55#include <net/if_dl.h>
56#include <net/if_types.h>
57#include <net/netisr.h>
58#include <net/if_llc.h>
59#include <net/ethernet.h>
60#include <net/route.h>
61#include <net/vnet.h>
62
63#include <netinet/in.h>
64#include <netinet/in_var.h>
65#include <net/if_llatbl.h>
66#include <netinet/if_ether.h>
67#ifdef INET
68#include <netinet/ip_carp.h>
69#endif
70
71#include <net/if_arc.h>
72#include <net/iso88025.h>
73
74#include <security/mac/mac_framework.h>
75
76#define SIN(s) ((const struct sockaddr_in *)(s))
77#define SDL(s) ((struct sockaddr_dl *)s)
78
79SYSCTL_DECL(_net_link_ether);
80static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
81static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, "");
82
83/* timer values */
84static VNET_DEFINE(int, arpt_keep) = (20*60);	/* once resolved, good for 20
85						 * minutes */
86static VNET_DEFINE(int, arp_maxtries) = 5;
87VNET_DEFINE(int, useloopback) = 1;	/* use loopback interface for
88					 * local traffic */
89static VNET_DEFINE(int, arp_proxyall) = 0;
90static VNET_DEFINE(int, arpt_down) = 20;	/* keep incomplete entries for
91						 * 20 seconds */
92VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat);  /* ARP statistics, see if_arp.h */
93VNET_PCPUSTAT_SYSINIT(arpstat);
94
95#ifdef VIMAGE
96VNET_PCPUSTAT_SYSUNINIT(arpstat);
97#endif /* VIMAGE */
98
99static VNET_DEFINE(int, arp_maxhold) = 1;
100
101#define	V_arpt_keep		VNET(arpt_keep)
102#define	V_arpt_down		VNET(arpt_down)
103#define	V_arp_maxtries		VNET(arp_maxtries)
104#define	V_arp_proxyall		VNET(arp_proxyall)
105#define	V_arp_maxhold		VNET(arp_maxhold)
106
107SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW,
108	&VNET_NAME(arpt_keep), 0,
109	"ARP entry lifetime in seconds");
110SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW,
111	&VNET_NAME(arp_maxtries), 0,
112	"ARP resolution attempts before returning error");
113SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW,
114	&VNET_NAME(useloopback), 0,
115	"Use the loopback interface for local traffic");
116SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW,
117	&VNET_NAME(arp_proxyall), 0,
118	"Enable proxy ARP for all suitable requests");
119SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_RW,
120	&VNET_NAME(arpt_down), 0,
121	"Incomplete ARP entry lifetime in seconds");
122SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
123    arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
124SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_RW,
125	&VNET_NAME(arp_maxhold), 0,
126	"Number of packets to hold per ARP entry");
127
128/*
129 * Due to the exponential backoff algorithm used for the interval between GARP
130 * retransmissions, the maximum number of retransmissions is limited for
131 * sanity. This limit corresponds to a maximum interval between retransmissions
132 * of 2^16 seconds ~= 18 hours.
133 *
134 * Making this limit more dynamic is more complicated than worthwhile,
135 * especially since sending out GARPs spaced days apart would be of little
136 * use. A maximum dynamic limit would look something like:
137 *
138 * const int max = fls(INT_MAX / hz) - 1;
139 */
140#define MAX_GARP_RETRANSMITS 16
141static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS);
142static int garp_rexmit_count = 0; /* GARP retransmission setting. */
143
144SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, garp_rexmit_count,
145    CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
146    &garp_rexmit_count, 0, sysctl_garp_rexmit, "I",
147    "Number of times to retransmit GARP packets;"
148    " 0 to disable, maximum of 16");
149
150static void	arp_init(void);
151static void	arpintr(struct mbuf *);
152static void	arptimer(void *);
153#ifdef INET
154static void	in_arpinput(struct mbuf *);
155#endif
156
157static const struct netisr_handler arp_nh = {
158	.nh_name = "arp",
159	.nh_handler = arpintr,
160	.nh_proto = NETISR_ARP,
161	.nh_policy = NETISR_POLICY_SOURCE,
162};
163
164#ifdef AF_INET
165/*
166 * called by in_ifscrub to remove entry from the table when
167 * the interface goes away
168 */
169void
170arp_ifscrub(struct ifnet *ifp, uint32_t addr)
171{
172	struct sockaddr_in addr4;
173
174	bzero((void *)&addr4, sizeof(addr4));
175	addr4.sin_len    = sizeof(addr4);
176	addr4.sin_family = AF_INET;
177	addr4.sin_addr.s_addr = addr;
178	IF_AFDATA_WLOCK(ifp);
179	lla_lookup(LLTABLE(ifp), (LLE_DELETE | LLE_IFADDR),
180	    (struct sockaddr *)&addr4);
181	IF_AFDATA_WUNLOCK(ifp);
182}
183#endif
184
185/*
186 * Timeout routine.  Age arp_tab entries periodically.
187 */
188static void
189arptimer(void *arg)
190{
191	struct llentry *lle = (struct llentry *)arg;
192	struct ifnet *ifp;
193
194	if (lle->la_flags & LLE_STATIC) {
195		return;
196	}
197	LLE_WLOCK(lle);
198	if (callout_pending(&lle->la_timer)) {
199		/*
200		 * Here we are a bit odd here in the treatment of
201		 * active/pending. If the pending bit is set, it got
202		 * rescheduled before I ran. The active
203		 * bit we ignore, since if it was stopped
204		 * in ll_tablefree() and was currently running
205		 * it would have return 0 so the code would
206		 * not have deleted it since the callout could
207		 * not be stopped so we want to go through
208		 * with the delete here now. If the callout
209		 * was restarted, the pending bit will be back on and
210		 * we just want to bail since the callout_reset would
211		 * return 1 and our reference would have been removed
212		 * by arpresolve() below.
213		 */
214		LLE_WUNLOCK(lle);
215 		return;
216 	}
217	ifp = lle->lle_tbl->llt_ifp;
218	CURVNET_SET(ifp->if_vnet);
219
220	if ((lle->la_flags & LLE_DELETED) == 0) {
221		int evt;
222
223		if (lle->la_flags & LLE_VALID)
224			evt = LLENTRY_EXPIRED;
225		else
226			evt = LLENTRY_TIMEDOUT;
227		EVENTHANDLER_INVOKE(lle_event, lle, evt);
228	}
229
230	callout_stop(&lle->la_timer);
231
232	/* XXX: LOR avoidance. We still have ref on lle. */
233	LLE_WUNLOCK(lle);
234	IF_AFDATA_LOCK(ifp);
235	LLE_WLOCK(lle);
236
237	/* Guard against race with other llentry_free(). */
238	if (lle->la_flags & LLE_LINKED) {
239		size_t pkts_dropped;
240
241		LLE_REMREF(lle);
242		pkts_dropped = llentry_free(lle);
243		ARPSTAT_ADD(dropped, pkts_dropped);
244	} else
245		LLE_FREE_LOCKED(lle);
246
247	IF_AFDATA_UNLOCK(ifp);
248
249	ARPSTAT_INC(timeouts);
250
251	CURVNET_RESTORE();
252}
253
254/*
255 * Broadcast an ARP request. Caller specifies:
256 *	- arp header source ip address
257 *	- arp header target ip address
258 *	- arp header source ethernet address
259 */
260void
261arprequest(struct ifnet *ifp, const struct in_addr *sip,
262    const struct in_addr *tip, u_char *enaddr)
263{
264	struct mbuf *m;
265	struct arphdr *ah;
266	struct sockaddr sa;
267	u_char *carpaddr = NULL;
268
269	if (sip == NULL) {
270		/*
271		 * The caller did not supply a source address, try to find
272		 * a compatible one among those assigned to this interface.
273		 */
274		struct ifaddr *ifa;
275
276		IF_ADDR_RLOCK(ifp);
277		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
278			if (ifa->ifa_addr->sa_family != AF_INET)
279				continue;
280
281			if (ifa->ifa_carp) {
282				if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
283					continue;
284				sip = &IA_SIN(ifa)->sin_addr;
285			} else {
286				carpaddr = NULL;
287				sip = &IA_SIN(ifa)->sin_addr;
288			}
289
290			if (0 == ((sip->s_addr ^ tip->s_addr) &
291			    IA_MASKSIN(ifa)->sin_addr.s_addr))
292				break;  /* found it. */
293		}
294		IF_ADDR_RUNLOCK(ifp);
295		if (sip == NULL) {
296			printf("%s: cannot find matching address\n", __func__);
297			return;
298		}
299	}
300	if (enaddr == NULL)
301		enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
302
303	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
304		return;
305	m->m_len = sizeof(*ah) + 2*sizeof(struct in_addr) +
306		2*ifp->if_data.ifi_addrlen;
307	m->m_pkthdr.len = m->m_len;
308	MH_ALIGN(m, m->m_len);
309	ah = mtod(m, struct arphdr *);
310	bzero((caddr_t)ah, m->m_len);
311#ifdef MAC
312	mac_netinet_arp_send(ifp, m);
313#endif
314	ah->ar_pro = htons(ETHERTYPE_IP);
315	ah->ar_hln = ifp->if_addrlen;		/* hardware address length */
316	ah->ar_pln = sizeof(struct in_addr);	/* protocol address length */
317	ah->ar_op = htons(ARPOP_REQUEST);
318	bcopy(enaddr, ar_sha(ah), ah->ar_hln);
319	bcopy(sip, ar_spa(ah), ah->ar_pln);
320	bcopy(tip, ar_tpa(ah), ah->ar_pln);
321	sa.sa_family = AF_ARP;
322	sa.sa_len = 2;
323	m->m_flags |= M_BCAST;
324	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
325	(*ifp->if_output)(ifp, m, &sa, NULL);
326	ARPSTAT_INC(txrequests);
327}
328
329/*
330 * Resolve an IP address into an ethernet address.
331 * On input:
332 *    ifp is the interface we use
333 *    rt0 is the route to the final destination (possibly useless)
334 *    m is the mbuf. May be NULL if we don't have a packet.
335 *    dst is the next hop,
336 *    desten is where we want the address.
337 *
338 * On success, desten is filled in and the function returns 0;
339 * If the packet must be held pending resolution, we return EWOULDBLOCK
340 * On other errors, we return the corresponding error code.
341 * Note that m_freem() handles NULL.
342 */
343int
344arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m,
345	const struct sockaddr *dst, u_char *desten, struct llentry **lle)
346{
347	struct llentry *la = 0;
348	u_int flags = 0;
349	struct mbuf *curr = NULL;
350	struct mbuf *next = NULL;
351	int error, renew;
352
353	*lle = NULL;
354	if (m != NULL) {
355		if (m->m_flags & M_BCAST) {
356			/* broadcast */
357			(void)memcpy(desten,
358			    ifp->if_broadcastaddr, ifp->if_addrlen);
359			return (0);
360		}
361		if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) {
362			/* multicast */
363			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
364			return (0);
365		}
366	}
367retry:
368	IF_AFDATA_RLOCK(ifp);
369	la = lla_lookup(LLTABLE(ifp), flags, dst);
370	IF_AFDATA_RUNLOCK(ifp);
371	if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0)
372	    && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) {
373		flags |= (LLE_CREATE | LLE_EXCLUSIVE);
374		IF_AFDATA_WLOCK(ifp);
375		la = lla_lookup(LLTABLE(ifp), flags, dst);
376		IF_AFDATA_WUNLOCK(ifp);
377	}
378	if (la == NULL) {
379		if (flags & LLE_CREATE)
380			log(LOG_DEBUG,
381			    "arpresolve: can't allocate llinfo for %s on %s\n",
382			    inet_ntoa(SIN(dst)->sin_addr), ifp->if_xname);
383		m_freem(m);
384		return (EINVAL);
385	}
386
387	if ((la->la_flags & LLE_VALID) &&
388	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
389		bcopy(&la->ll_addr, desten, ifp->if_addrlen);
390		renew = 0;
391		/*
392		 * If entry has an expiry time and it is approaching,
393		 * see if we need to send an ARP request within this
394		 * arpt_down interval.
395		 */
396		if (!(la->la_flags & LLE_STATIC) &&
397		    time_uptime + la->la_preempt > la->la_expire) {
398			renew = 1;
399			la->la_preempt--;
400		}
401
402		*lle = la;
403
404		if (flags & LLE_EXCLUSIVE)
405			LLE_WUNLOCK(la);
406		else
407			LLE_RUNLOCK(la);
408
409		if (renew == 1)
410			arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
411
412		return (0);
413	}
414
415	if (la->la_flags & LLE_STATIC) {   /* should not happen! */
416		log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n",
417		    inet_ntoa(SIN(dst)->sin_addr));
418		m_freem(m);
419		error = EINVAL;
420		goto done;
421	}
422
423	renew = (la->la_asked == 0 || la->la_expire != time_uptime);
424	if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) {
425		flags |= LLE_EXCLUSIVE;
426		LLE_RUNLOCK(la);
427		goto retry;
428	}
429	/*
430	 * There is an arptab entry, but no ethernet address
431	 * response yet.  Add the mbuf to the list, dropping
432	 * the oldest packet if we have exceeded the system
433	 * setting.
434	 */
435	if (m != NULL) {
436		if (la->la_numheld >= V_arp_maxhold) {
437			if (la->la_hold != NULL) {
438				next = la->la_hold->m_nextpkt;
439				m_freem(la->la_hold);
440				la->la_hold = next;
441				la->la_numheld--;
442				ARPSTAT_INC(dropped);
443			}
444		}
445		if (la->la_hold != NULL) {
446			curr = la->la_hold;
447			while (curr->m_nextpkt != NULL)
448				curr = curr->m_nextpkt;
449			curr->m_nextpkt = m;
450		} else
451			la->la_hold = m;
452		la->la_numheld++;
453		if (renew == 0 && (flags & LLE_EXCLUSIVE)) {
454			flags &= ~LLE_EXCLUSIVE;
455			LLE_DOWNGRADE(la);
456		}
457
458	}
459	/*
460	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
461	 * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
462	 * if we have already sent arp_maxtries ARP requests. Retransmit the
463	 * ARP request, but not faster than one request per second.
464	 */
465	if (la->la_asked < V_arp_maxtries)
466		error = EWOULDBLOCK;	/* First request. */
467	else
468		error = rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) ?
469		    EHOSTUNREACH : EHOSTDOWN;
470
471	if (renew) {
472		int canceled;
473
474		LLE_ADDREF(la);
475		la->la_expire = time_uptime;
476		canceled = callout_reset(&la->la_timer, hz * V_arpt_down,
477		    arptimer, la);
478		if (canceled)
479			LLE_REMREF(la);
480		la->la_asked++;
481		LLE_WUNLOCK(la);
482		arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
483		return (error);
484	}
485done:
486	if (flags & LLE_EXCLUSIVE)
487		LLE_WUNLOCK(la);
488	else
489		LLE_RUNLOCK(la);
490	return (error);
491}
492
493/*
494 * Common length and type checks are done here,
495 * then the protocol-specific routine is called.
496 */
497static void
498arpintr(struct mbuf *m)
499{
500	struct arphdr *ar;
501
502	if (m->m_len < sizeof(struct arphdr) &&
503	    ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
504		log(LOG_NOTICE, "arp: runt packet -- m_pullup failed\n");
505		return;
506	}
507	ar = mtod(m, struct arphdr *);
508
509	if (ntohs(ar->ar_hrd) != ARPHRD_ETHER &&
510	    ntohs(ar->ar_hrd) != ARPHRD_IEEE802 &&
511	    ntohs(ar->ar_hrd) != ARPHRD_ARCNET &&
512	    ntohs(ar->ar_hrd) != ARPHRD_IEEE1394 &&
513	    ntohs(ar->ar_hrd) != ARPHRD_INFINIBAND) {
514		log(LOG_NOTICE, "arp: unknown hardware address format (0x%2D)"
515		    " (from %*D to %*D)\n", (unsigned char *)&ar->ar_hrd, "",
516		    ETHER_ADDR_LEN, (u_char *)ar_sha(ar), ":",
517		    ETHER_ADDR_LEN, (u_char *)ar_tha(ar), ":");
518		m_freem(m);
519		return;
520	}
521
522	if (m->m_len < arphdr_len(ar)) {
523		if ((m = m_pullup(m, arphdr_len(ar))) == NULL) {
524			log(LOG_NOTICE, "arp: runt packet\n");
525			m_freem(m);
526			return;
527		}
528		ar = mtod(m, struct arphdr *);
529	}
530
531	ARPSTAT_INC(received);
532	switch (ntohs(ar->ar_pro)) {
533#ifdef INET
534	case ETHERTYPE_IP:
535		in_arpinput(m);
536		return;
537#endif
538	}
539	m_freem(m);
540}
541
542#ifdef INET
543/*
544 * ARP for Internet protocols on 10 Mb/s Ethernet.
545 * Algorithm is that given in RFC 826.
546 * In addition, a sanity check is performed on the sender
547 * protocol address, to catch impersonators.
548 * We no longer handle negotiations for use of trailer protocol:
549 * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
550 * along with IP replies if we wanted trailers sent to us,
551 * and also sent them in response to IP replies.
552 * This allowed either end to announce the desire to receive
553 * trailer packets.
554 * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
555 * but formerly didn't normally send requests.
556 */
557static int log_arp_wrong_iface = 1;
558static int log_arp_movements = 1;
559static int log_arp_permanent_modify = 1;
560static int allow_multicast = 0;
561static struct timeval arp_lastlog;
562static int arp_curpps;
563static int arp_maxpps = 1;
564
565SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
566	&log_arp_wrong_iface, 0,
567	"log arp packets arriving on the wrong interface");
568SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
569	&log_arp_movements, 0,
570	"log arp replies from MACs different than the one in the cache");
571SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
572	&log_arp_permanent_modify, 0,
573	"log arp replies from MACs different than the one in the permanent arp entry");
574SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
575	&allow_multicast, 0, "accept multicast addresses");
576SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
577	CTLFLAG_RW, &arp_maxpps, 0,
578	"Maximum number of remotely triggered ARP messages that can be "
579	"logged per second");
580
581#define	ARP_LOG(pri, ...)	do {					\
582	if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps))	\
583		log((pri), "arp: " __VA_ARGS__);			\
584} while (0)
585
586static void
587in_arpinput(struct mbuf *m)
588{
589	struct arphdr *ah;
590	struct ifnet *ifp = m->m_pkthdr.rcvif;
591	struct llentry *la = NULL;
592	struct rtentry *rt;
593	struct ifaddr *ifa;
594	struct in_ifaddr *ia;
595	struct sockaddr sa;
596	struct in_addr isaddr, itaddr, myaddr;
597	u_int8_t *enaddr = NULL;
598	int op, flags;
599	int req_len;
600	int bridged = 0, is_bridge = 0;
601	int carped;
602	struct sockaddr_in sin;
603	sin.sin_len = sizeof(struct sockaddr_in);
604	sin.sin_family = AF_INET;
605	sin.sin_addr.s_addr = 0;
606
607	if (ifp->if_bridge)
608		bridged = 1;
609	if (ifp->if_type == IFT_BRIDGE)
610		is_bridge = 1;
611
612	req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
613	if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) {
614		ARP_LOG(LOG_NOTICE, "runt packet -- m_pullup failed\n");
615		return;
616	}
617
618	ah = mtod(m, struct arphdr *);
619	/*
620	 * ARP is only for IPv4 so we can reject packets with
621	 * a protocol length not equal to an IPv4 address.
622	 */
623	if (ah->ar_pln != sizeof(struct in_addr)) {
624		ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
625		    sizeof(struct in_addr));
626		goto drop;
627	}
628
629	if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
630		ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
631		    ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
632		goto drop;
633	}
634
635	op = ntohs(ah->ar_op);
636	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
637	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
638
639	if (op == ARPOP_REPLY)
640		ARPSTAT_INC(rxreplies);
641
642	/*
643	 * For a bridge, we want to check the address irrespective
644	 * of the receive interface. (This will change slightly
645	 * when we have clusters of interfaces).
646	 */
647	IN_IFADDR_RLOCK();
648	LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
649		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
650		    ia->ia_ifp == ifp) &&
651		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
652		    (ia->ia_ifa.ifa_carp == NULL ||
653		    (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
654			ifa_ref(&ia->ia_ifa);
655			IN_IFADDR_RUNLOCK();
656			goto match;
657		}
658	}
659	LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
660		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
661		    ia->ia_ifp == ifp) &&
662		    isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
663			ifa_ref(&ia->ia_ifa);
664			IN_IFADDR_RUNLOCK();
665			goto match;
666		}
667
668#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia)				\
669  (ia->ia_ifp->if_bridge == ifp->if_softc &&				\
670  !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) &&	\
671  addr == ia->ia_addr.sin_addr.s_addr)
672	/*
673	 * Check the case when bridge shares its MAC address with
674	 * some of its children, so packets are claimed by bridge
675	 * itself (bridge_input() does it first), but they are really
676	 * meant to be destined to the bridge member.
677	 */
678	if (is_bridge) {
679		LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
680			if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
681				ifa_ref(&ia->ia_ifa);
682				ifp = ia->ia_ifp;
683				IN_IFADDR_RUNLOCK();
684				goto match;
685			}
686		}
687	}
688#undef BDG_MEMBER_MATCHES_ARP
689	IN_IFADDR_RUNLOCK();
690
691	/*
692	 * No match, use the first inet address on the receive interface
693	 * as a dummy address for the rest of the function.
694	 */
695	IF_ADDR_RLOCK(ifp);
696	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
697		if (ifa->ifa_addr->sa_family == AF_INET &&
698		    (ifa->ifa_carp == NULL ||
699		    (*carp_iamatch_p)(ifa, &enaddr))) {
700			ia = ifatoia(ifa);
701			ifa_ref(ifa);
702			IF_ADDR_RUNLOCK(ifp);
703			goto match;
704		}
705	IF_ADDR_RUNLOCK(ifp);
706
707	/*
708	 * If bridging, fall back to using any inet address.
709	 */
710	IN_IFADDR_RLOCK();
711	if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
712		IN_IFADDR_RUNLOCK();
713		goto drop;
714	}
715	ifa_ref(&ia->ia_ifa);
716	IN_IFADDR_RUNLOCK();
717match:
718	if (!enaddr)
719		enaddr = (u_int8_t *)IF_LLADDR(ifp);
720	carped = (ia->ia_ifa.ifa_carp != NULL);
721	myaddr = ia->ia_addr.sin_addr;
722	ifa_free(&ia->ia_ifa);
723	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
724		goto drop;	/* it's from me, ignore it. */
725	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
726		ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
727		    "%s!\n", inet_ntoa(isaddr));
728		goto drop;
729	}
730	/*
731	 * Warn if another host is using the same IP address, but only if the
732	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
733	 * case we suppress the warning to avoid false positive complaints of
734	 * potential misconfiguration.
735	 */
736	if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
737	    myaddr.s_addr != 0) {
738		ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
739		   ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
740		   inet_ntoa(isaddr), ifp->if_xname);
741		itaddr = myaddr;
742		ARPSTAT_INC(dupips);
743		goto reply;
744	}
745	if (ifp->if_flags & IFF_STATICARP)
746		goto reply;
747
748	bzero(&sin, sizeof(sin));
749	sin.sin_len = sizeof(struct sockaddr_in);
750	sin.sin_family = AF_INET;
751	sin.sin_addr = isaddr;
752	flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0;
753	flags |= LLE_EXCLUSIVE;
754	IF_AFDATA_LOCK(ifp);
755	la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin);
756	IF_AFDATA_UNLOCK(ifp);
757	if (la != NULL) {
758		/* the following is not an error when doing bridging */
759		if (!bridged && la->lle_tbl->llt_ifp != ifp) {
760			if (log_arp_wrong_iface)
761				ARP_LOG(LOG_WARNING, "%s is on %s "
762				    "but got reply from %*D on %s\n",
763				    inet_ntoa(isaddr),
764				    la->lle_tbl->llt_ifp->if_xname,
765				    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
766				    ifp->if_xname);
767			LLE_WUNLOCK(la);
768			goto reply;
769		}
770		if ((la->la_flags & LLE_VALID) &&
771		    bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) {
772			if (la->la_flags & LLE_STATIC) {
773				LLE_WUNLOCK(la);
774				if (log_arp_permanent_modify)
775					ARP_LOG(LOG_ERR,
776					    "%*D attempts to modify "
777					    "permanent entry for %s on %s\n",
778					    ifp->if_addrlen,
779					    (u_char *)ar_sha(ah), ":",
780					    inet_ntoa(isaddr), ifp->if_xname);
781				goto reply;
782			}
783			if (log_arp_movements) {
784				ARP_LOG(LOG_INFO, "%s moved from %*D "
785				    "to %*D on %s\n",
786				    inet_ntoa(isaddr),
787				    ifp->if_addrlen,
788				    (u_char *)&la->ll_addr, ":",
789				    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
790				    ifp->if_xname);
791			}
792		}
793
794		if (ifp->if_addrlen != ah->ar_hln) {
795			LLE_WUNLOCK(la);
796			ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
797			    "i/f %d (ignored)\n", ifp->if_addrlen,
798			    (u_char *) ar_sha(ah), ":", ah->ar_hln,
799			    ifp->if_addrlen);
800			goto drop;
801		}
802		(void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
803		la->la_flags |= LLE_VALID;
804
805		EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
806
807		if (!(la->la_flags & LLE_STATIC)) {
808			int canceled;
809
810			LLE_ADDREF(la);
811			la->la_expire = time_uptime + V_arpt_keep;
812			canceled = callout_reset(&la->la_timer,
813			    hz * V_arpt_keep, arptimer, la);
814			if (canceled)
815				LLE_REMREF(la);
816		}
817		la->la_asked = 0;
818		la->la_preempt = V_arp_maxtries;
819		/*
820		 * The packets are all freed within the call to the output
821		 * routine.
822		 *
823		 * NB: The lock MUST be released before the call to the
824		 * output routine.
825		 */
826		if (la->la_hold != NULL) {
827			struct mbuf *m_hold, *m_hold_next;
828
829			m_hold = la->la_hold;
830			la->la_hold = NULL;
831			la->la_numheld = 0;
832			memcpy(&sa, L3_ADDR(la), sizeof(sa));
833			LLE_WUNLOCK(la);
834			for (; m_hold != NULL; m_hold = m_hold_next) {
835				m_hold_next = m_hold->m_nextpkt;
836				m_hold->m_nextpkt = NULL;
837				/* Avoid confusing lower layers. */
838				m_clrprotoflags(m_hold);
839				(*ifp->if_output)(ifp, m_hold, &sa, NULL);
840			}
841		} else
842			LLE_WUNLOCK(la);
843	}
844reply:
845	if (op != ARPOP_REQUEST)
846		goto drop;
847	ARPSTAT_INC(rxrequests);
848
849	if (itaddr.s_addr == myaddr.s_addr) {
850		/* Shortcut.. the receiving interface is the target. */
851		(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
852		(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
853	} else {
854		struct llentry *lle = NULL;
855
856		sin.sin_addr = itaddr;
857		IF_AFDATA_RLOCK(ifp);
858		lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
859		IF_AFDATA_RUNLOCK(ifp);
860
861		if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
862			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
863			(void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln);
864			LLE_RUNLOCK(lle);
865		} else {
866
867			if (lle != NULL)
868				LLE_RUNLOCK(lle);
869
870			if (!V_arp_proxyall)
871				goto drop;
872
873			sin.sin_addr = itaddr;
874			/* XXX MRT use table 0 for arp reply  */
875			rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
876			if (!rt)
877				goto drop;
878
879			/*
880			 * Don't send proxies for nodes on the same interface
881			 * as this one came out of, or we'll get into a fight
882			 * over who claims what Ether address.
883			 */
884			if (!rt->rt_ifp || rt->rt_ifp == ifp) {
885				RTFREE_LOCKED(rt);
886				goto drop;
887			}
888			RTFREE_LOCKED(rt);
889
890			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
891			(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
892
893			/*
894			 * Also check that the node which sent the ARP packet
895			 * is on the interface we expect it to be on. This
896			 * avoids ARP chaos if an interface is connected to the
897			 * wrong network.
898			 */
899			sin.sin_addr = isaddr;
900
901			/* XXX MRT use table 0 for arp checks */
902			rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
903			if (!rt)
904				goto drop;
905			if (rt->rt_ifp != ifp) {
906				ARP_LOG(LOG_INFO, "proxy: ignoring request"
907				    " from %s via %s, expecting %s\n",
908				    inet_ntoa(isaddr), ifp->if_xname,
909				    rt->rt_ifp->if_xname);
910				RTFREE_LOCKED(rt);
911				goto drop;
912			}
913			RTFREE_LOCKED(rt);
914
915#ifdef DEBUG_PROXY
916			printf("arp: proxying for %s\n", inet_ntoa(itaddr));
917#endif
918		}
919	}
920
921	if (itaddr.s_addr == myaddr.s_addr &&
922	    IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
923		/* RFC 3927 link-local IPv4; always reply by broadcast. */
924#ifdef DEBUG_LINKLOCAL
925		printf("arp: sending reply for link-local addr %s\n",
926		    inet_ntoa(itaddr));
927#endif
928		m->m_flags |= M_BCAST;
929		m->m_flags &= ~M_MCAST;
930	} else {
931		/* default behaviour; never reply by broadcast. */
932		m->m_flags &= ~(M_BCAST|M_MCAST);
933	}
934	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
935	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
936	ah->ar_op = htons(ARPOP_REPLY);
937	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
938	m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
939	m->m_pkthdr.len = m->m_len;
940	m->m_pkthdr.rcvif = NULL;
941	sa.sa_family = AF_ARP;
942	sa.sa_len = 2;
943	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
944	(*ifp->if_output)(ifp, m, &sa, NULL);
945	ARPSTAT_INC(txreplies);
946	return;
947
948drop:
949	m_freem(m);
950}
951#endif
952
953/*
954 * Handle the garp_rexmit_count. Like sysctl_handle_int(), but limits the range
955 * of valid values.
956 */
957static int
958sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS)
959{
960	int error;
961	int rexmit_count = *(int *)arg1;
962
963	error = sysctl_handle_int(oidp, &rexmit_count, 0, req);
964
965	/* Enforce limits on any new value that may have been set. */
966	if (!error && req->newptr) {
967		/* A new value was set. */
968		if (rexmit_count < 0) {
969			rexmit_count = 0;
970		} else if (rexmit_count > MAX_GARP_RETRANSMITS) {
971			rexmit_count = MAX_GARP_RETRANSMITS;
972		}
973		*(int *)arg1 = rexmit_count;
974	}
975
976	return (error);
977}
978
979/*
980 * Retransmit a Gratuitous ARP (GARP) and, if necessary, schedule a callout to
981 * retransmit it again. A pending callout owns a reference to the ifa.
982 */
983static void
984garp_rexmit(void *arg)
985{
986	struct in_ifaddr *ia = arg;
987
988	if (callout_pending(&ia->ia_garp_timer) ||
989	    !callout_active(&ia->ia_garp_timer)) {
990		IFA_UNLOCK(&ia->ia_ifa);
991		ifa_free(&ia->ia_ifa);
992		return;
993	}
994
995	/*
996	 * Drop ifa lock while the ARP request is generated.
997	 */
998	IFA_UNLOCK(&ia->ia_ifa);
999
1000	arprequest(ia->ia_ifa.ifa_ifp, &IA_SIN(ia)->sin_addr,
1001	    &IA_SIN(ia)->sin_addr, IF_LLADDR(ia->ia_ifa.ifa_ifp));
1002
1003	/*
1004	 * Increment the count of retransmissions. If the count has reached the
1005	 * maximum value, stop sending the GARP packets. Otherwise, schedule
1006	 * the callout to retransmit another GARP packet.
1007	 */
1008	++ia->ia_garp_count;
1009	if (ia->ia_garp_count >= garp_rexmit_count) {
1010		ifa_free(&ia->ia_ifa);
1011	} else {
1012		int rescheduled;
1013		IFA_LOCK(&ia->ia_ifa);
1014		rescheduled = callout_reset(&ia->ia_garp_timer,
1015		    (1 << ia->ia_garp_count) * hz,
1016		    garp_rexmit, ia);
1017		IFA_UNLOCK(&ia->ia_ifa);
1018		if (rescheduled) {
1019			ifa_free(&ia->ia_ifa);
1020		}
1021	}
1022}
1023
1024/*
1025 * Start the GARP retransmit timer.
1026 *
1027 * A single GARP is always transmitted when an IPv4 address is added
1028 * to an interface and that is usually sufficient. However, in some
1029 * circumstances, such as when a shared address is passed between
1030 * cluster nodes, this single GARP may occasionally be dropped or
1031 * lost. This can lead to neighbors on the network link working with a
1032 * stale ARP cache and sending packets destined for that address to
1033 * the node that previously owned the address, which may not respond.
1034 *
1035 * To avoid this situation, GARP retransmits can be enabled by setting
1036 * the net.link.ether.inet.garp_rexmit_count sysctl to a value greater
1037 * than zero. The setting represents the maximum number of
1038 * retransmissions. The interval between retransmissions is calculated
1039 * using an exponential backoff algorithm, doubling each time, so the
1040 * retransmission intervals are: {1, 2, 4, 8, 16, ...} (seconds).
1041 */
1042static void
1043garp_timer_start(struct ifaddr *ifa)
1044{
1045	struct in_ifaddr *ia = (struct in_ifaddr *) ifa;
1046
1047	IFA_LOCK(ifa);
1048	ia->ia_garp_count = 0;
1049	if (callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz,
1050	    garp_rexmit, ia) == 0) {
1051		ifa_ref(ifa);
1052	}
1053	IFA_UNLOCK(ifa);
1054}
1055
1056void
1057arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
1058{
1059	struct llentry *lle;
1060
1061	if (ifa->ifa_carp != NULL)
1062		return;
1063
1064	if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) {
1065		arprequest(ifp, &IA_SIN(ifa)->sin_addr,
1066				&IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp));
1067		if (garp_rexmit_count > 0) {
1068			garp_timer_start(ifa);
1069		}
1070
1071		/*
1072		 * interface address is considered static entry
1073		 * because the output of the arp utility shows
1074		 * that L2 entry as permanent
1075		 */
1076		IF_AFDATA_LOCK(ifp);
1077		lle = lla_lookup(LLTABLE(ifp), (LLE_CREATE | LLE_IFADDR | LLE_STATIC),
1078				 (struct sockaddr *)IA_SIN(ifa));
1079		IF_AFDATA_UNLOCK(ifp);
1080		if (lle == NULL)
1081			log(LOG_INFO, "arp_ifinit: cannot create arp "
1082			    "entry for interface address\n");
1083		else
1084			LLE_RUNLOCK(lle);
1085	}
1086	ifa->ifa_rtrequest = NULL;
1087}
1088
1089void
1090arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr)
1091{
1092	if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY)
1093		arprequest(ifp, &IA_SIN(ifa)->sin_addr,
1094				&IA_SIN(ifa)->sin_addr, enaddr);
1095	ifa->ifa_rtrequest = NULL;
1096}
1097
1098static void
1099arp_init(void)
1100{
1101
1102	netisr_register(&arp_nh);
1103}
1104SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0);
1105