1/*-
2 * Copyright (c) 2007-2009 Bruce Simpson.
3 * Copyright (c) 1988 Stephen Deering.
4 * Copyright (c) 1992, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Stephen Deering of Stanford University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
35 */
36
37/*
38 * Internet Group Management Protocol (IGMP) routines.
39 * [RFC1112, RFC2236, RFC3376]
40 *
41 * Written by Steve Deering, Stanford, May 1988.
42 * Modified by Rosen Sharma, Stanford, Aug 1994.
43 * Modified by Bill Fenner, Xerox PARC, Feb 1995.
44 * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
45 * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
46 *
47 * MULTICAST Revision: 3.5.1.4
48 */
49
50#include <sys/cdefs.h>
51__FBSDID("$FreeBSD$");
52
53#include <sys/param.h>
54#include <sys/systm.h>
55#include <sys/module.h>
56#include <sys/malloc.h>
57#include <sys/mbuf.h>
58#include <sys/socket.h>
59#include <sys/protosw.h>
60#include <sys/kernel.h>
61#include <sys/sysctl.h>
62#include <sys/ktr.h>
63#include <sys/condvar.h>
64
65#include <net/if.h>
66#include <net/netisr.h>
67#include <net/vnet.h>
68
69#include <netinet/in.h>
70#include <netinet/in_var.h>
71#include <netinet/in_systm.h>
72#include <netinet/ip.h>
73#include <netinet/ip_var.h>
74#include <netinet/ip_options.h>
75#include <netinet/igmp.h>
76#include <netinet/igmp_var.h>
77
78#include <machine/in_cksum.h>
79
80#include <security/mac/mac_framework.h>
81
82#ifndef KTR_IGMPV3
83#define KTR_IGMPV3 KTR_INET
84#endif
85
86static struct igmp_ifinfo *
87		igi_alloc_locked(struct ifnet *);
88static void	igi_delete_locked(const struct ifnet *);
89static void	igmp_dispatch_queue(struct ifqueue *, int, const int);
90static void	igmp_fasttimo_vnet(void);
91static void	igmp_final_leave(struct in_multi *, struct igmp_ifinfo *);
92static int	igmp_handle_state_change(struct in_multi *,
93		    struct igmp_ifinfo *);
94static int	igmp_initial_join(struct in_multi *, struct igmp_ifinfo *);
95static int	igmp_input_v1_query(struct ifnet *, const struct ip *,
96		    const struct igmp *);
97static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
98		    const struct igmp *);
99static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
100		    /*const*/ struct igmpv3 *);
101static int	igmp_input_v3_group_query(struct in_multi *,
102		    struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *);
103static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
104		    /*const*/ struct igmp *);
105static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
106		    /*const*/ struct igmp *);
107static void	igmp_intr(struct mbuf *);
108static int	igmp_isgroupreported(const struct in_addr);
109static struct mbuf *
110		igmp_ra_alloc(void);
111#ifdef KTR
112static char *	igmp_rec_type_to_str(const int);
113#endif
114static void	igmp_set_version(struct igmp_ifinfo *, const int);
115static void	igmp_slowtimo_vnet(void);
116static int	igmp_v1v2_queue_report(struct in_multi *, const int);
117static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
118static void	igmp_v1v2_process_querier_timers(struct igmp_ifinfo *);
119static void	igmp_v2_update_group(struct in_multi *, const int);
120static void	igmp_v3_cancel_link_timers(struct igmp_ifinfo *);
121static void	igmp_v3_dispatch_general_query(struct igmp_ifinfo *);
122static struct mbuf *
123		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
124static int	igmp_v3_enqueue_group_record(struct ifqueue *,
125		    struct in_multi *, const int, const int, const int);
126static int	igmp_v3_enqueue_filter_change(struct ifqueue *,
127		    struct in_multi *);
128static void	igmp_v3_process_group_timers(struct igmp_ifinfo *,
129		    struct ifqueue *, struct ifqueue *, struct in_multi *,
130		    const int);
131static int	igmp_v3_merge_state_changes(struct in_multi *,
132		    struct ifqueue *);
133static void	igmp_v3_suppress_group_record(struct in_multi *);
134static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
135static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
136static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
137
138static const struct netisr_handler igmp_nh = {
139	.nh_name = "igmp",
140	.nh_handler = igmp_intr,
141	.nh_proto = NETISR_IGMP,
142	.nh_policy = NETISR_POLICY_SOURCE,
143};
144
145/*
146 * System-wide globals.
147 *
148 * Unlocked access to these is OK, except for the global IGMP output
149 * queue. The IGMP subsystem lock ends up being system-wide for the moment,
150 * because all VIMAGEs have to share a global output queue, as netisrs
151 * themselves are not virtualized.
152 *
153 * Locking:
154 *  * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
155 *    Any may be taken independently; if any are held at the same
156 *    time, the above lock order must be followed.
157 *  * All output is delegated to the netisr.
158 *    Now that Giant has been eliminated, the netisr may be inlined.
159 *  * IN_MULTI_LOCK covers in_multi.
160 *  * IGMP_LOCK covers igmp_ifinfo and any global variables in this file,
161 *    including the output queue.
162 *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
163 *    per-link state iterators.
164 *  * igmp_ifinfo is valid as long as PF_INET is attached to the interface,
165 *    therefore it is not refcounted.
166 *    We allow unlocked reads of igmp_ifinfo when accessed via in_multi.
167 *
168 * Reference counting
169 *  * IGMP acquires its own reference every time an in_multi is passed to
170 *    it and the group is being joined for the first time.
171 *  * IGMP releases its reference(s) on in_multi in a deferred way,
172 *    because the operations which process the release run as part of
173 *    a loop whose control variables are directly affected by the release
174 *    (that, and not recursing on the IF_ADDR_LOCK).
175 *
176 * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
177 * to a vnet in ifp->if_vnet.
178 *
179 * SMPng: XXX We may potentially race operations on ifma_protospec.
180 * The problem is that we currently lack a clean way of taking the
181 * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
182 * as anything which modifies ifma needs to be covered by that lock.
183 * So check for ifma_protospec being NULL before proceeding.
184 */
185struct mtx		 igmp_mtx;
186
187struct mbuf		*m_raopt;		 /* Router Alert option */
188static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
189
190/*
191 * VIMAGE-wide globals.
192 *
193 * The IGMPv3 timers themselves need to run per-image, however,
194 * protosw timers run globally (see tcp).
195 * An ifnet can only be in one vimage at a time, and the loopback
196 * ifnet, loif, is itself virtualized.
197 * It would otherwise be possible to seriously hose IGMP state,
198 * and create inconsistencies in upstream multicast routing, if you have
199 * multiple VIMAGEs running on the same link joining different multicast
200 * groups, UNLESS the "primary IP address" is different. This is because
201 * IGMP for IPv4 does not force link-local addresses to be used for each
202 * node, unlike MLD for IPv6.
203 * Obviously the IGMPv3 per-interface state has per-vimage granularity
204 * also as a result.
205 *
206 * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
207 * policy to control the address used by IGMP on the link.
208 */
209static VNET_DEFINE(int, interface_timers_running);	/* IGMPv3 general
210							 * query response */
211static VNET_DEFINE(int, state_change_timers_running);	/* IGMPv3 state-change
212							 * retransmit */
213static VNET_DEFINE(int, current_state_timers_running);	/* IGMPv1/v2 host
214							 * report; IGMPv3 g/sg
215							 * query response */
216
217#define	V_interface_timers_running	VNET(interface_timers_running)
218#define	V_state_change_timers_running	VNET(state_change_timers_running)
219#define	V_current_state_timers_running	VNET(current_state_timers_running)
220
221static VNET_DEFINE(LIST_HEAD(, igmp_ifinfo), igi_head);
222static VNET_DEFINE(struct igmpstat, igmpstat) = {
223	.igps_version = IGPS_VERSION_3,
224	.igps_len = sizeof(struct igmpstat),
225};
226static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0};
227
228#define	V_igi_head			VNET(igi_head)
229#define	V_igmpstat			VNET(igmpstat)
230#define	V_igmp_gsrdelay			VNET(igmp_gsrdelay)
231
232static VNET_DEFINE(int, igmp_recvifkludge) = 1;
233static VNET_DEFINE(int, igmp_sendra) = 1;
234static VNET_DEFINE(int, igmp_sendlocal) = 1;
235static VNET_DEFINE(int, igmp_v1enable) = 1;
236static VNET_DEFINE(int, igmp_v2enable) = 1;
237static VNET_DEFINE(int, igmp_legacysupp);
238static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
239
240#define	V_igmp_recvifkludge		VNET(igmp_recvifkludge)
241#define	V_igmp_sendra			VNET(igmp_sendra)
242#define	V_igmp_sendlocal		VNET(igmp_sendlocal)
243#define	V_igmp_v1enable			VNET(igmp_v1enable)
244#define	V_igmp_v2enable			VNET(igmp_v2enable)
245#define	V_igmp_legacysupp		VNET(igmp_legacysupp)
246#define	V_igmp_default_version		VNET(igmp_default_version)
247
248/*
249 * Virtualized sysctls.
250 */
251SYSCTL_VNET_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW,
252    &VNET_NAME(igmpstat), igmpstat, "");
253SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW,
254    &VNET_NAME(igmp_recvifkludge), 0,
255    "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
256SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW,
257    &VNET_NAME(igmp_sendra), 0,
258    "Send IP Router Alert option in IGMPv2/v3 messages");
259SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW,
260    &VNET_NAME(igmp_sendlocal), 0,
261    "Send IGMP membership reports for 224.0.0.0/24 groups");
262SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW,
263    &VNET_NAME(igmp_v1enable), 0,
264    "Enable backwards compatibility with IGMPv1");
265SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW,
266    &VNET_NAME(igmp_v2enable), 0,
267    "Enable backwards compatibility with IGMPv2");
268SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW,
269    &VNET_NAME(igmp_legacysupp), 0,
270    "Allow v1/v2 reports to suppress v3 group responses");
271SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, default_version,
272    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
273    &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
274    "Default version of IGMP to run on each interface");
275SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
276    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
277    &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
278    "Rate limit for IGMPv3 Group-and-Source queries in seconds");
279
280/*
281 * Non-virtualized sysctls.
282 */
283static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo,
284    CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo,
285    "Per-interface IGMPv3 state");
286
287static __inline void
288igmp_save_context(struct mbuf *m, struct ifnet *ifp)
289{
290
291#ifdef VIMAGE
292	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
293#endif /* VIMAGE */
294	m->m_pkthdr.flowid = ifp->if_index;
295}
296
297static __inline void
298igmp_scrub_context(struct mbuf *m)
299{
300
301	m->m_pkthdr.PH_loc.ptr = NULL;
302	m->m_pkthdr.flowid = 0;
303}
304
305#ifdef KTR
306static __inline char *
307inet_ntoa_haddr(in_addr_t haddr)
308{
309	struct in_addr ia;
310
311	ia.s_addr = htonl(haddr);
312	return (inet_ntoa(ia));
313}
314#endif
315
316/*
317 * Restore context from a queued IGMP output chain.
318 * Return saved ifindex.
319 *
320 * VIMAGE: The assertion is there to make sure that we
321 * actually called CURVNET_SET() with what's in the mbuf chain.
322 */
323static __inline uint32_t
324igmp_restore_context(struct mbuf *m)
325{
326
327#ifdef notyet
328#if defined(VIMAGE) && defined(INVARIANTS)
329	KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr),
330	    ("%s: called when curvnet was not restored", __func__));
331#endif
332#endif
333	return (m->m_pkthdr.flowid);
334}
335
336/*
337 * Retrieve or set default IGMP version.
338 *
339 * VIMAGE: Assume curvnet set by caller.
340 * SMPng: NOTE: Serialized by IGMP lock.
341 */
342static int
343sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
344{
345	int	 error;
346	int	 new;
347
348	error = sysctl_wire_old_buffer(req, sizeof(int));
349	if (error)
350		return (error);
351
352	IGMP_LOCK();
353
354	new = V_igmp_default_version;
355
356	error = sysctl_handle_int(oidp, &new, 0, req);
357	if (error || !req->newptr)
358		goto out_locked;
359
360	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
361		error = EINVAL;
362		goto out_locked;
363	}
364
365	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
366	     V_igmp_default_version, new);
367
368	V_igmp_default_version = new;
369
370out_locked:
371	IGMP_UNLOCK();
372	return (error);
373}
374
375/*
376 * Retrieve or set threshold between group-source queries in seconds.
377 *
378 * VIMAGE: Assume curvnet set by caller.
379 * SMPng: NOTE: Serialized by IGMP lock.
380 */
381static int
382sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
383{
384	int error;
385	int i;
386
387	error = sysctl_wire_old_buffer(req, sizeof(int));
388	if (error)
389		return (error);
390
391	IGMP_LOCK();
392
393	i = V_igmp_gsrdelay.tv_sec;
394
395	error = sysctl_handle_int(oidp, &i, 0, req);
396	if (error || !req->newptr)
397		goto out_locked;
398
399	if (i < -1 || i >= 60) {
400		error = EINVAL;
401		goto out_locked;
402	}
403
404	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
405	     V_igmp_gsrdelay.tv_sec, i);
406	V_igmp_gsrdelay.tv_sec = i;
407
408out_locked:
409	IGMP_UNLOCK();
410	return (error);
411}
412
413/*
414 * Expose struct igmp_ifinfo to userland, keyed by ifindex.
415 * For use by ifmcstat(8).
416 *
417 * SMPng: NOTE: Does an unlocked ifindex space read.
418 * VIMAGE: Assume curvnet set by caller. The node handler itself
419 * is not directly virtualized.
420 */
421static int
422sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
423{
424	int			*name;
425	int			 error;
426	u_int			 namelen;
427	struct ifnet		*ifp;
428	struct igmp_ifinfo	*igi;
429
430	name = (int *)arg1;
431	namelen = arg2;
432
433	if (req->newptr != NULL)
434		return (EPERM);
435
436	if (namelen != 1)
437		return (EINVAL);
438
439	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
440	if (error)
441		return (error);
442
443	IN_MULTI_LOCK();
444	IGMP_LOCK();
445
446	if (name[0] <= 0 || name[0] > V_if_index) {
447		error = ENOENT;
448		goto out_locked;
449	}
450
451	error = ENOENT;
452
453	ifp = ifnet_byindex(name[0]);
454	if (ifp == NULL)
455		goto out_locked;
456
457	LIST_FOREACH(igi, &V_igi_head, igi_link) {
458		if (ifp == igi->igi_ifp) {
459			error = SYSCTL_OUT(req, igi,
460			    sizeof(struct igmp_ifinfo));
461			break;
462		}
463	}
464
465out_locked:
466	IGMP_UNLOCK();
467	IN_MULTI_UNLOCK();
468	return (error);
469}
470
471/*
472 * Dispatch an entire queue of pending packet chains
473 * using the netisr.
474 * VIMAGE: Assumes the vnet pointer has been set.
475 */
476static void
477igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop)
478{
479	struct mbuf *m;
480
481	for (;;) {
482		_IF_DEQUEUE(ifq, m);
483		if (m == NULL)
484			break;
485		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m);
486		if (loop)
487			m->m_flags |= M_IGMP_LOOP;
488		netisr_dispatch(NETISR_IGMP, m);
489		if (--limit == 0)
490			break;
491	}
492}
493
494/*
495 * Filter outgoing IGMP report state by group.
496 *
497 * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
498 * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
499 * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
500 * this may break certain IGMP snooping switches which rely on the old
501 * report behaviour.
502 *
503 * Return zero if the given group is one for which IGMP reports
504 * should be suppressed, or non-zero if reports should be issued.
505 */
506static __inline int
507igmp_isgroupreported(const struct in_addr addr)
508{
509
510	if (in_allhosts(addr) ||
511	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
512		return (0);
513
514	return (1);
515}
516
517/*
518 * Construct a Router Alert option to use in outgoing packets.
519 */
520static struct mbuf *
521igmp_ra_alloc(void)
522{
523	struct mbuf	*m;
524	struct ipoption	*p;
525
526	m = m_get(M_WAITOK, MT_DATA);
527	p = mtod(m, struct ipoption *);
528	p->ipopt_dst.s_addr = INADDR_ANY;
529	p->ipopt_list[0] = (char)IPOPT_RA;	/* Router Alert Option */
530	p->ipopt_list[1] = 0x04;		/* 4 bytes long */
531	p->ipopt_list[2] = IPOPT_EOL;		/* End of IP option list */
532	p->ipopt_list[3] = 0x00;		/* pad byte */
533	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
534
535	return (m);
536}
537
538/*
539 * Attach IGMP when PF_INET is attached to an interface.
540 */
541struct igmp_ifinfo *
542igmp_domifattach(struct ifnet *ifp)
543{
544	struct igmp_ifinfo *igi;
545
546	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
547	    __func__, ifp, ifp->if_xname);
548
549	IGMP_LOCK();
550
551	igi = igi_alloc_locked(ifp);
552	if (!(ifp->if_flags & IFF_MULTICAST))
553		igi->igi_flags |= IGIF_SILENT;
554
555	IGMP_UNLOCK();
556
557	return (igi);
558}
559
560/*
561 * VIMAGE: assume curvnet set by caller.
562 */
563static struct igmp_ifinfo *
564igi_alloc_locked(/*const*/ struct ifnet *ifp)
565{
566	struct igmp_ifinfo *igi;
567
568	IGMP_LOCK_ASSERT();
569
570	igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO);
571	if (igi == NULL)
572		goto out;
573
574	igi->igi_ifp = ifp;
575	igi->igi_version = V_igmp_default_version;
576	igi->igi_flags = 0;
577	igi->igi_rv = IGMP_RV_INIT;
578	igi->igi_qi = IGMP_QI_INIT;
579	igi->igi_qri = IGMP_QRI_INIT;
580	igi->igi_uri = IGMP_URI_INIT;
581
582	SLIST_INIT(&igi->igi_relinmhead);
583
584	/*
585	 * Responses to general queries are subject to bounds.
586	 */
587	IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
588
589	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
590
591	CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)",
592	     ifp, ifp->if_xname);
593
594out:
595	return (igi);
596}
597
598/*
599 * Hook for ifdetach.
600 *
601 * NOTE: Some finalization tasks need to run before the protocol domain
602 * is detached, but also before the link layer does its cleanup.
603 *
604 * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
605 * XXX This is also bitten by unlocked ifma_protospec access.
606 */
607void
608igmp_ifdetach(struct ifnet *ifp)
609{
610	struct igmp_ifinfo	*igi;
611	struct ifmultiaddr	*ifma;
612	struct in_multi		*inm, *tinm;
613
614	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
615	    ifp->if_xname);
616
617	IGMP_LOCK();
618
619	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
620	if (igi->igi_version == IGMP_VERSION_3) {
621		IF_ADDR_RLOCK(ifp);
622		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
623			if (ifma->ifma_addr->sa_family != AF_INET ||
624			    ifma->ifma_protospec == NULL)
625				continue;
626#if 0
627			KASSERT(ifma->ifma_protospec != NULL,
628			    ("%s: ifma_protospec is NULL", __func__));
629#endif
630			inm = (struct in_multi *)ifma->ifma_protospec;
631			if (inm->inm_state == IGMP_LEAVING_MEMBER) {
632				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
633				    inm, inm_nrele);
634			}
635			inm_clear_recorded(inm);
636		}
637		IF_ADDR_RUNLOCK(ifp);
638		/*
639		 * Free the in_multi reference(s) for this IGMP lifecycle.
640		 */
641		SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
642		    tinm) {
643			SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
644			inm_release_locked(inm);
645		}
646	}
647
648	IGMP_UNLOCK();
649}
650
651/*
652 * Hook for domifdetach.
653 */
654void
655igmp_domifdetach(struct ifnet *ifp)
656{
657
658	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
659	    __func__, ifp, ifp->if_xname);
660
661	IGMP_LOCK();
662	igi_delete_locked(ifp);
663	IGMP_UNLOCK();
664}
665
666static void
667igi_delete_locked(const struct ifnet *ifp)
668{
669	struct igmp_ifinfo *igi, *tigi;
670
671	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)",
672	    __func__, ifp, ifp->if_xname);
673
674	IGMP_LOCK_ASSERT();
675
676	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
677		if (igi->igi_ifp == ifp) {
678			/*
679			 * Free deferred General Query responses.
680			 */
681			_IF_DRAIN(&igi->igi_gq);
682
683			LIST_REMOVE(igi, igi_link);
684
685			KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
686			    ("%s: there are dangling in_multi references",
687			    __func__));
688
689			free(igi, M_IGMP);
690			return;
691		}
692	}
693
694#ifdef INVARIANTS
695	panic("%s: igmp_ifinfo not found for ifp %p\n", __func__,  ifp);
696#endif
697}
698
699/*
700 * Process a received IGMPv1 query.
701 * Return non-zero if the message should be dropped.
702 *
703 * VIMAGE: The curvnet pointer is derived from the input ifp.
704 */
705static int
706igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
707    const struct igmp *igmp)
708{
709	struct ifmultiaddr	*ifma;
710	struct igmp_ifinfo	*igi;
711	struct in_multi		*inm;
712
713	/*
714	 * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
715	 * 224.0.0.1. They are always treated as General Queries.
716	 * igmp_group is always ignored. Do not drop it as a userland
717	 * daemon may wish to see it.
718	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
719	 */
720	if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
721		IGMPSTAT_INC(igps_rcv_badqueries);
722		return (0);
723	}
724	IGMPSTAT_INC(igps_rcv_gen_queries);
725
726	IN_MULTI_LOCK();
727	IGMP_LOCK();
728
729	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
730	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
731
732	if (igi->igi_flags & IGIF_LOOPBACK) {
733		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
734		    ifp, ifp->if_xname);
735		goto out_locked;
736	}
737
738	/*
739	 * Switch to IGMPv1 host compatibility mode.
740	 */
741	igmp_set_version(igi, IGMP_VERSION_1);
742
743	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
744
745	/*
746	 * Start the timers in all of our group records
747	 * for the interface on which the query arrived,
748	 * except those which are already running.
749	 */
750	IF_ADDR_RLOCK(ifp);
751	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
752		if (ifma->ifma_addr->sa_family != AF_INET ||
753		    ifma->ifma_protospec == NULL)
754			continue;
755		inm = (struct in_multi *)ifma->ifma_protospec;
756		if (inm->inm_timer != 0)
757			continue;
758		switch (inm->inm_state) {
759		case IGMP_NOT_MEMBER:
760		case IGMP_SILENT_MEMBER:
761			break;
762		case IGMP_G_QUERY_PENDING_MEMBER:
763		case IGMP_SG_QUERY_PENDING_MEMBER:
764		case IGMP_REPORTING_MEMBER:
765		case IGMP_IDLE_MEMBER:
766		case IGMP_LAZY_MEMBER:
767		case IGMP_SLEEPING_MEMBER:
768		case IGMP_AWAKENING_MEMBER:
769			inm->inm_state = IGMP_REPORTING_MEMBER;
770			inm->inm_timer = IGMP_RANDOM_DELAY(
771			    IGMP_V1V2_MAX_RI * PR_FASTHZ);
772			V_current_state_timers_running = 1;
773			break;
774		case IGMP_LEAVING_MEMBER:
775			break;
776		}
777	}
778	IF_ADDR_RUNLOCK(ifp);
779
780out_locked:
781	IGMP_UNLOCK();
782	IN_MULTI_UNLOCK();
783
784	return (0);
785}
786
787/*
788 * Process a received IGMPv2 general or group-specific query.
789 */
790static int
791igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
792    const struct igmp *igmp)
793{
794	struct ifmultiaddr	*ifma;
795	struct igmp_ifinfo	*igi;
796	struct in_multi		*inm;
797	int			 is_general_query;
798	uint16_t		 timer;
799
800	is_general_query = 0;
801
802	/*
803	 * Validate address fields upfront.
804	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
805	 */
806	if (in_nullhost(igmp->igmp_group)) {
807		/*
808		 * IGMPv2 General Query.
809		 * If this was not sent to the all-hosts group, ignore it.
810		 */
811		if (!in_allhosts(ip->ip_dst))
812			return (0);
813		IGMPSTAT_INC(igps_rcv_gen_queries);
814		is_general_query = 1;
815	} else {
816		/* IGMPv2 Group-Specific Query. */
817		IGMPSTAT_INC(igps_rcv_group_queries);
818	}
819
820	IN_MULTI_LOCK();
821	IGMP_LOCK();
822
823	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
824	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
825
826	if (igi->igi_flags & IGIF_LOOPBACK) {
827		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
828		    ifp, ifp->if_xname);
829		goto out_locked;
830	}
831
832	/*
833	 * Ignore v2 query if in v1 Compatibility Mode.
834	 */
835	if (igi->igi_version == IGMP_VERSION_1)
836		goto out_locked;
837
838	igmp_set_version(igi, IGMP_VERSION_2);
839
840	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
841	if (timer == 0)
842		timer = 1;
843
844	if (is_general_query) {
845		/*
846		 * For each reporting group joined on this
847		 * interface, kick the report timer.
848		 */
849		CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
850		    ifp, ifp->if_xname);
851		IF_ADDR_RLOCK(ifp);
852		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
853			if (ifma->ifma_addr->sa_family != AF_INET ||
854			    ifma->ifma_protospec == NULL)
855				continue;
856			inm = (struct in_multi *)ifma->ifma_protospec;
857			igmp_v2_update_group(inm, timer);
858		}
859		IF_ADDR_RUNLOCK(ifp);
860	} else {
861		/*
862		 * Group-specific IGMPv2 query, we need only
863		 * look up the single group to process it.
864		 */
865		inm = inm_lookup(ifp, igmp->igmp_group);
866		if (inm != NULL) {
867			CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
868			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
869			igmp_v2_update_group(inm, timer);
870		}
871	}
872
873out_locked:
874	IGMP_UNLOCK();
875	IN_MULTI_UNLOCK();
876
877	return (0);
878}
879
880/*
881 * Update the report timer on a group in response to an IGMPv2 query.
882 *
883 * If we are becoming the reporting member for this group, start the timer.
884 * If we already are the reporting member for this group, and timer is
885 * below the threshold, reset it.
886 *
887 * We may be updating the group for the first time since we switched
888 * to IGMPv3. If we are, then we must clear any recorded source lists,
889 * and transition to REPORTING state; the group timer is overloaded
890 * for group and group-source query responses.
891 *
892 * Unlike IGMPv3, the delay per group should be jittered
893 * to avoid bursts of IGMPv2 reports.
894 */
895static void
896igmp_v2_update_group(struct in_multi *inm, const int timer)
897{
898
899	CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
900	    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
901
902	IN_MULTI_LOCK_ASSERT();
903
904	switch (inm->inm_state) {
905	case IGMP_NOT_MEMBER:
906	case IGMP_SILENT_MEMBER:
907		break;
908	case IGMP_REPORTING_MEMBER:
909		if (inm->inm_timer != 0 &&
910		    inm->inm_timer <= timer) {
911			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
912			    "skipping.", __func__);
913			break;
914		}
915		/* FALLTHROUGH */
916	case IGMP_SG_QUERY_PENDING_MEMBER:
917	case IGMP_G_QUERY_PENDING_MEMBER:
918	case IGMP_IDLE_MEMBER:
919	case IGMP_LAZY_MEMBER:
920	case IGMP_AWAKENING_MEMBER:
921		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
922		inm->inm_state = IGMP_REPORTING_MEMBER;
923		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
924		V_current_state_timers_running = 1;
925		break;
926	case IGMP_SLEEPING_MEMBER:
927		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
928		inm->inm_state = IGMP_AWAKENING_MEMBER;
929		break;
930	case IGMP_LEAVING_MEMBER:
931		break;
932	}
933}
934
935/*
936 * Process a received IGMPv3 general, group-specific or
937 * group-and-source-specific query.
938 * Assumes m has already been pulled up to the full IGMP message length.
939 * Return 0 if successful, otherwise an appropriate error code is returned.
940 */
941static int
942igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
943    /*const*/ struct igmpv3 *igmpv3)
944{
945	struct igmp_ifinfo	*igi;
946	struct in_multi		*inm;
947	int			 is_general_query;
948	uint32_t		 maxresp, nsrc, qqi;
949	uint16_t		 timer;
950	uint8_t			 qrv;
951
952	is_general_query = 0;
953
954	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
955
956	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
957	if (maxresp >= 128) {
958		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
959			  (IGMP_EXP(igmpv3->igmp_code) + 3);
960	}
961
962	/*
963	 * Robustness must never be less than 2 for on-wire IGMPv3.
964	 * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
965	 * an exception for interfaces whose IGMPv3 state changes
966	 * are redirected to loopback (e.g. MANET).
967	 */
968	qrv = IGMP_QRV(igmpv3->igmp_misc);
969	if (qrv < 2) {
970		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
971		    qrv, IGMP_RV_INIT);
972		qrv = IGMP_RV_INIT;
973	}
974
975	qqi = igmpv3->igmp_qqi;
976	if (qqi >= 128) {
977		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
978		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
979	}
980
981	timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
982	if (timer == 0)
983		timer = 1;
984
985	nsrc = ntohs(igmpv3->igmp_numsrc);
986
987	/*
988	 * Validate address fields and versions upfront before
989	 * accepting v3 query.
990	 * XXX SMPng: Unlocked access to igmpstat counters here.
991	 */
992	if (in_nullhost(igmpv3->igmp_group)) {
993		/*
994		 * IGMPv3 General Query.
995		 *
996		 * General Queries SHOULD be directed to 224.0.0.1.
997		 * A general query with a source list has undefined
998		 * behaviour; discard it.
999		 */
1000		IGMPSTAT_INC(igps_rcv_gen_queries);
1001		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
1002			IGMPSTAT_INC(igps_rcv_badqueries);
1003			return (0);
1004		}
1005		is_general_query = 1;
1006	} else {
1007		/* Group or group-source specific query. */
1008		if (nsrc == 0)
1009			IGMPSTAT_INC(igps_rcv_group_queries);
1010		else
1011			IGMPSTAT_INC(igps_rcv_gsr_queries);
1012	}
1013
1014	IN_MULTI_LOCK();
1015	IGMP_LOCK();
1016
1017	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
1018	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
1019
1020	if (igi->igi_flags & IGIF_LOOPBACK) {
1021		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
1022		    ifp, ifp->if_xname);
1023		goto out_locked;
1024	}
1025
1026	/*
1027	 * Discard the v3 query if we're in Compatibility Mode.
1028	 * The RFC is not obviously worded that hosts need to stay in
1029	 * compatibility mode until the Old Version Querier Present
1030	 * timer expires.
1031	 */
1032	if (igi->igi_version != IGMP_VERSION_3) {
1033		CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
1034		    igi->igi_version, ifp, ifp->if_xname);
1035		goto out_locked;
1036	}
1037
1038	igmp_set_version(igi, IGMP_VERSION_3);
1039	igi->igi_rv = qrv;
1040	igi->igi_qi = qqi;
1041	igi->igi_qri = maxresp;
1042
1043	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
1044	    maxresp);
1045
1046	if (is_general_query) {
1047		/*
1048		 * Schedule a current-state report on this ifp for
1049		 * all groups, possibly containing source lists.
1050		 * If there is a pending General Query response
1051		 * scheduled earlier than the selected delay, do
1052		 * not schedule any other reports.
1053		 * Otherwise, reset the interface timer.
1054		 */
1055		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
1056		    ifp, ifp->if_xname);
1057		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
1058			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
1059			V_interface_timers_running = 1;
1060		}
1061	} else {
1062		/*
1063		 * Group-source-specific queries are throttled on
1064		 * a per-group basis to defeat denial-of-service attempts.
1065		 * Queries for groups we are not a member of on this
1066		 * link are simply ignored.
1067		 */
1068		inm = inm_lookup(ifp, igmpv3->igmp_group);
1069		if (inm == NULL)
1070			goto out_locked;
1071		if (nsrc > 0) {
1072			if (!ratecheck(&inm->inm_lastgsrtv,
1073			    &V_igmp_gsrdelay)) {
1074				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
1075				    __func__);
1076				IGMPSTAT_INC(igps_drop_gsr_queries);
1077				goto out_locked;
1078			}
1079		}
1080		CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
1081		     inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
1082		/*
1083		 * If there is a pending General Query response
1084		 * scheduled sooner than the selected delay, no
1085		 * further report need be scheduled.
1086		 * Otherwise, prepare to respond to the
1087		 * group-specific or group-and-source query.
1088		 */
1089		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
1090			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
1091	}
1092
1093out_locked:
1094	IGMP_UNLOCK();
1095	IN_MULTI_UNLOCK();
1096
1097	return (0);
1098}
1099
1100/*
1101 * Process a recieved IGMPv3 group-specific or group-and-source-specific
1102 * query.
1103 * Return <0 if any error occured. Currently this is ignored.
1104 */
1105static int
1106igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi,
1107    int timer, /*const*/ struct igmpv3 *igmpv3)
1108{
1109	int			 retval;
1110	uint16_t		 nsrc;
1111
1112	IN_MULTI_LOCK_ASSERT();
1113	IGMP_LOCK_ASSERT();
1114
1115	retval = 0;
1116
1117	switch (inm->inm_state) {
1118	case IGMP_NOT_MEMBER:
1119	case IGMP_SILENT_MEMBER:
1120	case IGMP_SLEEPING_MEMBER:
1121	case IGMP_LAZY_MEMBER:
1122	case IGMP_AWAKENING_MEMBER:
1123	case IGMP_IDLE_MEMBER:
1124	case IGMP_LEAVING_MEMBER:
1125		return (retval);
1126		break;
1127	case IGMP_REPORTING_MEMBER:
1128	case IGMP_G_QUERY_PENDING_MEMBER:
1129	case IGMP_SG_QUERY_PENDING_MEMBER:
1130		break;
1131	}
1132
1133	nsrc = ntohs(igmpv3->igmp_numsrc);
1134
1135	/*
1136	 * Deal with group-specific queries upfront.
1137	 * If any group query is already pending, purge any recorded
1138	 * source-list state if it exists, and schedule a query response
1139	 * for this group-specific query.
1140	 */
1141	if (nsrc == 0) {
1142		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
1143		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
1144			inm_clear_recorded(inm);
1145			timer = min(inm->inm_timer, timer);
1146		}
1147		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
1148		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1149		V_current_state_timers_running = 1;
1150		return (retval);
1151	}
1152
1153	/*
1154	 * Deal with the case where a group-and-source-specific query has
1155	 * been received but a group-specific query is already pending.
1156	 */
1157	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
1158		timer = min(inm->inm_timer, timer);
1159		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1160		V_current_state_timers_running = 1;
1161		return (retval);
1162	}
1163
1164	/*
1165	 * Finally, deal with the case where a group-and-source-specific
1166	 * query has been received, where a response to a previous g-s-r
1167	 * query exists, or none exists.
1168	 * In this case, we need to parse the source-list which the Querier
1169	 * has provided us with and check if we have any source list filter
1170	 * entries at T1 for these sources. If we do not, there is no need
1171	 * schedule a report and the query may be dropped.
1172	 * If we do, we must record them and schedule a current-state
1173	 * report for those sources.
1174	 * FIXME: Handling source lists larger than 1 mbuf requires that
1175	 * we pass the mbuf chain pointer down to this function, and use
1176	 * m_getptr() to walk the chain.
1177	 */
1178	if (inm->inm_nsrc > 0) {
1179		const struct in_addr	*ap;
1180		int			 i, nrecorded;
1181
1182		ap = (const struct in_addr *)(igmpv3 + 1);
1183		nrecorded = 0;
1184		for (i = 0; i < nsrc; i++, ap++) {
1185			retval = inm_record_source(inm, ap->s_addr);
1186			if (retval < 0)
1187				break;
1188			nrecorded += retval;
1189		}
1190		if (nrecorded > 0) {
1191			CTR1(KTR_IGMPV3,
1192			    "%s: schedule response to SG query", __func__);
1193			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
1194			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1195			V_current_state_timers_running = 1;
1196		}
1197	}
1198
1199	return (retval);
1200}
1201
1202/*
1203 * Process a received IGMPv1 host membership report.
1204 *
1205 * NOTE: 0.0.0.0 workaround breaks const correctness.
1206 */
1207static int
1208igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
1209    /*const*/ struct igmp *igmp)
1210{
1211	struct in_ifaddr *ia;
1212	struct in_multi *inm;
1213
1214	IGMPSTAT_INC(igps_rcv_reports);
1215
1216	if (ifp->if_flags & IFF_LOOPBACK)
1217		return (0);
1218
1219	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
1220	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
1221		IGMPSTAT_INC(igps_rcv_badreports);
1222		return (EINVAL);
1223	}
1224
1225	/*
1226	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
1227	 * Booting clients may use the source address 0.0.0.0. Some
1228	 * IGMP daemons may not know how to use IP_RECVIF to determine
1229	 * the interface upon which this message was received.
1230	 * Replace 0.0.0.0 with the subnet address if told to do so.
1231	 */
1232	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
1233		IFP_TO_IA(ifp, ia);
1234		if (ia != NULL) {
1235			ip->ip_src.s_addr = htonl(ia->ia_subnet);
1236			ifa_free(&ia->ia_ifa);
1237		}
1238	}
1239
1240	CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
1241	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1242
1243	/*
1244	 * IGMPv1 report suppression.
1245	 * If we are a member of this group, and our membership should be
1246	 * reported, stop our group timer and transition to the 'lazy' state.
1247	 */
1248	IN_MULTI_LOCK();
1249	inm = inm_lookup(ifp, igmp->igmp_group);
1250	if (inm != NULL) {
1251		struct igmp_ifinfo *igi;
1252
1253		igi = inm->inm_igi;
1254		if (igi == NULL) {
1255			KASSERT(igi != NULL,
1256			    ("%s: no igi for ifp %p", __func__, ifp));
1257			goto out_locked;
1258		}
1259
1260		IGMPSTAT_INC(igps_rcv_ourreports);
1261
1262		/*
1263		 * If we are in IGMPv3 host mode, do not allow the
1264		 * other host's IGMPv1 report to suppress our reports
1265		 * unless explicitly configured to do so.
1266		 */
1267		if (igi->igi_version == IGMP_VERSION_3) {
1268			if (V_igmp_legacysupp)
1269				igmp_v3_suppress_group_record(inm);
1270			goto out_locked;
1271		}
1272
1273		inm->inm_timer = 0;
1274
1275		switch (inm->inm_state) {
1276		case IGMP_NOT_MEMBER:
1277		case IGMP_SILENT_MEMBER:
1278			break;
1279		case IGMP_IDLE_MEMBER:
1280		case IGMP_LAZY_MEMBER:
1281		case IGMP_AWAKENING_MEMBER:
1282			CTR3(KTR_IGMPV3,
1283			    "report suppressed for %s on ifp %p(%s)",
1284			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1285		case IGMP_SLEEPING_MEMBER:
1286			inm->inm_state = IGMP_SLEEPING_MEMBER;
1287			break;
1288		case IGMP_REPORTING_MEMBER:
1289			CTR3(KTR_IGMPV3,
1290			    "report suppressed for %s on ifp %p(%s)",
1291			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1292			if (igi->igi_version == IGMP_VERSION_1)
1293				inm->inm_state = IGMP_LAZY_MEMBER;
1294			else if (igi->igi_version == IGMP_VERSION_2)
1295				inm->inm_state = IGMP_SLEEPING_MEMBER;
1296			break;
1297		case IGMP_G_QUERY_PENDING_MEMBER:
1298		case IGMP_SG_QUERY_PENDING_MEMBER:
1299		case IGMP_LEAVING_MEMBER:
1300			break;
1301		}
1302	}
1303
1304out_locked:
1305	IN_MULTI_UNLOCK();
1306
1307	return (0);
1308}
1309
1310/*
1311 * Process a received IGMPv2 host membership report.
1312 *
1313 * NOTE: 0.0.0.0 workaround breaks const correctness.
1314 */
1315static int
1316igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
1317    /*const*/ struct igmp *igmp)
1318{
1319	struct in_ifaddr *ia;
1320	struct in_multi *inm;
1321
1322	/*
1323	 * Make sure we don't hear our own membership report.  Fast
1324	 * leave requires knowing that we are the only member of a
1325	 * group.
1326	 */
1327	IFP_TO_IA(ifp, ia);
1328	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
1329		ifa_free(&ia->ia_ifa);
1330		return (0);
1331	}
1332
1333	IGMPSTAT_INC(igps_rcv_reports);
1334
1335	if (ifp->if_flags & IFF_LOOPBACK) {
1336		if (ia != NULL)
1337			ifa_free(&ia->ia_ifa);
1338		return (0);
1339	}
1340
1341	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
1342	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
1343		if (ia != NULL)
1344			ifa_free(&ia->ia_ifa);
1345		IGMPSTAT_INC(igps_rcv_badreports);
1346		return (EINVAL);
1347	}
1348
1349	/*
1350	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
1351	 * Booting clients may use the source address 0.0.0.0. Some
1352	 * IGMP daemons may not know how to use IP_RECVIF to determine
1353	 * the interface upon which this message was received.
1354	 * Replace 0.0.0.0 with the subnet address if told to do so.
1355	 */
1356	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
1357		if (ia != NULL)
1358			ip->ip_src.s_addr = htonl(ia->ia_subnet);
1359	}
1360	if (ia != NULL)
1361		ifa_free(&ia->ia_ifa);
1362
1363	CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
1364	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1365
1366	/*
1367	 * IGMPv2 report suppression.
1368	 * If we are a member of this group, and our membership should be
1369	 * reported, and our group timer is pending or about to be reset,
1370	 * stop our group timer by transitioning to the 'lazy' state.
1371	 */
1372	IN_MULTI_LOCK();
1373	inm = inm_lookup(ifp, igmp->igmp_group);
1374	if (inm != NULL) {
1375		struct igmp_ifinfo *igi;
1376
1377		igi = inm->inm_igi;
1378		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
1379
1380		IGMPSTAT_INC(igps_rcv_ourreports);
1381
1382		/*
1383		 * If we are in IGMPv3 host mode, do not allow the
1384		 * other host's IGMPv1 report to suppress our reports
1385		 * unless explicitly configured to do so.
1386		 */
1387		if (igi->igi_version == IGMP_VERSION_3) {
1388			if (V_igmp_legacysupp)
1389				igmp_v3_suppress_group_record(inm);
1390			goto out_locked;
1391		}
1392
1393		inm->inm_timer = 0;
1394
1395		switch (inm->inm_state) {
1396		case IGMP_NOT_MEMBER:
1397		case IGMP_SILENT_MEMBER:
1398		case IGMP_SLEEPING_MEMBER:
1399			break;
1400		case IGMP_REPORTING_MEMBER:
1401		case IGMP_IDLE_MEMBER:
1402		case IGMP_AWAKENING_MEMBER:
1403			CTR3(KTR_IGMPV3,
1404			    "report suppressed for %s on ifp %p(%s)",
1405			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1406		case IGMP_LAZY_MEMBER:
1407			inm->inm_state = IGMP_LAZY_MEMBER;
1408			break;
1409		case IGMP_G_QUERY_PENDING_MEMBER:
1410		case IGMP_SG_QUERY_PENDING_MEMBER:
1411		case IGMP_LEAVING_MEMBER:
1412			break;
1413		}
1414	}
1415
1416out_locked:
1417	IN_MULTI_UNLOCK();
1418
1419	return (0);
1420}
1421
1422void
1423igmp_input(struct mbuf *m, int off)
1424{
1425	int iphlen;
1426	struct ifnet *ifp;
1427	struct igmp *igmp;
1428	struct ip *ip;
1429	int igmplen;
1430	int minlen;
1431	int queryver;
1432
1433	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, m, off);
1434
1435	ifp = m->m_pkthdr.rcvif;
1436
1437	IGMPSTAT_INC(igps_rcv_total);
1438
1439	ip = mtod(m, struct ip *);
1440	iphlen = off;
1441	igmplen = ntohs(ip->ip_len) - off;
1442
1443	/*
1444	 * Validate lengths.
1445	 */
1446	if (igmplen < IGMP_MINLEN) {
1447		IGMPSTAT_INC(igps_rcv_tooshort);
1448		m_freem(m);
1449		return;
1450	}
1451
1452	/*
1453	 * Always pullup to the minimum size for v1/v2 or v3
1454	 * to amortize calls to m_pullup().
1455	 */
1456	minlen = iphlen;
1457	if (igmplen >= IGMP_V3_QUERY_MINLEN)
1458		minlen += IGMP_V3_QUERY_MINLEN;
1459	else
1460		minlen += IGMP_MINLEN;
1461	if ((m->m_flags & M_EXT || m->m_len < minlen) &&
1462	    (m = m_pullup(m, minlen)) == 0) {
1463		IGMPSTAT_INC(igps_rcv_tooshort);
1464		return;
1465	}
1466	ip = mtod(m, struct ip *);
1467
1468	/*
1469	 * Validate checksum.
1470	 */
1471	m->m_data += iphlen;
1472	m->m_len -= iphlen;
1473	igmp = mtod(m, struct igmp *);
1474	if (in_cksum(m, igmplen)) {
1475		IGMPSTAT_INC(igps_rcv_badsum);
1476		m_freem(m);
1477		return;
1478	}
1479	m->m_data -= iphlen;
1480	m->m_len += iphlen;
1481
1482	/*
1483	 * IGMP control traffic is link-scope, and must have a TTL of 1.
1484	 * DVMRP traffic (e.g. mrinfo, mtrace) is an exception;
1485	 * probe packets may come from beyond the LAN.
1486	 */
1487	if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
1488		IGMPSTAT_INC(igps_rcv_badttl);
1489		m_freem(m);
1490		return;
1491	}
1492
1493	switch (igmp->igmp_type) {
1494	case IGMP_HOST_MEMBERSHIP_QUERY:
1495		if (igmplen == IGMP_MINLEN) {
1496			if (igmp->igmp_code == 0)
1497				queryver = IGMP_VERSION_1;
1498			else
1499				queryver = IGMP_VERSION_2;
1500		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
1501			queryver = IGMP_VERSION_3;
1502		} else {
1503			IGMPSTAT_INC(igps_rcv_tooshort);
1504			m_freem(m);
1505			return;
1506		}
1507
1508		switch (queryver) {
1509		case IGMP_VERSION_1:
1510			IGMPSTAT_INC(igps_rcv_v1v2_queries);
1511			if (!V_igmp_v1enable)
1512				break;
1513			if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
1514				m_freem(m);
1515				return;
1516			}
1517			break;
1518
1519		case IGMP_VERSION_2:
1520			IGMPSTAT_INC(igps_rcv_v1v2_queries);
1521			if (!V_igmp_v2enable)
1522				break;
1523			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
1524				m_freem(m);
1525				return;
1526			}
1527			break;
1528
1529		case IGMP_VERSION_3: {
1530				struct igmpv3 *igmpv3;
1531				uint16_t igmpv3len;
1532				uint16_t nsrc;
1533
1534				IGMPSTAT_INC(igps_rcv_v3_queries);
1535				igmpv3 = (struct igmpv3 *)igmp;
1536				/*
1537				 * Validate length based on source count.
1538				 */
1539				nsrc = ntohs(igmpv3->igmp_numsrc);
1540				if (nsrc * sizeof(in_addr_t) >
1541				    UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) {
1542					IGMPSTAT_INC(igps_rcv_tooshort);
1543					return;
1544				}
1545				/*
1546				 * m_pullup() may modify m, so pullup in
1547				 * this scope.
1548				 */
1549				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
1550				    sizeof(struct in_addr) * nsrc;
1551				if ((m->m_flags & M_EXT ||
1552				     m->m_len < igmpv3len) &&
1553				    (m = m_pullup(m, igmpv3len)) == NULL) {
1554					IGMPSTAT_INC(igps_rcv_tooshort);
1555					return;
1556				}
1557				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
1558				    + iphlen);
1559				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
1560					m_freem(m);
1561					return;
1562				}
1563			}
1564			break;
1565		}
1566		break;
1567
1568	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
1569		if (!V_igmp_v1enable)
1570			break;
1571		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
1572			m_freem(m);
1573			return;
1574		}
1575		break;
1576
1577	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
1578		if (!V_igmp_v2enable)
1579			break;
1580		if (!ip_checkrouteralert(m))
1581			IGMPSTAT_INC(igps_rcv_nora);
1582		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
1583			m_freem(m);
1584			return;
1585		}
1586		break;
1587
1588	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
1589		/*
1590		 * Hosts do not need to process IGMPv3 membership reports,
1591		 * as report suppression is no longer required.
1592		 */
1593		if (!ip_checkrouteralert(m))
1594			IGMPSTAT_INC(igps_rcv_nora);
1595		break;
1596
1597	default:
1598		break;
1599	}
1600
1601	/*
1602	 * Pass all valid IGMP packets up to any process(es) listening on a
1603	 * raw IGMP socket.
1604	 */
1605	rip_input(m, off);
1606}
1607
1608
1609/*
1610 * Fast timeout handler (global).
1611 * VIMAGE: Timeout handlers are expected to service all vimages.
1612 */
1613void
1614igmp_fasttimo(void)
1615{
1616	VNET_ITERATOR_DECL(vnet_iter);
1617
1618	VNET_LIST_RLOCK_NOSLEEP();
1619	VNET_FOREACH(vnet_iter) {
1620		CURVNET_SET(vnet_iter);
1621		igmp_fasttimo_vnet();
1622		CURVNET_RESTORE();
1623	}
1624	VNET_LIST_RUNLOCK_NOSLEEP();
1625}
1626
1627/*
1628 * Fast timeout handler (per-vnet).
1629 * Sends are shuffled off to a netisr to deal with Giant.
1630 *
1631 * VIMAGE: Assume caller has set up our curvnet.
1632 */
1633static void
1634igmp_fasttimo_vnet(void)
1635{
1636	struct ifqueue		 scq;	/* State-change packets */
1637	struct ifqueue		 qrq;	/* Query response packets */
1638	struct ifnet		*ifp;
1639	struct igmp_ifinfo	*igi;
1640	struct ifmultiaddr	*ifma;
1641	struct in_multi		*inm;
1642	int			 loop, uri_fasthz;
1643
1644	loop = 0;
1645	uri_fasthz = 0;
1646
1647	/*
1648	 * Quick check to see if any work needs to be done, in order to
1649	 * minimize the overhead of fasttimo processing.
1650	 * SMPng: XXX Unlocked reads.
1651	 */
1652	if (!V_current_state_timers_running &&
1653	    !V_interface_timers_running &&
1654	    !V_state_change_timers_running)
1655		return;
1656
1657	IN_MULTI_LOCK();
1658	IGMP_LOCK();
1659
1660	/*
1661	 * IGMPv3 General Query response timer processing.
1662	 */
1663	if (V_interface_timers_running) {
1664		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
1665
1666		V_interface_timers_running = 0;
1667		LIST_FOREACH(igi, &V_igi_head, igi_link) {
1668			if (igi->igi_v3_timer == 0) {
1669				/* Do nothing. */
1670			} else if (--igi->igi_v3_timer == 0) {
1671				igmp_v3_dispatch_general_query(igi);
1672			} else {
1673				V_interface_timers_running = 1;
1674			}
1675		}
1676	}
1677
1678	if (!V_current_state_timers_running &&
1679	    !V_state_change_timers_running)
1680		goto out_locked;
1681
1682	V_current_state_timers_running = 0;
1683	V_state_change_timers_running = 0;
1684
1685	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
1686
1687	/*
1688	 * IGMPv1/v2/v3 host report and state-change timer processing.
1689	 * Note: Processing a v3 group timer may remove a node.
1690	 */
1691	LIST_FOREACH(igi, &V_igi_head, igi_link) {
1692		ifp = igi->igi_ifp;
1693
1694		if (igi->igi_version == IGMP_VERSION_3) {
1695			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
1696			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
1697			    PR_FASTHZ);
1698
1699			memset(&qrq, 0, sizeof(struct ifqueue));
1700			IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS);
1701
1702			memset(&scq, 0, sizeof(struct ifqueue));
1703			IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
1704		}
1705
1706		IF_ADDR_RLOCK(ifp);
1707		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1708			if (ifma->ifma_addr->sa_family != AF_INET ||
1709			    ifma->ifma_protospec == NULL)
1710				continue;
1711			inm = (struct in_multi *)ifma->ifma_protospec;
1712			switch (igi->igi_version) {
1713			case IGMP_VERSION_1:
1714			case IGMP_VERSION_2:
1715				igmp_v1v2_process_group_timer(inm,
1716				    igi->igi_version);
1717				break;
1718			case IGMP_VERSION_3:
1719				igmp_v3_process_group_timers(igi, &qrq,
1720				    &scq, inm, uri_fasthz);
1721				break;
1722			}
1723		}
1724		IF_ADDR_RUNLOCK(ifp);
1725
1726		if (igi->igi_version == IGMP_VERSION_3) {
1727			struct in_multi		*tinm;
1728
1729			igmp_dispatch_queue(&qrq, 0, loop);
1730			igmp_dispatch_queue(&scq, 0, loop);
1731
1732			/*
1733			 * Free the in_multi reference(s) for this
1734			 * IGMP lifecycle.
1735			 */
1736			SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
1737			    inm_nrele, tinm) {
1738				SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
1739				    inm_nrele);
1740				inm_release_locked(inm);
1741			}
1742		}
1743	}
1744
1745out_locked:
1746	IGMP_UNLOCK();
1747	IN_MULTI_UNLOCK();
1748}
1749
1750/*
1751 * Update host report group timer for IGMPv1/v2.
1752 * Will update the global pending timer flags.
1753 */
1754static void
1755igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
1756{
1757	int report_timer_expired;
1758
1759	IN_MULTI_LOCK_ASSERT();
1760	IGMP_LOCK_ASSERT();
1761
1762	if (inm->inm_timer == 0) {
1763		report_timer_expired = 0;
1764	} else if (--inm->inm_timer == 0) {
1765		report_timer_expired = 1;
1766	} else {
1767		V_current_state_timers_running = 1;
1768		return;
1769	}
1770
1771	switch (inm->inm_state) {
1772	case IGMP_NOT_MEMBER:
1773	case IGMP_SILENT_MEMBER:
1774	case IGMP_IDLE_MEMBER:
1775	case IGMP_LAZY_MEMBER:
1776	case IGMP_SLEEPING_MEMBER:
1777	case IGMP_AWAKENING_MEMBER:
1778		break;
1779	case IGMP_REPORTING_MEMBER:
1780		if (report_timer_expired) {
1781			inm->inm_state = IGMP_IDLE_MEMBER;
1782			(void)igmp_v1v2_queue_report(inm,
1783			    (version == IGMP_VERSION_2) ?
1784			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
1785			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
1786		}
1787		break;
1788	case IGMP_G_QUERY_PENDING_MEMBER:
1789	case IGMP_SG_QUERY_PENDING_MEMBER:
1790	case IGMP_LEAVING_MEMBER:
1791		break;
1792	}
1793}
1794
1795/*
1796 * Update a group's timers for IGMPv3.
1797 * Will update the global pending timer flags.
1798 * Note: Unlocked read from igi.
1799 */
1800static void
1801igmp_v3_process_group_timers(struct igmp_ifinfo *igi,
1802    struct ifqueue *qrq, struct ifqueue *scq,
1803    struct in_multi *inm, const int uri_fasthz)
1804{
1805	int query_response_timer_expired;
1806	int state_change_retransmit_timer_expired;
1807
1808	IN_MULTI_LOCK_ASSERT();
1809	IGMP_LOCK_ASSERT();
1810
1811	query_response_timer_expired = 0;
1812	state_change_retransmit_timer_expired = 0;
1813
1814	/*
1815	 * During a transition from v1/v2 compatibility mode back to v3,
1816	 * a group record in REPORTING state may still have its group
1817	 * timer active. This is a no-op in this function; it is easier
1818	 * to deal with it here than to complicate the slow-timeout path.
1819	 */
1820	if (inm->inm_timer == 0) {
1821		query_response_timer_expired = 0;
1822	} else if (--inm->inm_timer == 0) {
1823		query_response_timer_expired = 1;
1824	} else {
1825		V_current_state_timers_running = 1;
1826	}
1827
1828	if (inm->inm_sctimer == 0) {
1829		state_change_retransmit_timer_expired = 0;
1830	} else if (--inm->inm_sctimer == 0) {
1831		state_change_retransmit_timer_expired = 1;
1832	} else {
1833		V_state_change_timers_running = 1;
1834	}
1835
1836	/* We are in fasttimo, so be quick about it. */
1837	if (!state_change_retransmit_timer_expired &&
1838	    !query_response_timer_expired)
1839		return;
1840
1841	switch (inm->inm_state) {
1842	case IGMP_NOT_MEMBER:
1843	case IGMP_SILENT_MEMBER:
1844	case IGMP_SLEEPING_MEMBER:
1845	case IGMP_LAZY_MEMBER:
1846	case IGMP_AWAKENING_MEMBER:
1847	case IGMP_IDLE_MEMBER:
1848		break;
1849	case IGMP_G_QUERY_PENDING_MEMBER:
1850	case IGMP_SG_QUERY_PENDING_MEMBER:
1851		/*
1852		 * Respond to a previously pending Group-Specific
1853		 * or Group-and-Source-Specific query by enqueueing
1854		 * the appropriate Current-State report for
1855		 * immediate transmission.
1856		 */
1857		if (query_response_timer_expired) {
1858			int retval;
1859
1860			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
1861			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
1862			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
1863			    __func__, retval);
1864			inm->inm_state = IGMP_REPORTING_MEMBER;
1865			/* XXX Clear recorded sources for next time. */
1866			inm_clear_recorded(inm);
1867		}
1868		/* FALLTHROUGH */
1869	case IGMP_REPORTING_MEMBER:
1870	case IGMP_LEAVING_MEMBER:
1871		if (state_change_retransmit_timer_expired) {
1872			/*
1873			 * State-change retransmission timer fired.
1874			 * If there are any further pending retransmissions,
1875			 * set the global pending state-change flag, and
1876			 * reset the timer.
1877			 */
1878			if (--inm->inm_scrv > 0) {
1879				inm->inm_sctimer = uri_fasthz;
1880				V_state_change_timers_running = 1;
1881			}
1882			/*
1883			 * Retransmit the previously computed state-change
1884			 * report. If there are no further pending
1885			 * retransmissions, the mbuf queue will be consumed.
1886			 * Update T0 state to T1 as we have now sent
1887			 * a state-change.
1888			 */
1889			(void)igmp_v3_merge_state_changes(inm, scq);
1890
1891			inm_commit(inm);
1892			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
1893			    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
1894
1895			/*
1896			 * If we are leaving the group for good, make sure
1897			 * we release IGMP's reference to it.
1898			 * This release must be deferred using a SLIST,
1899			 * as we are called from a loop which traverses
1900			 * the in_ifmultiaddr TAILQ.
1901			 */
1902			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
1903			    inm->inm_scrv == 0) {
1904				inm->inm_state = IGMP_NOT_MEMBER;
1905				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
1906				    inm, inm_nrele);
1907			}
1908		}
1909		break;
1910	}
1911}
1912
1913
1914/*
1915 * Suppress a group's pending response to a group or source/group query.
1916 *
1917 * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
1918 * Do NOT update ST1/ST0 as this operation merely suppresses
1919 * the currently pending group record.
1920 * Do NOT suppress the response to a general query. It is possible but
1921 * it would require adding another state or flag.
1922 */
1923static void
1924igmp_v3_suppress_group_record(struct in_multi *inm)
1925{
1926
1927	IN_MULTI_LOCK_ASSERT();
1928
1929	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
1930		("%s: not IGMPv3 mode on link", __func__));
1931
1932	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
1933	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
1934		return;
1935
1936	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
1937		inm_clear_recorded(inm);
1938
1939	inm->inm_timer = 0;
1940	inm->inm_state = IGMP_REPORTING_MEMBER;
1941}
1942
1943/*
1944 * Switch to a different IGMP version on the given interface,
1945 * as per Section 7.2.1.
1946 */
1947static void
1948igmp_set_version(struct igmp_ifinfo *igi, const int version)
1949{
1950	int old_version_timer;
1951
1952	IGMP_LOCK_ASSERT();
1953
1954	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
1955	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
1956
1957	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
1958		/*
1959		 * Compute the "Older Version Querier Present" timer as per
1960		 * Section 8.12.
1961		 */
1962		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
1963		old_version_timer *= PR_SLOWHZ;
1964
1965		if (version == IGMP_VERSION_1) {
1966			igi->igi_v1_timer = old_version_timer;
1967			igi->igi_v2_timer = 0;
1968		} else if (version == IGMP_VERSION_2) {
1969			igi->igi_v1_timer = 0;
1970			igi->igi_v2_timer = old_version_timer;
1971		}
1972	}
1973
1974	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
1975		if (igi->igi_version != IGMP_VERSION_2) {
1976			igi->igi_version = IGMP_VERSION_2;
1977			igmp_v3_cancel_link_timers(igi);
1978		}
1979	} else if (igi->igi_v1_timer > 0) {
1980		if (igi->igi_version != IGMP_VERSION_1) {
1981			igi->igi_version = IGMP_VERSION_1;
1982			igmp_v3_cancel_link_timers(igi);
1983		}
1984	}
1985}
1986
1987/*
1988 * Cancel pending IGMPv3 timers for the given link and all groups
1989 * joined on it; state-change, general-query, and group-query timers.
1990 *
1991 * Only ever called on a transition from v3 to Compatibility mode. Kill
1992 * the timers stone dead (this may be expensive for large N groups), they
1993 * will be restarted if Compatibility Mode deems that they must be due to
1994 * query processing.
1995 */
1996static void
1997igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi)
1998{
1999	struct ifmultiaddr	*ifma;
2000	struct ifnet		*ifp;
2001	struct in_multi		*inm, *tinm;
2002
2003	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
2004	    igi->igi_ifp, igi->igi_ifp->if_xname);
2005
2006	IN_MULTI_LOCK_ASSERT();
2007	IGMP_LOCK_ASSERT();
2008
2009	/*
2010	 * Stop the v3 General Query Response on this link stone dead.
2011	 * If fasttimo is woken up due to V_interface_timers_running,
2012	 * the flag will be cleared if there are no pending link timers.
2013	 */
2014	igi->igi_v3_timer = 0;
2015
2016	/*
2017	 * Now clear the current-state and state-change report timers
2018	 * for all memberships scoped to this link.
2019	 */
2020	ifp = igi->igi_ifp;
2021	IF_ADDR_RLOCK(ifp);
2022	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2023		if (ifma->ifma_addr->sa_family != AF_INET ||
2024		    ifma->ifma_protospec == NULL)
2025			continue;
2026		inm = (struct in_multi *)ifma->ifma_protospec;
2027		switch (inm->inm_state) {
2028		case IGMP_NOT_MEMBER:
2029		case IGMP_SILENT_MEMBER:
2030		case IGMP_IDLE_MEMBER:
2031		case IGMP_LAZY_MEMBER:
2032		case IGMP_SLEEPING_MEMBER:
2033		case IGMP_AWAKENING_MEMBER:
2034			/*
2035			 * These states are either not relevant in v3 mode,
2036			 * or are unreported. Do nothing.
2037			 */
2038			break;
2039		case IGMP_LEAVING_MEMBER:
2040			/*
2041			 * If we are leaving the group and switching to
2042			 * compatibility mode, we need to release the final
2043			 * reference held for issuing the INCLUDE {}, and
2044			 * transition to REPORTING to ensure the host leave
2045			 * message is sent upstream to the old querier --
2046			 * transition to NOT would lose the leave and race.
2047			 */
2048			SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele);
2049			/* FALLTHROUGH */
2050		case IGMP_G_QUERY_PENDING_MEMBER:
2051		case IGMP_SG_QUERY_PENDING_MEMBER:
2052			inm_clear_recorded(inm);
2053			/* FALLTHROUGH */
2054		case IGMP_REPORTING_MEMBER:
2055			inm->inm_state = IGMP_REPORTING_MEMBER;
2056			break;
2057		}
2058		/*
2059		 * Always clear state-change and group report timers.
2060		 * Free any pending IGMPv3 state-change records.
2061		 */
2062		inm->inm_sctimer = 0;
2063		inm->inm_timer = 0;
2064		_IF_DRAIN(&inm->inm_scq);
2065	}
2066	IF_ADDR_RUNLOCK(ifp);
2067	SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) {
2068		SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
2069		inm_release_locked(inm);
2070	}
2071}
2072
2073/*
2074 * Update the Older Version Querier Present timers for a link.
2075 * See Section 7.2.1 of RFC 3376.
2076 */
2077static void
2078igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi)
2079{
2080
2081	IGMP_LOCK_ASSERT();
2082
2083	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
2084		/*
2085		 * IGMPv1 and IGMPv2 Querier Present timers expired.
2086		 *
2087		 * Revert to IGMPv3.
2088		 */
2089		if (igi->igi_version != IGMP_VERSION_3) {
2090			CTR5(KTR_IGMPV3,
2091			    "%s: transition from v%d -> v%d on %p(%s)",
2092			    __func__, igi->igi_version, IGMP_VERSION_3,
2093			    igi->igi_ifp, igi->igi_ifp->if_xname);
2094			igi->igi_version = IGMP_VERSION_3;
2095		}
2096	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
2097		/*
2098		 * IGMPv1 Querier Present timer expired,
2099		 * IGMPv2 Querier Present timer running.
2100		 * If IGMPv2 was disabled since last timeout,
2101		 * revert to IGMPv3.
2102		 * If IGMPv2 is enabled, revert to IGMPv2.
2103		 */
2104		if (!V_igmp_v2enable) {
2105			CTR5(KTR_IGMPV3,
2106			    "%s: transition from v%d -> v%d on %p(%s)",
2107			    __func__, igi->igi_version, IGMP_VERSION_3,
2108			    igi->igi_ifp, igi->igi_ifp->if_xname);
2109			igi->igi_v2_timer = 0;
2110			igi->igi_version = IGMP_VERSION_3;
2111		} else {
2112			--igi->igi_v2_timer;
2113			if (igi->igi_version != IGMP_VERSION_2) {
2114				CTR5(KTR_IGMPV3,
2115				    "%s: transition from v%d -> v%d on %p(%s)",
2116				    __func__, igi->igi_version, IGMP_VERSION_2,
2117				    igi->igi_ifp, igi->igi_ifp->if_xname);
2118				igi->igi_version = IGMP_VERSION_2;
2119				igmp_v3_cancel_link_timers(igi);
2120			}
2121		}
2122	} else if (igi->igi_v1_timer > 0) {
2123		/*
2124		 * IGMPv1 Querier Present timer running.
2125		 * Stop IGMPv2 timer if running.
2126		 *
2127		 * If IGMPv1 was disabled since last timeout,
2128		 * revert to IGMPv3.
2129		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
2130		 */
2131		if (!V_igmp_v1enable) {
2132			CTR5(KTR_IGMPV3,
2133			    "%s: transition from v%d -> v%d on %p(%s)",
2134			    __func__, igi->igi_version, IGMP_VERSION_3,
2135			    igi->igi_ifp, igi->igi_ifp->if_xname);
2136			igi->igi_v1_timer = 0;
2137			igi->igi_version = IGMP_VERSION_3;
2138		} else {
2139			--igi->igi_v1_timer;
2140		}
2141		if (igi->igi_v2_timer > 0) {
2142			CTR3(KTR_IGMPV3,
2143			    "%s: cancel v2 timer on %p(%s)",
2144			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
2145			igi->igi_v2_timer = 0;
2146		}
2147	}
2148}
2149
2150/*
2151 * Global slowtimo handler.
2152 * VIMAGE: Timeout handlers are expected to service all vimages.
2153 */
2154void
2155igmp_slowtimo(void)
2156{
2157	VNET_ITERATOR_DECL(vnet_iter);
2158
2159	VNET_LIST_RLOCK_NOSLEEP();
2160	VNET_FOREACH(vnet_iter) {
2161		CURVNET_SET(vnet_iter);
2162		igmp_slowtimo_vnet();
2163		CURVNET_RESTORE();
2164	}
2165	VNET_LIST_RUNLOCK_NOSLEEP();
2166}
2167
2168/*
2169 * Per-vnet slowtimo handler.
2170 */
2171static void
2172igmp_slowtimo_vnet(void)
2173{
2174	struct igmp_ifinfo *igi;
2175
2176	IGMP_LOCK();
2177
2178	LIST_FOREACH(igi, &V_igi_head, igi_link) {
2179		igmp_v1v2_process_querier_timers(igi);
2180	}
2181
2182	IGMP_UNLOCK();
2183}
2184
2185/*
2186 * Dispatch an IGMPv1/v2 host report or leave message.
2187 * These are always small enough to fit inside a single mbuf.
2188 */
2189static int
2190igmp_v1v2_queue_report(struct in_multi *inm, const int type)
2191{
2192	struct ifnet		*ifp;
2193	struct igmp		*igmp;
2194	struct ip		*ip;
2195	struct mbuf		*m;
2196
2197	IN_MULTI_LOCK_ASSERT();
2198	IGMP_LOCK_ASSERT();
2199
2200	ifp = inm->inm_ifp;
2201
2202	m = m_gethdr(M_NOWAIT, MT_DATA);
2203	if (m == NULL)
2204		return (ENOMEM);
2205	MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
2206
2207	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
2208
2209	m->m_data += sizeof(struct ip);
2210	m->m_len = sizeof(struct igmp);
2211
2212	igmp = mtod(m, struct igmp *);
2213	igmp->igmp_type = type;
2214	igmp->igmp_code = 0;
2215	igmp->igmp_group = inm->inm_addr;
2216	igmp->igmp_cksum = 0;
2217	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
2218
2219	m->m_data -= sizeof(struct ip);
2220	m->m_len += sizeof(struct ip);
2221
2222	ip = mtod(m, struct ip *);
2223	ip->ip_tos = 0;
2224	ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp));
2225	ip->ip_off = 0;
2226	ip->ip_p = IPPROTO_IGMP;
2227	ip->ip_src.s_addr = INADDR_ANY;
2228
2229	if (type == IGMP_HOST_LEAVE_MESSAGE)
2230		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
2231	else
2232		ip->ip_dst = inm->inm_addr;
2233
2234	igmp_save_context(m, ifp);
2235
2236	m->m_flags |= M_IGMPV2;
2237	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
2238		m->m_flags |= M_IGMP_LOOP;
2239
2240	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
2241	netisr_dispatch(NETISR_IGMP, m);
2242
2243	return (0);
2244}
2245
2246/*
2247 * Process a state change from the upper layer for the given IPv4 group.
2248 *
2249 * Each socket holds a reference on the in_multi in its own ip_moptions.
2250 * The socket layer will have made the necessary updates to.the group
2251 * state, it is now up to IGMP to issue a state change report if there
2252 * has been any change between T0 (when the last state-change was issued)
2253 * and T1 (now).
2254 *
2255 * We use the IGMPv3 state machine at group level. The IGMP module
2256 * however makes the decision as to which IGMP protocol version to speak.
2257 * A state change *from* INCLUDE {} always means an initial join.
2258 * A state change *to* INCLUDE {} always means a final leave.
2259 *
2260 * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
2261 * save ourselves a bunch of work; any exclusive mode groups need not
2262 * compute source filter lists.
2263 *
2264 * VIMAGE: curvnet should have been set by caller, as this routine
2265 * is called from the socket option handlers.
2266 */
2267int
2268igmp_change_state(struct in_multi *inm)
2269{
2270	struct igmp_ifinfo *igi;
2271	struct ifnet *ifp;
2272	int error;
2273
2274	IN_MULTI_LOCK_ASSERT();
2275
2276	error = 0;
2277
2278	/*
2279	 * Try to detect if the upper layer just asked us to change state
2280	 * for an interface which has now gone away.
2281	 */
2282	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
2283	ifp = inm->inm_ifma->ifma_ifp;
2284	/*
2285	 * Sanity check that netinet's notion of ifp is the
2286	 * same as net's.
2287	 */
2288	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
2289
2290	IGMP_LOCK();
2291
2292	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
2293	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
2294
2295	/*
2296	 * If we detect a state transition to or from MCAST_UNDEFINED
2297	 * for this group, then we are starting or finishing an IGMP
2298	 * life cycle for this group.
2299	 */
2300	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
2301		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
2302		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
2303		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
2304			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
2305			error = igmp_initial_join(inm, igi);
2306			goto out_locked;
2307		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
2308			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
2309			igmp_final_leave(inm, igi);
2310			goto out_locked;
2311		}
2312	} else {
2313		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
2314	}
2315
2316	error = igmp_handle_state_change(inm, igi);
2317
2318out_locked:
2319	IGMP_UNLOCK();
2320	return (error);
2321}
2322
2323/*
2324 * Perform the initial join for an IGMP group.
2325 *
2326 * When joining a group:
2327 *  If the group should have its IGMP traffic suppressed, do nothing.
2328 *  IGMPv1 starts sending IGMPv1 host membership reports.
2329 *  IGMPv2 starts sending IGMPv2 host membership reports.
2330 *  IGMPv3 will schedule an IGMPv3 state-change report containing the
2331 *  initial state of the membership.
2332 */
2333static int
2334igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi)
2335{
2336	struct ifnet		*ifp;
2337	struct ifqueue		*ifq;
2338	int			 error, retval, syncstates;
2339
2340	CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
2341	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2342	    inm->inm_ifp->if_xname);
2343
2344	error = 0;
2345	syncstates = 1;
2346
2347	ifp = inm->inm_ifp;
2348
2349	IN_MULTI_LOCK_ASSERT();
2350	IGMP_LOCK_ASSERT();
2351
2352	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
2353
2354	/*
2355	 * Groups joined on loopback or marked as 'not reported',
2356	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
2357	 * are never reported in any IGMP protocol exchanges.
2358	 * All other groups enter the appropriate IGMP state machine
2359	 * for the version in use on this link.
2360	 * A link marked as IGIF_SILENT causes IGMP to be completely
2361	 * disabled for the link.
2362	 */
2363	if ((ifp->if_flags & IFF_LOOPBACK) ||
2364	    (igi->igi_flags & IGIF_SILENT) ||
2365	    !igmp_isgroupreported(inm->inm_addr)) {
2366		CTR1(KTR_IGMPV3,
2367"%s: not kicking state machine for silent group", __func__);
2368		inm->inm_state = IGMP_SILENT_MEMBER;
2369		inm->inm_timer = 0;
2370	} else {
2371		/*
2372		 * Deal with overlapping in_multi lifecycle.
2373		 * If this group was LEAVING, then make sure
2374		 * we drop the reference we picked up to keep the
2375		 * group around for the final INCLUDE {} enqueue.
2376		 */
2377		if (igi->igi_version == IGMP_VERSION_3 &&
2378		    inm->inm_state == IGMP_LEAVING_MEMBER)
2379			inm_release_locked(inm);
2380
2381		inm->inm_state = IGMP_REPORTING_MEMBER;
2382
2383		switch (igi->igi_version) {
2384		case IGMP_VERSION_1:
2385		case IGMP_VERSION_2:
2386			inm->inm_state = IGMP_IDLE_MEMBER;
2387			error = igmp_v1v2_queue_report(inm,
2388			    (igi->igi_version == IGMP_VERSION_2) ?
2389			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
2390			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
2391			if (error == 0) {
2392				inm->inm_timer = IGMP_RANDOM_DELAY(
2393				    IGMP_V1V2_MAX_RI * PR_FASTHZ);
2394				V_current_state_timers_running = 1;
2395			}
2396			break;
2397
2398		case IGMP_VERSION_3:
2399			/*
2400			 * Defer update of T0 to T1, until the first copy
2401			 * of the state change has been transmitted.
2402			 */
2403			syncstates = 0;
2404
2405			/*
2406			 * Immediately enqueue a State-Change Report for
2407			 * this interface, freeing any previous reports.
2408			 * Don't kick the timers if there is nothing to do,
2409			 * or if an error occurred.
2410			 */
2411			ifq = &inm->inm_scq;
2412			_IF_DRAIN(ifq);
2413			retval = igmp_v3_enqueue_group_record(ifq, inm, 1,
2414			    0, 0);
2415			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
2416			    __func__, retval);
2417			if (retval <= 0) {
2418				error = retval * -1;
2419				break;
2420			}
2421
2422			/*
2423			 * Schedule transmission of pending state-change
2424			 * report up to RV times for this link. The timer
2425			 * will fire at the next igmp_fasttimo (~200ms),
2426			 * giving us an opportunity to merge the reports.
2427			 */
2428			if (igi->igi_flags & IGIF_LOOPBACK) {
2429				inm->inm_scrv = 1;
2430			} else {
2431				KASSERT(igi->igi_rv > 1,
2432				   ("%s: invalid robustness %d", __func__,
2433				    igi->igi_rv));
2434				inm->inm_scrv = igi->igi_rv;
2435			}
2436			inm->inm_sctimer = 1;
2437			V_state_change_timers_running = 1;
2438
2439			error = 0;
2440			break;
2441		}
2442	}
2443
2444	/*
2445	 * Only update the T0 state if state change is atomic,
2446	 * i.e. we don't need to wait for a timer to fire before we
2447	 * can consider the state change to have been communicated.
2448	 */
2449	if (syncstates) {
2450		inm_commit(inm);
2451		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2452		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2453	}
2454
2455	return (error);
2456}
2457
2458/*
2459 * Issue an intermediate state change during the IGMP life-cycle.
2460 */
2461static int
2462igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi)
2463{
2464	struct ifnet		*ifp;
2465	int			 retval;
2466
2467	CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
2468	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2469	    inm->inm_ifp->if_xname);
2470
2471	ifp = inm->inm_ifp;
2472
2473	IN_MULTI_LOCK_ASSERT();
2474	IGMP_LOCK_ASSERT();
2475
2476	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
2477
2478	if ((ifp->if_flags & IFF_LOOPBACK) ||
2479	    (igi->igi_flags & IGIF_SILENT) ||
2480	    !igmp_isgroupreported(inm->inm_addr) ||
2481	    (igi->igi_version != IGMP_VERSION_3)) {
2482		if (!igmp_isgroupreported(inm->inm_addr)) {
2483			CTR1(KTR_IGMPV3,
2484"%s: not kicking state machine for silent group", __func__);
2485		}
2486		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
2487		inm_commit(inm);
2488		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2489		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2490		return (0);
2491	}
2492
2493	_IF_DRAIN(&inm->inm_scq);
2494
2495	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
2496	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
2497	if (retval <= 0)
2498		return (-retval);
2499
2500	/*
2501	 * If record(s) were enqueued, start the state-change
2502	 * report timer for this group.
2503	 */
2504	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
2505	inm->inm_sctimer = 1;
2506	V_state_change_timers_running = 1;
2507
2508	return (0);
2509}
2510
2511/*
2512 * Perform the final leave for an IGMP group.
2513 *
2514 * When leaving a group:
2515 *  IGMPv1 does nothing.
2516 *  IGMPv2 sends a host leave message, if and only if we are the reporter.
2517 *  IGMPv3 enqueues a state-change report containing a transition
2518 *  to INCLUDE {} for immediate transmission.
2519 */
2520static void
2521igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi)
2522{
2523	int syncstates;
2524
2525	syncstates = 1;
2526
2527	CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
2528	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2529	    inm->inm_ifp->if_xname);
2530
2531	IN_MULTI_LOCK_ASSERT();
2532	IGMP_LOCK_ASSERT();
2533
2534	switch (inm->inm_state) {
2535	case IGMP_NOT_MEMBER:
2536	case IGMP_SILENT_MEMBER:
2537	case IGMP_LEAVING_MEMBER:
2538		/* Already leaving or left; do nothing. */
2539		CTR1(KTR_IGMPV3,
2540"%s: not kicking state machine for silent group", __func__);
2541		break;
2542	case IGMP_REPORTING_MEMBER:
2543	case IGMP_IDLE_MEMBER:
2544	case IGMP_G_QUERY_PENDING_MEMBER:
2545	case IGMP_SG_QUERY_PENDING_MEMBER:
2546		if (igi->igi_version == IGMP_VERSION_2) {
2547#ifdef INVARIANTS
2548			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
2549			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
2550			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
2551			     __func__);
2552#endif
2553			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
2554			inm->inm_state = IGMP_NOT_MEMBER;
2555		} else if (igi->igi_version == IGMP_VERSION_3) {
2556			/*
2557			 * Stop group timer and all pending reports.
2558			 * Immediately enqueue a state-change report
2559			 * TO_IN {} to be sent on the next fast timeout,
2560			 * giving us an opportunity to merge reports.
2561			 */
2562			_IF_DRAIN(&inm->inm_scq);
2563			inm->inm_timer = 0;
2564			if (igi->igi_flags & IGIF_LOOPBACK) {
2565				inm->inm_scrv = 1;
2566			} else {
2567				inm->inm_scrv = igi->igi_rv;
2568			}
2569			CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
2570			    "pending retransmissions.", __func__,
2571			    inet_ntoa(inm->inm_addr),
2572			    inm->inm_ifp->if_xname, inm->inm_scrv);
2573			if (inm->inm_scrv == 0) {
2574				inm->inm_state = IGMP_NOT_MEMBER;
2575				inm->inm_sctimer = 0;
2576			} else {
2577				int retval;
2578
2579				inm_acquire_locked(inm);
2580
2581				retval = igmp_v3_enqueue_group_record(
2582				    &inm->inm_scq, inm, 1, 0, 0);
2583				KASSERT(retval != 0,
2584				    ("%s: enqueue record = %d", __func__,
2585				     retval));
2586
2587				inm->inm_state = IGMP_LEAVING_MEMBER;
2588				inm->inm_sctimer = 1;
2589				V_state_change_timers_running = 1;
2590				syncstates = 0;
2591			}
2592			break;
2593		}
2594		break;
2595	case IGMP_LAZY_MEMBER:
2596	case IGMP_SLEEPING_MEMBER:
2597	case IGMP_AWAKENING_MEMBER:
2598		/* Our reports are suppressed; do nothing. */
2599		break;
2600	}
2601
2602	if (syncstates) {
2603		inm_commit(inm);
2604		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2605		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2606		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
2607		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
2608		    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2609	}
2610}
2611
2612/*
2613 * Enqueue an IGMPv3 group record to the given output queue.
2614 *
2615 * XXX This function could do with having the allocation code
2616 * split out, and the multiple-tree-walks coalesced into a single
2617 * routine as has been done in igmp_v3_enqueue_filter_change().
2618 *
2619 * If is_state_change is zero, a current-state record is appended.
2620 * If is_state_change is non-zero, a state-change report is appended.
2621 *
2622 * If is_group_query is non-zero, an mbuf packet chain is allocated.
2623 * If is_group_query is zero, and if there is a packet with free space
2624 * at the tail of the queue, it will be appended to providing there
2625 * is enough free space.
2626 * Otherwise a new mbuf packet chain is allocated.
2627 *
2628 * If is_source_query is non-zero, each source is checked to see if
2629 * it was recorded for a Group-Source query, and will be omitted if
2630 * it is not both in-mode and recorded.
2631 *
2632 * The function will attempt to allocate leading space in the packet
2633 * for the IP/IGMP header to be prepended without fragmenting the chain.
2634 *
2635 * If successful the size of all data appended to the queue is returned,
2636 * otherwise an error code less than zero is returned, or zero if
2637 * no record(s) were appended.
2638 */
2639static int
2640igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
2641    const int is_state_change, const int is_group_query,
2642    const int is_source_query)
2643{
2644	struct igmp_grouprec	 ig;
2645	struct igmp_grouprec	*pig;
2646	struct ifnet		*ifp;
2647	struct ip_msource	*ims, *nims;
2648	struct mbuf		*m0, *m, *md;
2649	int			 error, is_filter_list_change;
2650	int			 minrec0len, m0srcs, msrcs, nbytes, off;
2651	int			 record_has_sources;
2652	int			 now;
2653	int			 type;
2654	in_addr_t		 naddr;
2655	uint8_t			 mode;
2656
2657	IN_MULTI_LOCK_ASSERT();
2658
2659	error = 0;
2660	ifp = inm->inm_ifp;
2661	is_filter_list_change = 0;
2662	m = NULL;
2663	m0 = NULL;
2664	m0srcs = 0;
2665	msrcs = 0;
2666	nbytes = 0;
2667	nims = NULL;
2668	record_has_sources = 1;
2669	pig = NULL;
2670	type = IGMP_DO_NOTHING;
2671	mode = inm->inm_st[1].iss_fmode;
2672
2673	/*
2674	 * If we did not transition out of ASM mode during t0->t1,
2675	 * and there are no source nodes to process, we can skip
2676	 * the generation of source records.
2677	 */
2678	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
2679	    inm->inm_nsrc == 0)
2680		record_has_sources = 0;
2681
2682	if (is_state_change) {
2683		/*
2684		 * Queue a state change record.
2685		 * If the mode did not change, and there are non-ASM
2686		 * listeners or source filters present,
2687		 * we potentially need to issue two records for the group.
2688		 * If we are transitioning to MCAST_UNDEFINED, we need
2689		 * not send any sources.
2690		 * If there are ASM listeners, and there was no filter
2691		 * mode transition of any kind, do nothing.
2692		 */
2693		if (mode != inm->inm_st[0].iss_fmode) {
2694			if (mode == MCAST_EXCLUDE) {
2695				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
2696				    __func__);
2697				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
2698			} else {
2699				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
2700				    __func__);
2701				type = IGMP_CHANGE_TO_INCLUDE_MODE;
2702				if (mode == MCAST_UNDEFINED)
2703					record_has_sources = 0;
2704			}
2705		} else {
2706			if (record_has_sources) {
2707				is_filter_list_change = 1;
2708			} else {
2709				type = IGMP_DO_NOTHING;
2710			}
2711		}
2712	} else {
2713		/*
2714		 * Queue a current state record.
2715		 */
2716		if (mode == MCAST_EXCLUDE) {
2717			type = IGMP_MODE_IS_EXCLUDE;
2718		} else if (mode == MCAST_INCLUDE) {
2719			type = IGMP_MODE_IS_INCLUDE;
2720			KASSERT(inm->inm_st[1].iss_asm == 0,
2721			    ("%s: inm %p is INCLUDE but ASM count is %d",
2722			     __func__, inm, inm->inm_st[1].iss_asm));
2723		}
2724	}
2725
2726	/*
2727	 * Generate the filter list changes using a separate function.
2728	 */
2729	if (is_filter_list_change)
2730		return (igmp_v3_enqueue_filter_change(ifq, inm));
2731
2732	if (type == IGMP_DO_NOTHING) {
2733		CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
2734		    __func__, inet_ntoa(inm->inm_addr),
2735		    inm->inm_ifp->if_xname);
2736		return (0);
2737	}
2738
2739	/*
2740	 * If any sources are present, we must be able to fit at least
2741	 * one in the trailing space of the tail packet's mbuf,
2742	 * ideally more.
2743	 */
2744	minrec0len = sizeof(struct igmp_grouprec);
2745	if (record_has_sources)
2746		minrec0len += sizeof(in_addr_t);
2747
2748	CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
2749	    igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
2750	    inm->inm_ifp->if_xname);
2751
2752	/*
2753	 * Check if we have a packet in the tail of the queue for this
2754	 * group into which the first group record for this group will fit.
2755	 * Otherwise allocate a new packet.
2756	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
2757	 * Note: Group records for G/GSR query responses MUST be sent
2758	 * in their own packet.
2759	 */
2760	m0 = ifq->ifq_tail;
2761	if (!is_group_query &&
2762	    m0 != NULL &&
2763	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
2764	    (m0->m_pkthdr.len + minrec0len) <
2765	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
2766		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
2767			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2768		m = m0;
2769		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
2770	} else {
2771		if (_IF_QFULL(ifq)) {
2772			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
2773			return (-ENOMEM);
2774		}
2775		m = NULL;
2776		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
2777		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2778		if (!is_state_change && !is_group_query) {
2779			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
2780			if (m)
2781				m->m_data += IGMP_LEADINGSPACE;
2782		}
2783		if (m == NULL) {
2784			m = m_gethdr(M_NOWAIT, MT_DATA);
2785			if (m)
2786				MH_ALIGN(m, IGMP_LEADINGSPACE);
2787		}
2788		if (m == NULL)
2789			return (-ENOMEM);
2790
2791		igmp_save_context(m, ifp);
2792
2793		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
2794	}
2795
2796	/*
2797	 * Append group record.
2798	 * If we have sources, we don't know how many yet.
2799	 */
2800	ig.ig_type = type;
2801	ig.ig_datalen = 0;
2802	ig.ig_numsrc = 0;
2803	ig.ig_group = inm->inm_addr;
2804	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
2805		if (m != m0)
2806			m_freem(m);
2807		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
2808		return (-ENOMEM);
2809	}
2810	nbytes += sizeof(struct igmp_grouprec);
2811
2812	/*
2813	 * Append as many sources as will fit in the first packet.
2814	 * If we are appending to a new packet, the chain allocation
2815	 * may potentially use clusters; use m_getptr() in this case.
2816	 * If we are appending to an existing packet, we need to obtain
2817	 * a pointer to the group record after m_append(), in case a new
2818	 * mbuf was allocated.
2819	 * Only append sources which are in-mode at t1. If we are
2820	 * transitioning to MCAST_UNDEFINED state on the group, do not
2821	 * include source entries.
2822	 * Only report recorded sources in our filter set when responding
2823	 * to a group-source query.
2824	 */
2825	if (record_has_sources) {
2826		if (m == m0) {
2827			md = m_last(m);
2828			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
2829			    md->m_len - nbytes);
2830		} else {
2831			md = m_getptr(m, 0, &off);
2832			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
2833			    off);
2834		}
2835		msrcs = 0;
2836		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
2837			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
2838			    inet_ntoa_haddr(ims->ims_haddr));
2839			now = ims_get_mode(inm, ims, 1);
2840			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
2841			if ((now != mode) ||
2842			    (now == mode && mode == MCAST_UNDEFINED)) {
2843				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
2844				continue;
2845			}
2846			if (is_source_query && ims->ims_stp == 0) {
2847				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
2848				    __func__);
2849				continue;
2850			}
2851			CTR1(KTR_IGMPV3, "%s: append node", __func__);
2852			naddr = htonl(ims->ims_haddr);
2853			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
2854				if (m != m0)
2855					m_freem(m);
2856				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
2857				    __func__);
2858				return (-ENOMEM);
2859			}
2860			nbytes += sizeof(in_addr_t);
2861			++msrcs;
2862			if (msrcs == m0srcs)
2863				break;
2864		}
2865		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
2866		    msrcs);
2867		pig->ig_numsrc = htons(msrcs);
2868		nbytes += (msrcs * sizeof(in_addr_t));
2869	}
2870
2871	if (is_source_query && msrcs == 0) {
2872		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
2873		if (m != m0)
2874			m_freem(m);
2875		return (0);
2876	}
2877
2878	/*
2879	 * We are good to go with first packet.
2880	 */
2881	if (m != m0) {
2882		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
2883		m->m_pkthdr.PH_vt.vt_nrecs = 1;
2884		_IF_ENQUEUE(ifq, m);
2885	} else
2886		m->m_pkthdr.PH_vt.vt_nrecs++;
2887
2888	/*
2889	 * No further work needed if no source list in packet(s).
2890	 */
2891	if (!record_has_sources)
2892		return (nbytes);
2893
2894	/*
2895	 * Whilst sources remain to be announced, we need to allocate
2896	 * a new packet and fill out as many sources as will fit.
2897	 * Always try for a cluster first.
2898	 */
2899	while (nims != NULL) {
2900		if (_IF_QFULL(ifq)) {
2901			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
2902			return (-ENOMEM);
2903		}
2904		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
2905		if (m)
2906			m->m_data += IGMP_LEADINGSPACE;
2907		if (m == NULL) {
2908			m = m_gethdr(M_NOWAIT, MT_DATA);
2909			if (m)
2910				MH_ALIGN(m, IGMP_LEADINGSPACE);
2911		}
2912		if (m == NULL)
2913			return (-ENOMEM);
2914		igmp_save_context(m, ifp);
2915		md = m_getptr(m, 0, &off);
2916		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
2917		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
2918
2919		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
2920			if (m != m0)
2921				m_freem(m);
2922			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
2923			return (-ENOMEM);
2924		}
2925		m->m_pkthdr.PH_vt.vt_nrecs = 1;
2926		nbytes += sizeof(struct igmp_grouprec);
2927
2928		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
2929		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2930
2931		msrcs = 0;
2932		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
2933			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
2934			    inet_ntoa_haddr(ims->ims_haddr));
2935			now = ims_get_mode(inm, ims, 1);
2936			if ((now != mode) ||
2937			    (now == mode && mode == MCAST_UNDEFINED)) {
2938				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
2939				continue;
2940			}
2941			if (is_source_query && ims->ims_stp == 0) {
2942				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
2943				    __func__);
2944				continue;
2945			}
2946			CTR1(KTR_IGMPV3, "%s: append node", __func__);
2947			naddr = htonl(ims->ims_haddr);
2948			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
2949				if (m != m0)
2950					m_freem(m);
2951				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
2952				    __func__);
2953				return (-ENOMEM);
2954			}
2955			++msrcs;
2956			if (msrcs == m0srcs)
2957				break;
2958		}
2959		pig->ig_numsrc = htons(msrcs);
2960		nbytes += (msrcs * sizeof(in_addr_t));
2961
2962		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
2963		_IF_ENQUEUE(ifq, m);
2964	}
2965
2966	return (nbytes);
2967}
2968
2969/*
2970 * Type used to mark record pass completion.
2971 * We exploit the fact we can cast to this easily from the
2972 * current filter modes on each ip_msource node.
2973 */
2974typedef enum {
2975	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
2976	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
2977	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
2978	REC_FULL = REC_ALLOW | REC_BLOCK
2979} rectype_t;
2980
2981/*
2982 * Enqueue an IGMPv3 filter list change to the given output queue.
2983 *
2984 * Source list filter state is held in an RB-tree. When the filter list
2985 * for a group is changed without changing its mode, we need to compute
2986 * the deltas between T0 and T1 for each source in the filter set,
2987 * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
2988 *
2989 * As we may potentially queue two record types, and the entire R-B tree
2990 * needs to be walked at once, we break this out into its own function
2991 * so we can generate a tightly packed queue of packets.
2992 *
2993 * XXX This could be written to only use one tree walk, although that makes
2994 * serializing into the mbuf chains a bit harder. For now we do two walks
2995 * which makes things easier on us, and it may or may not be harder on
2996 * the L2 cache.
2997 *
2998 * If successful the size of all data appended to the queue is returned,
2999 * otherwise an error code less than zero is returned, or zero if
3000 * no record(s) were appended.
3001 */
3002static int
3003igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
3004{
3005	static const int MINRECLEN =
3006	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
3007	struct ifnet		*ifp;
3008	struct igmp_grouprec	 ig;
3009	struct igmp_grouprec	*pig;
3010	struct ip_msource	*ims, *nims;
3011	struct mbuf		*m, *m0, *md;
3012	in_addr_t		 naddr;
3013	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
3014	int			 nallow, nblock;
3015	uint8_t			 mode, now, then;
3016	rectype_t		 crt, drt, nrt;
3017
3018	IN_MULTI_LOCK_ASSERT();
3019
3020	if (inm->inm_nsrc == 0 ||
3021	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
3022		return (0);
3023
3024	ifp = inm->inm_ifp;			/* interface */
3025	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
3026	crt = REC_NONE;	/* current group record type */
3027	drt = REC_NONE;	/* mask of completed group record types */
3028	nrt = REC_NONE;	/* record type for current node */
3029	m0srcs = 0;	/* # source which will fit in current mbuf chain */
3030	nbytes = 0;	/* # of bytes appended to group's state-change queue */
3031	npbytes = 0;	/* # of bytes appended this packet */
3032	rsrcs = 0;	/* # sources encoded in current record */
3033	schanged = 0;	/* # nodes encoded in overall filter change */
3034	nallow = 0;	/* # of source entries in ALLOW_NEW */
3035	nblock = 0;	/* # of source entries in BLOCK_OLD */
3036	nims = NULL;	/* next tree node pointer */
3037
3038	/*
3039	 * For each possible filter record mode.
3040	 * The first kind of source we encounter tells us which
3041	 * is the first kind of record we start appending.
3042	 * If a node transitioned to UNDEFINED at t1, its mode is treated
3043	 * as the inverse of the group's filter mode.
3044	 */
3045	while (drt != REC_FULL) {
3046		do {
3047			m0 = ifq->ifq_tail;
3048			if (m0 != NULL &&
3049			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
3050			     IGMP_V3_REPORT_MAXRECS) &&
3051			    (m0->m_pkthdr.len + MINRECLEN) <
3052			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
3053				m = m0;
3054				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
3055					    sizeof(struct igmp_grouprec)) /
3056				    sizeof(in_addr_t);
3057				CTR1(KTR_IGMPV3,
3058				    "%s: use previous packet", __func__);
3059			} else {
3060				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
3061				if (m)
3062					m->m_data += IGMP_LEADINGSPACE;
3063				if (m == NULL) {
3064					m = m_gethdr(M_NOWAIT, MT_DATA);
3065					if (m)
3066						MH_ALIGN(m, IGMP_LEADINGSPACE);
3067				}
3068				if (m == NULL) {
3069					CTR1(KTR_IGMPV3,
3070					    "%s: m_get*() failed", __func__);
3071					return (-ENOMEM);
3072				}
3073				m->m_pkthdr.PH_vt.vt_nrecs = 0;
3074				igmp_save_context(m, ifp);
3075				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
3076				    sizeof(struct igmp_grouprec)) /
3077				    sizeof(in_addr_t);
3078				npbytes = 0;
3079				CTR1(KTR_IGMPV3,
3080				    "%s: allocated new packet", __func__);
3081			}
3082			/*
3083			 * Append the IGMP group record header to the
3084			 * current packet's data area.
3085			 * Recalculate pointer to free space for next
3086			 * group record, in case m_append() allocated
3087			 * a new mbuf or cluster.
3088			 */
3089			memset(&ig, 0, sizeof(ig));
3090			ig.ig_group = inm->inm_addr;
3091			if (!m_append(m, sizeof(ig), (void *)&ig)) {
3092				if (m != m0)
3093					m_freem(m);
3094				CTR1(KTR_IGMPV3,
3095				    "%s: m_append() failed", __func__);
3096				return (-ENOMEM);
3097			}
3098			npbytes += sizeof(struct igmp_grouprec);
3099			if (m != m0) {
3100				/* new packet; offset in c hain */
3101				md = m_getptr(m, npbytes -
3102				    sizeof(struct igmp_grouprec), &off);
3103				pig = (struct igmp_grouprec *)(mtod(md,
3104				    uint8_t *) + off);
3105			} else {
3106				/* current packet; offset from last append */
3107				md = m_last(m);
3108				pig = (struct igmp_grouprec *)(mtod(md,
3109				    uint8_t *) + md->m_len -
3110				    sizeof(struct igmp_grouprec));
3111			}
3112			/*
3113			 * Begin walking the tree for this record type
3114			 * pass, or continue from where we left off
3115			 * previously if we had to allocate a new packet.
3116			 * Only report deltas in-mode at t1.
3117			 * We need not report included sources as allowed
3118			 * if we are in inclusive mode on the group,
3119			 * however the converse is not true.
3120			 */
3121			rsrcs = 0;
3122			if (nims == NULL)
3123				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
3124			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
3125				CTR2(KTR_IGMPV3, "%s: visit node %s",
3126				    __func__, inet_ntoa_haddr(ims->ims_haddr));
3127				now = ims_get_mode(inm, ims, 1);
3128				then = ims_get_mode(inm, ims, 0);
3129				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
3130				    __func__, then, now);
3131				if (now == then) {
3132					CTR1(KTR_IGMPV3,
3133					    "%s: skip unchanged", __func__);
3134					continue;
3135				}
3136				if (mode == MCAST_EXCLUDE &&
3137				    now == MCAST_INCLUDE) {
3138					CTR1(KTR_IGMPV3,
3139					    "%s: skip IN src on EX group",
3140					    __func__);
3141					continue;
3142				}
3143				nrt = (rectype_t)now;
3144				if (nrt == REC_NONE)
3145					nrt = (rectype_t)(~mode & REC_FULL);
3146				if (schanged++ == 0) {
3147					crt = nrt;
3148				} else if (crt != nrt)
3149					continue;
3150				naddr = htonl(ims->ims_haddr);
3151				if (!m_append(m, sizeof(in_addr_t),
3152				    (void *)&naddr)) {
3153					if (m != m0)
3154						m_freem(m);
3155					CTR1(KTR_IGMPV3,
3156					    "%s: m_append() failed", __func__);
3157					return (-ENOMEM);
3158				}
3159				nallow += !!(crt == REC_ALLOW);
3160				nblock += !!(crt == REC_BLOCK);
3161				if (++rsrcs == m0srcs)
3162					break;
3163			}
3164			/*
3165			 * If we did not append any tree nodes on this
3166			 * pass, back out of allocations.
3167			 */
3168			if (rsrcs == 0) {
3169				npbytes -= sizeof(struct igmp_grouprec);
3170				if (m != m0) {
3171					CTR1(KTR_IGMPV3,
3172					    "%s: m_free(m)", __func__);
3173					m_freem(m);
3174				} else {
3175					CTR1(KTR_IGMPV3,
3176					    "%s: m_adj(m, -ig)", __func__);
3177					m_adj(m, -((int)sizeof(
3178					    struct igmp_grouprec)));
3179				}
3180				continue;
3181			}
3182			npbytes += (rsrcs * sizeof(in_addr_t));
3183			if (crt == REC_ALLOW)
3184				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
3185			else if (crt == REC_BLOCK)
3186				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
3187			pig->ig_numsrc = htons(rsrcs);
3188			/*
3189			 * Count the new group record, and enqueue this
3190			 * packet if it wasn't already queued.
3191			 */
3192			m->m_pkthdr.PH_vt.vt_nrecs++;
3193			if (m != m0)
3194				_IF_ENQUEUE(ifq, m);
3195			nbytes += npbytes;
3196		} while (nims != NULL);
3197		drt |= crt;
3198		crt = (~crt & REC_FULL);
3199	}
3200
3201	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
3202	    nallow, nblock);
3203
3204	return (nbytes);
3205}
3206
3207static int
3208igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
3209{
3210	struct ifqueue	*gq;
3211	struct mbuf	*m;		/* pending state-change */
3212	struct mbuf	*m0;		/* copy of pending state-change */
3213	struct mbuf	*mt;		/* last state-change in packet */
3214	int		 docopy, domerge;
3215	u_int		 recslen;
3216
3217	docopy = 0;
3218	domerge = 0;
3219	recslen = 0;
3220
3221	IN_MULTI_LOCK_ASSERT();
3222	IGMP_LOCK_ASSERT();
3223
3224	/*
3225	 * If there are further pending retransmissions, make a writable
3226	 * copy of each queued state-change message before merging.
3227	 */
3228	if (inm->inm_scrv > 0)
3229		docopy = 1;
3230
3231	gq = &inm->inm_scq;
3232#ifdef KTR
3233	if (gq->ifq_head == NULL) {
3234		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
3235		    __func__, inm);
3236	}
3237#endif
3238
3239	m = gq->ifq_head;
3240	while (m != NULL) {
3241		/*
3242		 * Only merge the report into the current packet if
3243		 * there is sufficient space to do so; an IGMPv3 report
3244		 * packet may only contain 65,535 group records.
3245		 * Always use a simple mbuf chain concatentation to do this,
3246		 * as large state changes for single groups may have
3247		 * allocated clusters.
3248		 */
3249		domerge = 0;
3250		mt = ifscq->ifq_tail;
3251		if (mt != NULL) {
3252			recslen = m_length(m, NULL);
3253
3254			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
3255			    m->m_pkthdr.PH_vt.vt_nrecs <=
3256			    IGMP_V3_REPORT_MAXRECS) &&
3257			    (mt->m_pkthdr.len + recslen <=
3258			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
3259				domerge = 1;
3260		}
3261
3262		if (!domerge && _IF_QFULL(gq)) {
3263			CTR2(KTR_IGMPV3,
3264			    "%s: outbound queue full, skipping whole packet %p",
3265			    __func__, m);
3266			mt = m->m_nextpkt;
3267			if (!docopy)
3268				m_freem(m);
3269			m = mt;
3270			continue;
3271		}
3272
3273		if (!docopy) {
3274			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
3275			_IF_DEQUEUE(gq, m0);
3276			m = m0->m_nextpkt;
3277		} else {
3278			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
3279			m0 = m_dup(m, M_NOWAIT);
3280			if (m0 == NULL)
3281				return (ENOMEM);
3282			m0->m_nextpkt = NULL;
3283			m = m->m_nextpkt;
3284		}
3285
3286		if (!domerge) {
3287			CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)",
3288			    __func__, m0, ifscq);
3289			_IF_ENQUEUE(ifscq, m0);
3290		} else {
3291			struct mbuf *mtl;	/* last mbuf of packet mt */
3292
3293			CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)",
3294			    __func__, m0, mt);
3295
3296			mtl = m_last(mt);
3297			m0->m_flags &= ~M_PKTHDR;
3298			mt->m_pkthdr.len += recslen;
3299			mt->m_pkthdr.PH_vt.vt_nrecs +=
3300			    m0->m_pkthdr.PH_vt.vt_nrecs;
3301
3302			mtl->m_next = m0;
3303		}
3304	}
3305
3306	return (0);
3307}
3308
3309/*
3310 * Respond to a pending IGMPv3 General Query.
3311 */
3312static void
3313igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi)
3314{
3315	struct ifmultiaddr	*ifma;
3316	struct ifnet		*ifp;
3317	struct in_multi		*inm;
3318	int			 retval, loop;
3319
3320	IN_MULTI_LOCK_ASSERT();
3321	IGMP_LOCK_ASSERT();
3322
3323	KASSERT(igi->igi_version == IGMP_VERSION_3,
3324	    ("%s: called when version %d", __func__, igi->igi_version));
3325
3326	/*
3327	 * Check that there are some packets queued. If so, send them first.
3328	 * For large number of groups the reply to general query can take
3329	 * many packets, we should finish sending them before starting of
3330	 * queuing the new reply.
3331	 */
3332	if (igi->igi_gq.ifq_head != NULL)
3333		goto send;
3334
3335	ifp = igi->igi_ifp;
3336
3337	IF_ADDR_RLOCK(ifp);
3338	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
3339		if (ifma->ifma_addr->sa_family != AF_INET ||
3340		    ifma->ifma_protospec == NULL)
3341			continue;
3342
3343		inm = (struct in_multi *)ifma->ifma_protospec;
3344		KASSERT(ifp == inm->inm_ifp,
3345		    ("%s: inconsistent ifp", __func__));
3346
3347		switch (inm->inm_state) {
3348		case IGMP_NOT_MEMBER:
3349		case IGMP_SILENT_MEMBER:
3350			break;
3351		case IGMP_REPORTING_MEMBER:
3352		case IGMP_IDLE_MEMBER:
3353		case IGMP_LAZY_MEMBER:
3354		case IGMP_SLEEPING_MEMBER:
3355		case IGMP_AWAKENING_MEMBER:
3356			inm->inm_state = IGMP_REPORTING_MEMBER;
3357			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
3358			    inm, 0, 0, 0);
3359			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
3360			    __func__, retval);
3361			break;
3362		case IGMP_G_QUERY_PENDING_MEMBER:
3363		case IGMP_SG_QUERY_PENDING_MEMBER:
3364		case IGMP_LEAVING_MEMBER:
3365			break;
3366		}
3367	}
3368	IF_ADDR_RUNLOCK(ifp);
3369
3370send:
3371	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
3372	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
3373
3374	/*
3375	 * Slew transmission of bursts over 500ms intervals.
3376	 */
3377	if (igi->igi_gq.ifq_head != NULL) {
3378		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
3379		    IGMP_RESPONSE_BURST_INTERVAL);
3380		V_interface_timers_running = 1;
3381	}
3382}
3383
3384/*
3385 * Transmit the next pending IGMP message in the output queue.
3386 *
3387 * We get called from netisr_processqueue(). A mutex private to igmpoq
3388 * will be acquired and released around this routine.
3389 *
3390 * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
3391 * MRT: Nothing needs to be done, as IGMP traffic is always local to
3392 * a link and uses a link-scope multicast address.
3393 */
3394static void
3395igmp_intr(struct mbuf *m)
3396{
3397	struct ip_moptions	 imo;
3398	struct ifnet		*ifp;
3399	struct mbuf		*ipopts, *m0;
3400	int			 error;
3401	uint32_t		 ifindex;
3402
3403	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
3404
3405	/*
3406	 * Set VNET image pointer from enqueued mbuf chain
3407	 * before doing anything else. Whilst we use interface
3408	 * indexes to guard against interface detach, they are
3409	 * unique to each VIMAGE and must be retrieved.
3410	 */
3411	CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr));
3412	ifindex = igmp_restore_context(m);
3413
3414	/*
3415	 * Check if the ifnet still exists. This limits the scope of
3416	 * any race in the absence of a global ifp lock for low cost
3417	 * (an array lookup).
3418	 */
3419	ifp = ifnet_byindex(ifindex);
3420	if (ifp == NULL) {
3421		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
3422		    __func__, m, ifindex);
3423		m_freem(m);
3424		IPSTAT_INC(ips_noroute);
3425		goto out;
3426	}
3427
3428	ipopts = V_igmp_sendra ? m_raopt : NULL;
3429
3430	imo.imo_multicast_ttl  = 1;
3431	imo.imo_multicast_vif  = -1;
3432	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
3433
3434	/*
3435	 * If the user requested that IGMP traffic be explicitly
3436	 * redirected to the loopback interface (e.g. they are running a
3437	 * MANET interface and the routing protocol needs to see the
3438	 * updates), handle this now.
3439	 */
3440	if (m->m_flags & M_IGMP_LOOP)
3441		imo.imo_multicast_ifp = V_loif;
3442	else
3443		imo.imo_multicast_ifp = ifp;
3444
3445	if (m->m_flags & M_IGMPV2) {
3446		m0 = m;
3447	} else {
3448		m0 = igmp_v3_encap_report(ifp, m);
3449		if (m0 == NULL) {
3450			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
3451			m_freem(m);
3452			IPSTAT_INC(ips_odropped);
3453			goto out;
3454		}
3455	}
3456
3457	igmp_scrub_context(m0);
3458	m_clrprotoflags(m);
3459	m0->m_pkthdr.rcvif = V_loif;
3460#ifdef MAC
3461	mac_netinet_igmp_send(ifp, m0);
3462#endif
3463	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
3464	if (error) {
3465		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
3466		goto out;
3467	}
3468
3469	IGMPSTAT_INC(igps_snd_reports);
3470
3471out:
3472	/*
3473	 * We must restore the existing vnet pointer before
3474	 * continuing as we are run from netisr context.
3475	 */
3476	CURVNET_RESTORE();
3477}
3478
3479/*
3480 * Encapsulate an IGMPv3 report.
3481 *
3482 * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
3483 * chain has already had its IP/IGMPv3 header prepended. In this case
3484 * the function will not attempt to prepend; the lengths and checksums
3485 * will however be re-computed.
3486 *
3487 * Returns a pointer to the new mbuf chain head, or NULL if the
3488 * allocation failed.
3489 */
3490static struct mbuf *
3491igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
3492{
3493	struct igmp_report	*igmp;
3494	struct ip		*ip;
3495	int			 hdrlen, igmpreclen;
3496
3497	KASSERT((m->m_flags & M_PKTHDR),
3498	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
3499
3500	igmpreclen = m_length(m, NULL);
3501	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
3502
3503	if (m->m_flags & M_IGMPV3_HDR) {
3504		igmpreclen -= hdrlen;
3505	} else {
3506		M_PREPEND(m, hdrlen, M_NOWAIT);
3507		if (m == NULL)
3508			return (NULL);
3509		m->m_flags |= M_IGMPV3_HDR;
3510	}
3511
3512	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
3513
3514	m->m_data += sizeof(struct ip);
3515	m->m_len -= sizeof(struct ip);
3516
3517	igmp = mtod(m, struct igmp_report *);
3518	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
3519	igmp->ir_rsv1 = 0;
3520	igmp->ir_rsv2 = 0;
3521	igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
3522	igmp->ir_cksum = 0;
3523	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
3524	m->m_pkthdr.PH_vt.vt_nrecs = 0;
3525
3526	m->m_data -= sizeof(struct ip);
3527	m->m_len += sizeof(struct ip);
3528
3529	ip = mtod(m, struct ip *);
3530	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
3531	ip->ip_len = htons(hdrlen + igmpreclen);
3532	ip->ip_off = htons(IP_DF);
3533	ip->ip_p = IPPROTO_IGMP;
3534	ip->ip_sum = 0;
3535
3536	ip->ip_src.s_addr = INADDR_ANY;
3537
3538	if (m->m_flags & M_IGMP_LOOP) {
3539		struct in_ifaddr *ia;
3540
3541		IFP_TO_IA(ifp, ia);
3542		if (ia != NULL) {
3543			ip->ip_src = ia->ia_addr.sin_addr;
3544			ifa_free(&ia->ia_ifa);
3545		}
3546	}
3547
3548	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
3549
3550	return (m);
3551}
3552
3553#ifdef KTR
3554static char *
3555igmp_rec_type_to_str(const int type)
3556{
3557
3558	switch (type) {
3559		case IGMP_CHANGE_TO_EXCLUDE_MODE:
3560			return "TO_EX";
3561			break;
3562		case IGMP_CHANGE_TO_INCLUDE_MODE:
3563			return "TO_IN";
3564			break;
3565		case IGMP_MODE_IS_EXCLUDE:
3566			return "MODE_EX";
3567			break;
3568		case IGMP_MODE_IS_INCLUDE:
3569			return "MODE_IN";
3570			break;
3571		case IGMP_ALLOW_NEW_SOURCES:
3572			return "ALLOW_NEW";
3573			break;
3574		case IGMP_BLOCK_OLD_SOURCES:
3575			return "BLOCK_OLD";
3576			break;
3577		default:
3578			break;
3579	}
3580	return "unknown";
3581}
3582#endif
3583
3584static void
3585igmp_init(void *unused __unused)
3586{
3587
3588	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
3589
3590	IGMP_LOCK_INIT();
3591
3592	m_raopt = igmp_ra_alloc();
3593
3594	netisr_register(&igmp_nh);
3595}
3596SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL);
3597
3598static void
3599igmp_uninit(void *unused __unused)
3600{
3601
3602	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
3603
3604	netisr_unregister(&igmp_nh);
3605
3606	m_free(m_raopt);
3607	m_raopt = NULL;
3608
3609	IGMP_LOCK_DESTROY();
3610}
3611SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL);
3612
3613static void
3614vnet_igmp_init(const void *unused __unused)
3615{
3616
3617	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
3618
3619	LIST_INIT(&V_igi_head);
3620}
3621VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init,
3622    NULL);
3623
3624static void
3625vnet_igmp_uninit(const void *unused __unused)
3626{
3627
3628	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
3629
3630	KASSERT(LIST_EMPTY(&V_igi_head),
3631	    ("%s: igi list not empty; ifnets not detached?", __func__));
3632}
3633VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
3634    vnet_igmp_uninit, NULL);
3635
3636static int
3637igmp_modevent(module_t mod, int type, void *unused __unused)
3638{
3639
3640    switch (type) {
3641    case MOD_LOAD:
3642    case MOD_UNLOAD:
3643	break;
3644    default:
3645	return (EOPNOTSUPP);
3646    }
3647    return (0);
3648}
3649
3650static moduledata_t igmp_mod = {
3651    "igmp",
3652    igmp_modevent,
3653    0
3654};
3655DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
3656