ip6_output.c revision 317335
1/*-
2 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the project nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
30 */
31
32/*-
33 * Copyright (c) 1982, 1986, 1988, 1990, 1993
34 *	The Regents of the University of California.  All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
61 */
62
63#include <sys/cdefs.h>
64__FBSDID("$FreeBSD: stable/10/sys/netinet6/ip6_output.c 317335 2017-04-23 08:59:57Z kp $");
65
66#include "opt_inet.h"
67#include "opt_inet6.h"
68#include "opt_ipfw.h"
69#include "opt_ipsec.h"
70#include "opt_sctp.h"
71#include "opt_route.h"
72
73#include <sys/param.h>
74#include <sys/kernel.h>
75#include <sys/malloc.h>
76#include <sys/mbuf.h>
77#include <sys/errno.h>
78#include <sys/priv.h>
79#include <sys/proc.h>
80#include <sys/protosw.h>
81#include <sys/socket.h>
82#include <sys/socketvar.h>
83#include <sys/syslog.h>
84#include <sys/ucred.h>
85
86#include <machine/in_cksum.h>
87
88#include <net/if.h>
89#include <net/netisr.h>
90#include <net/route.h>
91#include <net/pfil.h>
92#include <net/vnet.h>
93
94#include <netinet/in.h>
95#include <netinet/in_var.h>
96#include <netinet/ip_var.h>
97#include <netinet6/in6_var.h>
98#include <netinet/ip6.h>
99#include <netinet/icmp6.h>
100#include <netinet6/ip6_var.h>
101#include <netinet/in_pcb.h>
102#include <netinet/tcp_var.h>
103#include <netinet6/nd6.h>
104
105#ifdef IPSEC
106#include <netipsec/ipsec.h>
107#include <netipsec/ipsec6.h>
108#include <netipsec/key.h>
109#include <netinet6/ip6_ipsec.h>
110#endif /* IPSEC */
111#ifdef SCTP
112#include <netinet/sctp.h>
113#include <netinet/sctp_crc32.h>
114#endif
115
116#include <netinet6/ip6protosw.h>
117#include <netinet6/scope6_var.h>
118
119#ifdef FLOWTABLE
120#include <net/flowtable.h>
121#endif
122
123extern int in6_mcast_loop;
124
125struct ip6_exthdrs {
126	struct mbuf *ip6e_ip6;
127	struct mbuf *ip6e_hbh;
128	struct mbuf *ip6e_dest1;
129	struct mbuf *ip6e_rthdr;
130	struct mbuf *ip6e_dest2;
131};
132
133static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
134			   struct ucred *, int);
135static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *,
136	struct socket *, struct sockopt *);
137static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *);
138static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *,
139	struct ucred *, int, int, int);
140
141static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
142static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
143	struct ip6_frag **);
144static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
145static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
146static int ip6_getpmtu(struct route_in6 *, struct route_in6 *,
147	struct ifnet *, struct in6_addr *, u_long *, int *, u_int);
148static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
149
150
151/*
152 * Make an extension header from option data.  hp is the source, and
153 * mp is the destination.
154 */
155#define MAKE_EXTHDR(hp, mp)						\
156    do {								\
157	if (hp) {							\
158		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
159		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
160		    ((eh)->ip6e_len + 1) << 3);				\
161		if (error)						\
162			goto freehdrs;					\
163	}								\
164    } while (/*CONSTCOND*/ 0)
165
166/*
167 * Form a chain of extension headers.
168 * m is the extension header mbuf
169 * mp is the previous mbuf in the chain
170 * p is the next header
171 * i is the type of option.
172 */
173#define MAKE_CHAIN(m, mp, p, i)\
174    do {\
175	if (m) {\
176		if (!hdrsplit) \
177			panic("assumption failed: hdr not split"); \
178		*mtod((m), u_char *) = *(p);\
179		*(p) = (i);\
180		p = mtod((m), u_char *);\
181		(m)->m_next = (mp)->m_next;\
182		(mp)->m_next = (m);\
183		(mp) = (m);\
184	}\
185    } while (/*CONSTCOND*/ 0)
186
187void
188in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset)
189{
190	u_short csum;
191
192	csum = in_cksum_skip(m, offset + plen, offset);
193	if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0)
194		csum = 0xffff;
195	offset += m->m_pkthdr.csum_data;	/* checksum offset */
196
197	if (offset + sizeof(u_short) > m->m_len) {
198		printf("%s: delayed m_pullup, m->len: %d plen %u off %u "
199		    "csum_flags=%b\n", __func__, m->m_len, plen, offset,
200		    (int)m->m_pkthdr.csum_flags, CSUM_BITS);
201		/*
202		 * XXX this should not happen, but if it does, the correct
203		 * behavior may be to insert the checksum in the appropriate
204		 * next mbuf in the chain.
205		 */
206		return;
207	}
208	*(u_short *)(m->m_data + offset) = csum;
209}
210
211int
212ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto,
213    int mtu, uint32_t id)
214{
215	struct mbuf *m, **mnext, *m_frgpart;
216	struct ip6_hdr *ip6, *mhip6;
217	struct ip6_frag *ip6f;
218	int off;
219	int error;
220	int tlen = m0->m_pkthdr.len;
221
222	KASSERT(( mtu % 8 == 0), ("Fragment length must be a multiple of 8"));
223
224	m = m0;
225	ip6 = mtod(m, struct ip6_hdr *);
226	mnext = &m->m_nextpkt;
227
228	for (off = hlen; off < tlen; off += mtu) {
229		m = m_gethdr(M_NOWAIT, MT_DATA);
230		if (!m) {
231			IP6STAT_INC(ip6s_odropped);
232			return (ENOBUFS);
233		}
234		m->m_flags = m0->m_flags & M_COPYFLAGS;
235		*mnext = m;
236		mnext = &m->m_nextpkt;
237		m->m_data += max_linkhdr;
238		mhip6 = mtod(m, struct ip6_hdr *);
239		*mhip6 = *ip6;
240		m->m_len = sizeof(*mhip6);
241		error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
242		if (error) {
243			IP6STAT_INC(ip6s_odropped);
244			return (error);
245		}
246		ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
247		if (off + mtu >= tlen)
248			mtu = tlen - off;
249		else
250			ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
251		mhip6->ip6_plen = htons((u_short)(mtu + hlen +
252		    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
253		if ((m_frgpart = m_copy(m0, off, mtu)) == 0) {
254			IP6STAT_INC(ip6s_odropped);
255			return (ENOBUFS);
256		}
257		m_cat(m, m_frgpart);
258		m->m_pkthdr.len = mtu + hlen + sizeof(*ip6f);
259		m->m_pkthdr.fibnum = m0->m_pkthdr.fibnum;
260		m->m_pkthdr.rcvif = NULL;
261		ip6f->ip6f_reserved = 0;
262		ip6f->ip6f_ident = id;
263		ip6f->ip6f_nxt = nextproto;
264		IP6STAT_INC(ip6s_ofragments);
265		in6_ifstat_inc(ifp, ifs6_out_fragcreat);
266	}
267
268	return (0);
269}
270
271/*
272 * IP6 output. The packet in mbuf chain m contains a skeletal IP6
273 * header (with pri, len, nxt, hlim, src, dst).
274 * This function may modify ver and hlim only.
275 * The mbuf chain containing the packet will be freed.
276 * The mbuf opt, if present, will not be freed.
277 * If route_in6 ro is present and has ro_rt initialized, route lookup would be
278 * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
279 * then result of route lookup is stored in ro->ro_rt.
280 *
281 * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and
282 * nd_ifinfo.linkmtu is u_int32_t.  so we use u_long to hold largest one,
283 * which is rt_mtu.
284 *
285 * ifpp - XXX: just for statistics
286 */
287int
288ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
289    struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
290    struct ifnet **ifpp, struct inpcb *inp)
291{
292	struct ip6_hdr *ip6;
293	struct ifnet *ifp, *origifp;
294	struct mbuf *m = m0;
295	struct mbuf *mprev = NULL;
296	int hlen, tlen, len;
297	struct route_in6 ip6route;
298	struct rtentry *rt = NULL;
299	struct sockaddr_in6 *dst, src_sa, dst_sa;
300	struct in6_addr odst;
301	int error = 0;
302	struct in6_ifaddr *ia = NULL;
303	u_long mtu;
304	int alwaysfrag, dontfrag;
305	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
306	struct ip6_exthdrs exthdrs;
307	struct in6_addr finaldst, src0, dst0;
308	u_int32_t zone;
309	struct route_in6 *ro_pmtu = NULL;
310	int hdrsplit = 0;
311	int sw_csum, tso;
312	struct m_tag *fwd_tag = NULL;
313	uint32_t id;
314
315	ip6 = mtod(m, struct ip6_hdr *);
316	if (ip6 == NULL) {
317		printf ("ip6 is NULL");
318		goto bad;
319	}
320
321	if (inp != NULL)
322		M_SETFIB(m, inp->inp_inc.inc_fibnum);
323
324	finaldst = ip6->ip6_dst;
325	bzero(&exthdrs, sizeof(exthdrs));
326	if (opt) {
327		/* Hop-by-Hop options header */
328		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
329		/* Destination options header(1st part) */
330		if (opt->ip6po_rthdr) {
331			/*
332			 * Destination options header(1st part)
333			 * This only makes sense with a routing header.
334			 * See Section 9.2 of RFC 3542.
335			 * Disabling this part just for MIP6 convenience is
336			 * a bad idea.  We need to think carefully about a
337			 * way to make the advanced API coexist with MIP6
338			 * options, which might automatically be inserted in
339			 * the kernel.
340			 */
341			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
342		}
343		/* Routing header */
344		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
345		/* Destination options header(2nd part) */
346		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
347	}
348
349#ifdef IPSEC
350	/*
351	 * IPSec checking which handles several cases.
352	 * FAST IPSEC: We re-injected the packet.
353	 */
354	switch(ip6_ipsec_output(&m, inp, &flags, &error, &ifp))
355	{
356	case 1:                 /* Bad packet */
357		goto freehdrs;
358	case -1:                /* IPSec done */
359		goto done;
360	case 0:                 /* No IPSec */
361	default:
362		break;
363	}
364#endif /* IPSEC */
365
366	/*
367	 * Calculate the total length of the extension header chain.
368	 * Keep the length of the unfragmentable part for fragmentation.
369	 */
370	optlen = 0;
371	if (exthdrs.ip6e_hbh)
372		optlen += exthdrs.ip6e_hbh->m_len;
373	if (exthdrs.ip6e_dest1)
374		optlen += exthdrs.ip6e_dest1->m_len;
375	if (exthdrs.ip6e_rthdr)
376		optlen += exthdrs.ip6e_rthdr->m_len;
377	unfragpartlen = optlen + sizeof(struct ip6_hdr);
378
379	/* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */
380	if (exthdrs.ip6e_dest2)
381		optlen += exthdrs.ip6e_dest2->m_len;
382
383	/*
384	 * If there is at least one extension header,
385	 * separate IP6 header from the payload.
386	 */
387	if (optlen && !hdrsplit) {
388		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
389			m = NULL;
390			goto freehdrs;
391		}
392		m = exthdrs.ip6e_ip6;
393		hdrsplit++;
394	}
395
396	/* adjust pointer */
397	ip6 = mtod(m, struct ip6_hdr *);
398
399	/* adjust mbuf packet header length */
400	m->m_pkthdr.len += optlen;
401	plen = m->m_pkthdr.len - sizeof(*ip6);
402
403	/* If this is a jumbo payload, insert a jumbo payload option. */
404	if (plen > IPV6_MAXPACKET) {
405		if (!hdrsplit) {
406			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
407				m = NULL;
408				goto freehdrs;
409			}
410			m = exthdrs.ip6e_ip6;
411			hdrsplit++;
412		}
413		/* adjust pointer */
414		ip6 = mtod(m, struct ip6_hdr *);
415		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
416			goto freehdrs;
417		ip6->ip6_plen = 0;
418	} else
419		ip6->ip6_plen = htons(plen);
420
421	/*
422	 * Concatenate headers and fill in next header fields.
423	 * Here we have, on "m"
424	 *	IPv6 payload
425	 * and we insert headers accordingly.  Finally, we should be getting:
426	 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
427	 *
428	 * during the header composing process, "m" points to IPv6 header.
429	 * "mprev" points to an extension header prior to esp.
430	 */
431	u_char *nexthdrp = &ip6->ip6_nxt;
432	mprev = m;
433
434	/*
435	 * we treat dest2 specially.  this makes IPsec processing
436	 * much easier.  the goal here is to make mprev point the
437	 * mbuf prior to dest2.
438	 *
439	 * result: IPv6 dest2 payload
440	 * m and mprev will point to IPv6 header.
441	 */
442	if (exthdrs.ip6e_dest2) {
443		if (!hdrsplit)
444			panic("assumption failed: hdr not split");
445		exthdrs.ip6e_dest2->m_next = m->m_next;
446		m->m_next = exthdrs.ip6e_dest2;
447		*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
448		ip6->ip6_nxt = IPPROTO_DSTOPTS;
449	}
450
451	/*
452	 * result: IPv6 hbh dest1 rthdr dest2 payload
453	 * m will point to IPv6 header.  mprev will point to the
454	 * extension header prior to dest2 (rthdr in the above case).
455	 */
456	MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
457	MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
458		   IPPROTO_DSTOPTS);
459	MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
460		   IPPROTO_ROUTING);
461
462	/*
463	 * If there is a routing header, discard the packet.
464	 */
465	if (exthdrs.ip6e_rthdr) {
466		 error = EINVAL;
467		 goto bad;
468	}
469
470	/* Source address validation */
471	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
472	    (flags & IPV6_UNSPECSRC) == 0) {
473		error = EOPNOTSUPP;
474		IP6STAT_INC(ip6s_badscope);
475		goto bad;
476	}
477	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
478		error = EOPNOTSUPP;
479		IP6STAT_INC(ip6s_badscope);
480		goto bad;
481	}
482
483	IP6STAT_INC(ip6s_localout);
484
485	/*
486	 * Route packet.
487	 */
488	if (ro == 0) {
489		ro = &ip6route;
490		bzero((caddr_t)ro, sizeof(*ro));
491	}
492	ro_pmtu = ro;
493	if (opt && opt->ip6po_rthdr)
494		ro = &opt->ip6po_route;
495	dst = (struct sockaddr_in6 *)&ro->ro_dst;
496#ifdef FLOWTABLE
497	if (ro->ro_rt == NULL)
498		(void )flowtable_lookup(AF_INET6, m, (struct route *)ro);
499#endif
500again:
501	/*
502	 * if specified, try to fill in the traffic class field.
503	 * do not override if a non-zero value is already set.
504	 * we check the diffserv field and the ecn field separately.
505	 */
506	if (opt && opt->ip6po_tclass >= 0) {
507		int mask = 0;
508
509		if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
510			mask |= 0xfc;
511		if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
512			mask |= 0x03;
513		if (mask != 0)
514			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
515	}
516
517	/* fill in or override the hop limit field, if necessary. */
518	if (opt && opt->ip6po_hlim != -1)
519		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
520	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
521		if (im6o != NULL)
522			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
523		else
524			ip6->ip6_hlim = V_ip6_defmcasthlim;
525	}
526
527	/* adjust pointer */
528	ip6 = mtod(m, struct ip6_hdr *);
529
530	if (ro->ro_rt && fwd_tag == NULL) {
531		rt = ro->ro_rt;
532		ifp = ro->ro_rt->rt_ifp;
533	} else {
534		if (fwd_tag == NULL) {
535			bzero(&dst_sa, sizeof(dst_sa));
536			dst_sa.sin6_family = AF_INET6;
537			dst_sa.sin6_len = sizeof(dst_sa);
538			dst_sa.sin6_addr = ip6->ip6_dst;
539		}
540		error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp,
541		    &rt, inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
542		if (error != 0) {
543			if (ifp != NULL)
544				in6_ifstat_inc(ifp, ifs6_out_discard);
545			goto bad;
546		}
547	}
548	if (rt == NULL) {
549		/*
550		 * If in6_selectroute() does not return a route entry,
551		 * dst may not have been updated.
552		 */
553		*dst = dst_sa;	/* XXX */
554	}
555
556	/*
557	 * then rt (for unicast) and ifp must be non-NULL valid values.
558	 */
559	if ((flags & IPV6_FORWARDING) == 0) {
560		/* XXX: the FORWARDING flag can be set for mrouting. */
561		in6_ifstat_inc(ifp, ifs6_out_request);
562	}
563	if (rt != NULL) {
564		ia = (struct in6_ifaddr *)(rt->rt_ifa);
565		counter_u64_add(rt->rt_pksent, 1);
566	}
567
568
569	/*
570	 * The outgoing interface must be in the zone of source and
571	 * destination addresses.
572	 */
573	origifp = ifp;
574
575	src0 = ip6->ip6_src;
576	if (in6_setscope(&src0, origifp, &zone))
577		goto badscope;
578	bzero(&src_sa, sizeof(src_sa));
579	src_sa.sin6_family = AF_INET6;
580	src_sa.sin6_len = sizeof(src_sa);
581	src_sa.sin6_addr = ip6->ip6_src;
582	if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id)
583		goto badscope;
584
585	dst0 = ip6->ip6_dst;
586	if (in6_setscope(&dst0, origifp, &zone))
587		goto badscope;
588	/* re-initialize to be sure */
589	bzero(&dst_sa, sizeof(dst_sa));
590	dst_sa.sin6_family = AF_INET6;
591	dst_sa.sin6_len = sizeof(dst_sa);
592	dst_sa.sin6_addr = ip6->ip6_dst;
593	if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) {
594		goto badscope;
595	}
596
597	/* We should use ia_ifp to support the case of
598	 * sending packets to an address of our own.
599	 */
600	if (ia != NULL && ia->ia_ifp)
601		ifp = ia->ia_ifp;
602
603	/* scope check is done. */
604	goto routefound;
605
606  badscope:
607	IP6STAT_INC(ip6s_badscope);
608	in6_ifstat_inc(origifp, ifs6_out_discard);
609	if (error == 0)
610		error = EHOSTUNREACH; /* XXX */
611	goto bad;
612
613  routefound:
614	if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
615		if (opt && opt->ip6po_nextroute.ro_rt) {
616			/*
617			 * The nexthop is explicitly specified by the
618			 * application.  We assume the next hop is an IPv6
619			 * address.
620			 */
621			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
622		}
623		else if ((rt->rt_flags & RTF_GATEWAY))
624			dst = (struct sockaddr_in6 *)rt->rt_gateway;
625	}
626
627	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
628		m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
629	} else {
630		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
631		in6_ifstat_inc(ifp, ifs6_out_mcast);
632		/*
633		 * Confirm that the outgoing interface supports multicast.
634		 */
635		if (!(ifp->if_flags & IFF_MULTICAST)) {
636			IP6STAT_INC(ip6s_noroute);
637			in6_ifstat_inc(ifp, ifs6_out_discard);
638			error = ENETUNREACH;
639			goto bad;
640		}
641		if ((im6o == NULL && in6_mcast_loop) ||
642		    (im6o && im6o->im6o_multicast_loop)) {
643			/*
644			 * Loop back multicast datagram if not expressly
645			 * forbidden to do so, even if we have not joined
646			 * the address; protocols will filter it later,
647			 * thus deferring a hash lookup and lock acquisition
648			 * at the expense of an m_copym().
649			 */
650			ip6_mloopback(ifp, m, dst);
651		} else {
652			/*
653			 * If we are acting as a multicast router, perform
654			 * multicast forwarding as if the packet had just
655			 * arrived on the interface to which we are about
656			 * to send.  The multicast forwarding function
657			 * recursively calls this function, using the
658			 * IPV6_FORWARDING flag to prevent infinite recursion.
659			 *
660			 * Multicasts that are looped back by ip6_mloopback(),
661			 * above, will be forwarded by the ip6_input() routine,
662			 * if necessary.
663			 */
664			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
665				/*
666				 * XXX: ip6_mforward expects that rcvif is NULL
667				 * when it is called from the originating path.
668				 * However, it may not always be the case.
669				 */
670				m->m_pkthdr.rcvif = NULL;
671				if (ip6_mforward(ip6, ifp, m) != 0) {
672					m_freem(m);
673					goto done;
674				}
675			}
676		}
677		/*
678		 * Multicasts with a hoplimit of zero may be looped back,
679		 * above, but must not be transmitted on a network.
680		 * Also, multicasts addressed to the loopback interface
681		 * are not sent -- the above call to ip6_mloopback() will
682		 * loop back a copy if this host actually belongs to the
683		 * destination group on the loopback interface.
684		 */
685		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
686		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
687			m_freem(m);
688			goto done;
689		}
690	}
691
692	/*
693	 * Fill the outgoing inteface to tell the upper layer
694	 * to increment per-interface statistics.
695	 */
696	if (ifpp)
697		*ifpp = ifp;
698
699	/* Determine path MTU. */
700	if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
701	    &alwaysfrag, inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m))) != 0)
702		goto bad;
703
704	/*
705	 * The caller of this function may specify to use the minimum MTU
706	 * in some cases.
707	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
708	 * setting.  The logic is a bit complicated; by default, unicast
709	 * packets will follow path MTU while multicast packets will be sent at
710	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
711	 * including unicast ones will be sent at the minimum MTU.  Multicast
712	 * packets will always be sent at the minimum MTU unless
713	 * IP6PO_MINMTU_DISABLE is explicitly specified.
714	 * See RFC 3542 for more details.
715	 */
716	if (mtu > IPV6_MMTU) {
717		if ((flags & IPV6_MINMTU))
718			mtu = IPV6_MMTU;
719		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
720			mtu = IPV6_MMTU;
721		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
722			 (opt == NULL ||
723			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
724			mtu = IPV6_MMTU;
725		}
726	}
727
728	/*
729	 * clear embedded scope identifiers if necessary.
730	 * in6_clearscope will touch the addresses only when necessary.
731	 */
732	in6_clearscope(&ip6->ip6_src);
733	in6_clearscope(&ip6->ip6_dst);
734
735	/*
736	 * If the outgoing packet contains a hop-by-hop options header,
737	 * it must be examined and processed even by the source node.
738	 * (RFC 2460, section 4.)
739	 */
740	if (exthdrs.ip6e_hbh) {
741		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
742		u_int32_t dummy; /* XXX unused */
743		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
744
745#ifdef DIAGNOSTIC
746		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
747			panic("ip6e_hbh is not contiguous");
748#endif
749		/*
750		 *  XXX: if we have to send an ICMPv6 error to the sender,
751		 *       we need the M_LOOP flag since icmp6_error() expects
752		 *       the IPv6 and the hop-by-hop options header are
753		 *       contiguous unless the flag is set.
754		 */
755		m->m_flags |= M_LOOP;
756		m->m_pkthdr.rcvif = ifp;
757		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
758		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
759		    &dummy, &plen) < 0) {
760			/* m was already freed at this point */
761			error = EINVAL;/* better error? */
762			goto done;
763		}
764		m->m_flags &= ~M_LOOP; /* XXX */
765		m->m_pkthdr.rcvif = NULL;
766	}
767
768	/* Jump over all PFIL processing if hooks are not active. */
769	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
770		goto passout;
771
772	odst = ip6->ip6_dst;
773	/* Run through list of hooks for output packets. */
774	error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, inp);
775	if (error != 0 || m == NULL)
776		goto done;
777	ip6 = mtod(m, struct ip6_hdr *);
778
779	/* See if destination IP address was changed by packet filter. */
780	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
781		m->m_flags |= M_SKIP_FIREWALL;
782		/* If destination is now ourself drop to ip6_input(). */
783		if (in6_localip(&ip6->ip6_dst)) {
784			m->m_flags |= M_FASTFWD_OURS;
785			if (m->m_pkthdr.rcvif == NULL)
786				m->m_pkthdr.rcvif = V_loif;
787			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
788				m->m_pkthdr.csum_flags |=
789				    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
790				m->m_pkthdr.csum_data = 0xffff;
791			}
792#ifdef SCTP
793			if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
794				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
795#endif
796			error = netisr_queue(NETISR_IPV6, m);
797			goto done;
798		} else
799			goto again;	/* Redo the routing table lookup. */
800	}
801
802	/* See if local, if yes, send it to netisr. */
803	if (m->m_flags & M_FASTFWD_OURS) {
804		if (m->m_pkthdr.rcvif == NULL)
805			m->m_pkthdr.rcvif = V_loif;
806		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
807			m->m_pkthdr.csum_flags |=
808			    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
809			m->m_pkthdr.csum_data = 0xffff;
810		}
811#ifdef SCTP
812		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
813			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
814#endif
815		error = netisr_queue(NETISR_IPV6, m);
816		goto done;
817	}
818	/* Or forward to some other address? */
819	if ((m->m_flags & M_IP6_NEXTHOP) &&
820	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
821		dst = (struct sockaddr_in6 *)&ro->ro_dst;
822		bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6));
823		m->m_flags |= M_SKIP_FIREWALL;
824		m->m_flags &= ~M_IP6_NEXTHOP;
825		m_tag_delete(m, fwd_tag);
826		goto again;
827	}
828
829passout:
830	/*
831	 * Send the packet to the outgoing interface.
832	 * If necessary, do IPv6 fragmentation before sending.
833	 *
834	 * the logic here is rather complex:
835	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
836	 * 1-a:	send as is if tlen <= path mtu
837	 * 1-b:	fragment if tlen > path mtu
838	 *
839	 * 2: if user asks us not to fragment (dontfrag == 1)
840	 * 2-a:	send as is if tlen <= interface mtu
841	 * 2-b:	error if tlen > interface mtu
842	 *
843	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
844	 *	always fragment
845	 *
846	 * 4: if dontfrag == 1 && alwaysfrag == 1
847	 *	error, as we cannot handle this conflicting request
848	 */
849	sw_csum = m->m_pkthdr.csum_flags;
850	if (!hdrsplit) {
851		tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0;
852		sw_csum &= ~ifp->if_hwassist;
853	} else
854		tso = 0;
855	/*
856	 * If we added extension headers, we will not do TSO and calculate the
857	 * checksums ourselves for now.
858	 * XXX-BZ  Need a framework to know when the NIC can handle it, even
859	 * with ext. hdrs.
860	 */
861	if (sw_csum & CSUM_DELAY_DATA_IPV6) {
862		sw_csum &= ~CSUM_DELAY_DATA_IPV6;
863		in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr));
864	}
865#ifdef SCTP
866	if (sw_csum & CSUM_SCTP_IPV6) {
867		sw_csum &= ~CSUM_SCTP_IPV6;
868		sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
869	}
870#endif
871	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
872	tlen = m->m_pkthdr.len;
873
874	if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso)
875		dontfrag = 1;
876	else
877		dontfrag = 0;
878	if (dontfrag && alwaysfrag) {	/* case 4 */
879		/* conflicting request - can't transmit */
880		error = EMSGSIZE;
881		goto bad;
882	}
883	if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) {	/* case 2-b */
884		/*
885		 * Even if the DONTFRAG option is specified, we cannot send the
886		 * packet when the data length is larger than the MTU of the
887		 * outgoing interface.
888		 * Notify the error by sending IPV6_PATHMTU ancillary data if
889		 * application wanted to know the MTU value. Also return an
890		 * error code (this is not described in the API spec).
891		 */
892		if (inp != NULL)
893			ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu);
894		error = EMSGSIZE;
895		goto bad;
896	}
897
898	/*
899	 * transmit packet without fragmentation
900	 */
901	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* case 1-a and 2-a */
902		struct in6_ifaddr *ia6;
903
904		ip6 = mtod(m, struct ip6_hdr *);
905		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
906		if (ia6) {
907			/* Record statistics for this interface address. */
908			ia6->ia_ifa.if_opackets++;
909			ia6->ia_ifa.if_obytes += m->m_pkthdr.len;
910			ifa_free(&ia6->ia_ifa);
911		}
912		error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
913		goto done;
914	}
915
916	/*
917	 * try to fragment the packet.  case 1-b and 3
918	 */
919	if (mtu < IPV6_MMTU) {
920		/* path MTU cannot be less than IPV6_MMTU */
921		error = EMSGSIZE;
922		in6_ifstat_inc(ifp, ifs6_out_fragfail);
923		goto bad;
924	} else if (ip6->ip6_plen == 0) {
925		/* jumbo payload cannot be fragmented */
926		error = EMSGSIZE;
927		in6_ifstat_inc(ifp, ifs6_out_fragfail);
928		goto bad;
929	} else {
930		u_char nextproto;
931
932		int qslots = ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len;
933
934		/*
935		 * Too large for the destination or interface;
936		 * fragment if possible.
937		 * Must be able to put at least 8 bytes per fragment.
938		 */
939		hlen = unfragpartlen;
940		if (mtu > IPV6_MAXPACKET)
941			mtu = IPV6_MAXPACKET;
942
943		len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
944		if (len < 8) {
945			error = EMSGSIZE;
946			in6_ifstat_inc(ifp, ifs6_out_fragfail);
947			goto bad;
948		}
949
950		/*
951		 * Verify that we have any chance at all of being able to queue
952		 *      the packet or packet fragments
953		 */
954		if (qslots <= 0 || ((u_int)qslots * (mtu - hlen)
955		    < tlen  /* - hlen */)) {
956			error = ENOBUFS;
957			IP6STAT_INC(ip6s_odropped);
958			goto bad;
959		}
960
961
962		/*
963		 * If the interface will not calculate checksums on
964		 * fragmented packets, then do it here.
965		 * XXX-BZ handle the hw offloading case.  Need flags.
966		 */
967		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
968			in6_delayed_cksum(m, plen, hlen);
969			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
970		}
971#ifdef SCTP
972		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
973			sctp_delayed_cksum(m, hlen);
974			m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
975		}
976#endif
977		/*
978		 * Change the next header field of the last header in the
979		 * unfragmentable part.
980		 */
981		if (exthdrs.ip6e_rthdr) {
982			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
983			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
984		} else if (exthdrs.ip6e_dest1) {
985			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
986			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
987		} else if (exthdrs.ip6e_hbh) {
988			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
989			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
990		} else {
991			nextproto = ip6->ip6_nxt;
992			ip6->ip6_nxt = IPPROTO_FRAGMENT;
993		}
994
995		/*
996		 * Loop through length of segment after first fragment,
997		 * make new header and copy data of each part and link onto
998		 * chain.
999		 */
1000		m0 = m;
1001		id = htonl(ip6_randomid());
1002		if ((error = ip6_fragment(ifp, m, hlen, nextproto, len, id)))
1003			goto sendorfree;
1004
1005		in6_ifstat_inc(ifp, ifs6_out_fragok);
1006	}
1007
1008	/*
1009	 * Remove leading garbages.
1010	 */
1011sendorfree:
1012	m = m0->m_nextpkt;
1013	m0->m_nextpkt = 0;
1014	m_freem(m0);
1015	for (m0 = m; m; m = m0) {
1016		m0 = m->m_nextpkt;
1017		m->m_nextpkt = 0;
1018		if (error == 0) {
1019			/* Record statistics for this interface address. */
1020			if (ia) {
1021				ia->ia_ifa.if_opackets++;
1022				ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1023			}
1024			error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
1025		} else
1026			m_freem(m);
1027	}
1028
1029	if (error == 0)
1030		IP6STAT_INC(ip6s_fragmented);
1031
1032done:
1033	if (ro == &ip6route)
1034		RO_RTFREE(ro);
1035	if (ro_pmtu == &ip6route)
1036		RO_RTFREE(ro_pmtu);
1037	return (error);
1038
1039freehdrs:
1040	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
1041	m_freem(exthdrs.ip6e_dest1);
1042	m_freem(exthdrs.ip6e_rthdr);
1043	m_freem(exthdrs.ip6e_dest2);
1044	/* FALLTHROUGH */
1045bad:
1046	if (m)
1047		m_freem(m);
1048	goto done;
1049}
1050
1051static int
1052ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
1053{
1054	struct mbuf *m;
1055
1056	if (hlen > MCLBYTES)
1057		return (ENOBUFS); /* XXX */
1058
1059	if (hlen > MLEN)
1060		m = m_getcl(M_NOWAIT, MT_DATA, 0);
1061	else
1062		m = m_get(M_NOWAIT, MT_DATA);
1063	if (m == NULL)
1064		return (ENOBUFS);
1065	m->m_len = hlen;
1066	if (hdr)
1067		bcopy(hdr, mtod(m, caddr_t), hlen);
1068
1069	*mp = m;
1070	return (0);
1071}
1072
1073/*
1074 * Insert jumbo payload option.
1075 */
1076static int
1077ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
1078{
1079	struct mbuf *mopt;
1080	u_char *optbuf;
1081	u_int32_t v;
1082
1083#define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
1084
1085	/*
1086	 * If there is no hop-by-hop options header, allocate new one.
1087	 * If there is one but it doesn't have enough space to store the
1088	 * jumbo payload option, allocate a cluster to store the whole options.
1089	 * Otherwise, use it to store the options.
1090	 */
1091	if (exthdrs->ip6e_hbh == 0) {
1092		mopt = m_get(M_NOWAIT, MT_DATA);
1093		if (mopt == NULL)
1094			return (ENOBUFS);
1095		mopt->m_len = JUMBOOPTLEN;
1096		optbuf = mtod(mopt, u_char *);
1097		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
1098		exthdrs->ip6e_hbh = mopt;
1099	} else {
1100		struct ip6_hbh *hbh;
1101
1102		mopt = exthdrs->ip6e_hbh;
1103		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
1104			/*
1105			 * XXX assumption:
1106			 * - exthdrs->ip6e_hbh is not referenced from places
1107			 *   other than exthdrs.
1108			 * - exthdrs->ip6e_hbh is not an mbuf chain.
1109			 */
1110			int oldoptlen = mopt->m_len;
1111			struct mbuf *n;
1112
1113			/*
1114			 * XXX: give up if the whole (new) hbh header does
1115			 * not fit even in an mbuf cluster.
1116			 */
1117			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
1118				return (ENOBUFS);
1119
1120			/*
1121			 * As a consequence, we must always prepare a cluster
1122			 * at this point.
1123			 */
1124			n = m_getcl(M_NOWAIT, MT_DATA, 0);
1125			if (n == NULL)
1126				return (ENOBUFS);
1127			n->m_len = oldoptlen + JUMBOOPTLEN;
1128			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
1129			    oldoptlen);
1130			optbuf = mtod(n, caddr_t) + oldoptlen;
1131			m_freem(mopt);
1132			mopt = exthdrs->ip6e_hbh = n;
1133		} else {
1134			optbuf = mtod(mopt, u_char *) + mopt->m_len;
1135			mopt->m_len += JUMBOOPTLEN;
1136		}
1137		optbuf[0] = IP6OPT_PADN;
1138		optbuf[1] = 1;
1139
1140		/*
1141		 * Adjust the header length according to the pad and
1142		 * the jumbo payload option.
1143		 */
1144		hbh = mtod(mopt, struct ip6_hbh *);
1145		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
1146	}
1147
1148	/* fill in the option. */
1149	optbuf[2] = IP6OPT_JUMBO;
1150	optbuf[3] = 4;
1151	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
1152	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
1153
1154	/* finally, adjust the packet header length */
1155	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
1156
1157	return (0);
1158#undef JUMBOOPTLEN
1159}
1160
1161/*
1162 * Insert fragment header and copy unfragmentable header portions.
1163 */
1164static int
1165ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
1166    struct ip6_frag **frghdrp)
1167{
1168	struct mbuf *n, *mlast;
1169
1170	if (hlen > sizeof(struct ip6_hdr)) {
1171		n = m_copym(m0, sizeof(struct ip6_hdr),
1172		    hlen - sizeof(struct ip6_hdr), M_NOWAIT);
1173		if (n == 0)
1174			return (ENOBUFS);
1175		m->m_next = n;
1176	} else
1177		n = m;
1178
1179	/* Search for the last mbuf of unfragmentable part. */
1180	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
1181		;
1182
1183	if ((mlast->m_flags & M_EXT) == 0 &&
1184	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
1185		/* use the trailing space of the last mbuf for the fragment hdr */
1186		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
1187		    mlast->m_len);
1188		mlast->m_len += sizeof(struct ip6_frag);
1189		m->m_pkthdr.len += sizeof(struct ip6_frag);
1190	} else {
1191		/* allocate a new mbuf for the fragment header */
1192		struct mbuf *mfrg;
1193
1194		mfrg = m_get(M_NOWAIT, MT_DATA);
1195		if (mfrg == NULL)
1196			return (ENOBUFS);
1197		mfrg->m_len = sizeof(struct ip6_frag);
1198		*frghdrp = mtod(mfrg, struct ip6_frag *);
1199		mlast->m_next = mfrg;
1200	}
1201
1202	return (0);
1203}
1204
1205static int
1206ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro,
1207    struct ifnet *ifp, struct in6_addr *dst, u_long *mtup,
1208    int *alwaysfragp, u_int fibnum)
1209{
1210	u_int32_t mtu = 0;
1211	int alwaysfrag = 0;
1212	int error = 0;
1213
1214	if (ro_pmtu != ro) {
1215		/* The first hop and the final destination may differ. */
1216		struct sockaddr_in6 *sa6_dst =
1217		    (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
1218		if (ro_pmtu->ro_rt &&
1219		    ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 ||
1220		     !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) {
1221			RTFREE(ro_pmtu->ro_rt);
1222			ro_pmtu->ro_rt = (struct rtentry *)NULL;
1223		}
1224		if (ro_pmtu->ro_rt == NULL) {
1225			bzero(sa6_dst, sizeof(*sa6_dst));
1226			sa6_dst->sin6_family = AF_INET6;
1227			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
1228			sa6_dst->sin6_addr = *dst;
1229
1230			in6_rtalloc(ro_pmtu, fibnum);
1231		}
1232	}
1233	if (ro_pmtu->ro_rt) {
1234		u_int32_t ifmtu;
1235		struct in_conninfo inc;
1236
1237		bzero(&inc, sizeof(inc));
1238		inc.inc_flags |= INC_ISIPV6;
1239		inc.inc6_faddr = *dst;
1240
1241		if (ifp == NULL)
1242			ifp = ro_pmtu->ro_rt->rt_ifp;
1243		ifmtu = IN6_LINKMTU(ifp);
1244		mtu = tcp_hc_getmtu(&inc);
1245		if (mtu)
1246			mtu = min(mtu, ro_pmtu->ro_rt->rt_mtu);
1247		else
1248			mtu = ro_pmtu->ro_rt->rt_mtu;
1249		if (mtu == 0)
1250			mtu = ifmtu;
1251		else if (mtu < IPV6_MMTU) {
1252			/*
1253			 * RFC2460 section 5, last paragraph:
1254			 * if we record ICMPv6 too big message with
1255			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
1256			 * or smaller, with framgent header attached.
1257			 * (fragment header is needed regardless from the
1258			 * packet size, for translators to identify packets)
1259			 */
1260			alwaysfrag = 1;
1261			mtu = IPV6_MMTU;
1262		} else if (mtu > ifmtu) {
1263			/*
1264			 * The MTU on the route is larger than the MTU on
1265			 * the interface!  This shouldn't happen, unless the
1266			 * MTU of the interface has been changed after the
1267			 * interface was brought up.  Change the MTU in the
1268			 * route to match the interface MTU (as long as the
1269			 * field isn't locked).
1270			 */
1271			mtu = ifmtu;
1272			ro_pmtu->ro_rt->rt_mtu = mtu;
1273		}
1274	} else if (ifp) {
1275		mtu = IN6_LINKMTU(ifp);
1276	} else
1277		error = EHOSTUNREACH; /* XXX */
1278
1279	*mtup = mtu;
1280	if (alwaysfragp)
1281		*alwaysfragp = alwaysfrag;
1282	return (error);
1283}
1284
1285/*
1286 * IP6 socket option processing.
1287 */
1288int
1289ip6_ctloutput(struct socket *so, struct sockopt *sopt)
1290{
1291	int optdatalen, uproto;
1292	void *optdata;
1293	struct inpcb *in6p = sotoinpcb(so);
1294	int error, optval;
1295	int level, op, optname;
1296	int optlen;
1297	struct thread *td;
1298
1299	level = sopt->sopt_level;
1300	op = sopt->sopt_dir;
1301	optname = sopt->sopt_name;
1302	optlen = sopt->sopt_valsize;
1303	td = sopt->sopt_td;
1304	error = 0;
1305	optval = 0;
1306	uproto = (int)so->so_proto->pr_protocol;
1307
1308	if (level != IPPROTO_IPV6) {
1309		error = EINVAL;
1310
1311		if (sopt->sopt_level == SOL_SOCKET &&
1312		    sopt->sopt_dir == SOPT_SET) {
1313			switch (sopt->sopt_name) {
1314			case SO_REUSEADDR:
1315				INP_WLOCK(in6p);
1316				if ((so->so_options & SO_REUSEADDR) != 0)
1317					in6p->inp_flags2 |= INP_REUSEADDR;
1318				else
1319					in6p->inp_flags2 &= ~INP_REUSEADDR;
1320				INP_WUNLOCK(in6p);
1321				error = 0;
1322				break;
1323			case SO_REUSEPORT:
1324				INP_WLOCK(in6p);
1325				if ((so->so_options & SO_REUSEPORT) != 0)
1326					in6p->inp_flags2 |= INP_REUSEPORT;
1327				else
1328					in6p->inp_flags2 &= ~INP_REUSEPORT;
1329				INP_WUNLOCK(in6p);
1330				error = 0;
1331				break;
1332			case SO_SETFIB:
1333				INP_WLOCK(in6p);
1334				in6p->inp_inc.inc_fibnum = so->so_fibnum;
1335				INP_WUNLOCK(in6p);
1336				error = 0;
1337				break;
1338			default:
1339				break;
1340			}
1341		}
1342	} else {		/* level == IPPROTO_IPV6 */
1343		switch (op) {
1344
1345		case SOPT_SET:
1346			switch (optname) {
1347			case IPV6_2292PKTOPTIONS:
1348#ifdef IPV6_PKTOPTIONS
1349			case IPV6_PKTOPTIONS:
1350#endif
1351			{
1352				struct mbuf *m;
1353
1354				error = soopt_getm(sopt, &m); /* XXX */
1355				if (error != 0)
1356					break;
1357				error = soopt_mcopyin(sopt, m); /* XXX */
1358				if (error != 0)
1359					break;
1360				error = ip6_pcbopts(&in6p->in6p_outputopts,
1361						    m, so, sopt);
1362				m_freem(m); /* XXX */
1363				break;
1364			}
1365
1366			/*
1367			 * Use of some Hop-by-Hop options or some
1368			 * Destination options, might require special
1369			 * privilege.  That is, normal applications
1370			 * (without special privilege) might be forbidden
1371			 * from setting certain options in outgoing packets,
1372			 * and might never see certain options in received
1373			 * packets. [RFC 2292 Section 6]
1374			 * KAME specific note:
1375			 *  KAME prevents non-privileged users from sending or
1376			 *  receiving ANY hbh/dst options in order to avoid
1377			 *  overhead of parsing options in the kernel.
1378			 */
1379			case IPV6_RECVHOPOPTS:
1380			case IPV6_RECVDSTOPTS:
1381			case IPV6_RECVRTHDRDSTOPTS:
1382				if (td != NULL) {
1383					error = priv_check(td,
1384					    PRIV_NETINET_SETHDROPTS);
1385					if (error)
1386						break;
1387				}
1388				/* FALLTHROUGH */
1389			case IPV6_UNICAST_HOPS:
1390			case IPV6_HOPLIMIT:
1391			case IPV6_FAITH:
1392
1393			case IPV6_RECVPKTINFO:
1394			case IPV6_RECVHOPLIMIT:
1395			case IPV6_RECVRTHDR:
1396			case IPV6_RECVPATHMTU:
1397			case IPV6_RECVTCLASS:
1398			case IPV6_V6ONLY:
1399			case IPV6_AUTOFLOWLABEL:
1400			case IPV6_BINDANY:
1401				if (optname == IPV6_BINDANY && td != NULL) {
1402					error = priv_check(td,
1403					    PRIV_NETINET_BINDANY);
1404					if (error)
1405						break;
1406				}
1407
1408				if (optlen != sizeof(int)) {
1409					error = EINVAL;
1410					break;
1411				}
1412				error = sooptcopyin(sopt, &optval,
1413					sizeof optval, sizeof optval);
1414				if (error)
1415					break;
1416				switch (optname) {
1417
1418				case IPV6_UNICAST_HOPS:
1419					if (optval < -1 || optval >= 256)
1420						error = EINVAL;
1421					else {
1422						/* -1 = kernel default */
1423						in6p->in6p_hops = optval;
1424						if ((in6p->inp_vflag &
1425						     INP_IPV4) != 0)
1426							in6p->inp_ip_ttl = optval;
1427					}
1428					break;
1429#define OPTSET(bit) \
1430do { \
1431	INP_WLOCK(in6p); \
1432	if (optval) \
1433		in6p->inp_flags |= (bit); \
1434	else \
1435		in6p->inp_flags &= ~(bit); \
1436	INP_WUNLOCK(in6p); \
1437} while (/*CONSTCOND*/ 0)
1438#define OPTSET2292(bit) \
1439do { \
1440	INP_WLOCK(in6p); \
1441	in6p->inp_flags |= IN6P_RFC2292; \
1442	if (optval) \
1443		in6p->inp_flags |= (bit); \
1444	else \
1445		in6p->inp_flags &= ~(bit); \
1446	INP_WUNLOCK(in6p); \
1447} while (/*CONSTCOND*/ 0)
1448#define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0)
1449
1450				case IPV6_RECVPKTINFO:
1451					/* cannot mix with RFC2292 */
1452					if (OPTBIT(IN6P_RFC2292)) {
1453						error = EINVAL;
1454						break;
1455					}
1456					OPTSET(IN6P_PKTINFO);
1457					break;
1458
1459				case IPV6_HOPLIMIT:
1460				{
1461					struct ip6_pktopts **optp;
1462
1463					/* cannot mix with RFC2292 */
1464					if (OPTBIT(IN6P_RFC2292)) {
1465						error = EINVAL;
1466						break;
1467					}
1468					optp = &in6p->in6p_outputopts;
1469					error = ip6_pcbopt(IPV6_HOPLIMIT,
1470					    (u_char *)&optval, sizeof(optval),
1471					    optp, (td != NULL) ? td->td_ucred :
1472					    NULL, uproto);
1473					break;
1474				}
1475
1476				case IPV6_RECVHOPLIMIT:
1477					/* cannot mix with RFC2292 */
1478					if (OPTBIT(IN6P_RFC2292)) {
1479						error = EINVAL;
1480						break;
1481					}
1482					OPTSET(IN6P_HOPLIMIT);
1483					break;
1484
1485				case IPV6_RECVHOPOPTS:
1486					/* cannot mix with RFC2292 */
1487					if (OPTBIT(IN6P_RFC2292)) {
1488						error = EINVAL;
1489						break;
1490					}
1491					OPTSET(IN6P_HOPOPTS);
1492					break;
1493
1494				case IPV6_RECVDSTOPTS:
1495					/* cannot mix with RFC2292 */
1496					if (OPTBIT(IN6P_RFC2292)) {
1497						error = EINVAL;
1498						break;
1499					}
1500					OPTSET(IN6P_DSTOPTS);
1501					break;
1502
1503				case IPV6_RECVRTHDRDSTOPTS:
1504					/* cannot mix with RFC2292 */
1505					if (OPTBIT(IN6P_RFC2292)) {
1506						error = EINVAL;
1507						break;
1508					}
1509					OPTSET(IN6P_RTHDRDSTOPTS);
1510					break;
1511
1512				case IPV6_RECVRTHDR:
1513					/* cannot mix with RFC2292 */
1514					if (OPTBIT(IN6P_RFC2292)) {
1515						error = EINVAL;
1516						break;
1517					}
1518					OPTSET(IN6P_RTHDR);
1519					break;
1520
1521				case IPV6_FAITH:
1522					OPTSET(INP_FAITH);
1523					break;
1524
1525				case IPV6_RECVPATHMTU:
1526					/*
1527					 * We ignore this option for TCP
1528					 * sockets.
1529					 * (RFC3542 leaves this case
1530					 * unspecified.)
1531					 */
1532					if (uproto != IPPROTO_TCP)
1533						OPTSET(IN6P_MTU);
1534					break;
1535
1536				case IPV6_V6ONLY:
1537					/*
1538					 * make setsockopt(IPV6_V6ONLY)
1539					 * available only prior to bind(2).
1540					 * see ipng mailing list, Jun 22 2001.
1541					 */
1542					if (in6p->inp_lport ||
1543					    !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) {
1544						error = EINVAL;
1545						break;
1546					}
1547					OPTSET(IN6P_IPV6_V6ONLY);
1548					if (optval)
1549						in6p->inp_vflag &= ~INP_IPV4;
1550					else
1551						in6p->inp_vflag |= INP_IPV4;
1552					break;
1553				case IPV6_RECVTCLASS:
1554					/* cannot mix with RFC2292 XXX */
1555					if (OPTBIT(IN6P_RFC2292)) {
1556						error = EINVAL;
1557						break;
1558					}
1559					OPTSET(IN6P_TCLASS);
1560					break;
1561				case IPV6_AUTOFLOWLABEL:
1562					OPTSET(IN6P_AUTOFLOWLABEL);
1563					break;
1564
1565				case IPV6_BINDANY:
1566					OPTSET(INP_BINDANY);
1567					break;
1568				}
1569				break;
1570
1571			case IPV6_TCLASS:
1572			case IPV6_DONTFRAG:
1573			case IPV6_USE_MIN_MTU:
1574			case IPV6_PREFER_TEMPADDR:
1575				if (optlen != sizeof(optval)) {
1576					error = EINVAL;
1577					break;
1578				}
1579				error = sooptcopyin(sopt, &optval,
1580					sizeof optval, sizeof optval);
1581				if (error)
1582					break;
1583				{
1584					struct ip6_pktopts **optp;
1585					optp = &in6p->in6p_outputopts;
1586					error = ip6_pcbopt(optname,
1587					    (u_char *)&optval, sizeof(optval),
1588					    optp, (td != NULL) ? td->td_ucred :
1589					    NULL, uproto);
1590					break;
1591				}
1592
1593			case IPV6_2292PKTINFO:
1594			case IPV6_2292HOPLIMIT:
1595			case IPV6_2292HOPOPTS:
1596			case IPV6_2292DSTOPTS:
1597			case IPV6_2292RTHDR:
1598				/* RFC 2292 */
1599				if (optlen != sizeof(int)) {
1600					error = EINVAL;
1601					break;
1602				}
1603				error = sooptcopyin(sopt, &optval,
1604					sizeof optval, sizeof optval);
1605				if (error)
1606					break;
1607				switch (optname) {
1608				case IPV6_2292PKTINFO:
1609					OPTSET2292(IN6P_PKTINFO);
1610					break;
1611				case IPV6_2292HOPLIMIT:
1612					OPTSET2292(IN6P_HOPLIMIT);
1613					break;
1614				case IPV6_2292HOPOPTS:
1615					/*
1616					 * Check super-user privilege.
1617					 * See comments for IPV6_RECVHOPOPTS.
1618					 */
1619					if (td != NULL) {
1620						error = priv_check(td,
1621						    PRIV_NETINET_SETHDROPTS);
1622						if (error)
1623							return (error);
1624					}
1625					OPTSET2292(IN6P_HOPOPTS);
1626					break;
1627				case IPV6_2292DSTOPTS:
1628					if (td != NULL) {
1629						error = priv_check(td,
1630						    PRIV_NETINET_SETHDROPTS);
1631						if (error)
1632							return (error);
1633					}
1634					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
1635					break;
1636				case IPV6_2292RTHDR:
1637					OPTSET2292(IN6P_RTHDR);
1638					break;
1639				}
1640				break;
1641			case IPV6_PKTINFO:
1642			case IPV6_HOPOPTS:
1643			case IPV6_RTHDR:
1644			case IPV6_DSTOPTS:
1645			case IPV6_RTHDRDSTOPTS:
1646			case IPV6_NEXTHOP:
1647			{
1648				/* new advanced API (RFC3542) */
1649				u_char *optbuf;
1650				u_char optbuf_storage[MCLBYTES];
1651				int optlen;
1652				struct ip6_pktopts **optp;
1653
1654				/* cannot mix with RFC2292 */
1655				if (OPTBIT(IN6P_RFC2292)) {
1656					error = EINVAL;
1657					break;
1658				}
1659
1660				/*
1661				 * We only ensure valsize is not too large
1662				 * here.  Further validation will be done
1663				 * later.
1664				 */
1665				error = sooptcopyin(sopt, optbuf_storage,
1666				    sizeof(optbuf_storage), 0);
1667				if (error)
1668					break;
1669				optlen = sopt->sopt_valsize;
1670				optbuf = optbuf_storage;
1671				optp = &in6p->in6p_outputopts;
1672				error = ip6_pcbopt(optname, optbuf, optlen,
1673				    optp, (td != NULL) ? td->td_ucred : NULL,
1674				    uproto);
1675				break;
1676			}
1677#undef OPTSET
1678
1679			case IPV6_MULTICAST_IF:
1680			case IPV6_MULTICAST_HOPS:
1681			case IPV6_MULTICAST_LOOP:
1682			case IPV6_JOIN_GROUP:
1683			case IPV6_LEAVE_GROUP:
1684			case IPV6_MSFILTER:
1685			case MCAST_BLOCK_SOURCE:
1686			case MCAST_UNBLOCK_SOURCE:
1687			case MCAST_JOIN_GROUP:
1688			case MCAST_LEAVE_GROUP:
1689			case MCAST_JOIN_SOURCE_GROUP:
1690			case MCAST_LEAVE_SOURCE_GROUP:
1691				error = ip6_setmoptions(in6p, sopt);
1692				break;
1693
1694			case IPV6_PORTRANGE:
1695				error = sooptcopyin(sopt, &optval,
1696				    sizeof optval, sizeof optval);
1697				if (error)
1698					break;
1699
1700				INP_WLOCK(in6p);
1701				switch (optval) {
1702				case IPV6_PORTRANGE_DEFAULT:
1703					in6p->inp_flags &= ~(INP_LOWPORT);
1704					in6p->inp_flags &= ~(INP_HIGHPORT);
1705					break;
1706
1707				case IPV6_PORTRANGE_HIGH:
1708					in6p->inp_flags &= ~(INP_LOWPORT);
1709					in6p->inp_flags |= INP_HIGHPORT;
1710					break;
1711
1712				case IPV6_PORTRANGE_LOW:
1713					in6p->inp_flags &= ~(INP_HIGHPORT);
1714					in6p->inp_flags |= INP_LOWPORT;
1715					break;
1716
1717				default:
1718					error = EINVAL;
1719					break;
1720				}
1721				INP_WUNLOCK(in6p);
1722				break;
1723
1724#ifdef IPSEC
1725			case IPV6_IPSEC_POLICY:
1726			{
1727				caddr_t req;
1728				struct mbuf *m;
1729
1730				if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1731					break;
1732				if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1733					break;
1734				req = mtod(m, caddr_t);
1735				error = ipsec_set_policy(in6p, optname, req,
1736				    m->m_len, (sopt->sopt_td != NULL) ?
1737				    sopt->sopt_td->td_ucred : NULL);
1738				m_freem(m);
1739				break;
1740			}
1741#endif /* IPSEC */
1742
1743			default:
1744				error = ENOPROTOOPT;
1745				break;
1746			}
1747			break;
1748
1749		case SOPT_GET:
1750			switch (optname) {
1751
1752			case IPV6_2292PKTOPTIONS:
1753#ifdef IPV6_PKTOPTIONS
1754			case IPV6_PKTOPTIONS:
1755#endif
1756				/*
1757				 * RFC3542 (effectively) deprecated the
1758				 * semantics of the 2292-style pktoptions.
1759				 * Since it was not reliable in nature (i.e.,
1760				 * applications had to expect the lack of some
1761				 * information after all), it would make sense
1762				 * to simplify this part by always returning
1763				 * empty data.
1764				 */
1765				sopt->sopt_valsize = 0;
1766				break;
1767
1768			case IPV6_RECVHOPOPTS:
1769			case IPV6_RECVDSTOPTS:
1770			case IPV6_RECVRTHDRDSTOPTS:
1771			case IPV6_UNICAST_HOPS:
1772			case IPV6_RECVPKTINFO:
1773			case IPV6_RECVHOPLIMIT:
1774			case IPV6_RECVRTHDR:
1775			case IPV6_RECVPATHMTU:
1776
1777			case IPV6_FAITH:
1778			case IPV6_V6ONLY:
1779			case IPV6_PORTRANGE:
1780			case IPV6_RECVTCLASS:
1781			case IPV6_AUTOFLOWLABEL:
1782			case IPV6_BINDANY:
1783				switch (optname) {
1784
1785				case IPV6_RECVHOPOPTS:
1786					optval = OPTBIT(IN6P_HOPOPTS);
1787					break;
1788
1789				case IPV6_RECVDSTOPTS:
1790					optval = OPTBIT(IN6P_DSTOPTS);
1791					break;
1792
1793				case IPV6_RECVRTHDRDSTOPTS:
1794					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
1795					break;
1796
1797				case IPV6_UNICAST_HOPS:
1798					optval = in6p->in6p_hops;
1799					break;
1800
1801				case IPV6_RECVPKTINFO:
1802					optval = OPTBIT(IN6P_PKTINFO);
1803					break;
1804
1805				case IPV6_RECVHOPLIMIT:
1806					optval = OPTBIT(IN6P_HOPLIMIT);
1807					break;
1808
1809				case IPV6_RECVRTHDR:
1810					optval = OPTBIT(IN6P_RTHDR);
1811					break;
1812
1813				case IPV6_RECVPATHMTU:
1814					optval = OPTBIT(IN6P_MTU);
1815					break;
1816
1817				case IPV6_FAITH:
1818					optval = OPTBIT(INP_FAITH);
1819					break;
1820
1821				case IPV6_V6ONLY:
1822					optval = OPTBIT(IN6P_IPV6_V6ONLY);
1823					break;
1824
1825				case IPV6_PORTRANGE:
1826				    {
1827					int flags;
1828					flags = in6p->inp_flags;
1829					if (flags & INP_HIGHPORT)
1830						optval = IPV6_PORTRANGE_HIGH;
1831					else if (flags & INP_LOWPORT)
1832						optval = IPV6_PORTRANGE_LOW;
1833					else
1834						optval = 0;
1835					break;
1836				    }
1837				case IPV6_RECVTCLASS:
1838					optval = OPTBIT(IN6P_TCLASS);
1839					break;
1840
1841				case IPV6_AUTOFLOWLABEL:
1842					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
1843					break;
1844
1845				case IPV6_BINDANY:
1846					optval = OPTBIT(INP_BINDANY);
1847					break;
1848				}
1849				if (error)
1850					break;
1851				error = sooptcopyout(sopt, &optval,
1852					sizeof optval);
1853				break;
1854
1855			case IPV6_PATHMTU:
1856			{
1857				u_long pmtu = 0;
1858				struct ip6_mtuinfo mtuinfo;
1859				struct route_in6 sro;
1860
1861				bzero(&sro, sizeof(sro));
1862
1863				if (!(so->so_state & SS_ISCONNECTED))
1864					return (ENOTCONN);
1865				/*
1866				 * XXX: we dot not consider the case of source
1867				 * routing, or optional information to specify
1868				 * the outgoing interface.
1869				 */
1870				error = ip6_getpmtu(&sro, NULL, NULL,
1871				    &in6p->in6p_faddr, &pmtu, NULL,
1872				    so->so_fibnum);
1873				if (sro.ro_rt)
1874					RTFREE(sro.ro_rt);
1875				if (error)
1876					break;
1877				if (pmtu > IPV6_MAXPACKET)
1878					pmtu = IPV6_MAXPACKET;
1879
1880				bzero(&mtuinfo, sizeof(mtuinfo));
1881				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
1882				optdata = (void *)&mtuinfo;
1883				optdatalen = sizeof(mtuinfo);
1884				error = sooptcopyout(sopt, optdata,
1885				    optdatalen);
1886				break;
1887			}
1888
1889			case IPV6_2292PKTINFO:
1890			case IPV6_2292HOPLIMIT:
1891			case IPV6_2292HOPOPTS:
1892			case IPV6_2292RTHDR:
1893			case IPV6_2292DSTOPTS:
1894				switch (optname) {
1895				case IPV6_2292PKTINFO:
1896					optval = OPTBIT(IN6P_PKTINFO);
1897					break;
1898				case IPV6_2292HOPLIMIT:
1899					optval = OPTBIT(IN6P_HOPLIMIT);
1900					break;
1901				case IPV6_2292HOPOPTS:
1902					optval = OPTBIT(IN6P_HOPOPTS);
1903					break;
1904				case IPV6_2292RTHDR:
1905					optval = OPTBIT(IN6P_RTHDR);
1906					break;
1907				case IPV6_2292DSTOPTS:
1908					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
1909					break;
1910				}
1911				error = sooptcopyout(sopt, &optval,
1912				    sizeof optval);
1913				break;
1914			case IPV6_PKTINFO:
1915			case IPV6_HOPOPTS:
1916			case IPV6_RTHDR:
1917			case IPV6_DSTOPTS:
1918			case IPV6_RTHDRDSTOPTS:
1919			case IPV6_NEXTHOP:
1920			case IPV6_TCLASS:
1921			case IPV6_DONTFRAG:
1922			case IPV6_USE_MIN_MTU:
1923			case IPV6_PREFER_TEMPADDR:
1924				error = ip6_getpcbopt(in6p->in6p_outputopts,
1925				    optname, sopt);
1926				break;
1927
1928			case IPV6_MULTICAST_IF:
1929			case IPV6_MULTICAST_HOPS:
1930			case IPV6_MULTICAST_LOOP:
1931			case IPV6_MSFILTER:
1932				error = ip6_getmoptions(in6p, sopt);
1933				break;
1934
1935#ifdef IPSEC
1936			case IPV6_IPSEC_POLICY:
1937			  {
1938				caddr_t req = NULL;
1939				size_t len = 0;
1940				struct mbuf *m = NULL;
1941				struct mbuf **mp = &m;
1942				size_t ovalsize = sopt->sopt_valsize;
1943				caddr_t oval = (caddr_t)sopt->sopt_val;
1944
1945				error = soopt_getm(sopt, &m); /* XXX */
1946				if (error != 0)
1947					break;
1948				error = soopt_mcopyin(sopt, m); /* XXX */
1949				if (error != 0)
1950					break;
1951				sopt->sopt_valsize = ovalsize;
1952				sopt->sopt_val = oval;
1953				if (m) {
1954					req = mtod(m, caddr_t);
1955					len = m->m_len;
1956				}
1957				error = ipsec_get_policy(in6p, req, len, mp);
1958				if (error == 0)
1959					error = soopt_mcopyout(sopt, m); /* XXX */
1960				if (error == 0 && m)
1961					m_freem(m);
1962				break;
1963			  }
1964#endif /* IPSEC */
1965
1966			default:
1967				error = ENOPROTOOPT;
1968				break;
1969			}
1970			break;
1971		}
1972	}
1973	return (error);
1974}
1975
1976int
1977ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
1978{
1979	int error = 0, optval, optlen;
1980	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
1981	struct inpcb *in6p = sotoinpcb(so);
1982	int level, op, optname;
1983
1984	level = sopt->sopt_level;
1985	op = sopt->sopt_dir;
1986	optname = sopt->sopt_name;
1987	optlen = sopt->sopt_valsize;
1988
1989	if (level != IPPROTO_IPV6) {
1990		return (EINVAL);
1991	}
1992
1993	switch (optname) {
1994	case IPV6_CHECKSUM:
1995		/*
1996		 * For ICMPv6 sockets, no modification allowed for checksum
1997		 * offset, permit "no change" values to help existing apps.
1998		 *
1999		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
2000		 * for an ICMPv6 socket will fail."
2001		 * The current behavior does not meet RFC3542.
2002		 */
2003		switch (op) {
2004		case SOPT_SET:
2005			if (optlen != sizeof(int)) {
2006				error = EINVAL;
2007				break;
2008			}
2009			error = sooptcopyin(sopt, &optval, sizeof(optval),
2010					    sizeof(optval));
2011			if (error)
2012				break;
2013			if ((optval % 2) != 0) {
2014				/* the API assumes even offset values */
2015				error = EINVAL;
2016			} else if (so->so_proto->pr_protocol ==
2017			    IPPROTO_ICMPV6) {
2018				if (optval != icmp6off)
2019					error = EINVAL;
2020			} else
2021				in6p->in6p_cksum = optval;
2022			break;
2023
2024		case SOPT_GET:
2025			if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
2026				optval = icmp6off;
2027			else
2028				optval = in6p->in6p_cksum;
2029
2030			error = sooptcopyout(sopt, &optval, sizeof(optval));
2031			break;
2032
2033		default:
2034			error = EINVAL;
2035			break;
2036		}
2037		break;
2038
2039	default:
2040		error = ENOPROTOOPT;
2041		break;
2042	}
2043
2044	return (error);
2045}
2046
2047/*
2048 * Set up IP6 options in pcb for insertion in output packets or
2049 * specifying behavior of outgoing packets.
2050 */
2051static int
2052ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
2053    struct socket *so, struct sockopt *sopt)
2054{
2055	struct ip6_pktopts *opt = *pktopt;
2056	int error = 0;
2057	struct thread *td = sopt->sopt_td;
2058
2059	/* turn off any old options. */
2060	if (opt) {
2061#ifdef DIAGNOSTIC
2062		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
2063		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
2064		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
2065			printf("ip6_pcbopts: all specified options are cleared.\n");
2066#endif
2067		ip6_clearpktopts(opt, -1);
2068	} else
2069		opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK);
2070	*pktopt = NULL;
2071
2072	if (!m || m->m_len == 0) {
2073		/*
2074		 * Only turning off any previous options, regardless of
2075		 * whether the opt is just created or given.
2076		 */
2077		free(opt, M_IP6OPT);
2078		return (0);
2079	}
2080
2081	/*  set options specified by user. */
2082	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
2083	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
2084		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
2085		free(opt, M_IP6OPT);
2086		return (error);
2087	}
2088	*pktopt = opt;
2089	return (0);
2090}
2091
2092/*
2093 * initialize ip6_pktopts.  beware that there are non-zero default values in
2094 * the struct.
2095 */
2096void
2097ip6_initpktopts(struct ip6_pktopts *opt)
2098{
2099
2100	bzero(opt, sizeof(*opt));
2101	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
2102	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
2103	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
2104	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
2105}
2106
2107static int
2108ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
2109    struct ucred *cred, int uproto)
2110{
2111	struct ip6_pktopts *opt;
2112
2113	if (*pktopt == NULL) {
2114		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
2115		    M_WAITOK);
2116		ip6_initpktopts(*pktopt);
2117	}
2118	opt = *pktopt;
2119
2120	return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
2121}
2122
2123static int
2124ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt)
2125{
2126	void *optdata = NULL;
2127	int optdatalen = 0;
2128	struct ip6_ext *ip6e;
2129	int error = 0;
2130	struct in6_pktinfo null_pktinfo;
2131	int deftclass = 0, on;
2132	int defminmtu = IP6PO_MINMTU_MCASTONLY;
2133	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
2134
2135	switch (optname) {
2136	case IPV6_PKTINFO:
2137		if (pktopt && pktopt->ip6po_pktinfo)
2138			optdata = (void *)pktopt->ip6po_pktinfo;
2139		else {
2140			/* XXX: we don't have to do this every time... */
2141			bzero(&null_pktinfo, sizeof(null_pktinfo));
2142			optdata = (void *)&null_pktinfo;
2143		}
2144		optdatalen = sizeof(struct in6_pktinfo);
2145		break;
2146	case IPV6_TCLASS:
2147		if (pktopt && pktopt->ip6po_tclass >= 0)
2148			optdata = (void *)&pktopt->ip6po_tclass;
2149		else
2150			optdata = (void *)&deftclass;
2151		optdatalen = sizeof(int);
2152		break;
2153	case IPV6_HOPOPTS:
2154		if (pktopt && pktopt->ip6po_hbh) {
2155			optdata = (void *)pktopt->ip6po_hbh;
2156			ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
2157			optdatalen = (ip6e->ip6e_len + 1) << 3;
2158		}
2159		break;
2160	case IPV6_RTHDR:
2161		if (pktopt && pktopt->ip6po_rthdr) {
2162			optdata = (void *)pktopt->ip6po_rthdr;
2163			ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
2164			optdatalen = (ip6e->ip6e_len + 1) << 3;
2165		}
2166		break;
2167	case IPV6_RTHDRDSTOPTS:
2168		if (pktopt && pktopt->ip6po_dest1) {
2169			optdata = (void *)pktopt->ip6po_dest1;
2170			ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
2171			optdatalen = (ip6e->ip6e_len + 1) << 3;
2172		}
2173		break;
2174	case IPV6_DSTOPTS:
2175		if (pktopt && pktopt->ip6po_dest2) {
2176			optdata = (void *)pktopt->ip6po_dest2;
2177			ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
2178			optdatalen = (ip6e->ip6e_len + 1) << 3;
2179		}
2180		break;
2181	case IPV6_NEXTHOP:
2182		if (pktopt && pktopt->ip6po_nexthop) {
2183			optdata = (void *)pktopt->ip6po_nexthop;
2184			optdatalen = pktopt->ip6po_nexthop->sa_len;
2185		}
2186		break;
2187	case IPV6_USE_MIN_MTU:
2188		if (pktopt)
2189			optdata = (void *)&pktopt->ip6po_minmtu;
2190		else
2191			optdata = (void *)&defminmtu;
2192		optdatalen = sizeof(int);
2193		break;
2194	case IPV6_DONTFRAG:
2195		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
2196			on = 1;
2197		else
2198			on = 0;
2199		optdata = (void *)&on;
2200		optdatalen = sizeof(on);
2201		break;
2202	case IPV6_PREFER_TEMPADDR:
2203		if (pktopt)
2204			optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
2205		else
2206			optdata = (void *)&defpreftemp;
2207		optdatalen = sizeof(int);
2208		break;
2209	default:		/* should not happen */
2210#ifdef DIAGNOSTIC
2211		panic("ip6_getpcbopt: unexpected option\n");
2212#endif
2213		return (ENOPROTOOPT);
2214	}
2215
2216	error = sooptcopyout(sopt, optdata, optdatalen);
2217
2218	return (error);
2219}
2220
2221void
2222ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
2223{
2224	if (pktopt == NULL)
2225		return;
2226
2227	if (optname == -1 || optname == IPV6_PKTINFO) {
2228		if (pktopt->ip6po_pktinfo)
2229			free(pktopt->ip6po_pktinfo, M_IP6OPT);
2230		pktopt->ip6po_pktinfo = NULL;
2231	}
2232	if (optname == -1 || optname == IPV6_HOPLIMIT)
2233		pktopt->ip6po_hlim = -1;
2234	if (optname == -1 || optname == IPV6_TCLASS)
2235		pktopt->ip6po_tclass = -1;
2236	if (optname == -1 || optname == IPV6_NEXTHOP) {
2237		if (pktopt->ip6po_nextroute.ro_rt) {
2238			RTFREE(pktopt->ip6po_nextroute.ro_rt);
2239			pktopt->ip6po_nextroute.ro_rt = NULL;
2240		}
2241		if (pktopt->ip6po_nexthop)
2242			free(pktopt->ip6po_nexthop, M_IP6OPT);
2243		pktopt->ip6po_nexthop = NULL;
2244	}
2245	if (optname == -1 || optname == IPV6_HOPOPTS) {
2246		if (pktopt->ip6po_hbh)
2247			free(pktopt->ip6po_hbh, M_IP6OPT);
2248		pktopt->ip6po_hbh = NULL;
2249	}
2250	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
2251		if (pktopt->ip6po_dest1)
2252			free(pktopt->ip6po_dest1, M_IP6OPT);
2253		pktopt->ip6po_dest1 = NULL;
2254	}
2255	if (optname == -1 || optname == IPV6_RTHDR) {
2256		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
2257			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
2258		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
2259		if (pktopt->ip6po_route.ro_rt) {
2260			RTFREE(pktopt->ip6po_route.ro_rt);
2261			pktopt->ip6po_route.ro_rt = NULL;
2262		}
2263	}
2264	if (optname == -1 || optname == IPV6_DSTOPTS) {
2265		if (pktopt->ip6po_dest2)
2266			free(pktopt->ip6po_dest2, M_IP6OPT);
2267		pktopt->ip6po_dest2 = NULL;
2268	}
2269}
2270
2271#define PKTOPT_EXTHDRCPY(type) \
2272do {\
2273	if (src->type) {\
2274		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
2275		dst->type = malloc(hlen, M_IP6OPT, canwait);\
2276		if (dst->type == NULL && canwait == M_NOWAIT)\
2277			goto bad;\
2278		bcopy(src->type, dst->type, hlen);\
2279	}\
2280} while (/*CONSTCOND*/ 0)
2281
2282static int
2283copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
2284{
2285	if (dst == NULL || src == NULL)  {
2286		printf("ip6_clearpktopts: invalid argument\n");
2287		return (EINVAL);
2288	}
2289
2290	dst->ip6po_hlim = src->ip6po_hlim;
2291	dst->ip6po_tclass = src->ip6po_tclass;
2292	dst->ip6po_flags = src->ip6po_flags;
2293	dst->ip6po_minmtu = src->ip6po_minmtu;
2294	dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
2295	if (src->ip6po_pktinfo) {
2296		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
2297		    M_IP6OPT, canwait);
2298		if (dst->ip6po_pktinfo == NULL)
2299			goto bad;
2300		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
2301	}
2302	if (src->ip6po_nexthop) {
2303		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
2304		    M_IP6OPT, canwait);
2305		if (dst->ip6po_nexthop == NULL)
2306			goto bad;
2307		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
2308		    src->ip6po_nexthop->sa_len);
2309	}
2310	PKTOPT_EXTHDRCPY(ip6po_hbh);
2311	PKTOPT_EXTHDRCPY(ip6po_dest1);
2312	PKTOPT_EXTHDRCPY(ip6po_dest2);
2313	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
2314	return (0);
2315
2316  bad:
2317	ip6_clearpktopts(dst, -1);
2318	return (ENOBUFS);
2319}
2320#undef PKTOPT_EXTHDRCPY
2321
2322struct ip6_pktopts *
2323ip6_copypktopts(struct ip6_pktopts *src, int canwait)
2324{
2325	int error;
2326	struct ip6_pktopts *dst;
2327
2328	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
2329	if (dst == NULL)
2330		return (NULL);
2331	ip6_initpktopts(dst);
2332
2333	if ((error = copypktopts(dst, src, canwait)) != 0) {
2334		free(dst, M_IP6OPT);
2335		return (NULL);
2336	}
2337
2338	return (dst);
2339}
2340
2341void
2342ip6_freepcbopts(struct ip6_pktopts *pktopt)
2343{
2344	if (pktopt == NULL)
2345		return;
2346
2347	ip6_clearpktopts(pktopt, -1);
2348
2349	free(pktopt, M_IP6OPT);
2350}
2351
2352/*
2353 * Set IPv6 outgoing packet options based on advanced API.
2354 */
2355int
2356ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
2357    struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
2358{
2359	struct cmsghdr *cm = 0;
2360
2361	if (control == NULL || opt == NULL)
2362		return (EINVAL);
2363
2364	ip6_initpktopts(opt);
2365	if (stickyopt) {
2366		int error;
2367
2368		/*
2369		 * If stickyopt is provided, make a local copy of the options
2370		 * for this particular packet, then override them by ancillary
2371		 * objects.
2372		 * XXX: copypktopts() does not copy the cached route to a next
2373		 * hop (if any).  This is not very good in terms of efficiency,
2374		 * but we can allow this since this option should be rarely
2375		 * used.
2376		 */
2377		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
2378			return (error);
2379	}
2380
2381	/*
2382	 * XXX: Currently, we assume all the optional information is stored
2383	 * in a single mbuf.
2384	 */
2385	if (control->m_next)
2386		return (EINVAL);
2387
2388	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
2389	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
2390		int error;
2391
2392		if (control->m_len < CMSG_LEN(0))
2393			return (EINVAL);
2394
2395		cm = mtod(control, struct cmsghdr *);
2396		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
2397			return (EINVAL);
2398		if (cm->cmsg_level != IPPROTO_IPV6)
2399			continue;
2400
2401		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
2402		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
2403		if (error)
2404			return (error);
2405	}
2406
2407	return (0);
2408}
2409
2410/*
2411 * Set a particular packet option, as a sticky option or an ancillary data
2412 * item.  "len" can be 0 only when it's a sticky option.
2413 * We have 4 cases of combination of "sticky" and "cmsg":
2414 * "sticky=0, cmsg=0": impossible
2415 * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
2416 * "sticky=1, cmsg=0": RFC3542 socket option
2417 * "sticky=1, cmsg=1": RFC2292 socket option
2418 */
2419static int
2420ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
2421    struct ucred *cred, int sticky, int cmsg, int uproto)
2422{
2423	int minmtupolicy, preftemp;
2424	int error;
2425
2426	if (!sticky && !cmsg) {
2427#ifdef DIAGNOSTIC
2428		printf("ip6_setpktopt: impossible case\n");
2429#endif
2430		return (EINVAL);
2431	}
2432
2433	/*
2434	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
2435	 * not be specified in the context of RFC3542.  Conversely,
2436	 * RFC3542 types should not be specified in the context of RFC2292.
2437	 */
2438	if (!cmsg) {
2439		switch (optname) {
2440		case IPV6_2292PKTINFO:
2441		case IPV6_2292HOPLIMIT:
2442		case IPV6_2292NEXTHOP:
2443		case IPV6_2292HOPOPTS:
2444		case IPV6_2292DSTOPTS:
2445		case IPV6_2292RTHDR:
2446		case IPV6_2292PKTOPTIONS:
2447			return (ENOPROTOOPT);
2448		}
2449	}
2450	if (sticky && cmsg) {
2451		switch (optname) {
2452		case IPV6_PKTINFO:
2453		case IPV6_HOPLIMIT:
2454		case IPV6_NEXTHOP:
2455		case IPV6_HOPOPTS:
2456		case IPV6_DSTOPTS:
2457		case IPV6_RTHDRDSTOPTS:
2458		case IPV6_RTHDR:
2459		case IPV6_USE_MIN_MTU:
2460		case IPV6_DONTFRAG:
2461		case IPV6_TCLASS:
2462		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
2463			return (ENOPROTOOPT);
2464		}
2465	}
2466
2467	switch (optname) {
2468	case IPV6_2292PKTINFO:
2469	case IPV6_PKTINFO:
2470	{
2471		struct ifnet *ifp = NULL;
2472		struct in6_pktinfo *pktinfo;
2473
2474		if (len != sizeof(struct in6_pktinfo))
2475			return (EINVAL);
2476
2477		pktinfo = (struct in6_pktinfo *)buf;
2478
2479		/*
2480		 * An application can clear any sticky IPV6_PKTINFO option by
2481		 * doing a "regular" setsockopt with ipi6_addr being
2482		 * in6addr_any and ipi6_ifindex being zero.
2483		 * [RFC 3542, Section 6]
2484		 */
2485		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
2486		    pktinfo->ipi6_ifindex == 0 &&
2487		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2488			ip6_clearpktopts(opt, optname);
2489			break;
2490		}
2491
2492		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
2493		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2494			return (EINVAL);
2495		}
2496
2497		/* validate the interface index if specified. */
2498		if (pktinfo->ipi6_ifindex > V_if_index ||
2499		    pktinfo->ipi6_ifindex < 0) {
2500			 return (ENXIO);
2501		}
2502		if (pktinfo->ipi6_ifindex) {
2503			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
2504			if (ifp == NULL)
2505				return (ENXIO);
2506		}
2507
2508		/*
2509		 * We store the address anyway, and let in6_selectsrc()
2510		 * validate the specified address.  This is because ipi6_addr
2511		 * may not have enough information about its scope zone, and
2512		 * we may need additional information (such as outgoing
2513		 * interface or the scope zone of a destination address) to
2514		 * disambiguate the scope.
2515		 * XXX: the delay of the validation may confuse the
2516		 * application when it is used as a sticky option.
2517		 */
2518		if (opt->ip6po_pktinfo == NULL) {
2519			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
2520			    M_IP6OPT, M_NOWAIT);
2521			if (opt->ip6po_pktinfo == NULL)
2522				return (ENOBUFS);
2523		}
2524		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
2525		break;
2526	}
2527
2528	case IPV6_2292HOPLIMIT:
2529	case IPV6_HOPLIMIT:
2530	{
2531		int *hlimp;
2532
2533		/*
2534		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
2535		 * to simplify the ordering among hoplimit options.
2536		 */
2537		if (optname == IPV6_HOPLIMIT && sticky)
2538			return (ENOPROTOOPT);
2539
2540		if (len != sizeof(int))
2541			return (EINVAL);
2542		hlimp = (int *)buf;
2543		if (*hlimp < -1 || *hlimp > 255)
2544			return (EINVAL);
2545
2546		opt->ip6po_hlim = *hlimp;
2547		break;
2548	}
2549
2550	case IPV6_TCLASS:
2551	{
2552		int tclass;
2553
2554		if (len != sizeof(int))
2555			return (EINVAL);
2556		tclass = *(int *)buf;
2557		if (tclass < -1 || tclass > 255)
2558			return (EINVAL);
2559
2560		opt->ip6po_tclass = tclass;
2561		break;
2562	}
2563
2564	case IPV6_2292NEXTHOP:
2565	case IPV6_NEXTHOP:
2566		if (cred != NULL) {
2567			error = priv_check_cred(cred,
2568			    PRIV_NETINET_SETHDROPTS, 0);
2569			if (error)
2570				return (error);
2571		}
2572
2573		if (len == 0) {	/* just remove the option */
2574			ip6_clearpktopts(opt, IPV6_NEXTHOP);
2575			break;
2576		}
2577
2578		/* check if cmsg_len is large enough for sa_len */
2579		if (len < sizeof(struct sockaddr) || len < *buf)
2580			return (EINVAL);
2581
2582		switch (((struct sockaddr *)buf)->sa_family) {
2583		case AF_INET6:
2584		{
2585			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
2586			int error;
2587
2588			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
2589				return (EINVAL);
2590
2591			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
2592			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
2593				return (EINVAL);
2594			}
2595			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
2596			    != 0) {
2597				return (error);
2598			}
2599			break;
2600		}
2601		case AF_LINK:	/* should eventually be supported */
2602		default:
2603			return (EAFNOSUPPORT);
2604		}
2605
2606		/* turn off the previous option, then set the new option. */
2607		ip6_clearpktopts(opt, IPV6_NEXTHOP);
2608		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
2609		if (opt->ip6po_nexthop == NULL)
2610			return (ENOBUFS);
2611		bcopy(buf, opt->ip6po_nexthop, *buf);
2612		break;
2613
2614	case IPV6_2292HOPOPTS:
2615	case IPV6_HOPOPTS:
2616	{
2617		struct ip6_hbh *hbh;
2618		int hbhlen;
2619
2620		/*
2621		 * XXX: We don't allow a non-privileged user to set ANY HbH
2622		 * options, since per-option restriction has too much
2623		 * overhead.
2624		 */
2625		if (cred != NULL) {
2626			error = priv_check_cred(cred,
2627			    PRIV_NETINET_SETHDROPTS, 0);
2628			if (error)
2629				return (error);
2630		}
2631
2632		if (len == 0) {
2633			ip6_clearpktopts(opt, IPV6_HOPOPTS);
2634			break;	/* just remove the option */
2635		}
2636
2637		/* message length validation */
2638		if (len < sizeof(struct ip6_hbh))
2639			return (EINVAL);
2640		hbh = (struct ip6_hbh *)buf;
2641		hbhlen = (hbh->ip6h_len + 1) << 3;
2642		if (len != hbhlen)
2643			return (EINVAL);
2644
2645		/* turn off the previous option, then set the new option. */
2646		ip6_clearpktopts(opt, IPV6_HOPOPTS);
2647		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
2648		if (opt->ip6po_hbh == NULL)
2649			return (ENOBUFS);
2650		bcopy(hbh, opt->ip6po_hbh, hbhlen);
2651
2652		break;
2653	}
2654
2655	case IPV6_2292DSTOPTS:
2656	case IPV6_DSTOPTS:
2657	case IPV6_RTHDRDSTOPTS:
2658	{
2659		struct ip6_dest *dest, **newdest = NULL;
2660		int destlen;
2661
2662		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
2663			error = priv_check_cred(cred,
2664			    PRIV_NETINET_SETHDROPTS, 0);
2665			if (error)
2666				return (error);
2667		}
2668
2669		if (len == 0) {
2670			ip6_clearpktopts(opt, optname);
2671			break;	/* just remove the option */
2672		}
2673
2674		/* message length validation */
2675		if (len < sizeof(struct ip6_dest))
2676			return (EINVAL);
2677		dest = (struct ip6_dest *)buf;
2678		destlen = (dest->ip6d_len + 1) << 3;
2679		if (len != destlen)
2680			return (EINVAL);
2681
2682		/*
2683		 * Determine the position that the destination options header
2684		 * should be inserted; before or after the routing header.
2685		 */
2686		switch (optname) {
2687		case IPV6_2292DSTOPTS:
2688			/*
2689			 * The old advacned API is ambiguous on this point.
2690			 * Our approach is to determine the position based
2691			 * according to the existence of a routing header.
2692			 * Note, however, that this depends on the order of the
2693			 * extension headers in the ancillary data; the 1st
2694			 * part of the destination options header must appear
2695			 * before the routing header in the ancillary data,
2696			 * too.
2697			 * RFC3542 solved the ambiguity by introducing
2698			 * separate ancillary data or option types.
2699			 */
2700			if (opt->ip6po_rthdr == NULL)
2701				newdest = &opt->ip6po_dest1;
2702			else
2703				newdest = &opt->ip6po_dest2;
2704			break;
2705		case IPV6_RTHDRDSTOPTS:
2706			newdest = &opt->ip6po_dest1;
2707			break;
2708		case IPV6_DSTOPTS:
2709			newdest = &opt->ip6po_dest2;
2710			break;
2711		}
2712
2713		/* turn off the previous option, then set the new option. */
2714		ip6_clearpktopts(opt, optname);
2715		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
2716		if (*newdest == NULL)
2717			return (ENOBUFS);
2718		bcopy(dest, *newdest, destlen);
2719
2720		break;
2721	}
2722
2723	case IPV6_2292RTHDR:
2724	case IPV6_RTHDR:
2725	{
2726		struct ip6_rthdr *rth;
2727		int rthlen;
2728
2729		if (len == 0) {
2730			ip6_clearpktopts(opt, IPV6_RTHDR);
2731			break;	/* just remove the option */
2732		}
2733
2734		/* message length validation */
2735		if (len < sizeof(struct ip6_rthdr))
2736			return (EINVAL);
2737		rth = (struct ip6_rthdr *)buf;
2738		rthlen = (rth->ip6r_len + 1) << 3;
2739		if (len != rthlen)
2740			return (EINVAL);
2741
2742		switch (rth->ip6r_type) {
2743		case IPV6_RTHDR_TYPE_0:
2744			if (rth->ip6r_len == 0)	/* must contain one addr */
2745				return (EINVAL);
2746			if (rth->ip6r_len % 2) /* length must be even */
2747				return (EINVAL);
2748			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
2749				return (EINVAL);
2750			break;
2751		default:
2752			return (EINVAL);	/* not supported */
2753		}
2754
2755		/* turn off the previous option */
2756		ip6_clearpktopts(opt, IPV6_RTHDR);
2757		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
2758		if (opt->ip6po_rthdr == NULL)
2759			return (ENOBUFS);
2760		bcopy(rth, opt->ip6po_rthdr, rthlen);
2761
2762		break;
2763	}
2764
2765	case IPV6_USE_MIN_MTU:
2766		if (len != sizeof(int))
2767			return (EINVAL);
2768		minmtupolicy = *(int *)buf;
2769		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
2770		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
2771		    minmtupolicy != IP6PO_MINMTU_ALL) {
2772			return (EINVAL);
2773		}
2774		opt->ip6po_minmtu = minmtupolicy;
2775		break;
2776
2777	case IPV6_DONTFRAG:
2778		if (len != sizeof(int))
2779			return (EINVAL);
2780
2781		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
2782			/*
2783			 * we ignore this option for TCP sockets.
2784			 * (RFC3542 leaves this case unspecified.)
2785			 */
2786			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
2787		} else
2788			opt->ip6po_flags |= IP6PO_DONTFRAG;
2789		break;
2790
2791	case IPV6_PREFER_TEMPADDR:
2792		if (len != sizeof(int))
2793			return (EINVAL);
2794		preftemp = *(int *)buf;
2795		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
2796		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
2797		    preftemp != IP6PO_TEMPADDR_PREFER) {
2798			return (EINVAL);
2799		}
2800		opt->ip6po_prefer_tempaddr = preftemp;
2801		break;
2802
2803	default:
2804		return (ENOPROTOOPT);
2805	} /* end of switch */
2806
2807	return (0);
2808}
2809
2810/*
2811 * Routine called from ip6_output() to loop back a copy of an IP6 multicast
2812 * packet to the input queue of a specified interface.  Note that this
2813 * calls the output routine of the loopback "driver", but with an interface
2814 * pointer that might NOT be &loif -- easier than replicating that code here.
2815 */
2816void
2817ip6_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in6 *dst)
2818{
2819	struct mbuf *copym;
2820	struct ip6_hdr *ip6;
2821
2822	copym = m_copy(m, 0, M_COPYALL);
2823	if (copym == NULL)
2824		return;
2825
2826	/*
2827	 * Make sure to deep-copy IPv6 header portion in case the data
2828	 * is in an mbuf cluster, so that we can safely override the IPv6
2829	 * header portion later.
2830	 */
2831	if ((copym->m_flags & M_EXT) != 0 ||
2832	    copym->m_len < sizeof(struct ip6_hdr)) {
2833		copym = m_pullup(copym, sizeof(struct ip6_hdr));
2834		if (copym == NULL)
2835			return;
2836	}
2837	ip6 = mtod(copym, struct ip6_hdr *);
2838	/*
2839	 * clear embedded scope identifiers if necessary.
2840	 * in6_clearscope will touch the addresses only when necessary.
2841	 */
2842	in6_clearscope(&ip6->ip6_src);
2843	in6_clearscope(&ip6->ip6_dst);
2844	if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
2845		copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 |
2846		    CSUM_PSEUDO_HDR;
2847		copym->m_pkthdr.csum_data = 0xffff;
2848	}
2849	(void)if_simloop(ifp, copym, dst->sin6_family, 0);
2850}
2851
2852/*
2853 * Chop IPv6 header off from the payload.
2854 */
2855static int
2856ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
2857{
2858	struct mbuf *mh;
2859	struct ip6_hdr *ip6;
2860
2861	ip6 = mtod(m, struct ip6_hdr *);
2862	if (m->m_len > sizeof(*ip6)) {
2863		mh = m_gethdr(M_NOWAIT, MT_DATA);
2864		if (mh == NULL) {
2865			m_freem(m);
2866			return ENOBUFS;
2867		}
2868		m_move_pkthdr(mh, m);
2869		MH_ALIGN(mh, sizeof(*ip6));
2870		m->m_len -= sizeof(*ip6);
2871		m->m_data += sizeof(*ip6);
2872		mh->m_next = m;
2873		m = mh;
2874		m->m_len = sizeof(*ip6);
2875		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
2876	}
2877	exthdrs->ip6e_ip6 = m;
2878	return 0;
2879}
2880
2881/*
2882 * Compute IPv6 extension header length.
2883 */
2884int
2885ip6_optlen(struct inpcb *in6p)
2886{
2887	int len;
2888
2889	if (!in6p->in6p_outputopts)
2890		return 0;
2891
2892	len = 0;
2893#define elen(x) \
2894    (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
2895
2896	len += elen(in6p->in6p_outputopts->ip6po_hbh);
2897	if (in6p->in6p_outputopts->ip6po_rthdr)
2898		/* dest1 is valid with rthdr only */
2899		len += elen(in6p->in6p_outputopts->ip6po_dest1);
2900	len += elen(in6p->in6p_outputopts->ip6po_rthdr);
2901	len += elen(in6p->in6p_outputopts->ip6po_dest2);
2902	return len;
2903#undef elen
2904}
2905