1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
32 */
33
34/*-
35 * Copyright (c) 1982, 1986, 1988, 1990, 1993
36 *	The Regents of the University of California.  All rights reserved.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 *    notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 *    notice, this list of conditions and the following disclaimer in the
45 *    documentation and/or other materials provided with the distribution.
46 * 3. Neither the name of the University nor the names of its contributors
47 *    may be used to endorse or promote products derived from this software
48 *    without specific prior written permission.
49 *
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
61 */
62
63#include <sys/cdefs.h>
64#include "opt_inet.h"
65#include "opt_inet6.h"
66#include "opt_ipsec.h"
67#include "opt_kern_tls.h"
68#include "opt_ratelimit.h"
69#include "opt_route.h"
70#include "opt_rss.h"
71#include "opt_sctp.h"
72
73#include <sys/param.h>
74#include <sys/kernel.h>
75#include <sys/ktls.h>
76#include <sys/malloc.h>
77#include <sys/mbuf.h>
78#include <sys/errno.h>
79#include <sys/priv.h>
80#include <sys/proc.h>
81#include <sys/protosw.h>
82#include <sys/socket.h>
83#include <sys/socketvar.h>
84#include <sys/syslog.h>
85#include <sys/ucred.h>
86
87#include <machine/in_cksum.h>
88
89#include <net/if.h>
90#include <net/if_var.h>
91#include <net/if_private.h>
92#include <net/if_vlan_var.h>
93#include <net/if_llatbl.h>
94#include <net/ethernet.h>
95#include <net/netisr.h>
96#include <net/route.h>
97#include <net/route/nhop.h>
98#include <net/pfil.h>
99#include <net/rss_config.h>
100#include <net/vnet.h>
101
102#include <netinet/in.h>
103#include <netinet/in_var.h>
104#include <netinet/ip_var.h>
105#include <netinet6/in6_fib.h>
106#include <netinet6/in6_var.h>
107#include <netinet/ip6.h>
108#include <netinet/icmp6.h>
109#include <netinet6/ip6_var.h>
110#include <netinet/in_pcb.h>
111#include <netinet/tcp_var.h>
112#include <netinet6/nd6.h>
113#include <netinet6/in6_rss.h>
114
115#include <netipsec/ipsec_support.h>
116#if defined(SCTP) || defined(SCTP_SUPPORT)
117#include <netinet/sctp.h>
118#include <netinet/sctp_crc32.h>
119#endif
120
121#include <netinet6/scope6_var.h>
122
123extern int in6_mcast_loop;
124
125struct ip6_exthdrs {
126	struct mbuf *ip6e_ip6;
127	struct mbuf *ip6e_hbh;
128	struct mbuf *ip6e_dest1;
129	struct mbuf *ip6e_rthdr;
130	struct mbuf *ip6e_dest2;
131};
132
133static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
134
135static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
136			   struct ucred *, int);
137static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *,
138	struct socket *, struct sockopt *);
139static int ip6_getpcbopt(struct inpcb *, int, struct sockopt *);
140static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *,
141	struct ucred *, int, int, int);
142
143static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
144static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
145	struct ip6_frag **);
146static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
147static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
148static int ip6_getpmtu(struct route_in6 *, int,
149	struct ifnet *, const struct in6_addr *, u_long *, int *, u_int,
150	u_int);
151static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
152	u_long *, int *, u_int);
153static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *);
154static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
155
156/*
157 * Make an extension header from option data.  hp is the source,
158 * mp is the destination, and _ol is the optlen.
159 */
160#define	MAKE_EXTHDR(hp, mp, _ol)					\
161    do {								\
162	struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
163	error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
164	    ((eh)->ip6e_len + 1) << 3);				\
165	if (error)						\
166		goto freehdrs;					\
167	(_ol) += (*(mp))->m_len;				\
168    } while (/*CONSTCOND*/ 0)
169
170/*
171 * Form a chain of extension headers.
172 * m is the extension header mbuf
173 * mp is the previous mbuf in the chain
174 * p is the next header
175 * i is the type of option.
176 */
177#define MAKE_CHAIN(m, mp, p, i)\
178    do {\
179	if (m) {\
180		if (!hdrsplit) \
181			panic("%s:%d: assumption failed: "\
182			    "hdr not split: hdrsplit %d exthdrs %p",\
183			    __func__, __LINE__, hdrsplit, &exthdrs);\
184		*mtod((m), u_char *) = *(p);\
185		*(p) = (i);\
186		p = mtod((m), u_char *);\
187		(m)->m_next = (mp)->m_next;\
188		(mp)->m_next = (m);\
189		(mp) = (m);\
190	}\
191    } while (/*CONSTCOND*/ 0)
192
193void
194in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset)
195{
196	u_short csum;
197
198	csum = in_cksum_skip(m, offset + plen, offset);
199	if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0)
200		csum = 0xffff;
201	offset += m->m_pkthdr.csum_data;	/* checksum offset */
202
203	if (offset + sizeof(csum) > m->m_len)
204		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
205	else
206		*(u_short *)mtodo(m, offset) = csum;
207}
208
209static void
210ip6_output_delayed_csum(struct mbuf *m, struct ifnet *ifp, int csum_flags,
211    int plen, int optlen)
212{
213
214	KASSERT((plen >= optlen), ("%s:%d: plen %d < optlen %d, m %p, ifp %p "
215	    "csum_flags %#x",
216	    __func__, __LINE__, plen, optlen, m, ifp, csum_flags));
217
218	if (csum_flags & CSUM_DELAY_DATA_IPV6) {
219		in6_delayed_cksum(m, plen - optlen,
220		    sizeof(struct ip6_hdr) + optlen);
221		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
222	}
223#if defined(SCTP) || defined(SCTP_SUPPORT)
224	if (csum_flags & CSUM_SCTP_IPV6) {
225		sctp_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen);
226		m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
227	}
228#endif
229}
230
231int
232ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto,
233    int fraglen , uint32_t id)
234{
235	struct mbuf *m, **mnext, *m_frgpart;
236	struct ip6_hdr *ip6, *mhip6;
237	struct ip6_frag *ip6f;
238	int off;
239	int error;
240	int tlen = m0->m_pkthdr.len;
241
242	KASSERT((fraglen % 8 == 0), ("Fragment length must be a multiple of 8"));
243
244	m = m0;
245	ip6 = mtod(m, struct ip6_hdr *);
246	mnext = &m->m_nextpkt;
247
248	for (off = hlen; off < tlen; off += fraglen) {
249		m = m_gethdr(M_NOWAIT, MT_DATA);
250		if (!m) {
251			IP6STAT_INC(ip6s_odropped);
252			return (ENOBUFS);
253		}
254
255		/*
256		 * Make sure the complete packet header gets copied
257		 * from the originating mbuf to the newly created
258		 * mbuf. This also ensures that existing firewall
259		 * classification(s), VLAN tags and so on get copied
260		 * to the resulting fragmented packet(s):
261		 */
262		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
263			m_free(m);
264			IP6STAT_INC(ip6s_odropped);
265			return (ENOBUFS);
266		}
267
268		*mnext = m;
269		mnext = &m->m_nextpkt;
270		m->m_data += max_linkhdr;
271		mhip6 = mtod(m, struct ip6_hdr *);
272		*mhip6 = *ip6;
273		m->m_len = sizeof(*mhip6);
274		error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
275		if (error) {
276			IP6STAT_INC(ip6s_odropped);
277			return (error);
278		}
279		ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
280		if (off + fraglen >= tlen)
281			fraglen = tlen - off;
282		else
283			ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
284		mhip6->ip6_plen = htons((u_short)(fraglen + hlen +
285		    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
286		if ((m_frgpart = m_copym(m0, off, fraglen, M_NOWAIT)) == NULL) {
287			IP6STAT_INC(ip6s_odropped);
288			return (ENOBUFS);
289		}
290		m_cat(m, m_frgpart);
291		m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f);
292		ip6f->ip6f_reserved = 0;
293		ip6f->ip6f_ident = id;
294		ip6f->ip6f_nxt = nextproto;
295		IP6STAT_INC(ip6s_ofragments);
296		in6_ifstat_inc(ifp, ifs6_out_fragcreat);
297	}
298
299	return (0);
300}
301
302static int
303ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp,
304    struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro,
305    bool stamp_tag)
306{
307#ifdef KERN_TLS
308	struct ktls_session *tls = NULL;
309#endif
310	struct m_snd_tag *mst;
311	int error;
312
313	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
314	mst = NULL;
315
316#ifdef KERN_TLS
317	/*
318	 * If this is an unencrypted TLS record, save a reference to
319	 * the record.  This local reference is used to call
320	 * ktls_output_eagain after the mbuf has been freed (thus
321	 * dropping the mbuf's reference) in if_output.
322	 */
323	if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
324		tls = ktls_hold(m->m_next->m_epg_tls);
325		mst = tls->snd_tag;
326
327		/*
328		 * If a TLS session doesn't have a valid tag, it must
329		 * have had an earlier ifp mismatch, so drop this
330		 * packet.
331		 */
332		if (mst == NULL) {
333			m_freem(m);
334			error = EAGAIN;
335			goto done;
336		}
337		/*
338		 * Always stamp tags that include NIC ktls.
339		 */
340		stamp_tag = true;
341	}
342#endif
343#ifdef RATELIMIT
344	if (inp != NULL && mst == NULL) {
345		if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
346		    (inp->inp_snd_tag != NULL &&
347		    inp->inp_snd_tag->ifp != ifp))
348			in_pcboutput_txrtlmt(inp, ifp, m);
349
350		if (inp->inp_snd_tag != NULL)
351			mst = inp->inp_snd_tag;
352	}
353#endif
354	if (stamp_tag && mst != NULL) {
355		KASSERT(m->m_pkthdr.rcvif == NULL,
356		    ("trying to add a send tag to a forwarded packet"));
357		if (mst->ifp != ifp) {
358			m_freem(m);
359			error = EAGAIN;
360			goto done;
361		}
362
363		/* stamp send tag on mbuf */
364		m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
365		m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
366	}
367
368	error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro);
369
370done:
371	/* Check for route change invalidating send tags. */
372#ifdef KERN_TLS
373	if (tls != NULL) {
374		if (error == EAGAIN)
375			error = ktls_output_eagain(inp, tls);
376		ktls_free(tls);
377	}
378#endif
379#ifdef RATELIMIT
380	if (error == EAGAIN)
381		in_pcboutput_eagain(inp);
382#endif
383	return (error);
384}
385
386/*
387 * IP6 output.
388 * The packet in mbuf chain m contains a skeletal IP6 header (with pri, len,
389 * nxt, hlim, src, dst).
390 * This function may modify ver and hlim only.
391 * The mbuf chain containing the packet will be freed.
392 * The mbuf opt, if present, will not be freed.
393 * If route_in6 ro is present and has ro_nh initialized, route lookup would be
394 * skipped and ro->ro_nh would be used. If ro is present but ro->ro_nh is NULL,
395 * then result of route lookup is stored in ro->ro_nh.
396 *
397 * Type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and nd_ifinfo.linkmtu
398 * is uint32_t.  So we use u_long to hold largest one, which is rt_mtu.
399 *
400 * ifpp - XXX: just for statistics
401 */
402int
403ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
404    struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
405    struct ifnet **ifpp, struct inpcb *inp)
406{
407	struct ip6_hdr *ip6;
408	struct ifnet *ifp, *origifp;
409	struct mbuf *m = m0;
410	struct mbuf *mprev;
411	struct route_in6 *ro_pmtu;
412	struct nhop_object *nh;
413	struct sockaddr_in6 *dst, sin6, src_sa, dst_sa;
414	struct in6_addr odst;
415	u_char *nexthdrp;
416	int tlen, len;
417	int error = 0;
418	int vlan_pcp = -1;
419	struct in6_ifaddr *ia = NULL;
420	u_long mtu;
421	int alwaysfrag, dontfrag;
422	u_int32_t optlen, plen = 0, unfragpartlen;
423	struct ip6_exthdrs exthdrs;
424	struct in6_addr src0, dst0;
425	u_int32_t zone;
426	bool hdrsplit;
427	int sw_csum, tso;
428	int needfiblookup;
429	uint32_t fibnum;
430	struct m_tag *fwd_tag = NULL;
431	uint32_t id;
432	uint32_t optvalid;
433
434	NET_EPOCH_ASSERT();
435
436	if (inp != NULL) {
437		INP_LOCK_ASSERT(inp);
438		M_SETFIB(m, inp->inp_inc.inc_fibnum);
439		if ((flags & IP_NODEFAULTFLOWID) == 0) {
440			/* Unconditionally set flowid. */
441			m->m_pkthdr.flowid = inp->inp_flowid;
442			M_HASHTYPE_SET(m, inp->inp_flowtype);
443		}
444		if ((inp->inp_flags2 & INP_2PCP_SET) != 0)
445			vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >>
446			    INP_2PCP_SHIFT;
447#ifdef NUMA
448		m->m_pkthdr.numa_domain = inp->inp_numa_domain;
449#endif
450	}
451
452#if defined(IPSEC) || defined(IPSEC_SUPPORT)
453	/*
454	 * IPSec checking which handles several cases.
455	 * FAST IPSEC: We re-injected the packet.
456	 * XXX: need scope argument.
457	 */
458	if (IPSEC_ENABLED(ipv6)) {
459		m = mb_unmapped_to_ext(m);
460		if (m == NULL) {
461			IP6STAT_INC(ip6s_odropped);
462			error = ENOBUFS;
463			goto bad;
464		}
465		if ((error = IPSEC_OUTPUT(ipv6, m, inp)) != 0) {
466			if (error == EINPROGRESS)
467				error = 0;
468			goto done;
469		}
470	}
471#endif /* IPSEC */
472
473	/* Source address validation. */
474	ip6 = mtod(m, struct ip6_hdr *);
475	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
476	    (flags & IPV6_UNSPECSRC) == 0) {
477		error = EOPNOTSUPP;
478		IP6STAT_INC(ip6s_badscope);
479		goto bad;
480	}
481	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
482		error = EOPNOTSUPP;
483		IP6STAT_INC(ip6s_badscope);
484		goto bad;
485	}
486
487	/*
488	 * If we are given packet options to add extension headers prepare them.
489	 * Calculate the total length of the extension header chain.
490	 * Keep the length of the unfragmentable part for fragmentation.
491	 */
492	bzero(&exthdrs, sizeof(exthdrs));
493	optlen = optvalid = 0;
494	unfragpartlen = sizeof(struct ip6_hdr);
495	if (opt) {
496		optvalid = opt->ip6po_valid;
497
498		/* Hop-by-Hop options header. */
499		if ((optvalid & IP6PO_VALID_HBH) != 0)
500			MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh, optlen);
501
502		/* Destination options header (1st part). */
503		if ((optvalid & IP6PO_VALID_RHINFO) != 0) {
504#ifndef RTHDR_SUPPORT_IMPLEMENTED
505			/*
506			 * If there is a routing header, discard the packet
507			 * right away here. RH0/1 are obsolete and we do not
508			 * currently support RH2/3/4.
509			 * People trying to use RH253/254 may want to disable
510			 * this check.
511			 * The moment we do support any routing header (again)
512			 * this block should check the routing type more
513			 * selectively.
514			 */
515			error = EINVAL;
516			goto bad;
517#endif
518
519			/*
520			 * Destination options header (1st part).
521			 * This only makes sense with a routing header.
522			 * See Section 9.2 of RFC 3542.
523			 * Disabling this part just for MIP6 convenience is
524			 * a bad idea.  We need to think carefully about a
525			 * way to make the advanced API coexist with MIP6
526			 * options, which might automatically be inserted in
527			 * the kernel.
528			 */
529			if ((optvalid & IP6PO_VALID_DEST1) != 0)
530				MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1,
531				    optlen);
532		}
533		/* Routing header. */
534		if ((optvalid & IP6PO_VALID_RHINFO) != 0)
535			MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr, optlen);
536
537		unfragpartlen += optlen;
538
539		/*
540		 * NOTE: we don't add AH/ESP length here (done in
541		 * ip6_ipsec_output()).
542		 */
543
544		/* Destination options header (2nd part). */
545		if ((optvalid & IP6PO_VALID_DEST2) != 0)
546			MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2, optlen);
547	}
548
549	/*
550	 * If there is at least one extension header,
551	 * separate IP6 header from the payload.
552	 */
553	hdrsplit = false;
554	if (optlen) {
555		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
556			m = NULL;
557			goto freehdrs;
558		}
559		m = exthdrs.ip6e_ip6;
560		ip6 = mtod(m, struct ip6_hdr *);
561		hdrsplit = true;
562	}
563
564	/* Adjust mbuf packet header length. */
565	m->m_pkthdr.len += optlen;
566	plen = m->m_pkthdr.len - sizeof(*ip6);
567
568	/* If this is a jumbo payload, insert a jumbo payload option. */
569	if (plen > IPV6_MAXPACKET) {
570		if (!hdrsplit) {
571			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
572				m = NULL;
573				goto freehdrs;
574			}
575			m = exthdrs.ip6e_ip6;
576			ip6 = mtod(m, struct ip6_hdr *);
577			hdrsplit = true;
578		}
579		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
580			goto freehdrs;
581		ip6->ip6_plen = 0;
582	} else
583		ip6->ip6_plen = htons(plen);
584	nexthdrp = &ip6->ip6_nxt;
585
586	if (optlen) {
587		/*
588		 * Concatenate headers and fill in next header fields.
589		 * Here we have, on "m"
590		 *	IPv6 payload
591		 * and we insert headers accordingly.
592		 * Finally, we should be getting:
593		 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload].
594		 *
595		 * During the header composing process "m" points to IPv6
596		 * header.  "mprev" points to an extension header prior to esp.
597		 */
598		mprev = m;
599
600		/*
601		 * We treat dest2 specially.  This makes IPsec processing
602		 * much easier.  The goal here is to make mprev point the
603		 * mbuf prior to dest2.
604		 *
605		 * Result: IPv6 dest2 payload.
606		 * m and mprev will point to IPv6 header.
607		 */
608		if (exthdrs.ip6e_dest2) {
609			if (!hdrsplit)
610				panic("%s:%d: assumption failed: "
611				    "hdr not split: hdrsplit %d exthdrs %p",
612				    __func__, __LINE__, hdrsplit, &exthdrs);
613			exthdrs.ip6e_dest2->m_next = m->m_next;
614			m->m_next = exthdrs.ip6e_dest2;
615			*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
616			ip6->ip6_nxt = IPPROTO_DSTOPTS;
617		}
618
619		/*
620		 * Result: IPv6 hbh dest1 rthdr dest2 payload.
621		 * m will point to IPv6 header.  mprev will point to the
622		 * extension header prior to dest2 (rthdr in the above case).
623		 */
624		MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
625		MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
626			   IPPROTO_DSTOPTS);
627		MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
628			   IPPROTO_ROUTING);
629	}
630
631	IP6STAT_INC(ip6s_localout);
632
633	/* Route packet. */
634	ro_pmtu = ro;
635	if ((optvalid & IP6PO_VALID_RHINFO) != 0)
636		ro = &opt->ip6po_route;
637	if (ro != NULL)
638		dst = (struct sockaddr_in6 *)&ro->ro_dst;
639	else
640		dst = &sin6;
641	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
642
643again:
644	/*
645	 * If specified, try to fill in the traffic class field.
646	 * Do not override if a non-zero value is already set.
647	 * We check the diffserv field and the ECN field separately.
648	 */
649	if ((optvalid & IP6PO_VALID_TC) != 0){
650		int mask = 0;
651
652		if (IPV6_DSCP(ip6) == 0)
653			mask |= 0xfc;
654		if (IPV6_ECN(ip6) == 0)
655			mask |= 0x03;
656		if (mask != 0)
657			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
658	}
659
660	/* Fill in or override the hop limit field, if necessary. */
661	if ((optvalid & IP6PO_VALID_HLIM) != 0)
662		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
663	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
664		if (im6o != NULL)
665			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
666		else
667			ip6->ip6_hlim = V_ip6_defmcasthlim;
668	}
669
670	if (ro == NULL || ro->ro_nh == NULL) {
671		bzero(dst, sizeof(*dst));
672		dst->sin6_family = AF_INET6;
673		dst->sin6_len = sizeof(*dst);
674		dst->sin6_addr = ip6->ip6_dst;
675	}
676	/*
677	 * Validate route against routing table changes.
678	 * Make sure that the address family is set in route.
679	 */
680	nh = NULL;
681	ifp = NULL;
682	mtu = 0;
683	if (ro != NULL) {
684		if (ro->ro_nh != NULL && inp != NULL) {
685			ro->ro_dst.sin6_family = AF_INET6; /* XXX KASSERT? */
686			NH_VALIDATE((struct route *)ro, &inp->inp_rt_cookie,
687			    fibnum);
688		}
689		if (ro->ro_nh != NULL && fwd_tag == NULL &&
690		    (!NH_IS_VALID(ro->ro_nh) ||
691		    ro->ro_dst.sin6_family != AF_INET6 ||
692		    !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)))
693			RO_INVALIDATE_CACHE(ro);
694
695		if (ro->ro_nh != NULL && fwd_tag == NULL &&
696		    ro->ro_dst.sin6_family == AF_INET6 &&
697		    IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) {
698			/* Nexthop is valid and contains valid ifp */
699			nh = ro->ro_nh;
700		} else {
701			if (ro->ro_lle)
702				LLE_FREE(ro->ro_lle);	/* zeros ro_lle */
703			ro->ro_lle = NULL;
704			if (fwd_tag == NULL) {
705				bzero(&dst_sa, sizeof(dst_sa));
706				dst_sa.sin6_family = AF_INET6;
707				dst_sa.sin6_len = sizeof(dst_sa);
708				dst_sa.sin6_addr = ip6->ip6_dst;
709			}
710			error = in6_selectroute(&dst_sa, opt, im6o, ro, &ifp,
711			    &nh, fibnum, m->m_pkthdr.flowid);
712			if (error != 0) {
713				IP6STAT_INC(ip6s_noroute);
714				if (ifp != NULL)
715					in6_ifstat_inc(ifp, ifs6_out_discard);
716				goto bad;
717			}
718			/*
719			 * At this point at least @ifp is not NULL
720			 * Can be the case when dst is multicast, link-local or
721			 * interface is explicitly specificed by the caller.
722			 */
723		}
724		if (nh == NULL) {
725			/*
726			 * If in6_selectroute() does not return a nexthop
727			 * dst may not have been updated.
728			 */
729			*dst = dst_sa;	/* XXX */
730			origifp = ifp;
731			mtu = ifp->if_mtu;
732		} else {
733			ifp = nh->nh_ifp;
734			origifp = nh->nh_aifp;
735			ia = (struct in6_ifaddr *)(nh->nh_ifa);
736			counter_u64_add(nh->nh_pksent, 1);
737		}
738	} else {
739		struct nhop_object *nh;
740		struct in6_addr kdst;
741		uint32_t scopeid;
742
743		if (fwd_tag == NULL) {
744			bzero(&dst_sa, sizeof(dst_sa));
745			dst_sa.sin6_family = AF_INET6;
746			dst_sa.sin6_len = sizeof(dst_sa);
747			dst_sa.sin6_addr = ip6->ip6_dst;
748		}
749
750		if (IN6_IS_ADDR_MULTICAST(&dst_sa.sin6_addr) &&
751		    im6o != NULL &&
752		    (ifp = im6o->im6o_multicast_ifp) != NULL) {
753			/* We do not need a route lookup. */
754			*dst = dst_sa;	/* XXX */
755			origifp = ifp;
756			goto nonh6lookup;
757		}
758
759		in6_splitscope(&dst_sa.sin6_addr, &kdst, &scopeid);
760
761		if (IN6_IS_ADDR_MC_LINKLOCAL(&dst_sa.sin6_addr) ||
762		    IN6_IS_ADDR_MC_NODELOCAL(&dst_sa.sin6_addr)) {
763			if (scopeid > 0) {
764				ifp = in6_getlinkifnet(scopeid);
765				if (ifp == NULL) {
766					error = EHOSTUNREACH;
767					goto bad;
768				}
769				*dst = dst_sa;	/* XXX */
770				origifp = ifp;
771				goto nonh6lookup;
772			}
773		}
774
775		nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE,
776		    m->m_pkthdr.flowid);
777		if (nh == NULL) {
778			IP6STAT_INC(ip6s_noroute);
779			/* No ifp in6_ifstat_inc(ifp, ifs6_out_discard); */
780			error = EHOSTUNREACH;
781			goto bad;
782		}
783
784		ifp = nh->nh_ifp;
785		origifp = nh->nh_aifp;
786		ia = ifatoia6(nh->nh_ifa);
787		if (nh->nh_flags & NHF_GATEWAY)
788			dst->sin6_addr = nh->gw6_sa.sin6_addr;
789		else if (fwd_tag != NULL)
790			dst->sin6_addr = dst_sa.sin6_addr;
791nonh6lookup:
792		;
793	}
794	/*
795	 * At this point ifp MUST be pointing to the valid transmit ifp.
796	 * origifp MUST be valid and pointing to either the same ifp or,
797	 * in case of loopback output, to the interface which ip6_src
798	 * belongs to.
799	 * Examples:
800	 *  fe80::1%em0 -> fe80::2%em0 -> ifp=em0, origifp=em0
801	 *  fe80::1%em0 -> fe80::1%em0 -> ifp=lo0, origifp=em0
802	 *  ::1 -> ::1 -> ifp=lo0, origifp=lo0
803	 *
804	 * mtu can be 0 and will be refined later.
805	 */
806	KASSERT((ifp != NULL), ("output interface must not be NULL"));
807	KASSERT((origifp != NULL), ("output address interface must not be NULL"));
808
809	if ((flags & IPV6_FORWARDING) == 0) {
810		/* XXX: the FORWARDING flag can be set for mrouting. */
811		in6_ifstat_inc(ifp, ifs6_out_request);
812	}
813
814	/* Setup data structures for scope ID checks. */
815	src0 = ip6->ip6_src;
816	bzero(&src_sa, sizeof(src_sa));
817	src_sa.sin6_family = AF_INET6;
818	src_sa.sin6_len = sizeof(src_sa);
819	src_sa.sin6_addr = ip6->ip6_src;
820
821	dst0 = ip6->ip6_dst;
822	/* Re-initialize to be sure. */
823	bzero(&dst_sa, sizeof(dst_sa));
824	dst_sa.sin6_family = AF_INET6;
825	dst_sa.sin6_len = sizeof(dst_sa);
826	dst_sa.sin6_addr = ip6->ip6_dst;
827
828	/* Check for valid scope ID. */
829	if (in6_setscope(&src0, origifp, &zone) == 0 &&
830	    sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id &&
831	    in6_setscope(&dst0, origifp, &zone) == 0 &&
832	    sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) {
833		/*
834		 * The outgoing interface is in the zone of the source
835		 * and destination addresses.
836		 *
837		 */
838	} else if ((origifp->if_flags & IFF_LOOPBACK) == 0 ||
839	    sa6_recoverscope(&src_sa) != 0 ||
840	    sa6_recoverscope(&dst_sa) != 0 ||
841	    dst_sa.sin6_scope_id == 0 ||
842	    (src_sa.sin6_scope_id != 0 &&
843	    src_sa.sin6_scope_id != dst_sa.sin6_scope_id) ||
844	    ifnet_byindex(dst_sa.sin6_scope_id) == NULL) {
845		/*
846		 * If the destination network interface is not a
847		 * loopback interface, or the destination network
848		 * address has no scope ID, or the source address has
849		 * a scope ID set which is different from the
850		 * destination address one, or there is no network
851		 * interface representing this scope ID, the address
852		 * pair is considered invalid.
853		 */
854		IP6STAT_INC(ip6s_badscope);
855		in6_ifstat_inc(origifp, ifs6_out_discard);
856		if (error == 0)
857			error = EHOSTUNREACH; /* XXX */
858		goto bad;
859	}
860	/* All scope ID checks are successful. */
861
862	if (nh && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
863		if ((optvalid & IP6PO_VALID_NHINFO) != 0) {
864			/*
865			 * The nexthop is explicitly specified by the
866			 * application.  We assume the next hop is an IPv6
867			 * address.
868			 */
869			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
870		}
871		else if ((nh->nh_flags & NHF_GATEWAY))
872			dst = &nh->gw6_sa;
873	}
874
875	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
876		m->m_flags &= ~(M_BCAST | M_MCAST); /* Just in case. */
877	} else {
878		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
879		in6_ifstat_inc(ifp, ifs6_out_mcast);
880
881		/* Confirm that the outgoing interface supports multicast. */
882		if (!(ifp->if_flags & IFF_MULTICAST)) {
883			IP6STAT_INC(ip6s_noroute);
884			in6_ifstat_inc(ifp, ifs6_out_discard);
885			error = ENETUNREACH;
886			goto bad;
887		}
888		if ((im6o == NULL && in6_mcast_loop) ||
889		    (im6o && im6o->im6o_multicast_loop)) {
890			/*
891			 * Loop back multicast datagram if not expressly
892			 * forbidden to do so, even if we have not joined
893			 * the address; protocols will filter it later,
894			 * thus deferring a hash lookup and lock acquisition
895			 * at the expense of an m_copym().
896			 */
897			ip6_mloopback(ifp, m);
898		} else {
899			/*
900			 * If we are acting as a multicast router, perform
901			 * multicast forwarding as if the packet had just
902			 * arrived on the interface to which we are about
903			 * to send.  The multicast forwarding function
904			 * recursively calls this function, using the
905			 * IPV6_FORWARDING flag to prevent infinite recursion.
906			 *
907			 * Multicasts that are looped back by ip6_mloopback(),
908			 * above, will be forwarded by the ip6_input() routine,
909			 * if necessary.
910			 */
911			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
912				/*
913				 * XXX: ip6_mforward expects that rcvif is NULL
914				 * when it is called from the originating path.
915				 * However, it may not always be the case.
916				 */
917				m->m_pkthdr.rcvif = NULL;
918				if (ip6_mforward(ip6, ifp, m) != 0) {
919					m_freem(m);
920					goto done;
921				}
922			}
923		}
924		/*
925		 * Multicasts with a hoplimit of zero may be looped back,
926		 * above, but must not be transmitted on a network.
927		 * Also, multicasts addressed to the loopback interface
928		 * are not sent -- the above call to ip6_mloopback() will
929		 * loop back a copy if this host actually belongs to the
930		 * destination group on the loopback interface.
931		 */
932		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
933		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
934			m_freem(m);
935			goto done;
936		}
937	}
938
939	/*
940	 * Fill the outgoing inteface to tell the upper layer
941	 * to increment per-interface statistics.
942	 */
943	if (ifpp)
944		*ifpp = ifp;
945
946	/* Determine path MTU. */
947	if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst,
948		    &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0)
949		goto bad;
950	KASSERT(mtu > 0, ("%s:%d: mtu %ld, ro_pmtu %p ro %p ifp %p "
951	    "alwaysfrag %d fibnum %u\n", __func__, __LINE__, mtu, ro_pmtu, ro,
952	    ifp, alwaysfrag, fibnum));
953
954	/*
955	 * The caller of this function may specify to use the minimum MTU
956	 * in some cases.
957	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
958	 * setting.  The logic is a bit complicated; by default, unicast
959	 * packets will follow path MTU while multicast packets will be sent at
960	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
961	 * including unicast ones will be sent at the minimum MTU.  Multicast
962	 * packets will always be sent at the minimum MTU unless
963	 * IP6PO_MINMTU_DISABLE is explicitly specified.
964	 * See RFC 3542 for more details.
965	 */
966	if (mtu > IPV6_MMTU) {
967		if ((flags & IPV6_MINMTU))
968			mtu = IPV6_MMTU;
969		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
970			mtu = IPV6_MMTU;
971		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
972			 (opt == NULL ||
973			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
974			mtu = IPV6_MMTU;
975		}
976	}
977
978	/*
979	 * Clear embedded scope identifiers if necessary.
980	 * in6_clearscope() will touch the addresses only when necessary.
981	 */
982	in6_clearscope(&ip6->ip6_src);
983	in6_clearscope(&ip6->ip6_dst);
984
985	/*
986	 * If the outgoing packet contains a hop-by-hop options header,
987	 * it must be examined and processed even by the source node.
988	 * (RFC 2460, section 4.)
989	 */
990	if (exthdrs.ip6e_hbh) {
991		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
992		u_int32_t dummy; /* XXX unused */
993		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
994
995#ifdef DIAGNOSTIC
996		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
997			panic("ip6e_hbh is not contiguous");
998#endif
999		/*
1000		 *  XXX: if we have to send an ICMPv6 error to the sender,
1001		 *       we need the M_LOOP flag since icmp6_error() expects
1002		 *       the IPv6 and the hop-by-hop options header are
1003		 *       contiguous unless the flag is set.
1004		 */
1005		m->m_flags |= M_LOOP;
1006		m->m_pkthdr.rcvif = ifp;
1007		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
1008		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
1009		    &dummy, &plen) < 0) {
1010			/* m was already freed at this point. */
1011			error = EINVAL;/* better error? */
1012			goto done;
1013		}
1014		m->m_flags &= ~M_LOOP; /* XXX */
1015		m->m_pkthdr.rcvif = NULL;
1016	}
1017
1018	/* Jump over all PFIL processing if hooks are not active. */
1019	if (!PFIL_HOOKED_OUT(V_inet6_pfil_head))
1020		goto passout;
1021
1022	odst = ip6->ip6_dst;
1023	/* Run through list of hooks for output packets. */
1024	switch (pfil_mbuf_out(V_inet6_pfil_head, &m, ifp, inp)) {
1025	case PFIL_PASS:
1026		ip6 = mtod(m, struct ip6_hdr *);
1027		break;
1028	case PFIL_DROPPED:
1029		error = EACCES;
1030		/* FALLTHROUGH */
1031	case PFIL_CONSUMED:
1032		goto done;
1033	}
1034
1035	needfiblookup = 0;
1036	/* See if destination IP address was changed by packet filter. */
1037	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
1038		m->m_flags |= M_SKIP_FIREWALL;
1039		/* If destination is now ourself drop to ip6_input(). */
1040		if (in6_localip(&ip6->ip6_dst)) {
1041			m->m_flags |= M_FASTFWD_OURS;
1042			if (m->m_pkthdr.rcvif == NULL)
1043				m->m_pkthdr.rcvif = V_loif;
1044			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
1045				m->m_pkthdr.csum_flags |=
1046				    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
1047				m->m_pkthdr.csum_data = 0xffff;
1048			}
1049#if defined(SCTP) || defined(SCTP_SUPPORT)
1050			if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
1051				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
1052#endif
1053			error = netisr_queue(NETISR_IPV6, m);
1054			goto done;
1055		} else {
1056			if (ro != NULL)
1057				RO_INVALIDATE_CACHE(ro);
1058			needfiblookup = 1; /* Redo the routing table lookup. */
1059		}
1060	}
1061	/* See if fib was changed by packet filter. */
1062	if (fibnum != M_GETFIB(m)) {
1063		m->m_flags |= M_SKIP_FIREWALL;
1064		fibnum = M_GETFIB(m);
1065		if (ro != NULL)
1066			RO_INVALIDATE_CACHE(ro);
1067		needfiblookup = 1;
1068	}
1069	if (needfiblookup)
1070		goto again;
1071
1072	/* See if local, if yes, send it to netisr. */
1073	if (m->m_flags & M_FASTFWD_OURS) {
1074		if (m->m_pkthdr.rcvif == NULL)
1075			m->m_pkthdr.rcvif = V_loif;
1076		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
1077			m->m_pkthdr.csum_flags |=
1078			    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
1079			m->m_pkthdr.csum_data = 0xffff;
1080		}
1081#if defined(SCTP) || defined(SCTP_SUPPORT)
1082		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
1083			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
1084#endif
1085		error = netisr_queue(NETISR_IPV6, m);
1086		goto done;
1087	}
1088	/* Or forward to some other address? */
1089	if ((m->m_flags & M_IP6_NEXTHOP) &&
1090	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
1091		if (ro != NULL)
1092			dst = (struct sockaddr_in6 *)&ro->ro_dst;
1093		else
1094			dst = &sin6;
1095		bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6));
1096		m->m_flags |= M_SKIP_FIREWALL;
1097		m->m_flags &= ~M_IP6_NEXTHOP;
1098		m_tag_delete(m, fwd_tag);
1099		goto again;
1100	}
1101
1102passout:
1103	if (vlan_pcp > -1)
1104		EVL_APPLY_PRI(m, vlan_pcp);
1105
1106	/* Ensure the packet data is mapped if the interface requires it. */
1107	if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
1108		m = mb_unmapped_to_ext(m);
1109		if (m == NULL) {
1110			IP6STAT_INC(ip6s_odropped);
1111			return (ENOBUFS);
1112		}
1113	}
1114
1115	/*
1116	 * Send the packet to the outgoing interface.
1117	 * If necessary, do IPv6 fragmentation before sending.
1118	 *
1119	 * The logic here is rather complex:
1120	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
1121	 * 1-a:	send as is if tlen <= path mtu
1122	 * 1-b:	fragment if tlen > path mtu
1123	 *
1124	 * 2: if user asks us not to fragment (dontfrag == 1)
1125	 * 2-a:	send as is if tlen <= interface mtu
1126	 * 2-b:	error if tlen > interface mtu
1127	 *
1128	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
1129	 *	always fragment
1130	 *
1131	 * 4: if dontfrag == 1 && alwaysfrag == 1
1132	 *	error, as we cannot handle this conflicting request.
1133	 */
1134	sw_csum = m->m_pkthdr.csum_flags;
1135	if (!hdrsplit) {
1136		tso = ((sw_csum & ifp->if_hwassist &
1137		    (CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0;
1138		sw_csum &= ~ifp->if_hwassist;
1139	} else
1140		tso = 0;
1141	/*
1142	 * If we added extension headers, we will not do TSO and calculate the
1143	 * checksums ourselves for now.
1144	 * XXX-BZ  Need a framework to know when the NIC can handle it, even
1145	 * with ext. hdrs.
1146	 */
1147	ip6_output_delayed_csum(m, ifp, sw_csum, plen, optlen);
1148	/* XXX-BZ m->m_pkthdr.csum_flags &= ~ifp->if_hwassist; */
1149	tlen = m->m_pkthdr.len;
1150
1151	if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso)
1152		dontfrag = 1;
1153	else
1154		dontfrag = 0;
1155	if (dontfrag && alwaysfrag) {	/* Case 4. */
1156		/* Conflicting request - can't transmit. */
1157		error = EMSGSIZE;
1158		goto bad;
1159	}
1160	if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) {	/* Case 2-b. */
1161		/*
1162		 * Even if the DONTFRAG option is specified, we cannot send the
1163		 * packet when the data length is larger than the MTU of the
1164		 * outgoing interface.
1165		 * Notify the error by sending IPV6_PATHMTU ancillary data if
1166		 * application wanted to know the MTU value. Also return an
1167		 * error code (this is not described in the API spec).
1168		 */
1169		if (inp != NULL)
1170			ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu);
1171		error = EMSGSIZE;
1172		goto bad;
1173	}
1174
1175	/* Transmit packet without fragmentation. */
1176	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* Cases 1-a and 2-a. */
1177		struct in6_ifaddr *ia6;
1178
1179		ip6 = mtod(m, struct ip6_hdr *);
1180		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
1181		if (ia6) {
1182			/* Record statistics for this interface address. */
1183			counter_u64_add(ia6->ia_ifa.ifa_opackets, 1);
1184			counter_u64_add(ia6->ia_ifa.ifa_obytes,
1185			    m->m_pkthdr.len);
1186		}
1187		error = ip6_output_send(inp, ifp, origifp, m, dst, ro,
1188		    (flags & IP_NO_SND_TAG_RL) ? false : true);
1189		goto done;
1190	}
1191
1192	/* Try to fragment the packet.  Cases 1-b and 3. */
1193	if (mtu < IPV6_MMTU) {
1194		/* Path MTU cannot be less than IPV6_MMTU. */
1195		error = EMSGSIZE;
1196		in6_ifstat_inc(ifp, ifs6_out_fragfail);
1197		goto bad;
1198	} else if (ip6->ip6_plen == 0) {
1199		/* Jumbo payload cannot be fragmented. */
1200		error = EMSGSIZE;
1201		in6_ifstat_inc(ifp, ifs6_out_fragfail);
1202		goto bad;
1203	} else {
1204		u_char nextproto;
1205
1206		/*
1207		 * Too large for the destination or interface;
1208		 * fragment if possible.
1209		 * Must be able to put at least 8 bytes per fragment.
1210		 */
1211		if (mtu > IPV6_MAXPACKET)
1212			mtu = IPV6_MAXPACKET;
1213
1214		len = (mtu - unfragpartlen - sizeof(struct ip6_frag)) & ~7;
1215		if (len < 8) {
1216			error = EMSGSIZE;
1217			in6_ifstat_inc(ifp, ifs6_out_fragfail);
1218			goto bad;
1219		}
1220
1221		/*
1222		 * If the interface will not calculate checksums on
1223		 * fragmented packets, then do it here.
1224		 * XXX-BZ handle the hw offloading case.  Need flags.
1225		 */
1226		ip6_output_delayed_csum(m, ifp, m->m_pkthdr.csum_flags, plen,
1227		    optlen);
1228
1229		/*
1230		 * Change the next header field of the last header in the
1231		 * unfragmentable part.
1232		 */
1233		if (exthdrs.ip6e_rthdr) {
1234			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
1235			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
1236		} else if (exthdrs.ip6e_dest1) {
1237			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
1238			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
1239		} else if (exthdrs.ip6e_hbh) {
1240			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
1241			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
1242		} else {
1243			ip6 = mtod(m, struct ip6_hdr *);
1244			nextproto = ip6->ip6_nxt;
1245			ip6->ip6_nxt = IPPROTO_FRAGMENT;
1246		}
1247
1248		/*
1249		 * Loop through length of segment after first fragment,
1250		 * make new header and copy data of each part and link onto
1251		 * chain.
1252		 */
1253		m0 = m;
1254		id = htonl(ip6_randomid());
1255		error = ip6_fragment(ifp, m, unfragpartlen, nextproto,len, id);
1256		if (error != 0)
1257			goto sendorfree;
1258
1259		in6_ifstat_inc(ifp, ifs6_out_fragok);
1260	}
1261
1262	/* Remove leading garbage. */
1263sendorfree:
1264	m = m0->m_nextpkt;
1265	m0->m_nextpkt = 0;
1266	m_freem(m0);
1267	for (; m; m = m0) {
1268		m0 = m->m_nextpkt;
1269		m->m_nextpkt = 0;
1270		if (error == 0) {
1271			/* Record statistics for this interface address. */
1272			if (ia) {
1273				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
1274				counter_u64_add(ia->ia_ifa.ifa_obytes,
1275				    m->m_pkthdr.len);
1276			}
1277			if (vlan_pcp > -1)
1278				EVL_APPLY_PRI(m, vlan_pcp);
1279			error = ip6_output_send(inp, ifp, origifp, m, dst, ro,
1280			    true);
1281		} else
1282			m_freem(m);
1283	}
1284
1285	if (error == 0)
1286		IP6STAT_INC(ip6s_fragmented);
1287
1288done:
1289	return (error);
1290
1291freehdrs:
1292	m_freem(exthdrs.ip6e_hbh);	/* m_freem() checks if mbuf is NULL. */
1293	m_freem(exthdrs.ip6e_dest1);
1294	m_freem(exthdrs.ip6e_rthdr);
1295	m_freem(exthdrs.ip6e_dest2);
1296	/* FALLTHROUGH */
1297bad:
1298	if (m)
1299		m_freem(m);
1300	goto done;
1301}
1302
1303static int
1304ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
1305{
1306	struct mbuf *m;
1307
1308	if (hlen > MCLBYTES)
1309		return (ENOBUFS); /* XXX */
1310
1311	if (hlen > MLEN)
1312		m = m_getcl(M_NOWAIT, MT_DATA, 0);
1313	else
1314		m = m_get(M_NOWAIT, MT_DATA);
1315	if (m == NULL)
1316		return (ENOBUFS);
1317	m->m_len = hlen;
1318	if (hdr)
1319		bcopy(hdr, mtod(m, caddr_t), hlen);
1320
1321	*mp = m;
1322	return (0);
1323}
1324
1325/*
1326 * Insert jumbo payload option.
1327 */
1328static int
1329ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
1330{
1331	struct mbuf *mopt;
1332	u_char *optbuf;
1333	u_int32_t v;
1334
1335#define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
1336
1337	/*
1338	 * If there is no hop-by-hop options header, allocate new one.
1339	 * If there is one but it doesn't have enough space to store the
1340	 * jumbo payload option, allocate a cluster to store the whole options.
1341	 * Otherwise, use it to store the options.
1342	 */
1343	if (exthdrs->ip6e_hbh == NULL) {
1344		mopt = m_get(M_NOWAIT, MT_DATA);
1345		if (mopt == NULL)
1346			return (ENOBUFS);
1347		mopt->m_len = JUMBOOPTLEN;
1348		optbuf = mtod(mopt, u_char *);
1349		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
1350		exthdrs->ip6e_hbh = mopt;
1351	} else {
1352		struct ip6_hbh *hbh;
1353
1354		mopt = exthdrs->ip6e_hbh;
1355		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
1356			/*
1357			 * XXX assumption:
1358			 * - exthdrs->ip6e_hbh is not referenced from places
1359			 *   other than exthdrs.
1360			 * - exthdrs->ip6e_hbh is not an mbuf chain.
1361			 */
1362			int oldoptlen = mopt->m_len;
1363			struct mbuf *n;
1364
1365			/*
1366			 * XXX: give up if the whole (new) hbh header does
1367			 * not fit even in an mbuf cluster.
1368			 */
1369			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
1370				return (ENOBUFS);
1371
1372			/*
1373			 * As a consequence, we must always prepare a cluster
1374			 * at this point.
1375			 */
1376			n = m_getcl(M_NOWAIT, MT_DATA, 0);
1377			if (n == NULL)
1378				return (ENOBUFS);
1379			n->m_len = oldoptlen + JUMBOOPTLEN;
1380			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
1381			    oldoptlen);
1382			optbuf = mtod(n, caddr_t) + oldoptlen;
1383			m_freem(mopt);
1384			mopt = exthdrs->ip6e_hbh = n;
1385		} else {
1386			optbuf = mtod(mopt, u_char *) + mopt->m_len;
1387			mopt->m_len += JUMBOOPTLEN;
1388		}
1389		optbuf[0] = IP6OPT_PADN;
1390		optbuf[1] = 1;
1391
1392		/*
1393		 * Adjust the header length according to the pad and
1394		 * the jumbo payload option.
1395		 */
1396		hbh = mtod(mopt, struct ip6_hbh *);
1397		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
1398	}
1399
1400	/* fill in the option. */
1401	optbuf[2] = IP6OPT_JUMBO;
1402	optbuf[3] = 4;
1403	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
1404	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
1405
1406	/* finally, adjust the packet header length */
1407	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
1408
1409	return (0);
1410#undef JUMBOOPTLEN
1411}
1412
1413/*
1414 * Insert fragment header and copy unfragmentable header portions.
1415 */
1416static int
1417ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
1418    struct ip6_frag **frghdrp)
1419{
1420	struct mbuf *n, *mlast;
1421
1422	if (hlen > sizeof(struct ip6_hdr)) {
1423		n = m_copym(m0, sizeof(struct ip6_hdr),
1424		    hlen - sizeof(struct ip6_hdr), M_NOWAIT);
1425		if (n == NULL)
1426			return (ENOBUFS);
1427		m->m_next = n;
1428	} else
1429		n = m;
1430
1431	/* Search for the last mbuf of unfragmentable part. */
1432	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
1433		;
1434
1435	if (M_WRITABLE(mlast) &&
1436	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
1437		/* use the trailing space of the last mbuf for the fragment hdr */
1438		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
1439		    mlast->m_len);
1440		mlast->m_len += sizeof(struct ip6_frag);
1441		m->m_pkthdr.len += sizeof(struct ip6_frag);
1442	} else {
1443		/* allocate a new mbuf for the fragment header */
1444		struct mbuf *mfrg;
1445
1446		mfrg = m_get(M_NOWAIT, MT_DATA);
1447		if (mfrg == NULL)
1448			return (ENOBUFS);
1449		mfrg->m_len = sizeof(struct ip6_frag);
1450		*frghdrp = mtod(mfrg, struct ip6_frag *);
1451		mlast->m_next = mfrg;
1452	}
1453
1454	return (0);
1455}
1456
1457/*
1458 * Calculates IPv6 path mtu for destination @dst.
1459 * Resulting MTU is stored in @mtup.
1460 *
1461 * Returns 0 on success.
1462 */
1463static int
1464ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup)
1465{
1466	struct epoch_tracker et;
1467	struct nhop_object *nh;
1468	struct in6_addr kdst;
1469	uint32_t scopeid;
1470	int error;
1471
1472	in6_splitscope(dst, &kdst, &scopeid);
1473
1474	NET_EPOCH_ENTER(et);
1475	nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, 0);
1476	if (nh != NULL)
1477		error = ip6_calcmtu(nh->nh_ifp, dst, nh->nh_mtu, mtup, NULL, 0);
1478	else
1479		error = EHOSTUNREACH;
1480	NET_EPOCH_EXIT(et);
1481
1482	return (error);
1483}
1484
1485/*
1486 * Calculates IPv6 path MTU for @dst based on transmit @ifp,
1487 * and cached data in @ro_pmtu.
1488 * MTU from (successful) route lookup is saved (along with dst)
1489 * inside @ro_pmtu to avoid subsequent route lookups after packet
1490 * filter processing.
1491 *
1492 * Stores mtu and always-frag value into @mtup and @alwaysfragp.
1493 * Returns 0 on success.
1494 */
1495static int
1496ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
1497    struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup,
1498    int *alwaysfragp, u_int fibnum, u_int proto)
1499{
1500	struct nhop_object *nh;
1501	struct in6_addr kdst;
1502	uint32_t scopeid;
1503	struct sockaddr_in6 *sa6_dst, sin6;
1504	u_long mtu;
1505
1506	NET_EPOCH_ASSERT();
1507
1508	mtu = 0;
1509	if (ro_pmtu == NULL || do_lookup) {
1510		/*
1511		 * Here ro_pmtu has final destination address, while
1512		 * ro might represent immediate destination.
1513		 * Use ro_pmtu destination since mtu might differ.
1514		 */
1515		if (ro_pmtu != NULL) {
1516			sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
1517			if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))
1518				ro_pmtu->ro_mtu = 0;
1519		} else
1520			sa6_dst = &sin6;
1521
1522		if (ro_pmtu == NULL || ro_pmtu->ro_mtu == 0) {
1523			bzero(sa6_dst, sizeof(*sa6_dst));
1524			sa6_dst->sin6_family = AF_INET6;
1525			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
1526			sa6_dst->sin6_addr = *dst;
1527
1528			in6_splitscope(dst, &kdst, &scopeid);
1529			nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, 0);
1530			if (nh != NULL) {
1531				mtu = nh->nh_mtu;
1532				if (ro_pmtu != NULL)
1533					ro_pmtu->ro_mtu = mtu;
1534			}
1535		} else
1536			mtu = ro_pmtu->ro_mtu;
1537	}
1538
1539	if (ro_pmtu != NULL && ro_pmtu->ro_nh != NULL)
1540		mtu = ro_pmtu->ro_nh->nh_mtu;
1541
1542	return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
1543}
1544
1545/*
1546 * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and
1547 * hostcache data for @dst.
1548 * Stores mtu and always-frag value into @mtup and @alwaysfragp.
1549 *
1550 * Returns 0 on success.
1551 */
1552static int
1553ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
1554    u_long *mtup, int *alwaysfragp, u_int proto)
1555{
1556	u_long mtu = 0;
1557	int alwaysfrag = 0;
1558	int error = 0;
1559
1560	if (rt_mtu > 0) {
1561		u_int32_t ifmtu;
1562		struct in_conninfo inc;
1563
1564		bzero(&inc, sizeof(inc));
1565		inc.inc_flags |= INC_ISIPV6;
1566		inc.inc6_faddr = *dst;
1567
1568		ifmtu = IN6_LINKMTU(ifp);
1569
1570		/* TCP is known to react to pmtu changes so skip hc */
1571		if (proto != IPPROTO_TCP)
1572			mtu = tcp_hc_getmtu(&inc);
1573
1574		if (mtu)
1575			mtu = min(mtu, rt_mtu);
1576		else
1577			mtu = rt_mtu;
1578		if (mtu == 0)
1579			mtu = ifmtu;
1580		else if (mtu < IPV6_MMTU) {
1581			/*
1582			 * RFC2460 section 5, last paragraph:
1583			 * if we record ICMPv6 too big message with
1584			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
1585			 * or smaller, with framgent header attached.
1586			 * (fragment header is needed regardless from the
1587			 * packet size, for translators to identify packets)
1588			 */
1589			alwaysfrag = 1;
1590			mtu = IPV6_MMTU;
1591		}
1592	} else if (ifp) {
1593		mtu = IN6_LINKMTU(ifp);
1594	} else
1595		error = EHOSTUNREACH; /* XXX */
1596
1597	*mtup = mtu;
1598	if (alwaysfragp)
1599		*alwaysfragp = alwaysfrag;
1600	return (error);
1601}
1602
1603/*
1604 * IP6 socket option processing.
1605 */
1606int
1607ip6_ctloutput(struct socket *so, struct sockopt *sopt)
1608{
1609	int optdatalen, uproto;
1610	void *optdata;
1611	struct inpcb *inp = sotoinpcb(so);
1612	int error, optval;
1613	int level, op, optname;
1614	int optlen;
1615	struct thread *td;
1616#ifdef	RSS
1617	uint32_t rss_bucket;
1618	int retval;
1619#endif
1620
1621/*
1622 * Don't use more than a quarter of mbuf clusters.  N.B.:
1623 * nmbclusters is an int, but nmbclusters * MCLBYTES may overflow
1624 * on LP64 architectures, so cast to u_long to avoid undefined
1625 * behavior.  ILP32 architectures cannot have nmbclusters
1626 * large enough to overflow for other reasons.
1627 */
1628#define IPV6_PKTOPTIONS_MBUF_LIMIT	((u_long)nmbclusters * MCLBYTES / 4)
1629
1630	level = sopt->sopt_level;
1631	op = sopt->sopt_dir;
1632	optname = sopt->sopt_name;
1633	optlen = sopt->sopt_valsize;
1634	td = sopt->sopt_td;
1635	error = 0;
1636	optval = 0;
1637	uproto = (int)so->so_proto->pr_protocol;
1638
1639	if (level != IPPROTO_IPV6) {
1640		error = EINVAL;
1641
1642		if (sopt->sopt_level == SOL_SOCKET &&
1643		    sopt->sopt_dir == SOPT_SET) {
1644			switch (sopt->sopt_name) {
1645			case SO_SETFIB:
1646				INP_WLOCK(inp);
1647				inp->inp_inc.inc_fibnum = so->so_fibnum;
1648				INP_WUNLOCK(inp);
1649				error = 0;
1650				break;
1651			case SO_MAX_PACING_RATE:
1652#ifdef RATELIMIT
1653				INP_WLOCK(inp);
1654				inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
1655				INP_WUNLOCK(inp);
1656				error = 0;
1657#else
1658				error = EOPNOTSUPP;
1659#endif
1660				break;
1661			default:
1662				break;
1663			}
1664		}
1665	} else {		/* level == IPPROTO_IPV6 */
1666		switch (op) {
1667		case SOPT_SET:
1668			switch (optname) {
1669			case IPV6_2292PKTOPTIONS:
1670#ifdef IPV6_PKTOPTIONS
1671			case IPV6_PKTOPTIONS:
1672#endif
1673			{
1674				struct mbuf *m;
1675
1676				if (optlen > IPV6_PKTOPTIONS_MBUF_LIMIT) {
1677					printf("ip6_ctloutput: mbuf limit hit\n");
1678					error = ENOBUFS;
1679					break;
1680				}
1681
1682				error = soopt_getm(sopt, &m); /* XXX */
1683				if (error != 0)
1684					break;
1685				error = soopt_mcopyin(sopt, m); /* XXX */
1686				if (error != 0)
1687					break;
1688				INP_WLOCK(inp);
1689				error = ip6_pcbopts(&inp->in6p_outputopts, m,
1690				    so, sopt);
1691				INP_WUNLOCK(inp);
1692				m_freem(m); /* XXX */
1693				break;
1694			}
1695
1696			/*
1697			 * Use of some Hop-by-Hop options or some
1698			 * Destination options, might require special
1699			 * privilege.  That is, normal applications
1700			 * (without special privilege) might be forbidden
1701			 * from setting certain options in outgoing packets,
1702			 * and might never see certain options in received
1703			 * packets. [RFC 2292 Section 6]
1704			 * KAME specific note:
1705			 *  KAME prevents non-privileged users from sending or
1706			 *  receiving ANY hbh/dst options in order to avoid
1707			 *  overhead of parsing options in the kernel.
1708			 */
1709			case IPV6_RECVHOPOPTS:
1710			case IPV6_RECVDSTOPTS:
1711			case IPV6_RECVRTHDRDSTOPTS:
1712				if (td != NULL) {
1713					error = priv_check(td,
1714					    PRIV_NETINET_SETHDROPTS);
1715					if (error)
1716						break;
1717				}
1718				/* FALLTHROUGH */
1719			case IPV6_UNICAST_HOPS:
1720			case IPV6_HOPLIMIT:
1721
1722			case IPV6_RECVPKTINFO:
1723			case IPV6_RECVHOPLIMIT:
1724			case IPV6_RECVRTHDR:
1725			case IPV6_RECVPATHMTU:
1726			case IPV6_RECVTCLASS:
1727			case IPV6_RECVFLOWID:
1728#ifdef	RSS
1729			case IPV6_RECVRSSBUCKETID:
1730#endif
1731			case IPV6_V6ONLY:
1732			case IPV6_AUTOFLOWLABEL:
1733			case IPV6_ORIGDSTADDR:
1734			case IPV6_BINDANY:
1735			case IPV6_VLAN_PCP:
1736				if (optname == IPV6_BINDANY && td != NULL) {
1737					error = priv_check(td,
1738					    PRIV_NETINET_BINDANY);
1739					if (error)
1740						break;
1741				}
1742
1743				if (optlen != sizeof(int)) {
1744					error = EINVAL;
1745					break;
1746				}
1747				error = sooptcopyin(sopt, &optval,
1748					sizeof optval, sizeof optval);
1749				if (error)
1750					break;
1751				switch (optname) {
1752				case IPV6_UNICAST_HOPS:
1753					if (optval < -1 || optval >= 256)
1754						error = EINVAL;
1755					else {
1756						/* -1 = kernel default */
1757						inp->in6p_hops = optval;
1758						if ((inp->inp_vflag &
1759						     INP_IPV4) != 0)
1760							inp->inp_ip_ttl = optval;
1761					}
1762					break;
1763#define OPTSET(bit) \
1764do { \
1765	INP_WLOCK(inp); \
1766	if (optval) \
1767		inp->inp_flags |= (bit); \
1768	else \
1769		inp->inp_flags &= ~(bit); \
1770	INP_WUNLOCK(inp); \
1771} while (/*CONSTCOND*/ 0)
1772#define OPTSET2292(bit) \
1773do { \
1774	INP_WLOCK(inp); \
1775	inp->inp_flags |= IN6P_RFC2292; \
1776	if (optval) \
1777		inp->inp_flags |= (bit); \
1778	else \
1779		inp->inp_flags &= ~(bit); \
1780	INP_WUNLOCK(inp); \
1781} while (/*CONSTCOND*/ 0)
1782#define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0)
1783
1784#define OPTSET2_N(bit, val) do {					\
1785	if (val)							\
1786		inp->inp_flags2 |= bit;					\
1787	else								\
1788		inp->inp_flags2 &= ~bit;				\
1789} while (0)
1790#define OPTSET2(bit, val) do {						\
1791	INP_WLOCK(inp);							\
1792	OPTSET2_N(bit, val);						\
1793	INP_WUNLOCK(inp);						\
1794} while (0)
1795#define OPTBIT2(bit) (inp->inp_flags2 & (bit) ? 1 : 0)
1796#define OPTSET2292_EXCLUSIVE(bit)					\
1797do {									\
1798	INP_WLOCK(inp);							\
1799	if (OPTBIT(IN6P_RFC2292)) {					\
1800		error = EINVAL;						\
1801	} else {							\
1802		if (optval)						\
1803			inp->inp_flags |= (bit);			\
1804		else							\
1805			inp->inp_flags &= ~(bit);			\
1806	}								\
1807	INP_WUNLOCK(inp);						\
1808} while (/*CONSTCOND*/ 0)
1809
1810				case IPV6_RECVPKTINFO:
1811					OPTSET2292_EXCLUSIVE(IN6P_PKTINFO);
1812					break;
1813
1814				case IPV6_HOPLIMIT:
1815				{
1816					struct ip6_pktopts **optp;
1817
1818					/* cannot mix with RFC2292 */
1819					if (OPTBIT(IN6P_RFC2292)) {
1820						error = EINVAL;
1821						break;
1822					}
1823					INP_WLOCK(inp);
1824					if (inp->inp_flags & INP_DROPPED) {
1825						INP_WUNLOCK(inp);
1826						return (ECONNRESET);
1827					}
1828					optp = &inp->in6p_outputopts;
1829					error = ip6_pcbopt(IPV6_HOPLIMIT,
1830					    (u_char *)&optval, sizeof(optval),
1831					    optp, (td != NULL) ? td->td_ucred :
1832					    NULL, uproto);
1833					INP_WUNLOCK(inp);
1834					break;
1835				}
1836
1837				case IPV6_RECVHOPLIMIT:
1838					OPTSET2292_EXCLUSIVE(IN6P_HOPLIMIT);
1839					break;
1840
1841				case IPV6_RECVHOPOPTS:
1842					OPTSET2292_EXCLUSIVE(IN6P_HOPOPTS);
1843					break;
1844
1845				case IPV6_RECVDSTOPTS:
1846					OPTSET2292_EXCLUSIVE(IN6P_DSTOPTS);
1847					break;
1848
1849				case IPV6_RECVRTHDRDSTOPTS:
1850					OPTSET2292_EXCLUSIVE(IN6P_RTHDRDSTOPTS);
1851					break;
1852
1853				case IPV6_RECVRTHDR:
1854					OPTSET2292_EXCLUSIVE(IN6P_RTHDR);
1855					break;
1856
1857				case IPV6_RECVPATHMTU:
1858					/*
1859					 * We ignore this option for TCP
1860					 * sockets.
1861					 * (RFC3542 leaves this case
1862					 * unspecified.)
1863					 */
1864					if (uproto != IPPROTO_TCP)
1865						OPTSET(IN6P_MTU);
1866					break;
1867
1868				case IPV6_RECVFLOWID:
1869					OPTSET2(INP_RECVFLOWID, optval);
1870					break;
1871
1872#ifdef	RSS
1873				case IPV6_RECVRSSBUCKETID:
1874					OPTSET2(INP_RECVRSSBUCKETID, optval);
1875					break;
1876#endif
1877
1878				case IPV6_V6ONLY:
1879					INP_WLOCK(inp);
1880					if (inp->inp_lport ||
1881					    !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
1882						/*
1883						 * The socket is already bound.
1884						 */
1885						INP_WUNLOCK(inp);
1886						error = EINVAL;
1887						break;
1888					}
1889					if (optval) {
1890						inp->inp_flags |= IN6P_IPV6_V6ONLY;
1891						inp->inp_vflag &= ~INP_IPV4;
1892					} else {
1893						inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
1894						inp->inp_vflag |= INP_IPV4;
1895					}
1896					INP_WUNLOCK(inp);
1897					break;
1898				case IPV6_RECVTCLASS:
1899					/* cannot mix with RFC2292 XXX */
1900					OPTSET2292_EXCLUSIVE(IN6P_TCLASS);
1901					break;
1902				case IPV6_AUTOFLOWLABEL:
1903					OPTSET(IN6P_AUTOFLOWLABEL);
1904					break;
1905
1906				case IPV6_ORIGDSTADDR:
1907					OPTSET2(INP_ORIGDSTADDR, optval);
1908					break;
1909				case IPV6_BINDANY:
1910					OPTSET(INP_BINDANY);
1911					break;
1912				case IPV6_VLAN_PCP:
1913					if ((optval >= -1) && (optval <=
1914					    (INP_2PCP_MASK >> INP_2PCP_SHIFT))) {
1915						if (optval == -1) {
1916							INP_WLOCK(inp);
1917							inp->inp_flags2 &=
1918							    ~(INP_2PCP_SET |
1919							    INP_2PCP_MASK);
1920							INP_WUNLOCK(inp);
1921						} else {
1922							INP_WLOCK(inp);
1923							inp->inp_flags2 |=
1924							    INP_2PCP_SET;
1925							inp->inp_flags2 &=
1926							    ~INP_2PCP_MASK;
1927							inp->inp_flags2 |=
1928							    optval <<
1929							    INP_2PCP_SHIFT;
1930							INP_WUNLOCK(inp);
1931						}
1932					} else
1933						error = EINVAL;
1934					break;
1935				}
1936				break;
1937
1938			case IPV6_TCLASS:
1939			case IPV6_DONTFRAG:
1940			case IPV6_USE_MIN_MTU:
1941			case IPV6_PREFER_TEMPADDR:
1942				if (optlen != sizeof(optval)) {
1943					error = EINVAL;
1944					break;
1945				}
1946				error = sooptcopyin(sopt, &optval,
1947					sizeof optval, sizeof optval);
1948				if (error)
1949					break;
1950				{
1951					struct ip6_pktopts **optp;
1952					INP_WLOCK(inp);
1953					if (inp->inp_flags & INP_DROPPED) {
1954						INP_WUNLOCK(inp);
1955						return (ECONNRESET);
1956					}
1957					optp = &inp->in6p_outputopts;
1958					error = ip6_pcbopt(optname,
1959					    (u_char *)&optval, sizeof(optval),
1960					    optp, (td != NULL) ? td->td_ucred :
1961					    NULL, uproto);
1962					INP_WUNLOCK(inp);
1963					break;
1964				}
1965
1966			case IPV6_2292PKTINFO:
1967			case IPV6_2292HOPLIMIT:
1968			case IPV6_2292HOPOPTS:
1969			case IPV6_2292DSTOPTS:
1970			case IPV6_2292RTHDR:
1971				/* RFC 2292 */
1972				if (optlen != sizeof(int)) {
1973					error = EINVAL;
1974					break;
1975				}
1976				error = sooptcopyin(sopt, &optval,
1977					sizeof optval, sizeof optval);
1978				if (error)
1979					break;
1980				switch (optname) {
1981				case IPV6_2292PKTINFO:
1982					OPTSET2292(IN6P_PKTINFO);
1983					break;
1984				case IPV6_2292HOPLIMIT:
1985					OPTSET2292(IN6P_HOPLIMIT);
1986					break;
1987				case IPV6_2292HOPOPTS:
1988					/*
1989					 * Check super-user privilege.
1990					 * See comments for IPV6_RECVHOPOPTS.
1991					 */
1992					if (td != NULL) {
1993						error = priv_check(td,
1994						    PRIV_NETINET_SETHDROPTS);
1995						if (error)
1996							return (error);
1997					}
1998					OPTSET2292(IN6P_HOPOPTS);
1999					break;
2000				case IPV6_2292DSTOPTS:
2001					if (td != NULL) {
2002						error = priv_check(td,
2003						    PRIV_NETINET_SETHDROPTS);
2004						if (error)
2005							return (error);
2006					}
2007					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
2008					break;
2009				case IPV6_2292RTHDR:
2010					OPTSET2292(IN6P_RTHDR);
2011					break;
2012				}
2013				break;
2014			case IPV6_PKTINFO:
2015			case IPV6_HOPOPTS:
2016			case IPV6_RTHDR:
2017			case IPV6_DSTOPTS:
2018			case IPV6_RTHDRDSTOPTS:
2019			case IPV6_NEXTHOP:
2020			{
2021				/* new advanced API (RFC3542) */
2022				u_char *optbuf;
2023				u_char optbuf_storage[MCLBYTES];
2024				int optlen;
2025				struct ip6_pktopts **optp;
2026
2027				/* cannot mix with RFC2292 */
2028				if (OPTBIT(IN6P_RFC2292)) {
2029					error = EINVAL;
2030					break;
2031				}
2032
2033				/*
2034				 * We only ensure valsize is not too large
2035				 * here.  Further validation will be done
2036				 * later.
2037				 */
2038				error = sooptcopyin(sopt, optbuf_storage,
2039				    sizeof(optbuf_storage), 0);
2040				if (error)
2041					break;
2042				optlen = sopt->sopt_valsize;
2043				optbuf = optbuf_storage;
2044				INP_WLOCK(inp);
2045				if (inp->inp_flags & INP_DROPPED) {
2046					INP_WUNLOCK(inp);
2047					return (ECONNRESET);
2048				}
2049				optp = &inp->in6p_outputopts;
2050				error = ip6_pcbopt(optname, optbuf, optlen,
2051				    optp, (td != NULL) ? td->td_ucred : NULL,
2052				    uproto);
2053				INP_WUNLOCK(inp);
2054				break;
2055			}
2056#undef OPTSET
2057
2058			case IPV6_MULTICAST_IF:
2059			case IPV6_MULTICAST_HOPS:
2060			case IPV6_MULTICAST_LOOP:
2061			case IPV6_JOIN_GROUP:
2062			case IPV6_LEAVE_GROUP:
2063			case IPV6_MSFILTER:
2064			case MCAST_BLOCK_SOURCE:
2065			case MCAST_UNBLOCK_SOURCE:
2066			case MCAST_JOIN_GROUP:
2067			case MCAST_LEAVE_GROUP:
2068			case MCAST_JOIN_SOURCE_GROUP:
2069			case MCAST_LEAVE_SOURCE_GROUP:
2070				error = ip6_setmoptions(inp, sopt);
2071				break;
2072
2073			case IPV6_PORTRANGE:
2074				error = sooptcopyin(sopt, &optval,
2075				    sizeof optval, sizeof optval);
2076				if (error)
2077					break;
2078
2079				INP_WLOCK(inp);
2080				switch (optval) {
2081				case IPV6_PORTRANGE_DEFAULT:
2082					inp->inp_flags &= ~(INP_LOWPORT);
2083					inp->inp_flags &= ~(INP_HIGHPORT);
2084					break;
2085
2086				case IPV6_PORTRANGE_HIGH:
2087					inp->inp_flags &= ~(INP_LOWPORT);
2088					inp->inp_flags |= INP_HIGHPORT;
2089					break;
2090
2091				case IPV6_PORTRANGE_LOW:
2092					inp->inp_flags &= ~(INP_HIGHPORT);
2093					inp->inp_flags |= INP_LOWPORT;
2094					break;
2095
2096				default:
2097					error = EINVAL;
2098					break;
2099				}
2100				INP_WUNLOCK(inp);
2101				break;
2102
2103#if defined(IPSEC) || defined(IPSEC_SUPPORT)
2104			case IPV6_IPSEC_POLICY:
2105				if (IPSEC_ENABLED(ipv6)) {
2106					error = IPSEC_PCBCTL(ipv6, inp, sopt);
2107					break;
2108				}
2109				/* FALLTHROUGH */
2110#endif /* IPSEC */
2111
2112			default:
2113				error = ENOPROTOOPT;
2114				break;
2115			}
2116			break;
2117
2118		case SOPT_GET:
2119			switch (optname) {
2120			case IPV6_2292PKTOPTIONS:
2121#ifdef IPV6_PKTOPTIONS
2122			case IPV6_PKTOPTIONS:
2123#endif
2124				/*
2125				 * RFC3542 (effectively) deprecated the
2126				 * semantics of the 2292-style pktoptions.
2127				 * Since it was not reliable in nature (i.e.,
2128				 * applications had to expect the lack of some
2129				 * information after all), it would make sense
2130				 * to simplify this part by always returning
2131				 * empty data.
2132				 */
2133				sopt->sopt_valsize = 0;
2134				break;
2135
2136			case IPV6_RECVHOPOPTS:
2137			case IPV6_RECVDSTOPTS:
2138			case IPV6_RECVRTHDRDSTOPTS:
2139			case IPV6_UNICAST_HOPS:
2140			case IPV6_RECVPKTINFO:
2141			case IPV6_RECVHOPLIMIT:
2142			case IPV6_RECVRTHDR:
2143			case IPV6_RECVPATHMTU:
2144
2145			case IPV6_V6ONLY:
2146			case IPV6_PORTRANGE:
2147			case IPV6_RECVTCLASS:
2148			case IPV6_AUTOFLOWLABEL:
2149			case IPV6_BINDANY:
2150			case IPV6_FLOWID:
2151			case IPV6_FLOWTYPE:
2152			case IPV6_RECVFLOWID:
2153#ifdef	RSS
2154			case IPV6_RSSBUCKETID:
2155			case IPV6_RECVRSSBUCKETID:
2156#endif
2157			case IPV6_VLAN_PCP:
2158				switch (optname) {
2159				case IPV6_RECVHOPOPTS:
2160					optval = OPTBIT(IN6P_HOPOPTS);
2161					break;
2162
2163				case IPV6_RECVDSTOPTS:
2164					optval = OPTBIT(IN6P_DSTOPTS);
2165					break;
2166
2167				case IPV6_RECVRTHDRDSTOPTS:
2168					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
2169					break;
2170
2171				case IPV6_UNICAST_HOPS:
2172					optval = inp->in6p_hops;
2173					break;
2174
2175				case IPV6_RECVPKTINFO:
2176					optval = OPTBIT(IN6P_PKTINFO);
2177					break;
2178
2179				case IPV6_RECVHOPLIMIT:
2180					optval = OPTBIT(IN6P_HOPLIMIT);
2181					break;
2182
2183				case IPV6_RECVRTHDR:
2184					optval = OPTBIT(IN6P_RTHDR);
2185					break;
2186
2187				case IPV6_RECVPATHMTU:
2188					optval = OPTBIT(IN6P_MTU);
2189					break;
2190
2191				case IPV6_V6ONLY:
2192					optval = OPTBIT(IN6P_IPV6_V6ONLY);
2193					break;
2194
2195				case IPV6_PORTRANGE:
2196				    {
2197					int flags;
2198					flags = inp->inp_flags;
2199					if (flags & INP_HIGHPORT)
2200						optval = IPV6_PORTRANGE_HIGH;
2201					else if (flags & INP_LOWPORT)
2202						optval = IPV6_PORTRANGE_LOW;
2203					else
2204						optval = 0;
2205					break;
2206				    }
2207				case IPV6_RECVTCLASS:
2208					optval = OPTBIT(IN6P_TCLASS);
2209					break;
2210
2211				case IPV6_AUTOFLOWLABEL:
2212					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
2213					break;
2214
2215				case IPV6_ORIGDSTADDR:
2216					optval = OPTBIT2(INP_ORIGDSTADDR);
2217					break;
2218
2219				case IPV6_BINDANY:
2220					optval = OPTBIT(INP_BINDANY);
2221					break;
2222
2223				case IPV6_FLOWID:
2224					optval = inp->inp_flowid;
2225					break;
2226
2227				case IPV6_FLOWTYPE:
2228					optval = inp->inp_flowtype;
2229					break;
2230
2231				case IPV6_RECVFLOWID:
2232					optval = OPTBIT2(INP_RECVFLOWID);
2233					break;
2234#ifdef	RSS
2235				case IPV6_RSSBUCKETID:
2236					retval =
2237					    rss_hash2bucket(inp->inp_flowid,
2238					    inp->inp_flowtype,
2239					    &rss_bucket);
2240					if (retval == 0)
2241						optval = rss_bucket;
2242					else
2243						error = EINVAL;
2244					break;
2245
2246				case IPV6_RECVRSSBUCKETID:
2247					optval = OPTBIT2(INP_RECVRSSBUCKETID);
2248					break;
2249#endif
2250
2251
2252				case IPV6_VLAN_PCP:
2253					if (OPTBIT2(INP_2PCP_SET)) {
2254						optval = (inp->inp_flags2 &
2255							    INP_2PCP_MASK) >>
2256							    INP_2PCP_SHIFT;
2257					} else {
2258						optval = -1;
2259					}
2260					break;
2261				}
2262
2263				if (error)
2264					break;
2265				error = sooptcopyout(sopt, &optval,
2266					sizeof optval);
2267				break;
2268
2269			case IPV6_PATHMTU:
2270			{
2271				u_long pmtu = 0;
2272				struct ip6_mtuinfo mtuinfo;
2273				struct in6_addr addr;
2274
2275				if (!(so->so_state & SS_ISCONNECTED))
2276					return (ENOTCONN);
2277				/*
2278				 * XXX: we dot not consider the case of source
2279				 * routing, or optional information to specify
2280				 * the outgoing interface.
2281				 * Copy faddr out of inp to avoid holding lock
2282				 * on inp during route lookup.
2283				 */
2284				INP_RLOCK(inp);
2285				bcopy(&inp->in6p_faddr, &addr, sizeof(addr));
2286				INP_RUNLOCK(inp);
2287				error = ip6_getpmtu_ctl(so->so_fibnum,
2288				    &addr, &pmtu);
2289				if (error)
2290					break;
2291				if (pmtu > IPV6_MAXPACKET)
2292					pmtu = IPV6_MAXPACKET;
2293
2294				bzero(&mtuinfo, sizeof(mtuinfo));
2295				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
2296				optdata = (void *)&mtuinfo;
2297				optdatalen = sizeof(mtuinfo);
2298				error = sooptcopyout(sopt, optdata,
2299				    optdatalen);
2300				break;
2301			}
2302
2303			case IPV6_2292PKTINFO:
2304			case IPV6_2292HOPLIMIT:
2305			case IPV6_2292HOPOPTS:
2306			case IPV6_2292RTHDR:
2307			case IPV6_2292DSTOPTS:
2308				switch (optname) {
2309				case IPV6_2292PKTINFO:
2310					optval = OPTBIT(IN6P_PKTINFO);
2311					break;
2312				case IPV6_2292HOPLIMIT:
2313					optval = OPTBIT(IN6P_HOPLIMIT);
2314					break;
2315				case IPV6_2292HOPOPTS:
2316					optval = OPTBIT(IN6P_HOPOPTS);
2317					break;
2318				case IPV6_2292RTHDR:
2319					optval = OPTBIT(IN6P_RTHDR);
2320					break;
2321				case IPV6_2292DSTOPTS:
2322					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
2323					break;
2324				}
2325				error = sooptcopyout(sopt, &optval,
2326				    sizeof optval);
2327				break;
2328			case IPV6_PKTINFO:
2329			case IPV6_HOPOPTS:
2330			case IPV6_RTHDR:
2331			case IPV6_DSTOPTS:
2332			case IPV6_RTHDRDSTOPTS:
2333			case IPV6_NEXTHOP:
2334			case IPV6_TCLASS:
2335			case IPV6_DONTFRAG:
2336			case IPV6_USE_MIN_MTU:
2337			case IPV6_PREFER_TEMPADDR:
2338				error = ip6_getpcbopt(inp, optname, sopt);
2339				break;
2340
2341			case IPV6_MULTICAST_IF:
2342			case IPV6_MULTICAST_HOPS:
2343			case IPV6_MULTICAST_LOOP:
2344			case IPV6_MSFILTER:
2345				error = ip6_getmoptions(inp, sopt);
2346				break;
2347
2348#if defined(IPSEC) || defined(IPSEC_SUPPORT)
2349			case IPV6_IPSEC_POLICY:
2350				if (IPSEC_ENABLED(ipv6)) {
2351					error = IPSEC_PCBCTL(ipv6, inp, sopt);
2352					break;
2353				}
2354				/* FALLTHROUGH */
2355#endif /* IPSEC */
2356			default:
2357				error = ENOPROTOOPT;
2358				break;
2359			}
2360			break;
2361		}
2362	}
2363	return (error);
2364}
2365
2366int
2367ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
2368{
2369	int error = 0, optval, optlen;
2370	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
2371	struct inpcb *inp = sotoinpcb(so);
2372	int level, op, optname;
2373
2374	level = sopt->sopt_level;
2375	op = sopt->sopt_dir;
2376	optname = sopt->sopt_name;
2377	optlen = sopt->sopt_valsize;
2378
2379	if (level != IPPROTO_IPV6) {
2380		return (EINVAL);
2381	}
2382
2383	switch (optname) {
2384	case IPV6_CHECKSUM:
2385		/*
2386		 * For ICMPv6 sockets, no modification allowed for checksum
2387		 * offset, permit "no change" values to help existing apps.
2388		 *
2389		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
2390		 * for an ICMPv6 socket will fail."
2391		 * The current behavior does not meet RFC3542.
2392		 */
2393		switch (op) {
2394		case SOPT_SET:
2395			if (optlen != sizeof(int)) {
2396				error = EINVAL;
2397				break;
2398			}
2399			error = sooptcopyin(sopt, &optval, sizeof(optval),
2400					    sizeof(optval));
2401			if (error)
2402				break;
2403			if (optval < -1 || (optval % 2) != 0) {
2404				/*
2405				 * The API assumes non-negative even offset
2406				 * values or -1 as a special value.
2407				 */
2408				error = EINVAL;
2409			} else if (inp->inp_ip_p == IPPROTO_ICMPV6) {
2410				if (optval != icmp6off)
2411					error = EINVAL;
2412			} else
2413				inp->in6p_cksum = optval;
2414			break;
2415
2416		case SOPT_GET:
2417			if (inp->inp_ip_p == IPPROTO_ICMPV6)
2418				optval = icmp6off;
2419			else
2420				optval = inp->in6p_cksum;
2421
2422			error = sooptcopyout(sopt, &optval, sizeof(optval));
2423			break;
2424
2425		default:
2426			error = EINVAL;
2427			break;
2428		}
2429		break;
2430
2431	default:
2432		error = ENOPROTOOPT;
2433		break;
2434	}
2435
2436	return (error);
2437}
2438
2439/*
2440 * Set up IP6 options in pcb for insertion in output packets or
2441 * specifying behavior of outgoing packets.
2442 */
2443static int
2444ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
2445    struct socket *so, struct sockopt *sopt)
2446{
2447	struct ip6_pktopts *opt = *pktopt;
2448	int error = 0;
2449	struct thread *td = sopt->sopt_td;
2450	struct epoch_tracker et;
2451
2452	/* turn off any old options. */
2453	if (opt) {
2454#ifdef DIAGNOSTIC
2455		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
2456		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
2457		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
2458			printf("ip6_pcbopts: all specified options are cleared.\n");
2459#endif
2460		ip6_clearpktopts(opt, -1);
2461	} else {
2462		opt = malloc(sizeof(*opt), M_IP6OPT, M_NOWAIT);
2463		if (opt == NULL)
2464			return (ENOMEM);
2465	}
2466	*pktopt = NULL;
2467
2468	if (!m || m->m_len == 0) {
2469		/*
2470		 * Only turning off any previous options, regardless of
2471		 * whether the opt is just created or given.
2472		 */
2473		free(opt, M_IP6OPT);
2474		return (0);
2475	}
2476
2477	/*  set options specified by user. */
2478	NET_EPOCH_ENTER(et);
2479	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
2480	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
2481		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
2482		free(opt, M_IP6OPT);
2483		NET_EPOCH_EXIT(et);
2484		return (error);
2485	}
2486	NET_EPOCH_EXIT(et);
2487	*pktopt = opt;
2488	return (0);
2489}
2490
2491/*
2492 * initialize ip6_pktopts.  beware that there are non-zero default values in
2493 * the struct.
2494 */
2495void
2496ip6_initpktopts(struct ip6_pktopts *opt)
2497{
2498
2499	bzero(opt, sizeof(*opt));
2500	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
2501	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
2502	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
2503	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
2504}
2505
2506static int
2507ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
2508    struct ucred *cred, int uproto)
2509{
2510	struct epoch_tracker et;
2511	struct ip6_pktopts *opt;
2512	int ret;
2513
2514	if (*pktopt == NULL) {
2515		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
2516		    M_NOWAIT);
2517		if (*pktopt == NULL)
2518			return (ENOBUFS);
2519		ip6_initpktopts(*pktopt);
2520	}
2521	opt = *pktopt;
2522
2523	NET_EPOCH_ENTER(et);
2524	ret = ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto);
2525	NET_EPOCH_EXIT(et);
2526
2527	return (ret);
2528}
2529
2530#define GET_PKTOPT_VAR(field, lenexpr) do {				\
2531	if (pktopt && pktopt->field) {					\
2532		INP_RUNLOCK(inp);					\
2533		optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK);	\
2534		malloc_optdata = true;					\
2535		INP_RLOCK(inp);						\
2536		if (inp->inp_flags & INP_DROPPED) {			\
2537			INP_RUNLOCK(inp);				\
2538			free(optdata, M_TEMP);				\
2539			return (ECONNRESET);				\
2540		}							\
2541		pktopt = inp->in6p_outputopts;				\
2542		if (pktopt && pktopt->field) {				\
2543			optdatalen = min(lenexpr, sopt->sopt_valsize);	\
2544			bcopy(pktopt->field, optdata, optdatalen);	\
2545		} else {						\
2546			free(optdata, M_TEMP);				\
2547			optdata = NULL;					\
2548			malloc_optdata = false;				\
2549		}							\
2550	}								\
2551} while(0)
2552
2553#define GET_PKTOPT_EXT_HDR(field) GET_PKTOPT_VAR(field,			\
2554	(((struct ip6_ext *)pktopt->field)->ip6e_len + 1) << 3)
2555
2556#define GET_PKTOPT_SOCKADDR(field) GET_PKTOPT_VAR(field,		\
2557	pktopt->field->sa_len)
2558
2559static int
2560ip6_getpcbopt(struct inpcb *inp, int optname, struct sockopt *sopt)
2561{
2562	void *optdata = NULL;
2563	bool malloc_optdata = false;
2564	int optdatalen = 0;
2565	int error = 0;
2566	struct in6_pktinfo null_pktinfo;
2567	int deftclass = 0, on;
2568	int defminmtu = IP6PO_MINMTU_MCASTONLY;
2569	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
2570	struct ip6_pktopts *pktopt;
2571
2572	INP_RLOCK(inp);
2573	pktopt = inp->in6p_outputopts;
2574
2575	switch (optname) {
2576	case IPV6_PKTINFO:
2577		optdata = (void *)&null_pktinfo;
2578		if (pktopt && pktopt->ip6po_pktinfo) {
2579			bcopy(pktopt->ip6po_pktinfo, &null_pktinfo,
2580			    sizeof(null_pktinfo));
2581			in6_clearscope(&null_pktinfo.ipi6_addr);
2582		} else {
2583			/* XXX: we don't have to do this every time... */
2584			bzero(&null_pktinfo, sizeof(null_pktinfo));
2585		}
2586		optdatalen = sizeof(struct in6_pktinfo);
2587		break;
2588	case IPV6_TCLASS:
2589		if (pktopt && pktopt->ip6po_tclass >= 0)
2590			deftclass = pktopt->ip6po_tclass;
2591		optdata = (void *)&deftclass;
2592		optdatalen = sizeof(int);
2593		break;
2594	case IPV6_HOPOPTS:
2595		GET_PKTOPT_EXT_HDR(ip6po_hbh);
2596		break;
2597	case IPV6_RTHDR:
2598		GET_PKTOPT_EXT_HDR(ip6po_rthdr);
2599		break;
2600	case IPV6_RTHDRDSTOPTS:
2601		GET_PKTOPT_EXT_HDR(ip6po_dest1);
2602		break;
2603	case IPV6_DSTOPTS:
2604		GET_PKTOPT_EXT_HDR(ip6po_dest2);
2605		break;
2606	case IPV6_NEXTHOP:
2607		GET_PKTOPT_SOCKADDR(ip6po_nexthop);
2608		break;
2609	case IPV6_USE_MIN_MTU:
2610		if (pktopt)
2611			defminmtu = pktopt->ip6po_minmtu;
2612		optdata = (void *)&defminmtu;
2613		optdatalen = sizeof(int);
2614		break;
2615	case IPV6_DONTFRAG:
2616		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
2617			on = 1;
2618		else
2619			on = 0;
2620		optdata = (void *)&on;
2621		optdatalen = sizeof(on);
2622		break;
2623	case IPV6_PREFER_TEMPADDR:
2624		if (pktopt)
2625			defpreftemp = pktopt->ip6po_prefer_tempaddr;
2626		optdata = (void *)&defpreftemp;
2627		optdatalen = sizeof(int);
2628		break;
2629	default:		/* should not happen */
2630#ifdef DIAGNOSTIC
2631		panic("ip6_getpcbopt: unexpected option\n");
2632#endif
2633		INP_RUNLOCK(inp);
2634		return (ENOPROTOOPT);
2635	}
2636	INP_RUNLOCK(inp);
2637
2638	error = sooptcopyout(sopt, optdata, optdatalen);
2639	if (malloc_optdata)
2640		free(optdata, M_TEMP);
2641
2642	return (error);
2643}
2644
2645void
2646ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
2647{
2648	if (pktopt == NULL)
2649		return;
2650
2651	if (optname == -1 || optname == IPV6_PKTINFO) {
2652		if (pktopt->ip6po_pktinfo)
2653			free(pktopt->ip6po_pktinfo, M_IP6OPT);
2654		pktopt->ip6po_pktinfo = NULL;
2655	}
2656	if (optname == -1 || optname == IPV6_HOPLIMIT) {
2657		pktopt->ip6po_hlim = -1;
2658		pktopt->ip6po_valid &= ~IP6PO_VALID_HLIM;
2659	}
2660	if (optname == -1 || optname == IPV6_TCLASS) {
2661		pktopt->ip6po_tclass = -1;
2662		pktopt->ip6po_valid &= ~IP6PO_VALID_TC;
2663	}
2664	if (optname == -1 || optname == IPV6_NEXTHOP) {
2665		if (pktopt->ip6po_nextroute.ro_nh) {
2666			NH_FREE(pktopt->ip6po_nextroute.ro_nh);
2667			pktopt->ip6po_nextroute.ro_nh = NULL;
2668		}
2669		if (pktopt->ip6po_nexthop)
2670			free(pktopt->ip6po_nexthop, M_IP6OPT);
2671		pktopt->ip6po_nexthop = NULL;
2672		pktopt->ip6po_valid &= ~IP6PO_VALID_NHINFO;
2673	}
2674	if (optname == -1 || optname == IPV6_HOPOPTS) {
2675		if (pktopt->ip6po_hbh)
2676			free(pktopt->ip6po_hbh, M_IP6OPT);
2677		pktopt->ip6po_hbh = NULL;
2678		pktopt->ip6po_valid &= ~IP6PO_VALID_HBH;
2679	}
2680	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
2681		if (pktopt->ip6po_dest1)
2682			free(pktopt->ip6po_dest1, M_IP6OPT);
2683		pktopt->ip6po_dest1 = NULL;
2684		pktopt->ip6po_valid &= ~IP6PO_VALID_DEST1;
2685	}
2686	if (optname == -1 || optname == IPV6_RTHDR) {
2687		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
2688			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
2689		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
2690		if (pktopt->ip6po_route.ro_nh) {
2691			NH_FREE(pktopt->ip6po_route.ro_nh);
2692			pktopt->ip6po_route.ro_nh = NULL;
2693		}
2694		pktopt->ip6po_valid &= ~IP6PO_VALID_RHINFO;
2695	}
2696	if (optname == -1 || optname == IPV6_DSTOPTS) {
2697		if (pktopt->ip6po_dest2)
2698			free(pktopt->ip6po_dest2, M_IP6OPT);
2699		pktopt->ip6po_dest2 = NULL;
2700		pktopt->ip6po_valid &= ~IP6PO_VALID_DEST2;
2701	}
2702}
2703
2704#define PKTOPT_EXTHDRCPY(type) \
2705do {\
2706	if (src->type) {\
2707		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
2708		dst->type = malloc(hlen, M_IP6OPT, canwait);\
2709		if (dst->type == NULL)\
2710			goto bad;\
2711		bcopy(src->type, dst->type, hlen);\
2712	}\
2713} while (/*CONSTCOND*/ 0)
2714
2715static int
2716copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
2717{
2718	if (dst == NULL || src == NULL)  {
2719		printf("ip6_clearpktopts: invalid argument\n");
2720		return (EINVAL);
2721	}
2722
2723	dst->ip6po_hlim = src->ip6po_hlim;
2724	dst->ip6po_tclass = src->ip6po_tclass;
2725	dst->ip6po_flags = src->ip6po_flags;
2726	dst->ip6po_minmtu = src->ip6po_minmtu;
2727	dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
2728	if (src->ip6po_pktinfo) {
2729		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
2730		    M_IP6OPT, canwait);
2731		if (dst->ip6po_pktinfo == NULL)
2732			goto bad;
2733		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
2734	}
2735	if (src->ip6po_nexthop) {
2736		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
2737		    M_IP6OPT, canwait);
2738		if (dst->ip6po_nexthop == NULL)
2739			goto bad;
2740		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
2741		    src->ip6po_nexthop->sa_len);
2742	}
2743	PKTOPT_EXTHDRCPY(ip6po_hbh);
2744	PKTOPT_EXTHDRCPY(ip6po_dest1);
2745	PKTOPT_EXTHDRCPY(ip6po_dest2);
2746	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
2747	dst->ip6po_valid = src->ip6po_valid;
2748	return (0);
2749
2750  bad:
2751	ip6_clearpktopts(dst, -1);
2752	return (ENOBUFS);
2753}
2754#undef PKTOPT_EXTHDRCPY
2755
2756struct ip6_pktopts *
2757ip6_copypktopts(struct ip6_pktopts *src, int canwait)
2758{
2759	int error;
2760	struct ip6_pktopts *dst;
2761
2762	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
2763	if (dst == NULL)
2764		return (NULL);
2765	ip6_initpktopts(dst);
2766
2767	if ((error = copypktopts(dst, src, canwait)) != 0) {
2768		free(dst, M_IP6OPT);
2769		return (NULL);
2770	}
2771
2772	return (dst);
2773}
2774
2775void
2776ip6_freepcbopts(struct ip6_pktopts *pktopt)
2777{
2778	if (pktopt == NULL)
2779		return;
2780
2781	ip6_clearpktopts(pktopt, -1);
2782
2783	free(pktopt, M_IP6OPT);
2784}
2785
2786/*
2787 * Set IPv6 outgoing packet options based on advanced API.
2788 */
2789int
2790ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
2791    struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
2792{
2793	struct cmsghdr *cm = NULL;
2794
2795	if (control == NULL || opt == NULL)
2796		return (EINVAL);
2797
2798	/*
2799	 * ip6_setpktopt can call ifnet_byindex(), so it's imperative that we
2800	 * are in the network epoch here.
2801	 */
2802	NET_EPOCH_ASSERT();
2803
2804	ip6_initpktopts(opt);
2805	if (stickyopt) {
2806		int error;
2807
2808		/*
2809		 * If stickyopt is provided, make a local copy of the options
2810		 * for this particular packet, then override them by ancillary
2811		 * objects.
2812		 * XXX: copypktopts() does not copy the cached route to a next
2813		 * hop (if any).  This is not very good in terms of efficiency,
2814		 * but we can allow this since this option should be rarely
2815		 * used.
2816		 */
2817		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
2818			return (error);
2819	}
2820
2821	/*
2822	 * XXX: Currently, we assume all the optional information is stored
2823	 * in a single mbuf.
2824	 */
2825	if (control->m_next)
2826		return (EINVAL);
2827
2828	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
2829	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
2830		int error;
2831
2832		if (control->m_len < CMSG_LEN(0))
2833			return (EINVAL);
2834
2835		cm = mtod(control, struct cmsghdr *);
2836		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
2837			return (EINVAL);
2838		if (cm->cmsg_level != IPPROTO_IPV6)
2839			continue;
2840
2841		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
2842		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
2843		if (error)
2844			return (error);
2845	}
2846
2847	return (0);
2848}
2849
2850/*
2851 * Set a particular packet option, as a sticky option or an ancillary data
2852 * item.  "len" can be 0 only when it's a sticky option.
2853 * We have 4 cases of combination of "sticky" and "cmsg":
2854 * "sticky=0, cmsg=0": impossible
2855 * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
2856 * "sticky=1, cmsg=0": RFC3542 socket option
2857 * "sticky=1, cmsg=1": RFC2292 socket option
2858 */
2859static int
2860ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
2861    struct ucred *cred, int sticky, int cmsg, int uproto)
2862{
2863	int minmtupolicy, preftemp;
2864	int error;
2865
2866	NET_EPOCH_ASSERT();
2867
2868	if (!sticky && !cmsg) {
2869#ifdef DIAGNOSTIC
2870		printf("ip6_setpktopt: impossible case\n");
2871#endif
2872		return (EINVAL);
2873	}
2874
2875	/*
2876	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
2877	 * not be specified in the context of RFC3542.  Conversely,
2878	 * RFC3542 types should not be specified in the context of RFC2292.
2879	 */
2880	if (!cmsg) {
2881		switch (optname) {
2882		case IPV6_2292PKTINFO:
2883		case IPV6_2292HOPLIMIT:
2884		case IPV6_2292NEXTHOP:
2885		case IPV6_2292HOPOPTS:
2886		case IPV6_2292DSTOPTS:
2887		case IPV6_2292RTHDR:
2888		case IPV6_2292PKTOPTIONS:
2889			return (ENOPROTOOPT);
2890		}
2891	}
2892	if (sticky && cmsg) {
2893		switch (optname) {
2894		case IPV6_PKTINFO:
2895		case IPV6_HOPLIMIT:
2896		case IPV6_NEXTHOP:
2897		case IPV6_HOPOPTS:
2898		case IPV6_DSTOPTS:
2899		case IPV6_RTHDRDSTOPTS:
2900		case IPV6_RTHDR:
2901		case IPV6_USE_MIN_MTU:
2902		case IPV6_DONTFRAG:
2903		case IPV6_TCLASS:
2904		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
2905			return (ENOPROTOOPT);
2906		}
2907	}
2908
2909	switch (optname) {
2910	case IPV6_2292PKTINFO:
2911	case IPV6_PKTINFO:
2912	{
2913		struct ifnet *ifp = NULL;
2914		struct in6_pktinfo *pktinfo;
2915
2916		if (len != sizeof(struct in6_pktinfo))
2917			return (EINVAL);
2918
2919		pktinfo = (struct in6_pktinfo *)buf;
2920
2921		/*
2922		 * An application can clear any sticky IPV6_PKTINFO option by
2923		 * doing a "regular" setsockopt with ipi6_addr being
2924		 * in6addr_any and ipi6_ifindex being zero.
2925		 * [RFC 3542, Section 6]
2926		 */
2927		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
2928		    pktinfo->ipi6_ifindex == 0 &&
2929		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2930			ip6_clearpktopts(opt, optname);
2931			break;
2932		}
2933
2934		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
2935		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2936			return (EINVAL);
2937		}
2938		if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr))
2939			return (EINVAL);
2940		/* validate the interface index if specified. */
2941		if (pktinfo->ipi6_ifindex) {
2942			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
2943			if (ifp == NULL)
2944				return (ENXIO);
2945		}
2946		if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL ||
2947		    (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0))
2948			return (ENETDOWN);
2949
2950		if (ifp != NULL &&
2951		    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
2952			struct in6_ifaddr *ia;
2953
2954			in6_setscope(&pktinfo->ipi6_addr, ifp, NULL);
2955			ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr);
2956			if (ia == NULL)
2957				return (EADDRNOTAVAIL);
2958			ifa_free(&ia->ia_ifa);
2959		}
2960		/*
2961		 * We store the address anyway, and let in6_selectsrc()
2962		 * validate the specified address.  This is because ipi6_addr
2963		 * may not have enough information about its scope zone, and
2964		 * we may need additional information (such as outgoing
2965		 * interface or the scope zone of a destination address) to
2966		 * disambiguate the scope.
2967		 * XXX: the delay of the validation may confuse the
2968		 * application when it is used as a sticky option.
2969		 */
2970		if (opt->ip6po_pktinfo == NULL) {
2971			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
2972			    M_IP6OPT, M_NOWAIT);
2973			if (opt->ip6po_pktinfo == NULL)
2974				return (ENOBUFS);
2975		}
2976		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
2977		opt->ip6po_valid |= IP6PO_VALID_PKTINFO;
2978		break;
2979	}
2980
2981	case IPV6_2292HOPLIMIT:
2982	case IPV6_HOPLIMIT:
2983	{
2984		int *hlimp;
2985
2986		/*
2987		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
2988		 * to simplify the ordering among hoplimit options.
2989		 */
2990		if (optname == IPV6_HOPLIMIT && sticky)
2991			return (ENOPROTOOPT);
2992
2993		if (len != sizeof(int))
2994			return (EINVAL);
2995		hlimp = (int *)buf;
2996		if (*hlimp < -1 || *hlimp > 255)
2997			return (EINVAL);
2998
2999		opt->ip6po_hlim = *hlimp;
3000		opt->ip6po_valid |= IP6PO_VALID_HLIM;
3001		break;
3002	}
3003
3004	case IPV6_TCLASS:
3005	{
3006		int tclass;
3007
3008		if (len != sizeof(int))
3009			return (EINVAL);
3010		tclass = *(int *)buf;
3011		if (tclass < -1 || tclass > 255)
3012			return (EINVAL);
3013
3014		opt->ip6po_tclass = tclass;
3015		opt->ip6po_valid |= IP6PO_VALID_TC;
3016		break;
3017	}
3018
3019	case IPV6_2292NEXTHOP:
3020	case IPV6_NEXTHOP:
3021		if (cred != NULL) {
3022			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
3023			if (error)
3024				return (error);
3025		}
3026
3027		if (len == 0) {	/* just remove the option */
3028			ip6_clearpktopts(opt, IPV6_NEXTHOP);
3029			break;
3030		}
3031
3032		/* check if cmsg_len is large enough for sa_len */
3033		if (len < sizeof(struct sockaddr) || len < *buf)
3034			return (EINVAL);
3035
3036		switch (((struct sockaddr *)buf)->sa_family) {
3037		case AF_INET6:
3038		{
3039			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
3040			int error;
3041
3042			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
3043				return (EINVAL);
3044
3045			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
3046			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
3047				return (EINVAL);
3048			}
3049			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
3050			    != 0) {
3051				return (error);
3052			}
3053			break;
3054		}
3055		case AF_LINK:	/* should eventually be supported */
3056		default:
3057			return (EAFNOSUPPORT);
3058		}
3059
3060		/* turn off the previous option, then set the new option. */
3061		ip6_clearpktopts(opt, IPV6_NEXTHOP);
3062		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
3063		if (opt->ip6po_nexthop == NULL)
3064			return (ENOBUFS);
3065		bcopy(buf, opt->ip6po_nexthop, *buf);
3066		opt->ip6po_valid |= IP6PO_VALID_NHINFO;
3067		break;
3068
3069	case IPV6_2292HOPOPTS:
3070	case IPV6_HOPOPTS:
3071	{
3072		struct ip6_hbh *hbh;
3073		int hbhlen;
3074
3075		/*
3076		 * XXX: We don't allow a non-privileged user to set ANY HbH
3077		 * options, since per-option restriction has too much
3078		 * overhead.
3079		 */
3080		if (cred != NULL) {
3081			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
3082			if (error)
3083				return (error);
3084		}
3085
3086		if (len == 0) {
3087			ip6_clearpktopts(opt, IPV6_HOPOPTS);
3088			break;	/* just remove the option */
3089		}
3090
3091		/* message length validation */
3092		if (len < sizeof(struct ip6_hbh))
3093			return (EINVAL);
3094		hbh = (struct ip6_hbh *)buf;
3095		hbhlen = (hbh->ip6h_len + 1) << 3;
3096		if (len != hbhlen)
3097			return (EINVAL);
3098
3099		/* turn off the previous option, then set the new option. */
3100		ip6_clearpktopts(opt, IPV6_HOPOPTS);
3101		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
3102		if (opt->ip6po_hbh == NULL)
3103			return (ENOBUFS);
3104		bcopy(hbh, opt->ip6po_hbh, hbhlen);
3105		opt->ip6po_valid |= IP6PO_VALID_HBH;
3106
3107		break;
3108	}
3109
3110	case IPV6_2292DSTOPTS:
3111	case IPV6_DSTOPTS:
3112	case IPV6_RTHDRDSTOPTS:
3113	{
3114		struct ip6_dest *dest, **newdest = NULL;
3115		int destlen;
3116
3117		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
3118			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
3119			if (error)
3120				return (error);
3121		}
3122
3123		if (len == 0) {
3124			ip6_clearpktopts(opt, optname);
3125			break;	/* just remove the option */
3126		}
3127
3128		/* message length validation */
3129		if (len < sizeof(struct ip6_dest))
3130			return (EINVAL);
3131		dest = (struct ip6_dest *)buf;
3132		destlen = (dest->ip6d_len + 1) << 3;
3133		if (len != destlen)
3134			return (EINVAL);
3135
3136		/*
3137		 * Determine the position that the destination options header
3138		 * should be inserted; before or after the routing header.
3139		 */
3140		switch (optname) {
3141		case IPV6_2292DSTOPTS:
3142			/*
3143			 * The old advacned API is ambiguous on this point.
3144			 * Our approach is to determine the position based
3145			 * according to the existence of a routing header.
3146			 * Note, however, that this depends on the order of the
3147			 * extension headers in the ancillary data; the 1st
3148			 * part of the destination options header must appear
3149			 * before the routing header in the ancillary data,
3150			 * too.
3151			 * RFC3542 solved the ambiguity by introducing
3152			 * separate ancillary data or option types.
3153			 */
3154			if (opt->ip6po_rthdr == NULL)
3155				newdest = &opt->ip6po_dest1;
3156			else
3157				newdest = &opt->ip6po_dest2;
3158			break;
3159		case IPV6_RTHDRDSTOPTS:
3160			newdest = &opt->ip6po_dest1;
3161			break;
3162		case IPV6_DSTOPTS:
3163			newdest = &opt->ip6po_dest2;
3164			break;
3165		}
3166
3167		/* turn off the previous option, then set the new option. */
3168		ip6_clearpktopts(opt, optname);
3169		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
3170		if (*newdest == NULL)
3171			return (ENOBUFS);
3172		bcopy(dest, *newdest, destlen);
3173		if (newdest == &opt->ip6po_dest1)
3174			opt->ip6po_valid |= IP6PO_VALID_DEST1;
3175		else
3176			opt->ip6po_valid |= IP6PO_VALID_DEST2;
3177
3178		break;
3179	}
3180
3181	case IPV6_2292RTHDR:
3182	case IPV6_RTHDR:
3183	{
3184		struct ip6_rthdr *rth;
3185		int rthlen;
3186
3187		if (len == 0) {
3188			ip6_clearpktopts(opt, IPV6_RTHDR);
3189			break;	/* just remove the option */
3190		}
3191
3192		/* message length validation */
3193		if (len < sizeof(struct ip6_rthdr))
3194			return (EINVAL);
3195		rth = (struct ip6_rthdr *)buf;
3196		rthlen = (rth->ip6r_len + 1) << 3;
3197		if (len != rthlen)
3198			return (EINVAL);
3199
3200		switch (rth->ip6r_type) {
3201		case IPV6_RTHDR_TYPE_0:
3202			if (rth->ip6r_len == 0)	/* must contain one addr */
3203				return (EINVAL);
3204			if (rth->ip6r_len % 2) /* length must be even */
3205				return (EINVAL);
3206			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
3207				return (EINVAL);
3208			break;
3209		default:
3210			return (EINVAL);	/* not supported */
3211		}
3212
3213		/* turn off the previous option */
3214		ip6_clearpktopts(opt, IPV6_RTHDR);
3215		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
3216		if (opt->ip6po_rthdr == NULL)
3217			return (ENOBUFS);
3218		bcopy(rth, opt->ip6po_rthdr, rthlen);
3219		opt->ip6po_valid |= IP6PO_VALID_RHINFO;
3220
3221		break;
3222	}
3223
3224	case IPV6_USE_MIN_MTU:
3225		if (len != sizeof(int))
3226			return (EINVAL);
3227		minmtupolicy = *(int *)buf;
3228		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
3229		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
3230		    minmtupolicy != IP6PO_MINMTU_ALL) {
3231			return (EINVAL);
3232		}
3233		opt->ip6po_minmtu = minmtupolicy;
3234		break;
3235
3236	case IPV6_DONTFRAG:
3237		if (len != sizeof(int))
3238			return (EINVAL);
3239
3240		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
3241			/*
3242			 * we ignore this option for TCP sockets.
3243			 * (RFC3542 leaves this case unspecified.)
3244			 */
3245			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
3246		} else
3247			opt->ip6po_flags |= IP6PO_DONTFRAG;
3248		break;
3249
3250	case IPV6_PREFER_TEMPADDR:
3251		if (len != sizeof(int))
3252			return (EINVAL);
3253		preftemp = *(int *)buf;
3254		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
3255		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
3256		    preftemp != IP6PO_TEMPADDR_PREFER) {
3257			return (EINVAL);
3258		}
3259		opt->ip6po_prefer_tempaddr = preftemp;
3260		break;
3261
3262	default:
3263		return (ENOPROTOOPT);
3264	} /* end of switch */
3265
3266	return (0);
3267}
3268
3269/*
3270 * Routine called from ip6_output() to loop back a copy of an IP6 multicast
3271 * packet to the input queue of a specified interface.  Note that this
3272 * calls the output routine of the loopback "driver", but with an interface
3273 * pointer that might NOT be &loif -- easier than replicating that code here.
3274 */
3275void
3276ip6_mloopback(struct ifnet *ifp, struct mbuf *m)
3277{
3278	struct mbuf *copym;
3279	struct ip6_hdr *ip6;
3280
3281	copym = m_copym(m, 0, M_COPYALL, M_NOWAIT);
3282	if (copym == NULL)
3283		return;
3284
3285	/*
3286	 * Make sure to deep-copy IPv6 header portion in case the data
3287	 * is in an mbuf cluster, so that we can safely override the IPv6
3288	 * header portion later.
3289	 */
3290	if (!M_WRITABLE(copym) ||
3291	    copym->m_len < sizeof(struct ip6_hdr)) {
3292		copym = m_pullup(copym, sizeof(struct ip6_hdr));
3293		if (copym == NULL)
3294			return;
3295	}
3296	ip6 = mtod(copym, struct ip6_hdr *);
3297	/*
3298	 * clear embedded scope identifiers if necessary.
3299	 * in6_clearscope will touch the addresses only when necessary.
3300	 */
3301	in6_clearscope(&ip6->ip6_src);
3302	in6_clearscope(&ip6->ip6_dst);
3303	if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
3304		copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 |
3305		    CSUM_PSEUDO_HDR;
3306		copym->m_pkthdr.csum_data = 0xffff;
3307	}
3308	if_simloop(ifp, copym, AF_INET6, 0);
3309}
3310
3311/*
3312 * Chop IPv6 header off from the payload.
3313 */
3314static int
3315ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
3316{
3317	struct mbuf *mh;
3318	struct ip6_hdr *ip6;
3319
3320	ip6 = mtod(m, struct ip6_hdr *);
3321	if (m->m_len > sizeof(*ip6)) {
3322		mh = m_gethdr(M_NOWAIT, MT_DATA);
3323		if (mh == NULL) {
3324			m_freem(m);
3325			return ENOBUFS;
3326		}
3327		m_move_pkthdr(mh, m);
3328		M_ALIGN(mh, sizeof(*ip6));
3329		m->m_len -= sizeof(*ip6);
3330		m->m_data += sizeof(*ip6);
3331		mh->m_next = m;
3332		m = mh;
3333		m->m_len = sizeof(*ip6);
3334		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
3335	}
3336	exthdrs->ip6e_ip6 = m;
3337	return 0;
3338}
3339
3340/*
3341 * Compute IPv6 extension header length.
3342 */
3343int
3344ip6_optlen(struct inpcb *inp)
3345{
3346	int len;
3347
3348	if (!inp->in6p_outputopts)
3349		return 0;
3350
3351	len = 0;
3352#define elen(x) \
3353    (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
3354
3355	len += elen(inp->in6p_outputopts->ip6po_hbh);
3356	if (inp->in6p_outputopts->ip6po_rthdr)
3357		/* dest1 is valid with rthdr only */
3358		len += elen(inp->in6p_outputopts->ip6po_dest1);
3359	len += elen(inp->in6p_outputopts->ip6po_rthdr);
3360	len += elen(inp->in6p_outputopts->ip6po_dest2);
3361	return len;
3362#undef elen
3363}
3364