1/*-
2 * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include "opt_inet.h"
27#include "opt_inet6.h"
28#include "opt_kbd.h"
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/devctl.h>
33#include <sys/eventhandler.h>
34#include <sys/kernel.h>
35#include <sys/mbuf.h>
36#include <sys/module.h>
37#include <sys/socket.h>
38#include <sys/sysctl.h>
39#ifdef KDB
40#include <sys/kdb.h>
41#endif
42
43#include <net/bpf.h>
44#include <net/ethernet.h>
45#include <net/infiniband.h>
46#include <net/if.h>
47#include <net/if_var.h>
48#include <net/if_private.h>
49#include <net/if_dl.h>
50#include <net/if_media.h>
51#include <net/if_lagg.h>
52#include <net/if_llatbl.h>
53#include <net/if_types.h>
54#include <net/netisr.h>
55#include <net/route.h>
56#include <netinet/if_ether.h>
57#include <netinet/in.h>
58#include <netinet/ip6.h>
59#include <netinet6/in6_var.h>
60#include <netinet6/nd6.h>
61
62#include <security/mac/mac_framework.h>
63
64/* if_lagg(4) support */
65struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *);
66
67#ifdef INET
68static inline void
69infiniband_ipv4_multicast_map(uint32_t addr,
70    const uint8_t *broadcast, uint8_t *buf)
71{
72	uint8_t scope;
73
74	addr = ntohl(addr);
75	scope = broadcast[5] & 0xF;
76
77	buf[0] = 0;
78	buf[1] = 0xff;
79	buf[2] = 0xff;
80	buf[3] = 0xff;
81	buf[4] = 0xff;
82	buf[5] = 0x10 | scope;
83	buf[6] = 0x40;
84	buf[7] = 0x1b;
85	buf[8] = broadcast[8];
86	buf[9] = broadcast[9];
87	buf[10] = 0;
88	buf[11] = 0;
89	buf[12] = 0;
90	buf[13] = 0;
91	buf[14] = 0;
92	buf[15] = 0;
93	buf[16] = (addr >> 24) & 0xff;
94	buf[17] = (addr >> 16) & 0xff;
95	buf[18] = (addr >> 8) & 0xff;
96	buf[19] = addr & 0xff;
97}
98#endif
99
100#ifdef INET6
101static inline void
102infiniband_ipv6_multicast_map(const struct in6_addr *addr,
103    const uint8_t *broadcast, uint8_t *buf)
104{
105	uint8_t scope;
106
107	scope = broadcast[5] & 0xF;
108
109	buf[0] = 0;
110	buf[1] = 0xff;
111	buf[2] = 0xff;
112	buf[3] = 0xff;
113	buf[4] = 0xff;
114	buf[5] = 0x10 | scope;
115	buf[6] = 0x60;
116	buf[7] = 0x1b;
117	buf[8] = broadcast[8];
118	buf[9] = broadcast[9];
119	memcpy(&buf[10], &addr->s6_addr[6], 10);
120}
121#endif
122
123/*
124 * This is for clients that have an infiniband_header in the mbuf.
125 */
126void
127infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb)
128{
129	struct infiniband_header *ibh;
130	struct ether_header eh;
131
132	if (!bpf_peers_present(ifp->if_bpf))
133		return;
134
135	M_ASSERTVALID(mb);
136	if (mb->m_len < sizeof(*ibh))
137		return;
138
139	ibh = mtod(mb, struct infiniband_header *);
140	eh.ether_type = ibh->ib_protocol;
141	memset(eh.ether_shost, 0, ETHER_ADDR_LEN);
142	memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN);
143	mb->m_data += sizeof(*ibh);
144	mb->m_len -= sizeof(*ibh);
145	mb->m_pkthdr.len -= sizeof(*ibh);
146	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
147	mb->m_data -= sizeof(*ibh);
148	mb->m_len += sizeof(*ibh);
149	mb->m_pkthdr.len += sizeof(*ibh);
150}
151
152static void
153update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
154{
155	int csum_flags = 0;
156
157	if (src->m_pkthdr.csum_flags & CSUM_IP)
158		csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
159	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
160		csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
161	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
162		csum_flags |= CSUM_SCTP_VALID;
163	dst->m_pkthdr.csum_flags |= csum_flags;
164	if (csum_flags & CSUM_DATA_VALID)
165		dst->m_pkthdr.csum_data = 0xffff;
166}
167
168/*
169 * Handle link-layer encapsulation requests.
170 */
171static int
172infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req)
173{
174	struct infiniband_header *ih;
175	struct arphdr *ah;
176	uint16_t etype;
177	const uint8_t *lladdr;
178
179	if (req->rtype != IFENCAP_LL)
180		return (EOPNOTSUPP);
181
182	if (req->bufsize < INFINIBAND_HDR_LEN)
183		return (ENOMEM);
184
185	ih = (struct infiniband_header *)req->buf;
186	lladdr = req->lladdr;
187	req->lladdr_off = 0;
188
189	switch (req->family) {
190	case AF_INET:
191		etype = htons(ETHERTYPE_IP);
192		break;
193	case AF_INET6:
194		etype = htons(ETHERTYPE_IPV6);
195		break;
196	case AF_ARP:
197		ah = (struct arphdr *)req->hdata;
198		ah->ar_hrd = htons(ARPHRD_INFINIBAND);
199
200		switch (ntohs(ah->ar_op)) {
201		case ARPOP_REVREQUEST:
202		case ARPOP_REVREPLY:
203			etype = htons(ETHERTYPE_REVARP);
204			break;
205		case ARPOP_REQUEST:
206		case ARPOP_REPLY:
207		default:
208			etype = htons(ETHERTYPE_ARP);
209			break;
210		}
211
212		if (req->flags & IFENCAP_FLAG_BROADCAST)
213			lladdr = ifp->if_broadcastaddr;
214		break;
215	default:
216		return (EAFNOSUPPORT);
217	}
218
219	ih->ib_protocol = etype;
220	ih->ib_reserved = 0;
221	memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN);
222	req->bufsize = sizeof(struct infiniband_header);
223
224	return (0);
225}
226
227static int
228infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m,
229    const struct sockaddr *dst, struct route *ro, uint8_t *phdr,
230    uint32_t *pflags, struct llentry **plle)
231{
232#if defined(INET) || defined(INET6)
233	struct infiniband_header *ih = (struct infiniband_header *)phdr;
234#endif
235	uint32_t lleflags = 0;
236	int error = 0;
237
238	if (plle)
239		*plle = NULL;
240
241	switch (dst->sa_family) {
242#ifdef INET
243	case AF_INET:
244		if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) {
245			error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle);
246		} else {
247			if (m->m_flags & M_BCAST) {
248				memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr,
249				    INFINIBAND_ADDR_LEN);
250			} else {
251				infiniband_ipv4_multicast_map(
252				    ((const struct sockaddr_in *)dst)->sin_addr.s_addr,
253				    ifp->if_broadcastaddr, ih->ib_hwaddr);
254			}
255			ih->ib_protocol = htons(ETHERTYPE_IP);
256			ih->ib_reserved = 0;
257		}
258		break;
259#endif
260#ifdef INET6
261	case AF_INET6:
262		if ((m->m_flags & M_MCAST) == 0) {
263			int af = RO_GET_FAMILY(ro, dst);
264			error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr,
265			    &lleflags, plle);
266		} else {
267			infiniband_ipv6_multicast_map(
268			    &((const struct sockaddr_in6 *)dst)->sin6_addr,
269			    ifp->if_broadcastaddr, ih->ib_hwaddr);
270			ih->ib_protocol = htons(ETHERTYPE_IPV6);
271			ih->ib_reserved = 0;
272		}
273		break;
274#endif
275	default:
276		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
277		if (m != NULL)
278			m_freem(m);
279		return (EAFNOSUPPORT);
280	}
281
282	if (error == EHOSTDOWN) {
283		if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
284			error = EHOSTUNREACH;
285	}
286
287	if (error != 0)
288		return (error);
289
290	*pflags = RT_MAY_LOOP;
291	if (lleflags & LLE_IFADDR)
292		*pflags |= RT_L2_ME;
293
294	return (0);
295}
296
297/*
298 * Infiniband output routine.
299 */
300static int
301infiniband_output(struct ifnet *ifp, struct mbuf *m,
302    const struct sockaddr *dst, struct route *ro)
303{
304	uint8_t linkhdr[INFINIBAND_HDR_LEN];
305	uint8_t *phdr;
306	struct llentry *lle = NULL;
307	struct infiniband_header *ih;
308	int error = 0;
309	int hlen;	/* link layer header length */
310	uint32_t pflags;
311	bool addref;
312
313	NET_EPOCH_ASSERT();
314
315	addref = false;
316	phdr = NULL;
317	pflags = 0;
318	if (ro != NULL) {
319		/* XXX BPF uses ro_prepend */
320		if (ro->ro_prepend != NULL) {
321			phdr = ro->ro_prepend;
322			hlen = ro->ro_plen;
323		} else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
324			if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
325				lle = ro->ro_lle;
326				if (lle != NULL &&
327				    (lle->la_flags & LLE_VALID) == 0) {
328					LLE_FREE(lle);
329					lle = NULL;	/* redundant */
330					ro->ro_lle = NULL;
331				}
332				if (lle == NULL) {
333					/* if we lookup, keep cache */
334					addref = 1;
335				} else
336					/*
337					 * Notify LLE code that
338					 * the entry was used
339					 * by datapath.
340					 */
341					llentry_provide_feedback(lle);
342			}
343			if (lle != NULL) {
344				phdr = lle->r_linkdata;
345				hlen = lle->r_hdrlen;
346				pflags = lle->r_flags;
347			}
348		}
349	}
350
351#ifdef MAC
352	error = mac_ifnet_check_transmit(ifp, m);
353	if (error)
354		goto bad;
355#endif
356
357	M_PROFILE(m);
358	if (ifp->if_flags & IFF_MONITOR) {
359		error = ENETDOWN;
360		goto bad;
361	}
362	if (!((ifp->if_flags & IFF_UP) &&
363	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
364		error = ENETDOWN;
365		goto bad;
366	}
367
368	if (phdr == NULL) {
369		/* No prepend data supplied. Try to calculate ourselves. */
370		phdr = linkhdr;
371		hlen = INFINIBAND_HDR_LEN;
372		error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
373		    addref ? &lle : NULL);
374		if (addref && lle != NULL)
375			ro->ro_lle = lle;
376		if (error != 0)
377			return (error == EWOULDBLOCK ? 0 : error);
378	}
379
380	if ((pflags & RT_L2_ME) != 0) {
381		update_mbuf_csumflags(m, m);
382		return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0));
383	}
384
385	/*
386	 * Add local infiniband header. If no space in first mbuf,
387	 * allocate another.
388	 */
389	M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT);
390	if (m == NULL) {
391		error = ENOBUFS;
392		goto bad;
393	}
394	if ((pflags & RT_HAS_HEADER) == 0) {
395		ih = mtod(m, struct infiniband_header *);
396		memcpy(ih, phdr, hlen);
397	}
398
399	/*
400	 * Queue message on interface, update output statistics if
401	 * successful, and start output if interface not yet active.
402	 */
403	return (ifp->if_transmit(ifp, m));
404bad:
405	if (m != NULL)
406		m_freem(m);
407	return (error);
408}
409
410/*
411 * Process a received Infiniband packet.
412 */
413static void
414infiniband_input(struct ifnet *ifp, struct mbuf *m)
415{
416	struct infiniband_header *ibh;
417	struct epoch_tracker et;
418	int isr;
419	bool needs_epoch;
420
421	needs_epoch = (ifp->if_flags & IFF_NEEDSEPOCH);
422#ifdef INVARIANTS
423	/*
424	 * This temporary code is here to prevent epoch unaware and unmarked
425	 * drivers to panic the system.  Once all drivers are taken care of,
426	 * the whole INVARIANTS block should go away.
427	 */
428	if (!needs_epoch && !in_epoch(net_epoch_preempt)) {
429		static bool printedonce;
430
431		needs_epoch = true;
432		if (!printedonce) {
433			printedonce = true;
434			if_printf(ifp, "called %s w/o net epoch! "
435			    "PLEASE file a bug report.", __func__);
436#ifdef KDB
437			kdb_backtrace();
438#endif
439		}
440	}
441#endif
442
443	CURVNET_SET_QUIET(ifp->if_vnet);
444	if (__predict_false(needs_epoch))
445		NET_EPOCH_ENTER(et);
446
447	if ((ifp->if_flags & IFF_UP) == 0) {
448		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
449		m_freem(m);
450		goto done;
451	}
452
453	ibh = mtod(m, struct infiniband_header *);
454
455	/*
456	 * Reset layer specific mbuf flags to avoid confusing upper
457	 * layers:
458	 */
459	m->m_flags &= ~M_VLANTAG;
460	m_clrprotoflags(m);
461
462	if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) {
463		if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr,
464		    ifp->if_addrlen) == 0)
465			m->m_flags |= M_BCAST;
466		else
467			m->m_flags |= M_MCAST;
468		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
469	}
470
471	/* Let BPF have it before we strip the header. */
472	infiniband_bpf_mtap(ifp, m);
473
474	/* Allow monitor mode to claim this frame, after stats are updated. */
475	if (ifp->if_flags & IFF_MONITOR) {
476		m_freem(m);
477		goto done;
478	}
479
480	/* Direct packet to correct FIB based on interface config. */
481	M_SETFIB(m, ifp->if_fib);
482
483	/* Handle input from a lagg<N> port */
484	if (ifp->if_type == IFT_INFINIBANDLAG) {
485		KASSERT(lagg_input_infiniband_p != NULL,
486		    ("%s: if_lagg not loaded!", __func__));
487		m = (*lagg_input_infiniband_p)(ifp, m);
488		if (__predict_false(m == NULL))
489			goto done;
490		ifp = m->m_pkthdr.rcvif;
491	}
492
493	/*
494	 * Dispatch frame to upper layer.
495	 */
496	switch (ibh->ib_protocol) {
497#ifdef INET
498	case htons(ETHERTYPE_IP):
499		isr = NETISR_IP;
500		break;
501
502	case htons(ETHERTYPE_ARP):
503		if (ifp->if_flags & IFF_NOARP) {
504			/* Discard packet if ARP is disabled on interface */
505			m_freem(m);
506			goto done;
507		}
508		isr = NETISR_ARP;
509		break;
510#endif
511#ifdef INET6
512	case htons(ETHERTYPE_IPV6):
513		isr = NETISR_IPV6;
514		break;
515#endif
516	default:
517		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
518		m_freem(m);
519		goto done;
520	}
521
522	/* Strip off the Infiniband header. */
523	m_adj(m, INFINIBAND_HDR_LEN);
524
525#ifdef MAC
526	/*
527	 * Tag the mbuf with an appropriate MAC label before any other
528	 * consumers can get to it.
529	 */
530	mac_ifnet_create_mbuf(ifp, m);
531#endif
532	/* Allow monitor mode to claim this frame, after stats are updated. */
533	netisr_dispatch(isr, m);
534done:
535	if (__predict_false(needs_epoch))
536		NET_EPOCH_EXIT(et);
537	CURVNET_RESTORE();
538}
539
540static int
541infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
542    struct sockaddr *sa)
543{
544	struct sockaddr_dl *sdl;
545#ifdef INET
546	struct sockaddr_in *sin;
547#endif
548#ifdef INET6
549	struct sockaddr_in6 *sin6;
550#endif
551	uint8_t *e_addr;
552
553	switch (sa->sa_family) {
554	case AF_LINK:
555		/*
556		 * No mapping needed. Just check that it's a valid MC address.
557		 */
558		sdl = (struct sockaddr_dl *)sa;
559		e_addr = LLADDR(sdl);
560		if (!INFINIBAND_IS_MULTICAST(e_addr))
561			return (EADDRNOTAVAIL);
562		*llsa = NULL;
563		return 0;
564
565#ifdef INET
566	case AF_INET:
567		sin = (struct sockaddr_in *)sa;
568		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
569			return (EADDRNOTAVAIL);
570		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
571		sdl->sdl_alen = INFINIBAND_ADDR_LEN;
572		e_addr = LLADDR(sdl);
573		infiniband_ipv4_multicast_map(
574		    sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr);
575		*llsa = (struct sockaddr *)sdl;
576		return (0);
577#endif
578#ifdef INET6
579	case AF_INET6:
580		sin6 = (struct sockaddr_in6 *)sa;
581		/*
582		 * An IP6 address of 0 means listen to all of the
583		 * multicast address used for IP6. This has no meaning
584		 * in infiniband.
585		 */
586		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
587			return (EADDRNOTAVAIL);
588		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
589			return (EADDRNOTAVAIL);
590		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
591		sdl->sdl_alen = INFINIBAND_ADDR_LEN;
592		e_addr = LLADDR(sdl);
593		infiniband_ipv6_multicast_map(
594		    &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
595		*llsa = (struct sockaddr *)sdl;
596		return (0);
597#endif
598	default:
599		return (EAFNOSUPPORT);
600	}
601}
602
603void
604infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb)
605{
606	struct sockaddr_dl *sdl;
607	struct ifaddr *ifa;
608	int i;
609
610	ifp->if_addrlen = INFINIBAND_ADDR_LEN;
611	ifp->if_hdrlen = INFINIBAND_HDR_LEN;
612	ifp->if_mtu = INFINIBAND_MTU;
613	if_attach(ifp);
614	ifp->if_output = infiniband_output;
615	ifp->if_input = infiniband_input;
616	ifp->if_resolvemulti = infiniband_resolvemulti;
617	ifp->if_requestencap = infiniband_requestencap;
618
619	if (ifp->if_baudrate == 0)
620		ifp->if_baudrate = IF_Gbps(10); /* default value */
621	if (llb != NULL)
622		ifp->if_broadcastaddr = llb;
623
624	ifa = ifp->if_addr;
625	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
626	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
627	sdl->sdl_type = IFT_INFINIBAND;
628	sdl->sdl_alen = ifp->if_addrlen;
629
630	if (lla != NULL) {
631		memcpy(LLADDR(sdl), lla, ifp->if_addrlen);
632
633		if (ifp->if_hw_addr != NULL)
634			memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen);
635	} else {
636		lla = LLADDR(sdl);
637	}
638
639	/* Attach ethernet compatible network device */
640	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
641
642	/* Announce Infiniband MAC address if non-zero. */
643	for (i = 0; i < ifp->if_addrlen; i++)
644		if (lla[i] != 0)
645			break;
646	if (i != ifp->if_addrlen)
647		if_printf(ifp, "Infiniband address: %20D\n", lla, ":");
648
649	/* Add necessary bits are setup; announce it now. */
650	EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp);
651
652	if (IS_DEFAULT_VNET(curvnet))
653		devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL);
654}
655
656/*
657 * Perform common duties while detaching an Infiniband interface
658 */
659void
660infiniband_ifdetach(struct ifnet *ifp)
661{
662	bpfdetach(ifp);
663	if_detach(ifp);
664}
665
666static int
667infiniband_modevent(module_t mod, int type, void *data)
668{
669	switch (type) {
670	case MOD_LOAD:
671	case MOD_UNLOAD:
672		return (0);
673	default:
674		return (EOPNOTSUPP);
675	}
676}
677
678static moduledata_t infiniband_mod = {
679	.name = "if_infiniband",
680	.evhand = &infiniband_modevent,
681};
682
683DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
684MODULE_VERSION(if_infiniband, 1);
685