1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2019 Isilon Systems, LLC.
5 * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
6 * Copyright (c) 2000 Darrell Anderson
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include "opt_ddb.h"
35#include "opt_inet.h"
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/endian.h>
40#include <sys/errno.h>
41#include <sys/eventhandler.h>
42#include <sys/socket.h>
43#include <sys/sysctl.h>
44
45#ifdef DDB
46#include <ddb/ddb.h>
47#include <ddb/db_lex.h>
48#endif
49
50#include <net/ethernet.h>
51#include <net/if.h>
52#include <net/if_arp.h>
53#include <net/if_dl.h>
54#include <net/if_types.h>
55#include <net/if_var.h>
56#include <net/route.h>
57#include <net/route/nhop.h>
58
59#include <netinet/in.h>
60#include <netinet/in_fib.h>
61#include <netinet/in_systm.h>
62#include <netinet/in_var.h>
63#include <netinet/ip.h>
64#include <netinet/ip_var.h>
65#include <netinet/ip_options.h>
66#include <netinet/udp.h>
67#include <netinet/udp_var.h>
68
69#include <machine/in_cksum.h>
70#include <machine/pcb.h>
71
72#include <net/debugnet.h>
73#define	DEBUGNET_INTERNAL
74#include <net/debugnet_int.h>
75
76FEATURE(debugnet, "Debugnet support");
77
78SYSCTL_NODE(_net, OID_AUTO, debugnet, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
79    "debugnet parameters");
80
81unsigned debugnet_debug;
82SYSCTL_UINT(_net_debugnet, OID_AUTO, debug, CTLFLAG_RWTUN,
83    &debugnet_debug, 0,
84    "Debug message verbosity (0: off; 1: on; 2: verbose)");
85
86int debugnet_npolls = 2000;
87SYSCTL_INT(_net_debugnet, OID_AUTO, npolls, CTLFLAG_RWTUN,
88    &debugnet_npolls, 0,
89    "Number of times to poll before assuming packet loss (0.5ms per poll)");
90int debugnet_nretries = 10;
91SYSCTL_INT(_net_debugnet, OID_AUTO, nretries, CTLFLAG_RWTUN,
92    &debugnet_nretries, 0,
93    "Number of retransmit attempts before giving up");
94int debugnet_fib = RT_DEFAULT_FIB;
95SYSCTL_INT(_net_debugnet, OID_AUTO, fib, CTLFLAG_RWTUN,
96    &debugnet_fib, 0,
97    "Fib to use when sending dump");
98
99static bool g_debugnet_pcb_inuse;
100static struct debugnet_pcb g_dnet_pcb;
101
102/*
103 * Simple accessors for opaque PCB.
104 */
105const unsigned char *
106debugnet_get_gw_mac(const struct debugnet_pcb *pcb)
107{
108	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
109	    pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
110	return (pcb->dp_gw_mac.octet);
111}
112
113/*
114 * Start of network primitives, beginning with output primitives.
115 */
116
117/*
118 * Handles creation of the ethernet header, then places outgoing packets into
119 * the tx buffer for the NIC
120 *
121 * Parameters:
122 *	m	The mbuf containing the packet to be sent (will be freed by
123 *		this function or the NIC driver)
124 *	ifp	The interface to send on
125 *	dst	The destination ethernet address (source address will be looked
126 *		up using ifp)
127 *	etype	The ETHERTYPE_* value for the protocol that is being sent
128 *
129 * Returns:
130 *	int	see errno.h, 0 for success
131 */
132int
133debugnet_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst,
134    u_short etype)
135{
136	struct ether_header *eh;
137
138	if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) ||
139	    (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) {
140		if_printf(ifp, "%s: interface isn't up\n", __func__);
141		m_freem(m);
142		return (ENETDOWN);
143	}
144
145	/* Fill in the ethernet header. */
146	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
147	if (m == NULL) {
148		printf("%s: out of mbufs\n", __func__);
149		return (ENOBUFS);
150	}
151	eh = mtod(m, struct ether_header *);
152	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
153	memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN);
154	eh->ether_type = htons(etype);
155	return (ifp->if_debugnet_methods->dn_transmit(ifp, m));
156}
157
158/*
159 * Unreliable transmission of an mbuf chain to the debugnet server
160 * Note: can't handle fragmentation; fails if the packet is larger than
161 *	 ifp->if_mtu after adding the UDP/IP headers
162 *
163 * Parameters:
164 *	pcb	The debugnet context block
165 *	m	mbuf chain
166 *
167 * Returns:
168 *	int	see errno.h, 0 for success
169 */
170static int
171debugnet_udp_output(struct debugnet_pcb *pcb, struct mbuf *m)
172{
173	struct udphdr *udp;
174
175	MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
176
177	M_PREPEND(m, sizeof(*udp), M_NOWAIT);
178	if (m == NULL) {
179		printf("%s: out of mbufs\n", __func__);
180		return (ENOBUFS);
181	}
182
183	udp = mtod(m, void *);
184	udp->uh_ulen = htons(m->m_pkthdr.len);
185	/* Use this src port so that the server can connect() the socket */
186	udp->uh_sport = htons(pcb->dp_client_port);
187	udp->uh_dport = htons(pcb->dp_server_port);
188	/* Computed later (protocol-dependent). */
189	udp->uh_sum = 0;
190
191	return (debugnet_ip_output(pcb, m));
192}
193
194int
195debugnet_ack_output(struct debugnet_pcb *pcb, uint32_t seqno /* net endian */)
196{
197	struct debugnet_ack *dn_ack;
198	struct mbuf *m;
199
200	DNETDEBUG("Acking with seqno %u\n", ntohl(seqno));
201
202	m = m_gethdr(M_NOWAIT, MT_DATA);
203	if (m == NULL) {
204		printf("%s: Out of mbufs\n", __func__);
205		return (ENOBUFS);
206	}
207	m->m_len = sizeof(*dn_ack);
208	m->m_pkthdr.len = sizeof(*dn_ack);
209	MH_ALIGN(m, sizeof(*dn_ack));
210	dn_ack = mtod(m, void *);
211	dn_ack->da_seqno = seqno;
212
213	return (debugnet_udp_output(pcb, m));
214}
215
216/*
217 * Dummy free function for debugnet clusters.
218 */
219static void
220debugnet_mbuf_free(struct mbuf *m __unused)
221{
222}
223
224/*
225 * Construct and reliably send a debugnet packet.  May fail from a resource
226 * shortage or extreme number of unacknowledged retransmissions.  Wait for
227 * an acknowledgement before returning.  Splits packets into chunks small
228 * enough to be sent without fragmentation (looks up the interface MTU)
229 *
230 * Parameters:
231 *	type	debugnet packet type (HERALD, FINISHED, ...)
232 *	data	data
233 *	datalen	data size (bytes)
234 *	auxdata	optional auxiliary information
235 *
236 * Returns:
237 *	int see errno.h, 0 for success
238 */
239int
240debugnet_send(struct debugnet_pcb *pcb, uint32_t type, const void *data,
241    uint32_t datalen, const struct debugnet_proto_aux *auxdata)
242{
243	struct debugnet_msg_hdr *dn_msg_hdr;
244	struct mbuf *m, *m2;
245	uint64_t want_acks;
246	uint32_t i, pktlen, sent_so_far;
247	int retries, polls, error;
248
249	if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
250		return (ECONNRESET);
251
252	want_acks = 0;
253	pcb->dp_rcvd_acks = 0;
254	retries = 0;
255
256retransmit:
257	/* Chunks can be too big to fit in packets. */
258	for (i = sent_so_far = 0; sent_so_far < datalen ||
259	    (i == 0 && datalen == 0); i++) {
260		pktlen = datalen - sent_so_far;
261
262		/* Bound: the interface MTU (assume no IP options). */
263		pktlen = min(pktlen, pcb->dp_ifp->if_mtu -
264		    sizeof(struct udpiphdr) - sizeof(struct debugnet_msg_hdr));
265
266		/*
267		 * Check if it is retransmitting and this has been ACKed
268		 * already.
269		 */
270		if ((pcb->dp_rcvd_acks & (1 << i)) != 0) {
271			sent_so_far += pktlen;
272			continue;
273		}
274
275		/*
276		 * Get and fill a header mbuf, then chain data as an extended
277		 * mbuf.
278		 */
279		m = m_gethdr(M_NOWAIT, MT_DATA);
280		if (m == NULL) {
281			printf("%s: Out of mbufs\n", __func__);
282			return (ENOBUFS);
283		}
284		m->m_len = sizeof(struct debugnet_msg_hdr);
285		m->m_pkthdr.len = sizeof(struct debugnet_msg_hdr);
286		MH_ALIGN(m, sizeof(struct debugnet_msg_hdr));
287		dn_msg_hdr = mtod(m, struct debugnet_msg_hdr *);
288		dn_msg_hdr->mh_seqno = htonl(pcb->dp_seqno + i);
289		dn_msg_hdr->mh_type = htonl(type);
290		dn_msg_hdr->mh_len = htonl(pktlen);
291
292		if (auxdata != NULL) {
293			dn_msg_hdr->mh_offset =
294			    htobe64(auxdata->dp_offset_start + sent_so_far);
295			dn_msg_hdr->mh_aux2 = htobe32(auxdata->dp_aux2);
296		} else {
297			dn_msg_hdr->mh_offset = htobe64(sent_so_far);
298			dn_msg_hdr->mh_aux2 = 0;
299		}
300
301		if (pktlen != 0) {
302			m2 = m_get(M_NOWAIT, MT_DATA);
303			if (m2 == NULL) {
304				m_freem(m);
305				printf("%s: Out of mbufs\n", __func__);
306				return (ENOBUFS);
307			}
308			MEXTADD(m2, __DECONST(char *, data) + sent_so_far,
309			    pktlen, debugnet_mbuf_free, NULL, NULL, 0,
310			    EXT_DISPOSABLE);
311			m2->m_len = pktlen;
312
313			m_cat(m, m2);
314			m->m_pkthdr.len += pktlen;
315		}
316		error = debugnet_udp_output(pcb, m);
317		if (error != 0)
318			return (error);
319
320		/* Note that we're waiting for this packet in the bitfield. */
321		want_acks |= (1 << i);
322		sent_so_far += pktlen;
323	}
324	if (i >= DEBUGNET_MAX_IN_FLIGHT)
325		printf("Warning: Sent more than %d packets (%d). "
326		    "Acknowledgements will fail unless the size of "
327		    "rcvd_acks/want_acks is increased.\n",
328		    DEBUGNET_MAX_IN_FLIGHT, i);
329
330	/*
331	 * Wait for acks.  A *real* window would speed things up considerably.
332	 */
333	polls = 0;
334	while (pcb->dp_rcvd_acks != want_acks) {
335		if (polls++ > debugnet_npolls) {
336			if (retries++ > debugnet_nretries)
337				return (ETIMEDOUT);
338			printf(". ");
339			goto retransmit;
340		}
341		debugnet_network_poll(pcb);
342		DELAY(500);
343		if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
344			return (ECONNRESET);
345	}
346	pcb->dp_seqno += i;
347	return (0);
348}
349
350/*
351 * Network input primitives.
352 */
353
354/*
355 * Just introspect the header enough to fire off a seqno ack and validate
356 * length fits.
357 */
358static void
359debugnet_handle_rx_msg(struct debugnet_pcb *pcb, struct mbuf **mb)
360{
361	const struct debugnet_msg_hdr *dnh;
362	struct mbuf *m;
363	int error;
364
365	m = *mb;
366
367	if (m->m_pkthdr.len < sizeof(*dnh)) {
368		DNETDEBUG("ignoring small debugnet_msg packet\n");
369		return;
370	}
371
372	/* Get ND header. */
373	if (m->m_len < sizeof(*dnh)) {
374		m = m_pullup(m, sizeof(*dnh));
375		*mb = m;
376		if (m == NULL) {
377			DNETDEBUG("m_pullup failed\n");
378			return;
379		}
380	}
381	dnh = mtod(m, const void *);
382
383	if (ntohl(dnh->mh_len) + sizeof(*dnh) > m->m_pkthdr.len) {
384		DNETDEBUG("Dropping short packet.\n");
385		return;
386	}
387
388	/*
389	 * If the issue is transient (ENOBUFS), sender should resend.  If
390	 * non-transient (like driver objecting to rx -> tx from the same
391	 * thread), not much else we can do.
392	 */
393	error = debugnet_ack_output(pcb, dnh->mh_seqno);
394	if (error != 0)
395		return;
396
397	if (ntohl(dnh->mh_type) == DEBUGNET_FINISHED) {
398		printf("Remote shut down the connection on us!\n");
399		pcb->dp_state = DN_STATE_REMOTE_CLOSED;
400
401		/*
402		 * Continue through to the user handler so they are signalled
403		 * not to wait for further rx.
404		 */
405	}
406
407	pcb->dp_rx_handler(pcb, mb);
408}
409
410static void
411debugnet_handle_ack(struct debugnet_pcb *pcb, struct mbuf **mb, uint16_t sport)
412{
413	const struct debugnet_ack *dn_ack;
414	struct mbuf *m;
415	uint32_t rcv_ackno;
416
417	m = *mb;
418
419	/* Get Ack. */
420	if (m->m_len < sizeof(*dn_ack)) {
421		m = m_pullup(m, sizeof(*dn_ack));
422		*mb = m;
423		if (m == NULL) {
424			DNETDEBUG("m_pullup failed\n");
425			return;
426		}
427	}
428	dn_ack = mtod(m, const void *);
429
430	/* Debugnet processing. */
431	/*
432	 * Packet is meant for us.  Extract the ack sequence number and the
433	 * port number if necessary.
434	 */
435	rcv_ackno = ntohl(dn_ack->da_seqno);
436	if (pcb->dp_state < DN_STATE_GOT_HERALD_PORT) {
437		pcb->dp_server_port = sport;
438		pcb->dp_state = DN_STATE_GOT_HERALD_PORT;
439	}
440	if (rcv_ackno >= pcb->dp_seqno + DEBUGNET_MAX_IN_FLIGHT)
441		printf("%s: ACK %u too far in future!\n", __func__, rcv_ackno);
442	else if (rcv_ackno >= pcb->dp_seqno) {
443		/* We're interested in this ack. Record it. */
444		pcb->dp_rcvd_acks |= 1 << (rcv_ackno - pcb->dp_seqno);
445	}
446}
447
448void
449debugnet_handle_udp(struct debugnet_pcb *pcb, struct mbuf **mb)
450{
451	const struct udphdr *udp;
452	struct mbuf *m;
453	uint16_t sport, ulen;
454
455	/* UDP processing. */
456
457	m = *mb;
458	if (m->m_pkthdr.len < sizeof(*udp)) {
459		DNETDEBUG("ignoring small UDP packet\n");
460		return;
461	}
462
463	/* Get UDP headers. */
464	if (m->m_len < sizeof(*udp)) {
465		m = m_pullup(m, sizeof(*udp));
466		*mb = m;
467		if (m == NULL) {
468			DNETDEBUG("m_pullup failed\n");
469			return;
470		}
471	}
472	udp = mtod(m, const void *);
473
474	/* We expect to receive UDP packets on the configured client port. */
475	if (ntohs(udp->uh_dport) != pcb->dp_client_port) {
476		DNETDEBUG("not on the expected port.\n");
477		return;
478	}
479
480	/* Check that ulen does not exceed actual size of data. */
481	ulen = ntohs(udp->uh_ulen);
482	if (m->m_pkthdr.len < ulen) {
483		DNETDEBUG("ignoring runt UDP packet\n");
484		return;
485	}
486
487	sport = ntohs(udp->uh_sport);
488
489	m_adj(m, sizeof(*udp));
490	ulen -= sizeof(*udp);
491
492	if (ulen == sizeof(struct debugnet_ack)) {
493		debugnet_handle_ack(pcb, mb, sport);
494		return;
495	}
496
497	if (pcb->dp_rx_handler == NULL) {
498		if (ulen < sizeof(struct debugnet_ack))
499			DNETDEBUG("ignoring small ACK packet\n");
500		else
501			DNETDEBUG("ignoring unexpected non-ACK packet on "
502			    "half-duplex connection.\n");
503		return;
504	}
505
506	debugnet_handle_rx_msg(pcb, mb);
507}
508
509/*
510 * Handler for incoming packets directly from the network adapter
511 * Identifies the packet type (IP or ARP) and passes it along to one of the
512 * helper functions debugnet_handle_ip or debugnet_handle_arp.
513 *
514 * It needs to partially replicate the behaviour of ether_input() and
515 * ether_demux().
516 *
517 * Parameters:
518 *	ifp	the interface the packet came from
519 *	m	an mbuf containing the packet received
520 */
521static void
522debugnet_pkt_in(struct ifnet *ifp, struct mbuf *m)
523{
524	struct ifreq ifr;
525	struct ether_header *eh;
526	u_short etype;
527
528	/* Ethernet processing. */
529	if ((m->m_flags & M_PKTHDR) == 0) {
530		DNETDEBUG_IF(ifp, "discard frame without packet header\n");
531		goto done;
532	}
533	if (m->m_len < ETHER_HDR_LEN) {
534		DNETDEBUG_IF(ifp,
535	    "discard frame without leading eth header (len %u pktlen %u)\n",
536		    m->m_len, m->m_pkthdr.len);
537		goto done;
538	}
539	if ((m->m_flags & M_HASFCS) != 0) {
540		m_adj(m, -ETHER_CRC_LEN);
541		m->m_flags &= ~M_HASFCS;
542	}
543	eh = mtod(m, struct ether_header *);
544	etype = ntohs(eh->ether_type);
545	if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) {
546		DNETDEBUG_IF(ifp, "ignoring vlan packets\n");
547		goto done;
548	}
549	if (if_gethwaddr(ifp, &ifr) != 0) {
550		DNETDEBUG_IF(ifp, "failed to get hw addr for interface\n");
551		goto done;
552	}
553	if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost,
554	    ETHER_ADDR_LEN) != 0 &&
555	    (etype != ETHERTYPE_ARP || !ETHER_IS_BROADCAST(eh->ether_dhost))) {
556		DNETDEBUG_IF(ifp,
557		    "discard frame with incorrect destination addr\n");
558		goto done;
559	}
560
561	MPASS(g_debugnet_pcb_inuse);
562
563	/* Done ethernet processing. Strip off the ethernet header. */
564	m_adj(m, ETHER_HDR_LEN);
565	switch (etype) {
566	case ETHERTYPE_ARP:
567		debugnet_handle_arp(&g_dnet_pcb, &m);
568		break;
569	case ETHERTYPE_IP:
570		debugnet_handle_ip(&g_dnet_pcb, &m);
571		break;
572	default:
573		DNETDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype);
574		break;
575	}
576done:
577	if (m != NULL)
578		m_freem(m);
579}
580
581/*
582 * Network polling primitive.
583 *
584 * Instead of assuming that most of the network stack is sane, we just poll the
585 * driver directly for packets.
586 */
587void
588debugnet_network_poll(struct debugnet_pcb *pcb)
589{
590	struct ifnet *ifp;
591
592	ifp = pcb->dp_ifp;
593	ifp->if_debugnet_methods->dn_poll(ifp, 1000);
594}
595
596/*
597 * Start of consumer API surface.
598 */
599void
600debugnet_free(struct debugnet_pcb *pcb)
601{
602	struct ifnet *ifp;
603
604	MPASS(g_debugnet_pcb_inuse);
605	MPASS(pcb == &g_dnet_pcb);
606
607	ifp = pcb->dp_ifp;
608	if (ifp != NULL) {
609		if (pcb->dp_drv_input != NULL)
610			ifp->if_input = pcb->dp_drv_input;
611		if (pcb->dp_event_started)
612			ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_END);
613	}
614	debugnet_mbuf_finish();
615
616	g_debugnet_pcb_inuse = false;
617	memset(&g_dnet_pcb, 0xfd, sizeof(g_dnet_pcb));
618}
619
620int
621debugnet_connect(const struct debugnet_conn_params *dcp,
622    struct debugnet_pcb **pcb_out)
623{
624	struct debugnet_proto_aux herald_auxdata;
625	struct debugnet_pcb *pcb;
626	struct ifnet *ifp;
627	int error;
628
629	if (g_debugnet_pcb_inuse) {
630		printf("%s: Only one connection at a time.\n", __func__);
631		return (EBUSY);
632	}
633
634	pcb = &g_dnet_pcb;
635	*pcb = (struct debugnet_pcb) {
636		.dp_state = DN_STATE_INIT,
637		.dp_client = dcp->dc_client,
638		.dp_server = dcp->dc_server,
639		.dp_gateway = dcp->dc_gateway,
640		.dp_server_port = dcp->dc_herald_port,	/* Initially */
641		.dp_client_port = dcp->dc_client_port,
642		.dp_seqno = 1,
643		.dp_ifp = dcp->dc_ifp,
644		.dp_rx_handler = dcp->dc_rx_handler,
645	};
646
647	/* Switch to the debugnet mbuf zones. */
648	debugnet_mbuf_start();
649
650	/* At least one needed parameter is missing; infer it. */
651	if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY ||
652	    pcb->dp_ifp == NULL) {
653		struct sockaddr_in dest_sin, *gw_sin, *local_sin;
654		struct ifnet *rt_ifp;
655		struct nhop_object *nh;
656
657		memset(&dest_sin, 0, sizeof(dest_sin));
658		dest_sin = (struct sockaddr_in) {
659			.sin_len = sizeof(dest_sin),
660			.sin_family = AF_INET,
661			.sin_addr.s_addr = pcb->dp_server,
662		};
663
664		CURVNET_SET(vnet0);
665		nh = fib4_lookup_debugnet(debugnet_fib, dest_sin.sin_addr, 0,
666		    NHR_NONE);
667		CURVNET_RESTORE();
668
669		if (nh == NULL) {
670			printf("%s: Could not get route for that server.\n",
671			    __func__);
672			error = ENOENT;
673			goto cleanup;
674		}
675
676		if (nh->gw_sa.sa_family == AF_INET)
677			gw_sin = &nh->gw4_sa;
678		else {
679			if (nh->gw_sa.sa_family == AF_LINK)
680				DNETDEBUG("Destination address is on link.\n");
681			gw_sin = NULL;
682		}
683
684		MPASS(nh->nh_ifa->ifa_addr->sa_family == AF_INET);
685		local_sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
686
687		rt_ifp = nh->nh_ifp;
688
689		if (pcb->dp_client == INADDR_ANY)
690			pcb->dp_client = local_sin->sin_addr.s_addr;
691		if (pcb->dp_gateway == INADDR_ANY && gw_sin != NULL)
692			pcb->dp_gateway = gw_sin->sin_addr.s_addr;
693		if (pcb->dp_ifp == NULL)
694			pcb->dp_ifp = rt_ifp;
695	}
696
697	ifp = pcb->dp_ifp;
698
699	if (debugnet_debug > 0) {
700		char serbuf[INET_ADDRSTRLEN], clibuf[INET_ADDRSTRLEN],
701		    gwbuf[INET_ADDRSTRLEN];
702		inet_ntop(AF_INET, &pcb->dp_server, serbuf, sizeof(serbuf));
703		inet_ntop(AF_INET, &pcb->dp_client, clibuf, sizeof(clibuf));
704		if (pcb->dp_gateway != INADDR_ANY)
705			inet_ntop(AF_INET, &pcb->dp_gateway, gwbuf, sizeof(gwbuf));
706		DNETDEBUG("Connecting to %s:%d%s%s from %s:%d on %s\n",
707		    serbuf, pcb->dp_server_port,
708		    (pcb->dp_gateway == INADDR_ANY) ? "" : " via ",
709		    (pcb->dp_gateway == INADDR_ANY) ? "" : gwbuf,
710		    clibuf, pcb->dp_client_port, if_name(ifp));
711	}
712
713	/* Validate iface is online and supported. */
714	if (!DEBUGNET_SUPPORTED_NIC(ifp)) {
715		printf("%s: interface '%s' does not support debugnet\n",
716		    __func__, if_name(ifp));
717		error = ENODEV;
718		goto cleanup;
719	}
720	if ((if_getflags(ifp) & IFF_UP) == 0) {
721		printf("%s: interface '%s' link is down\n", __func__,
722		    if_name(ifp));
723		error = ENXIO;
724		goto cleanup;
725	}
726
727	ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_START);
728	pcb->dp_event_started = true;
729
730	/*
731	 * We maintain the invariant that g_debugnet_pcb_inuse is always true
732	 * while the debugnet ifp's if_input is overridden with
733	 * debugnet_pkt_in.
734	 */
735	g_debugnet_pcb_inuse = true;
736
737	/* Make the card use *our* receive callback. */
738	pcb->dp_drv_input = ifp->if_input;
739	ifp->if_input = debugnet_pkt_in;
740
741	printf("%s: searching for %s MAC...\n", __func__,
742	    (dcp->dc_gateway == INADDR_ANY) ? "server" : "gateway");
743
744	error = debugnet_arp_gw(pcb);
745	if (error != 0) {
746		printf("%s: failed to locate MAC address\n", __func__);
747		goto cleanup;
748	}
749	MPASS(pcb->dp_state == DN_STATE_HAVE_GW_MAC);
750
751	herald_auxdata = (struct debugnet_proto_aux) {
752		.dp_offset_start = dcp->dc_herald_offset,
753		.dp_aux2 = dcp->dc_herald_aux2,
754	};
755	error = debugnet_send(pcb, DEBUGNET_HERALD, dcp->dc_herald_data,
756	    dcp->dc_herald_datalen, &herald_auxdata);
757	if (error != 0) {
758		printf("%s: failed to herald debugnet server\n", __func__);
759		goto cleanup;
760	}
761
762	*pcb_out = pcb;
763	return (0);
764
765cleanup:
766	debugnet_free(pcb);
767	return (error);
768}
769
770/*
771 * Pre-allocated dump-time mbuf tracking.
772 *
773 * We just track the high water mark we've ever seen and allocate appropriately
774 * for that iface/mtu combo.
775 */
776static struct {
777	int nmbuf;
778	int ncl;
779	int clsize;
780} dn_hwm;
781static struct mtx dn_hwm_lk;
782MTX_SYSINIT(debugnet_hwm_lock, &dn_hwm_lk, "Debugnet HWM lock", MTX_DEF);
783
784static void
785dn_maybe_reinit_mbufs(int nmbuf, int ncl, int clsize)
786{
787	bool any;
788
789	any = false;
790	mtx_lock(&dn_hwm_lk);
791
792	if (nmbuf > dn_hwm.nmbuf) {
793		any = true;
794		dn_hwm.nmbuf = nmbuf;
795	} else
796		nmbuf = dn_hwm.nmbuf;
797
798	if (ncl > dn_hwm.ncl) {
799		any = true;
800		dn_hwm.ncl = ncl;
801	} else
802		ncl = dn_hwm.ncl;
803
804	if (clsize > dn_hwm.clsize) {
805		any = true;
806		dn_hwm.clsize = clsize;
807	} else
808		clsize = dn_hwm.clsize;
809
810	mtx_unlock(&dn_hwm_lk);
811
812	if (any)
813		debugnet_mbuf_reinit(nmbuf, ncl, clsize);
814}
815
816void
817debugnet_any_ifnet_update(struct ifnet *ifp)
818{
819	int clsize, nmbuf, ncl, nrxr;
820
821	if (!DEBUGNET_SUPPORTED_NIC(ifp))
822		return;
823
824	ifp->if_debugnet_methods->dn_init(ifp, &nrxr, &ncl, &clsize);
825	KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr));
826
827	/*
828	 * We need two headers per message on the transmit side. Multiply by
829	 * four to give us some breathing room.
830	 */
831	nmbuf = ncl * (4 + nrxr);
832	ncl *= nrxr;
833
834	/*
835	 * Bandaid for drivers that (incorrectly) advertise LinkUp before their
836	 * dn_init method is available.
837	 */
838	if (nmbuf == 0 || ncl == 0 || clsize == 0) {
839		printf("%s: Bad dn_init result from %s (ifp %p), ignoring.\n",
840		    __func__, if_name(ifp), ifp);
841		return;
842	}
843	dn_maybe_reinit_mbufs(nmbuf, ncl, clsize);
844}
845
846/*
847 * Unfortunately, the ifnet_arrival_event eventhandler hook is mostly useless
848 * for us because drivers tend to if_attach before invoking DEBUGNET_SET().
849 *
850 * On the other hand, hooking DEBUGNET_SET() itself may still be too early,
851 * because the driver is still in attach.  Since we cannot use down interfaces,
852 * maybe hooking ifnet_event:IFNET_EVENT_UP is sufficient?  ... Nope, at least
853 * with vtnet and dhcpclient that event just never occurs.
854 *
855 * So that's how I've landed on the lower level ifnet_link_event.
856 */
857
858static void
859dn_ifnet_event(void *arg __unused, struct ifnet *ifp, int link_state)
860{
861	if (link_state == LINK_STATE_UP)
862		debugnet_any_ifnet_update(ifp);
863}
864
865static eventhandler_tag dn_attach_cookie;
866static void
867dn_evh_init(void *ctx __unused)
868{
869	dn_attach_cookie = EVENTHANDLER_REGISTER(ifnet_link_event,
870	    dn_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
871}
872SYSINIT(dn_evh_init, SI_SUB_EVENTHANDLER + 1, SI_ORDER_ANY, dn_evh_init, NULL);
873
874/*
875 * DDB parsing helpers for debugnet(4) consumers.
876 */
877#ifdef DDB
878struct my_inet_opt {
879	bool has_opt;
880	const char *printname;
881	in_addr_t *result;
882};
883
884static int
885dn_parse_optarg_ipv4(struct my_inet_opt *opt)
886{
887	in_addr_t tmp;
888	unsigned octet;
889	int t;
890
891	tmp = 0;
892	for (octet = 0; octet < 4; octet++) {
893		t = db_read_token_flags(DRT_WSPACE | DRT_DECIMAL);
894		if (t != tNUMBER) {
895			db_printf("%s:%s: octet %u expected number; found %d\n",
896			    __func__, opt->printname, octet, t);
897			return (EINVAL);
898		}
899		/*
900		 * db_lex lexes '-' distinctly from the number itself, but
901		 * let's document that invariant.
902		 */
903		MPASS(db_tok_number >= 0);
904
905		if (db_tok_number > UINT8_MAX) {
906			db_printf("%s:%s: octet %u out of range: %jd\n", __func__,
907			    opt->printname, octet, (intmax_t)db_tok_number);
908			return (EDOM);
909		}
910
911		/* Constructed host-endian and converted to network later. */
912		tmp = (tmp << 8) | db_tok_number;
913
914		if (octet < 3) {
915			t = db_read_token_flags(DRT_WSPACE);
916			if (t != tDOT) {
917				db_printf("%s:%s: octet %u expected '.'; found"
918				    " %d\n", __func__, opt->printname, octet,
919				    t);
920				return (EINVAL);
921			}
922		}
923	}
924
925	*opt->result = htonl(tmp);
926	opt->has_opt = true;
927	return (0);
928}
929
930int
931debugnet_parse_ddb_cmd(const char *cmd, struct debugnet_ddb_config *result)
932{
933	struct ifnet *ifp;
934	int t, error;
935	bool want_ifp;
936	char ch;
937
938	struct my_inet_opt opt_client = {
939		.printname = "client",
940		.result = &result->dd_client,
941	},
942	opt_server = {
943		.printname = "server",
944		.result = &result->dd_server,
945	},
946	opt_gateway = {
947		.printname = "gateway",
948		.result = &result->dd_gateway,
949	},
950	*cur_inet_opt;
951
952	ifp = NULL;
953	memset(result, 0, sizeof(*result));
954
955	/*
956	 * command [space] [-] [opt] [[space] [optarg]] ...
957	 *
958	 * db_command has already lexed 'command' for us.
959	 */
960	t = db_read_token_flags(DRT_WSPACE);
961	if (t == tWSPACE)
962		t = db_read_token_flags(DRT_WSPACE);
963
964	while (t != tEOL) {
965		if (t != tMINUS) {
966			db_printf("%s: Bad syntax; expected '-', got %d\n",
967			    cmd, t);
968			goto usage;
969		}
970
971		t = db_read_token_flags(DRT_WSPACE);
972		if (t != tIDENT) {
973			db_printf("%s: Bad syntax; expected tIDENT, got %d\n",
974			    cmd, t);
975			goto usage;
976		}
977
978		if (strlen(db_tok_string) > 1) {
979			db_printf("%s: Bad syntax; expected single option "
980			    "flag, got '%s'\n", cmd, db_tok_string);
981			goto usage;
982		}
983
984		want_ifp = false;
985		cur_inet_opt = NULL;
986		switch ((ch = db_tok_string[0])) {
987		default:
988			DNETDEBUG("Unexpected: '%c'\n", ch);
989			/* FALLTHROUGH */
990		case 'h':
991			goto usage;
992		case 'c':
993			cur_inet_opt = &opt_client;
994			break;
995		case 'g':
996			cur_inet_opt = &opt_gateway;
997			break;
998		case 's':
999			cur_inet_opt = &opt_server;
1000			break;
1001		case 'i':
1002			want_ifp = true;
1003			break;
1004		}
1005
1006		t = db_read_token_flags(DRT_WSPACE);
1007		if (t != tWSPACE) {
1008			db_printf("%s: Bad syntax; expected space after "
1009			    "flag %c, got %d\n", cmd, ch, t);
1010			goto usage;
1011		}
1012
1013		if (want_ifp) {
1014			t = db_read_token_flags(DRT_WSPACE);
1015			if (t != tIDENT) {
1016				db_printf("%s: Expected interface but got %d\n",
1017				    cmd, t);
1018				goto usage;
1019			}
1020
1021			CURVNET_SET(vnet0);
1022			/*
1023			 * We *don't* take a ref here because the only current
1024			 * consumer, db_netdump_cmd, does not need it.  It
1025			 * (somewhat redundantly) extracts the if_name(),
1026			 * re-lookups the ifp, and takes its own reference.
1027			 */
1028			ifp = ifunit(db_tok_string);
1029			CURVNET_RESTORE();
1030			if (ifp == NULL) {
1031				db_printf("Could not locate interface %s\n",
1032				    db_tok_string);
1033				goto cleanup;
1034			}
1035		} else {
1036			MPASS(cur_inet_opt != NULL);
1037			/* Assume IPv4 for now. */
1038			error = dn_parse_optarg_ipv4(cur_inet_opt);
1039			if (error != 0)
1040				goto cleanup;
1041		}
1042
1043		/* Skip (mandatory) whitespace after option, if not EOL. */
1044		t = db_read_token_flags(DRT_WSPACE);
1045		if (t == tEOL)
1046			break;
1047		if (t != tWSPACE) {
1048			db_printf("%s: Bad syntax; expected space after "
1049			    "flag %c option; got %d\n", cmd, ch, t);
1050			goto usage;
1051		}
1052		t = db_read_token_flags(DRT_WSPACE);
1053	}
1054
1055	if (!opt_server.has_opt) {
1056		db_printf("%s: need a destination server address\n", cmd);
1057		goto usage;
1058	}
1059
1060	result->dd_has_client = opt_client.has_opt;
1061	result->dd_has_gateway = opt_gateway.has_opt;
1062	result->dd_ifp = ifp;
1063
1064	/* We parsed the full line to tEOL already, or bailed with an error. */
1065	return (0);
1066
1067usage:
1068	db_printf("Usage: %s -s <server> [-g <gateway> -c <localip> "
1069	    "-i <interface>]\n", cmd);
1070	error = EINVAL;
1071	/* FALLTHROUGH */
1072cleanup:
1073	db_skip_to_eol();
1074	return (error);
1075}
1076#endif /* DDB */
1077