1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2021 Ng Peng Nam Sean
5 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/param.h>
30#include <sys/ck.h>
31#include <sys/lock.h>
32#include <sys/malloc.h>
33#include <sys/mbuf.h>
34#include <sys/mutex.h>
35#include <sys/socket.h>
36#include <sys/socketvar.h>
37#include <sys/syslog.h>
38
39#include <netlink/netlink.h>
40#include <netlink/netlink_ctl.h>
41#include <netlink/netlink_linux.h>
42#include <netlink/netlink_var.h>
43
44#define	DEBUG_MOD_NAME	nl_io
45#define	DEBUG_MAX_LEVEL	LOG_DEBUG3
46#include <netlink/netlink_debug.h>
47_DECLARE_DEBUG(LOG_INFO);
48
49/*
50 * The logic below provide a p2p interface for receiving and
51 * sending netlink data between the kernel and userland.
52 */
53
54static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp);
55
56struct nl_buf *
57nl_buf_alloc(size_t len, int mflag)
58{
59	struct nl_buf *nb;
60
61	nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag);
62	if (__predict_true(nb != NULL)) {
63		nb->buflen = len;
64		nb->datalen = nb->offset = 0;
65	}
66
67	return (nb);
68}
69
70void
71nl_buf_free(struct nl_buf *nb)
72{
73
74	free(nb, M_NETLINK);
75}
76
77void
78nl_schedule_taskqueue(struct nlpcb *nlp)
79{
80	if (!nlp->nl_task_pending) {
81		nlp->nl_task_pending = true;
82		taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
83		NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
84	} else {
85		NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
86	}
87}
88
89static bool
90nl_process_received_one(struct nlpcb *nlp)
91{
92	struct socket *so = nlp->nl_socket;
93	struct sockbuf *sb;
94	struct nl_buf *nb;
95	bool reschedule = false;
96
97	NLP_LOCK(nlp);
98	nlp->nl_task_pending = false;
99	NLP_UNLOCK(nlp);
100
101	/*
102	 * Do not process queued up requests if there is no space to queue
103	 * replies.
104	 */
105	sb = &so->so_rcv;
106	SOCK_RECVBUF_LOCK(so);
107	if (sb->sb_hiwat <= sb->sb_ccc) {
108		SOCK_RECVBUF_UNLOCK(so);
109		return (false);
110	}
111	SOCK_RECVBUF_UNLOCK(so);
112
113	sb = &so->so_snd;
114	SOCK_SENDBUF_LOCK(so);
115	while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) {
116		TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
117		SOCK_SENDBUF_UNLOCK(so);
118		reschedule = nl_process_nbuf(nb, nlp);
119		SOCK_SENDBUF_LOCK(so);
120		if (reschedule) {
121			sb->sb_acc -= nb->datalen;
122			sb->sb_ccc -= nb->datalen;
123			/* XXXGL: potentially can reduce lock&unlock count. */
124			sowwakeup_locked(so);
125			nl_buf_free(nb);
126			SOCK_SENDBUF_LOCK(so);
127		} else {
128			TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq);
129			break;
130		}
131	}
132	SOCK_SENDBUF_UNLOCK(so);
133
134	return (reschedule);
135}
136
137static void
138nl_process_received(struct nlpcb *nlp)
139{
140	NL_LOG(LOG_DEBUG3, "taskqueue called");
141
142	if (__predict_false(nlp->nl_need_thread_setup)) {
143		nl_set_thread_nlp(curthread, nlp);
144		NLP_LOCK(nlp);
145		nlp->nl_need_thread_setup = false;
146		NLP_UNLOCK(nlp);
147	}
148
149	while (nl_process_received_one(nlp))
150		;
151}
152
153/*
154 * Called after some data have been read from the socket.
155 */
156void
157nl_on_transmit(struct nlpcb *nlp)
158{
159	NLP_LOCK(nlp);
160
161	struct socket *so = nlp->nl_socket;
162	if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
163		unsigned long dropped_bytes = nlp->nl_dropped_bytes;
164		unsigned long dropped_messages = nlp->nl_dropped_messages;
165		nlp->nl_dropped_bytes = 0;
166		nlp->nl_dropped_messages = 0;
167
168		struct sockbuf *sb = &so->so_rcv;
169		NLP_LOG(LOG_DEBUG, nlp,
170		    "socket RX overflowed, %lu messages (%lu bytes) dropped. "
171		    "bytes: [%u/%u]", dropped_messages, dropped_bytes,
172		    sb->sb_ccc, sb->sb_hiwat);
173		/* TODO: send netlink message */
174	}
175
176	nl_schedule_taskqueue(nlp);
177	NLP_UNLOCK(nlp);
178}
179
180void
181nl_taskqueue_handler(void *_arg, int pending)
182{
183	struct nlpcb *nlp = (struct nlpcb *)_arg;
184
185	CURVNET_SET(nlp->nl_socket->so_vnet);
186	nl_process_received(nlp);
187	CURVNET_RESTORE();
188}
189
190/*
191 * Tries to send current data buffer from writer.
192 *
193 * Returns true on success.
194 * If no queue overrunes happened, wakes up socket owner.
195 */
196bool
197nl_send(struct nl_writer *nw, struct nlpcb *nlp)
198{
199	struct socket *so = nlp->nl_socket;
200	struct sockbuf *sb = &so->so_rcv;
201	struct nl_buf *nb;
202
203	MPASS(nw->hdr == NULL);
204	MPASS(nw->buf != NULL);
205	MPASS(nw->buf->datalen > 0);
206
207	IF_DEBUG_LEVEL(LOG_DEBUG2) {
208		struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data;
209		NLP_LOG(LOG_DEBUG2, nlp,
210		    "TX len %u msgs %u msg type %d first hdrlen %u",
211		    nw->buf->datalen, nw->num_messages, hdr->nlmsg_type,
212		    hdr->nlmsg_len);
213	}
214
215	if (nlp->nl_linux && linux_netlink_p != NULL &&
216	    __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) {
217		nl_buf_free(nw->buf);
218		nw->buf = NULL;
219		return (false);
220	}
221
222	nb = nw->buf;
223	nw->buf = NULL;
224
225	SOCK_RECVBUF_LOCK(so);
226	if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) {
227		SOCK_RECVBUF_UNLOCK(so);
228		NLP_LOCK(nlp);
229		nlp->nl_dropped_bytes += nb->datalen;
230		nlp->nl_dropped_messages += nw->num_messages;
231		NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
232		    (unsigned long)nlp->nl_dropped_messages, nw->num_messages,
233		    (unsigned long)nlp->nl_dropped_bytes, nb->datalen);
234		NLP_UNLOCK(nlp);
235		nl_buf_free(nb);
236		return (false);
237	} else {
238		bool full;
239
240		TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
241		sb->sb_acc += nb->datalen;
242		sb->sb_ccc += nb->datalen;
243		full = sb->sb_hiwat <= sb->sb_ccc;
244		sorwakeup_locked(so);
245		if (full) {
246			NLP_LOCK(nlp);
247			nlp->nl_tx_blocked = true;
248			NLP_UNLOCK(nlp);
249		}
250		return (true);
251	}
252}
253
254static int
255nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
256    struct nlpcb *nlp, struct nl_pstate *npt)
257{
258	nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
259	int error = 0;
260
261	NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
262	    hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
263	    hdr->nlmsg_pid);
264
265	if (__predict_false(hdr->nlmsg_len > remaining_length)) {
266		NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
267		    hdr->nlmsg_len, remaining_length);
268		return (EINVAL);
269	} else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
270		NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
271		return (EINVAL);
272	}
273	/* Stamp each message with sender pid */
274	hdr->nlmsg_pid = nlp->nl_port;
275
276	npt->hdr = hdr;
277
278	if (hdr->nlmsg_flags & NLM_F_REQUEST &&
279	    hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
280		NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
281		   hdr->nlmsg_type);
282		if (nlp->nl_linux) {
283			MPASS(linux_netlink_p != NULL);
284			error = linux_netlink_p->msg_from_linux(nlp->nl_proto,
285			    &hdr, npt);
286			if (error)
287				goto ack;
288		}
289		error = handler(hdr, npt);
290		NL_LOG(LOG_DEBUG2, "retcode: %d", error);
291	}
292ack:
293	if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
294		if (!npt->nw->suppress_ack) {
295			NL_LOG(LOG_DEBUG3, "ack");
296			nlmsg_ack(nlp, error, hdr, npt);
297		}
298	}
299
300	return (0);
301}
302
303static void
304npt_clear(struct nl_pstate *npt)
305{
306	lb_clear(&npt->lb);
307	npt->error = 0;
308	npt->err_msg = NULL;
309	npt->err_off = 0;
310	npt->hdr = NULL;
311	npt->nw->suppress_ack = false;
312}
313
314/*
315 * Processes an incoming packet, which can contain multiple netlink messages
316 */
317static bool
318nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp)
319{
320	struct nlmsghdr *hdr;
321	int error;
322
323	NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket);
324
325	struct nl_writer nw = {};
326	if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) {
327		NL_LOG(LOG_DEBUG, "error allocating socket writer");
328		return (true);
329	}
330
331	nlmsg_ignore_limit(&nw);
332
333	struct nl_pstate npt = {
334		.nlp = nlp,
335		.lb.base = &nb->data[roundup2(nb->datalen, 8)],
336		.lb.size = nb->buflen - roundup2(nb->datalen, 8),
337		.nw = &nw,
338		.strict = nlp->nl_flags & NLF_STRICT,
339	};
340
341	for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) {
342		hdr = (struct nlmsghdr *)&nb->data[nb->offset];
343		/* Save length prior to calling handler */
344		int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
345		NL_LOG(LOG_DEBUG3, "parsing offset %d/%d",
346		    nb->offset, nb->datalen);
347		npt_clear(&npt);
348		error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp,
349		    &npt);
350		nb->offset += msglen;
351		if (__predict_false(error != 0 || nlp->nl_tx_blocked))
352			break;
353	}
354	NL_LOG(LOG_DEBUG3, "packet parsing done");
355	nlmsg_flush(&nw);
356
357	if (nlp->nl_tx_blocked) {
358		NLP_LOCK(nlp);
359		nlp->nl_tx_blocked = false;
360		NLP_UNLOCK(nlp);
361		return (false);
362	} else
363		return (true);
364}
365