icl.c revision 265501
1/*-
2 * Copyright (c) 2012 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: stable/10/sys/dev/iscsi/icl.c 265501 2014-05-07 06:46:59Z trasz $
30 */
31
32/*
33 * iSCSI Common Layer.  It's used by both the initiator and target to send
34 * and receive iSCSI PDUs.
35 */
36
37#include <sys/param.h>
38#include <sys/capability.h>
39#include <sys/condvar.h>
40#include <sys/conf.h>
41#include <sys/file.h>
42#include <sys/kernel.h>
43#include <sys/kthread.h>
44#include <sys/lock.h>
45#include <sys/mbuf.h>
46#include <sys/mutex.h>
47#include <sys/module.h>
48#include <sys/socket.h>
49#include <sys/socketvar.h>
50#include <sys/sysctl.h>
51#include <sys/systm.h>
52#include <sys/sx.h>
53#include <sys/uio.h>
54#include <vm/uma.h>
55#include <netinet/in.h>
56#include <netinet/tcp.h>
57
58#include "icl.h"
59#include "iscsi_proto.h"
60
61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62static int debug = 1;
63TUNABLE_INT("kern.icl.debug", &debug);
64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RWTUN,
65    &debug, 1, "Enable debug messages");
66static int partial_receive_len = 1 * 1024; /* XXX: More? */
67TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
69    &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70    "data segment");
71static int sendspace = 1048576;
72TUNABLE_INT("kern.icl.sendspace", &sendspace);
73SYSCTL_INT(_kern_icl, OID_AUTO, sendspace, CTLFLAG_RWTUN,
74    &sendspace, 1048576, "Default send socket buffer size");
75static int recvspace = 1048576;
76TUNABLE_INT("kern.icl.recvspace", &recvspace);
77SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN,
78    &recvspace, 1048576, "Default receive socket buffer size");
79
80static uma_zone_t icl_conn_zone;
81static uma_zone_t icl_pdu_zone;
82
83static volatile u_int	icl_ncons;
84
85#define	ICL_DEBUG(X, ...)					\
86	if (debug > 1) {					\
87		printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
88	} while (0)
89
90#define	ICL_WARN(X, ...)					\
91	if (debug > 0) {					\
92		printf("WARNING: %s: " X "\n",			\
93		    __func__, ## __VA_ARGS__);			\
94	} while (0)
95
96#define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
97#define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
98#define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
99#define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
100
101static void
102icl_conn_fail(struct icl_conn *ic)
103{
104	if (ic->ic_socket == NULL)
105		return;
106
107	/*
108	 * XXX
109	 */
110	ic->ic_socket->so_error = EDOOFUS;
111	(ic->ic_error)(ic);
112}
113
114static struct mbuf *
115icl_conn_receive(struct icl_conn *ic, size_t len)
116{
117	struct uio uio;
118	struct socket *so;
119	struct mbuf *m;
120	int error, flags;
121
122	so = ic->ic_socket;
123
124	memset(&uio, 0, sizeof(uio));
125	uio.uio_resid = len;
126
127	flags = MSG_DONTWAIT;
128	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
129	if (error != 0) {
130		ICL_DEBUG("soreceive error %d", error);
131		return (NULL);
132	}
133	if (uio.uio_resid != 0) {
134		m_freem(m);
135		ICL_DEBUG("short read");
136		return (NULL);
137	}
138
139	return (m);
140}
141
142static struct icl_pdu *
143icl_pdu_new(struct icl_conn *ic, int flags)
144{
145	struct icl_pdu *ip;
146
147#ifdef DIAGNOSTIC
148	refcount_acquire(&ic->ic_outstanding_pdus);
149#endif
150	ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
151	if (ip == NULL) {
152		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
153#ifdef DIAGNOSTIC
154		refcount_release(&ic->ic_outstanding_pdus);
155#endif
156		return (NULL);
157	}
158
159	ip->ip_conn = ic;
160
161	return (ip);
162}
163
164void
165icl_pdu_free(struct icl_pdu *ip)
166{
167	struct icl_conn *ic;
168
169	ic = ip->ip_conn;
170
171	m_freem(ip->ip_bhs_mbuf);
172	m_freem(ip->ip_ahs_mbuf);
173	m_freem(ip->ip_data_mbuf);
174	uma_zfree(icl_pdu_zone, ip);
175#ifdef DIAGNOSTIC
176	refcount_release(&ic->ic_outstanding_pdus);
177#endif
178}
179
180/*
181 * Allocate icl_pdu with empty BHS to fill up by the caller.
182 */
183struct icl_pdu *
184icl_pdu_new_bhs(struct icl_conn *ic, int flags)
185{
186	struct icl_pdu *ip;
187
188	ip = icl_pdu_new(ic, flags);
189	if (ip == NULL)
190		return (NULL);
191
192	ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
193	    flags, MT_DATA, M_PKTHDR);
194	if (ip->ip_bhs_mbuf == NULL) {
195		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
196		icl_pdu_free(ip);
197		return (NULL);
198	}
199	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
200	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
201	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
202
203	return (ip);
204}
205
206static int
207icl_pdu_ahs_length(const struct icl_pdu *request)
208{
209
210	return (request->ip_bhs->bhs_total_ahs_len * 4);
211}
212
213size_t
214icl_pdu_data_segment_length(const struct icl_pdu *request)
215{
216	uint32_t len = 0;
217
218	len += request->ip_bhs->bhs_data_segment_len[0];
219	len <<= 8;
220	len += request->ip_bhs->bhs_data_segment_len[1];
221	len <<= 8;
222	len += request->ip_bhs->bhs_data_segment_len[2];
223
224	return (len);
225}
226
227static void
228icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
229{
230
231	response->ip_bhs->bhs_data_segment_len[2] = len;
232	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
233	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
234}
235
236static size_t
237icl_pdu_padding(const struct icl_pdu *ip)
238{
239
240	if ((ip->ip_data_len % 4) != 0)
241		return (4 - (ip->ip_data_len % 4));
242
243	return (0);
244}
245
246static size_t
247icl_pdu_size(const struct icl_pdu *response)
248{
249	size_t len;
250
251	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
252
253	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
254	    icl_pdu_padding(response);
255	if (response->ip_conn->ic_header_crc32c)
256		len += ISCSI_HEADER_DIGEST_SIZE;
257	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
258		len += ISCSI_DATA_DIGEST_SIZE;
259
260	return (len);
261}
262
263static int
264icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
265{
266	struct mbuf *m;
267
268	m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
269	if (m == NULL) {
270		ICL_DEBUG("failed to receive BHS");
271		return (-1);
272	}
273
274	request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
275	if (request->ip_bhs_mbuf == NULL) {
276		ICL_WARN("m_pullup failed");
277		return (-1);
278	}
279	request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
280
281	/*
282	 * XXX: For architectures with strict alignment requirements
283	 * 	we may need to allocate ip_bhs and copy the data into it.
284	 * 	For some reason, though, not doing this doesn't seem
285	 * 	to cause problems; tested on sparc64.
286	 */
287
288	*availablep -= sizeof(struct iscsi_bhs);
289	return (0);
290}
291
292static int
293icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
294{
295
296	request->ip_ahs_len = icl_pdu_ahs_length(request);
297	if (request->ip_ahs_len == 0)
298		return (0);
299
300	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
301	    request->ip_ahs_len);
302	if (request->ip_ahs_mbuf == NULL) {
303		ICL_DEBUG("failed to receive AHS");
304		return (-1);
305	}
306
307	*availablep -= request->ip_ahs_len;
308	return (0);
309}
310
311static uint32_t
312icl_mbuf_to_crc32c(const struct mbuf *m0)
313{
314	uint32_t digest = 0xffffffff;
315	const struct mbuf *m;
316
317	for (m = m0; m != NULL; m = m->m_next)
318		digest = calculate_crc32c(digest,
319		    mtod(m, const void *), m->m_len);
320
321	digest = digest ^ 0xffffffff;
322
323	return (digest);
324}
325
326static int
327icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
328{
329	struct mbuf *m;
330	uint32_t received_digest, valid_digest;
331
332	if (request->ip_conn->ic_header_crc32c == false)
333		return (0);
334
335	m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
336	if (m == NULL) {
337		ICL_DEBUG("failed to receive header digest");
338		return (-1);
339	}
340
341	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
342	m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest);
343	m_freem(m);
344
345	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
346
347	/*
348	 * XXX: Handle AHS.
349	 */
350	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
351	if (received_digest != valid_digest) {
352		ICL_WARN("header digest check failed; got 0x%x, "
353		    "should be 0x%x", received_digest, valid_digest);
354		return (-1);
355	}
356
357	return (0);
358}
359
360/*
361 * Return the number of bytes that should be waiting in the receive socket
362 * before icl_pdu_receive_data_segment() gets called.
363 */
364static size_t
365icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
366{
367	size_t len;
368
369	len = icl_pdu_data_segment_length(request);
370	if (len == 0)
371		return (0);
372
373	/*
374	 * Account for the parts of data segment already read from
375	 * the socket buffer.
376	 */
377	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
378	len -= request->ip_data_len;
379
380	/*
381	 * Don't always wait for the full data segment to be delivered
382	 * to the socket; this might badly affect performance due to
383	 * TCP window scaling.
384	 */
385	if (len > partial_receive_len) {
386#if 0
387		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
388		    len, partial_receive_len));
389#endif
390		len = partial_receive_len;
391
392		return (len);
393	}
394
395	/*
396	 * Account for padding.  Note that due to the way code is written,
397	 * the icl_pdu_receive_data_segment() must always receive padding
398	 * along with the last part of data segment, because it would be
399	 * impossible to tell whether we've already received the full data
400	 * segment including padding, or without it.
401	 */
402	if ((len % 4) != 0)
403		len += 4 - (len % 4);
404
405#if 0
406	ICL_DEBUG("need %zd bytes of data", len));
407#endif
408
409	return (len);
410}
411
412static int
413icl_pdu_receive_data_segment(struct icl_pdu *request,
414    size_t *availablep, bool *more_neededp)
415{
416	struct icl_conn *ic;
417	size_t len, padding = 0;
418	struct mbuf *m;
419
420	ic = request->ip_conn;
421
422	*more_neededp = false;
423	ic->ic_receive_len = 0;
424
425	len = icl_pdu_data_segment_length(request);
426	if (len == 0)
427		return (0);
428
429	if ((len % 4) != 0)
430		padding = 4 - (len % 4);
431
432	/*
433	 * Account for already received parts of data segment.
434	 */
435	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
436	len -= request->ip_data_len;
437
438	if (len + padding > *availablep) {
439		/*
440		 * Not enough data in the socket buffer.  Receive as much
441		 * as we can.  Don't receive padding, since, obviously, it's
442		 * not the end of data segment yet.
443		 */
444#if 0
445		ICL_DEBUG("limited from %zd to %zd",
446		    len + padding, *availablep - padding));
447#endif
448		len = *availablep - padding;
449		*more_neededp = true;
450		padding = 0;
451	}
452
453	/*
454	 * Must not try to receive padding without at least one byte
455	 * of actual data segment.
456	 */
457	if (len > 0) {
458		m = icl_conn_receive(request->ip_conn, len + padding);
459		if (m == NULL) {
460			ICL_DEBUG("failed to receive data segment");
461			return (-1);
462		}
463
464		if (request->ip_data_mbuf == NULL)
465			request->ip_data_mbuf = m;
466		else
467			m_cat(request->ip_data_mbuf, m);
468
469		request->ip_data_len += len;
470		*availablep -= len + padding;
471	} else
472		ICL_DEBUG("len 0");
473
474	if (*more_neededp)
475		ic->ic_receive_len =
476		    icl_pdu_data_segment_receive_len(request);
477
478	return (0);
479}
480
481static int
482icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
483{
484	struct mbuf *m;
485	uint32_t received_digest, valid_digest;
486
487	if (request->ip_conn->ic_data_crc32c == false)
488		return (0);
489
490	if (request->ip_data_len == 0)
491		return (0);
492
493	m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
494	if (m == NULL) {
495		ICL_DEBUG("failed to receive data digest");
496		return (-1);
497	}
498
499	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
500	m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest);
501	m_freem(m);
502
503	*availablep -= ISCSI_DATA_DIGEST_SIZE;
504
505	/*
506	 * Note that ip_data_mbuf also contains padding; since digest
507	 * calculation is supposed to include that, we iterate over
508	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
509	 */
510	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
511	if (received_digest != valid_digest) {
512		ICL_WARN("data digest check failed; got 0x%x, "
513		    "should be 0x%x", received_digest, valid_digest);
514		return (-1);
515	}
516
517	return (0);
518}
519
520/*
521 * Somewhat contrary to the name, this attempts to receive only one
522 * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
523 */
524static struct icl_pdu *
525icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
526{
527	struct icl_pdu *request;
528	struct socket *so;
529	size_t len;
530	int error;
531	bool more_needed;
532
533	so = ic->ic_socket;
534
535	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
536		KASSERT(ic->ic_receive_pdu == NULL,
537		    ("ic->ic_receive_pdu != NULL"));
538		request = icl_pdu_new(ic, M_NOWAIT);
539		if (request == NULL) {
540			ICL_DEBUG("failed to allocate PDU; "
541			    "dropping connection");
542			icl_conn_fail(ic);
543			return (NULL);
544		}
545		ic->ic_receive_pdu = request;
546	} else {
547		KASSERT(ic->ic_receive_pdu != NULL,
548		    ("ic->ic_receive_pdu == NULL"));
549		request = ic->ic_receive_pdu;
550	}
551
552	if (*availablep < ic->ic_receive_len) {
553#if 0
554		ICL_DEBUG("not enough data; need %zd, "
555		    "have %zd", ic->ic_receive_len, *availablep);
556#endif
557		return (NULL);
558	}
559
560	switch (ic->ic_receive_state) {
561	case ICL_CONN_STATE_BHS:
562		//ICL_DEBUG("receiving BHS");
563		error = icl_pdu_receive_bhs(request, availablep);
564		if (error != 0) {
565			ICL_DEBUG("failed to receive BHS; "
566			    "dropping connection");
567			break;
568		}
569
570		/*
571		 * We don't enforce any limit for AHS length;
572		 * its length is stored in 8 bit field.
573		 */
574
575		len = icl_pdu_data_segment_length(request);
576		if (len > ic->ic_max_data_segment_length) {
577			ICL_WARN("received data segment "
578			    "length %zd is larger than negotiated "
579			    "MaxDataSegmentLength %zd; "
580			    "dropping connection",
581			    len, ic->ic_max_data_segment_length);
582			error = EINVAL;
583			break;
584		}
585
586		ic->ic_receive_state = ICL_CONN_STATE_AHS;
587		ic->ic_receive_len = icl_pdu_ahs_length(request);
588		break;
589
590	case ICL_CONN_STATE_AHS:
591		//ICL_DEBUG("receiving AHS");
592		error = icl_pdu_receive_ahs(request, availablep);
593		if (error != 0) {
594			ICL_DEBUG("failed to receive AHS; "
595			    "dropping connection");
596			break;
597		}
598		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
599		if (ic->ic_header_crc32c == false)
600			ic->ic_receive_len = 0;
601		else
602			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
603		break;
604
605	case ICL_CONN_STATE_HEADER_DIGEST:
606		//ICL_DEBUG("receiving header digest");
607		error = icl_pdu_check_header_digest(request, availablep);
608		if (error != 0) {
609			ICL_DEBUG("header digest failed; "
610			    "dropping connection");
611			break;
612		}
613
614		ic->ic_receive_state = ICL_CONN_STATE_DATA;
615		ic->ic_receive_len =
616		    icl_pdu_data_segment_receive_len(request);
617		break;
618
619	case ICL_CONN_STATE_DATA:
620		//ICL_DEBUG("receiving data segment");
621		error = icl_pdu_receive_data_segment(request, availablep,
622		    &more_needed);
623		if (error != 0) {
624			ICL_DEBUG("failed to receive data segment;"
625			    "dropping connection");
626			break;
627		}
628
629		if (more_needed)
630			break;
631
632		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
633		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
634			ic->ic_receive_len = 0;
635		else
636			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
637		break;
638
639	case ICL_CONN_STATE_DATA_DIGEST:
640		//ICL_DEBUG("receiving data digest");
641		error = icl_pdu_check_data_digest(request, availablep);
642		if (error != 0) {
643			ICL_DEBUG("data digest failed; "
644			    "dropping connection");
645			break;
646		}
647
648		/*
649		 * We've received complete PDU; reset the receive state machine
650		 * and return the PDU.
651		 */
652		ic->ic_receive_state = ICL_CONN_STATE_BHS;
653		ic->ic_receive_len = sizeof(struct iscsi_bhs);
654		ic->ic_receive_pdu = NULL;
655		return (request);
656
657	default:
658		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
659	}
660
661	if (error != 0) {
662		icl_pdu_free(request);
663		icl_conn_fail(ic);
664	}
665
666	return (NULL);
667}
668
669static void
670icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
671{
672	struct icl_pdu *response;
673	struct socket *so;
674
675	so = ic->ic_socket;
676
677	/*
678	 * This can never happen; we're careful to only mess with ic->ic_socket
679	 * pointer when the send/receive threads are not running.
680	 */
681	KASSERT(so != NULL, ("NULL socket"));
682
683	for (;;) {
684		if (ic->ic_disconnecting)
685			return;
686
687		if (so->so_error != 0) {
688			ICL_DEBUG("connection error %d; "
689			    "dropping connection", so->so_error);
690			icl_conn_fail(ic);
691			return;
692		}
693
694		/*
695		 * Loop until we have a complete PDU or there is not enough
696		 * data in the socket buffer.
697		 */
698		if (available < ic->ic_receive_len) {
699#if 0
700			ICL_DEBUG("not enough data; have %zd, "
701			    "need %zd", available,
702			    ic->ic_receive_len);
703#endif
704			return;
705		}
706
707		response = icl_conn_receive_pdu(ic, &available);
708		if (response == NULL)
709			continue;
710
711		if (response->ip_ahs_len > 0) {
712			ICL_WARN("received PDU with unsupported "
713			    "AHS; opcode 0x%x; dropping connection",
714			    response->ip_bhs->bhs_opcode);
715			icl_pdu_free(response);
716			icl_conn_fail(ic);
717			return;
718		}
719
720		(ic->ic_receive)(response);
721	}
722}
723
724static void
725icl_receive_thread(void *arg)
726{
727	struct icl_conn *ic;
728	size_t available;
729	struct socket *so;
730
731	ic = arg;
732	so = ic->ic_socket;
733
734	ICL_CONN_LOCK(ic);
735	ic->ic_receive_running = true;
736	ICL_CONN_UNLOCK(ic);
737
738	for (;;) {
739		if (ic->ic_disconnecting) {
740			//ICL_DEBUG("terminating");
741			break;
742		}
743
744		SOCKBUF_LOCK(&so->so_rcv);
745		available = so->so_rcv.sb_cc;
746		if (available < ic->ic_receive_len) {
747			so->so_rcv.sb_lowat = ic->ic_receive_len;
748			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
749		}
750		SOCKBUF_UNLOCK(&so->so_rcv);
751
752		icl_conn_receive_pdus(ic, available);
753	}
754
755	ICL_CONN_LOCK(ic);
756	ic->ic_receive_running = false;
757	ICL_CONN_UNLOCK(ic);
758	kthread_exit();
759}
760
761static int
762icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
763{
764	struct icl_conn *ic;
765
766	ic = arg;
767	cv_signal(&ic->ic_receive_cv);
768	return (SU_OK);
769}
770
771static int
772icl_pdu_send(struct icl_pdu *request)
773{
774	size_t padding, pdu_len;
775	uint32_t digest, zero = 0;
776	int error, ok;
777	struct socket *so;
778	struct icl_conn *ic;
779
780	ic = request->ip_conn;
781	so = request->ip_conn->ic_socket;
782
783	ICL_CONN_LOCK_ASSERT(ic);
784
785	icl_pdu_set_data_segment_length(request, request->ip_data_len);
786
787	pdu_len = icl_pdu_size(request);
788
789	if (ic->ic_header_crc32c) {
790		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
791		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
792		    (void *)&digest);
793		if (ok != 1) {
794			ICL_WARN("failed to append header digest");
795			return (1);
796		}
797	}
798
799	if (request->ip_data_len != 0) {
800		padding = icl_pdu_padding(request);
801		if (padding > 0) {
802			ok = m_append(request->ip_data_mbuf, padding,
803			    (void *)&zero);
804			if (ok != 1) {
805				ICL_WARN("failed to append padding");
806				return (1);
807			}
808		}
809
810		if (ic->ic_data_crc32c) {
811			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
812
813			ok = m_append(request->ip_data_mbuf, sizeof(digest),
814			    (void *)&digest);
815			if (ok != 1) {
816				ICL_WARN("failed to append header digest");
817				return (1);
818			}
819		}
820
821		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
822		request->ip_data_mbuf = NULL;
823	}
824
825	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
826
827	error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
828	    NULL, MSG_DONTWAIT, curthread);
829	request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
830	if (error != 0) {
831		ICL_DEBUG("sosend error %d", error);
832		return (error);
833	}
834
835	return (0);
836}
837
838static void
839icl_conn_send_pdus(struct icl_conn *ic)
840{
841	struct icl_pdu *request;
842	struct socket *so;
843	size_t available, size;
844	int error;
845
846	ICL_CONN_LOCK_ASSERT(ic);
847
848	so = ic->ic_socket;
849
850	SOCKBUF_LOCK(&so->so_snd);
851	available = sbspace(&so->so_snd);
852	SOCKBUF_UNLOCK(&so->so_snd);
853
854	while (!STAILQ_EMPTY(&ic->ic_to_send)) {
855		if (ic->ic_disconnecting)
856			return;
857
858		request = STAILQ_FIRST(&ic->ic_to_send);
859		size = icl_pdu_size(request);
860		if (available < size) {
861			/*
862			 * Set the low watermark on the socket,
863			 * to avoid waking up until there is enough
864			 * space.
865			 */
866			SOCKBUF_LOCK(&so->so_snd);
867			so->so_snd.sb_lowat = size;
868			SOCKBUF_UNLOCK(&so->so_snd);
869#if 1
870			ICL_DEBUG("no space to send; "
871			    "have %zd, need %zd",
872			    available, size);
873#endif
874			return;
875		}
876		available -= size;
877		STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
878		error = icl_pdu_send(request);
879		if (error != 0) {
880			ICL_DEBUG("failed to send PDU; "
881			    "dropping connection");
882			icl_conn_fail(ic);
883			return;
884		}
885		icl_pdu_free(request);
886	}
887}
888
889static void
890icl_send_thread(void *arg)
891{
892	struct icl_conn *ic;
893
894	ic = arg;
895
896	ICL_CONN_LOCK(ic);
897	ic->ic_send_running = true;
898
899	for (;;) {
900		if (ic->ic_disconnecting) {
901			//ICL_DEBUG("terminating");
902			break;
903		}
904		icl_conn_send_pdus(ic);
905		cv_wait(&ic->ic_send_cv, ic->ic_lock);
906	}
907
908	ic->ic_send_running = false;
909	ICL_CONN_UNLOCK(ic);
910	kthread_exit();
911}
912
913static int
914icl_soupcall_send(struct socket *so, void *arg, int waitflag)
915{
916	struct icl_conn *ic;
917
918	ic = arg;
919	cv_signal(&ic->ic_send_cv);
920	return (SU_OK);
921}
922
923int
924icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
925{
926	struct mbuf *mb, *newmb;
927	size_t copylen, off = 0;
928
929	KASSERT(len > 0, ("len == 0"));
930
931	newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
932	if (newmb == NULL) {
933		ICL_WARN("failed to allocate mbuf for %zd bytes", len);
934		return (ENOMEM);
935	}
936
937	for (mb = newmb; mb != NULL; mb = mb->m_next) {
938		copylen = min(M_TRAILINGSPACE(mb), len - off);
939		memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
940		mb->m_len = copylen;
941		off += copylen;
942	}
943	KASSERT(off == len, ("%s: off != len", __func__));
944
945	if (request->ip_data_mbuf == NULL) {
946		request->ip_data_mbuf = newmb;
947		request->ip_data_len = len;
948	} else {
949		m_cat(request->ip_data_mbuf, newmb);
950		request->ip_data_len += len;
951	}
952
953	return (0);
954}
955
956void
957icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
958{
959
960	m_copydata(ip->ip_data_mbuf, off, len, addr);
961}
962
963void
964icl_pdu_queue(struct icl_pdu *ip)
965{
966	struct icl_conn *ic;
967
968	ic = ip->ip_conn;
969
970	ICL_CONN_LOCK_ASSERT(ic);
971
972	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
973		ICL_DEBUG("icl_pdu_queue on closed connection");
974		icl_pdu_free(ip);
975		return;
976	}
977	STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
978	cv_signal(&ic->ic_send_cv);
979}
980
981struct icl_conn *
982icl_conn_new(const char *name, struct mtx *lock)
983{
984	struct icl_conn *ic;
985
986	refcount_acquire(&icl_ncons);
987
988	ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
989
990	STAILQ_INIT(&ic->ic_to_send);
991	ic->ic_lock = lock;
992	cv_init(&ic->ic_send_cv, "icl_tx");
993	cv_init(&ic->ic_receive_cv, "icl_rx");
994#ifdef DIAGNOSTIC
995	refcount_init(&ic->ic_outstanding_pdus, 0);
996#endif
997	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
998	ic->ic_name = name;
999
1000	return (ic);
1001}
1002
1003void
1004icl_conn_free(struct icl_conn *ic)
1005{
1006
1007	cv_destroy(&ic->ic_send_cv);
1008	cv_destroy(&ic->ic_receive_cv);
1009	uma_zfree(icl_conn_zone, ic);
1010	refcount_release(&icl_ncons);
1011}
1012
1013static int
1014icl_conn_start(struct icl_conn *ic)
1015{
1016	size_t minspace;
1017	struct sockopt opt;
1018	int error, one = 1;
1019
1020	ICL_CONN_LOCK(ic);
1021
1022	/*
1023	 * XXX: Ugly hack.
1024	 */
1025	if (ic->ic_socket == NULL) {
1026		ICL_CONN_UNLOCK(ic);
1027		return (EINVAL);
1028	}
1029
1030	ic->ic_receive_state = ICL_CONN_STATE_BHS;
1031	ic->ic_receive_len = sizeof(struct iscsi_bhs);
1032	ic->ic_disconnecting = false;
1033
1034	ICL_CONN_UNLOCK(ic);
1035
1036	/*
1037	 * For sendspace, this is required because the current code cannot
1038	 * send a PDU in pieces; thus, the minimum buffer size is equal
1039	 * to the maximum PDU size.  "+4" is to account for possible padding.
1040	 *
1041	 * What we should actually do here is to use autoscaling, but set
1042	 * some minimal buffer size to "minspace".  I don't know a way to do
1043	 * that, though.
1044	 */
1045	minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length +
1046	    ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
1047	if (sendspace < minspace) {
1048		ICL_WARN("kern.icl.sendspace too low; must be at least %zd",
1049		    minspace);
1050		sendspace = minspace;
1051	}
1052	if (recvspace < minspace) {
1053		ICL_WARN("kern.icl.recvspace too low; must be at least %zd",
1054		    minspace);
1055		recvspace = minspace;
1056	}
1057
1058	error = soreserve(ic->ic_socket, sendspace, recvspace);
1059	if (error != 0) {
1060		ICL_WARN("soreserve failed with error %d", error);
1061		icl_conn_close(ic);
1062		return (error);
1063	}
1064
1065	/*
1066	 * Disable Nagle.
1067	 */
1068	bzero(&opt, sizeof(opt));
1069	opt.sopt_dir = SOPT_SET;
1070	opt.sopt_level = IPPROTO_TCP;
1071	opt.sopt_name = TCP_NODELAY;
1072	opt.sopt_val = &one;
1073	opt.sopt_valsize = sizeof(one);
1074	error = sosetopt(ic->ic_socket, &opt);
1075	if (error != 0) {
1076		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1077		icl_conn_close(ic);
1078		return (error);
1079	}
1080
1081	/*
1082	 * Start threads.
1083	 */
1084	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx",
1085	    ic->ic_name);
1086	if (error != 0) {
1087		ICL_WARN("kthread_add(9) failed with error %d", error);
1088		icl_conn_close(ic);
1089		return (error);
1090	}
1091
1092	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx",
1093	    ic->ic_name);
1094	if (error != 0) {
1095		ICL_WARN("kthread_add(9) failed with error %d", error);
1096		icl_conn_close(ic);
1097		return (error);
1098	}
1099
1100	/*
1101	 * Register socket upcall, to get notified about incoming PDUs
1102	 * and free space to send outgoing ones.
1103	 */
1104	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1105	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1106	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1107	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1108	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1109	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1110
1111	return (0);
1112}
1113
1114int
1115icl_conn_handoff(struct icl_conn *ic, int fd)
1116{
1117	struct file *fp;
1118	struct socket *so;
1119	cap_rights_t rights;
1120	int error;
1121
1122	ICL_CONN_LOCK_ASSERT_NOT(ic);
1123
1124	/*
1125	 * Steal the socket from userland.
1126	 */
1127	error = fget(curthread, fd,
1128	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1129	if (error != 0)
1130		return (error);
1131	if (fp->f_type != DTYPE_SOCKET) {
1132		fdrop(fp, curthread);
1133		return (EINVAL);
1134	}
1135	so = fp->f_data;
1136	if (so->so_type != SOCK_STREAM) {
1137		fdrop(fp, curthread);
1138		return (EINVAL);
1139	}
1140
1141	ICL_CONN_LOCK(ic);
1142
1143	if (ic->ic_socket != NULL) {
1144		ICL_CONN_UNLOCK(ic);
1145		fdrop(fp, curthread);
1146		return (EBUSY);
1147	}
1148
1149	ic->ic_socket = fp->f_data;
1150	fp->f_ops = &badfileops;
1151	fp->f_data = NULL;
1152	fdrop(fp, curthread);
1153	ICL_CONN_UNLOCK(ic);
1154
1155	error = icl_conn_start(ic);
1156
1157	return (error);
1158}
1159
1160void
1161icl_conn_shutdown(struct icl_conn *ic)
1162{
1163	ICL_CONN_LOCK_ASSERT_NOT(ic);
1164
1165	ICL_CONN_LOCK(ic);
1166	if (ic->ic_socket == NULL) {
1167		ICL_CONN_UNLOCK(ic);
1168		return;
1169	}
1170	ICL_CONN_UNLOCK(ic);
1171
1172	soshutdown(ic->ic_socket, SHUT_RDWR);
1173}
1174
1175void
1176icl_conn_close(struct icl_conn *ic)
1177{
1178	struct icl_pdu *pdu;
1179
1180	ICL_CONN_LOCK_ASSERT_NOT(ic);
1181
1182	ICL_CONN_LOCK(ic);
1183	if (ic->ic_socket == NULL) {
1184		ICL_CONN_UNLOCK(ic);
1185		return;
1186	}
1187
1188	ic->ic_disconnecting = true;
1189
1190	/*
1191	 * Wake up the threads, so they can properly terminate.
1192	 */
1193	cv_signal(&ic->ic_receive_cv);
1194	cv_signal(&ic->ic_send_cv);
1195	while (ic->ic_receive_running || ic->ic_send_running) {
1196		//ICL_DEBUG("waiting for send/receive threads to terminate");
1197		ICL_CONN_UNLOCK(ic);
1198		cv_signal(&ic->ic_receive_cv);
1199		cv_signal(&ic->ic_send_cv);
1200		pause("icl_close", 1 * hz);
1201		ICL_CONN_LOCK(ic);
1202	}
1203	//ICL_DEBUG("send/receive threads terminated");
1204
1205	soclose(ic->ic_socket);
1206	ic->ic_socket = NULL;
1207
1208	if (ic->ic_receive_pdu != NULL) {
1209		//ICL_DEBUG("freeing partially received PDU");
1210		icl_pdu_free(ic->ic_receive_pdu);
1211		ic->ic_receive_pdu = NULL;
1212	}
1213
1214	/*
1215	 * Remove any outstanding PDUs from the send queue.
1216	 */
1217	while (!STAILQ_EMPTY(&ic->ic_to_send)) {
1218		pdu = STAILQ_FIRST(&ic->ic_to_send);
1219		STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
1220		icl_pdu_free(pdu);
1221	}
1222
1223	KASSERT(STAILQ_EMPTY(&ic->ic_to_send),
1224	    ("destroying session with non-empty send queue"));
1225#ifdef DIAGNOSTIC
1226	KASSERT(ic->ic_outstanding_pdus == 0,
1227	    ("destroying session with %d outstanding PDUs",
1228	     ic->ic_outstanding_pdus));
1229#endif
1230	ICL_CONN_UNLOCK(ic);
1231}
1232
1233bool
1234icl_conn_connected(struct icl_conn *ic)
1235{
1236	ICL_CONN_LOCK_ASSERT_NOT(ic);
1237
1238	ICL_CONN_LOCK(ic);
1239	if (ic->ic_socket == NULL) {
1240		ICL_CONN_UNLOCK(ic);
1241		return (false);
1242	}
1243	if (ic->ic_socket->so_error != 0) {
1244		ICL_CONN_UNLOCK(ic);
1245		return (false);
1246	}
1247	ICL_CONN_UNLOCK(ic);
1248	return (true);
1249}
1250
1251#ifdef ICL_KERNEL_PROXY
1252int
1253icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1254{
1255	int error;
1256
1257	ICL_CONN_LOCK_ASSERT_NOT(ic);
1258
1259	if (so->so_type != SOCK_STREAM)
1260		return (EINVAL);
1261
1262	ICL_CONN_LOCK(ic);
1263	if (ic->ic_socket != NULL) {
1264		ICL_CONN_UNLOCK(ic);
1265		return (EBUSY);
1266	}
1267	ic->ic_socket = so;
1268	ICL_CONN_UNLOCK(ic);
1269
1270	error = icl_conn_start(ic);
1271
1272	return (error);
1273}
1274#endif /* ICL_KERNEL_PROXY */
1275
1276static int
1277icl_unload(void)
1278{
1279
1280	if (icl_ncons != 0)
1281		return (EBUSY);
1282
1283	uma_zdestroy(icl_conn_zone);
1284	uma_zdestroy(icl_pdu_zone);
1285
1286	return (0);
1287}
1288
1289static void
1290icl_load(void)
1291{
1292
1293	icl_conn_zone = uma_zcreate("icl_conn",
1294	    sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1295	    UMA_ALIGN_PTR, 0);
1296	icl_pdu_zone = uma_zcreate("icl_pdu",
1297	    sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1298	    UMA_ALIGN_PTR, 0);
1299
1300	refcount_init(&icl_ncons, 0);
1301}
1302
1303static int
1304icl_modevent(module_t mod, int what, void *arg)
1305{
1306
1307	switch (what) {
1308	case MOD_LOAD:
1309		icl_load();
1310		return (0);
1311	case MOD_UNLOAD:
1312		return (icl_unload());
1313	default:
1314		return (EINVAL);
1315	}
1316}
1317
1318moduledata_t icl_data = {
1319	"icl",
1320	icl_modevent,
1321	0
1322};
1323
1324DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1325MODULE_VERSION(icl, 1);
1326