1258945Sroberto/*
2280849Scy * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3258945Sroberto * Copyright (C) 2000-2003  Internet Software Consortium.
4258945Sroberto *
5258945Sroberto * Permission to use, copy, modify, and/or distribute this software for any
6258945Sroberto * purpose with or without fee is hereby granted, provided that the above
7258945Sroberto * copyright notice and this permission notice appear in all copies.
8258945Sroberto *
9258945Sroberto * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10258945Sroberto * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11258945Sroberto * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12258945Sroberto * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13258945Sroberto * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14258945Sroberto * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15258945Sroberto * PERFORMANCE OF THIS SOFTWARE.
16258945Sroberto */
17258945Sroberto
18280849Scy/* $Id$ */
19258945Sroberto
20258945Sroberto/* This code uses functions which are only available on Server 2003 and
21258945Sroberto * higher, and Windows XP and higher.
22258945Sroberto *
23258945Sroberto * This code is by nature multithreaded and takes advantage of various
24258945Sroberto * features to pass on information through the completion port for
25258945Sroberto * when I/O is completed.  All sends, receives, accepts, and connects are
26258945Sroberto * completed through the completion port.
27258945Sroberto *
28258945Sroberto * The number of Completion Port Worker threads used is the total number
29258945Sroberto * of CPU's + 1. This increases the likelihood that a Worker Thread is
30258945Sroberto * available for processing a completed request.
31258945Sroberto *
32258945Sroberto * XXXPDM 5 August, 2002
33258945Sroberto */
34258945Sroberto
35258945Sroberto#define MAKE_EXTERNAL 1
36258945Sroberto#include <config.h>
37258945Sroberto
38258945Sroberto#include <sys/types.h>
39258945Sroberto
40258945Sroberto#ifndef _WINSOCKAPI_
41258945Sroberto#define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
42258945Sroberto#endif
43258945Sroberto
44258945Sroberto#include <errno.h>
45258945Sroberto#include <stddef.h>
46258945Sroberto#include <stdlib.h>
47258945Sroberto#include <string.h>
48258945Sroberto#include <unistd.h>
49258945Sroberto#include <io.h>
50258945Sroberto#include <fcntl.h>
51258945Sroberto#include <process.h>
52258945Sroberto
53258945Sroberto#include <isc/buffer.h>
54258945Sroberto#include <isc/bufferlist.h>
55258945Sroberto#include <isc/condition.h>
56258945Sroberto#include <isc/list.h>
57258945Sroberto#include <isc/log.h>
58258945Sroberto#include <isc/mem.h>
59258945Sroberto#include <isc/msgs.h>
60258945Sroberto#include <isc/mutex.h>
61258945Sroberto#include <isc/net.h>
62258945Sroberto#include <isc/once.h>
63258945Sroberto#include <isc/os.h>
64258945Sroberto#include <isc/platform.h>
65258945Sroberto#include <isc/print.h>
66258945Sroberto#include <isc/region.h>
67258945Sroberto#include <isc/socket.h>
68258945Sroberto#include <isc/stats.h>
69258945Sroberto#include <isc/strerror.h>
70258945Sroberto#include <isc/syslog.h>
71258945Sroberto#include <isc/task.h>
72258945Sroberto#include <isc/thread.h>
73258945Sroberto#include <isc/util.h>
74258945Sroberto#include <isc/win32os.h>
75258945Sroberto
76258945Sroberto#include <mswsock.h>
77258945Sroberto
78258945Sroberto#include "errno2result.h"
79258945Sroberto
80258945Sroberto/*
81258945Sroberto * How in the world can Microsoft exist with APIs like this?
82258945Sroberto * We can't actually call this directly, because it turns out
83258945Sroberto * no library exports this function.  Instead, we need to
84258945Sroberto * issue a runtime call to get the address.
85258945Sroberto */
86258945SrobertoLPFN_CONNECTEX ISCConnectEx;
87258945SrobertoLPFN_ACCEPTEX ISCAcceptEx;
88258945SrobertoLPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;
89258945Sroberto
90258945Sroberto/*
91258945Sroberto * Run expensive internal consistency checks.
92258945Sroberto */
93258945Sroberto#ifdef ISC_SOCKET_CONSISTENCY_CHECKS
94258945Sroberto#define CONSISTENT(sock) consistent(sock)
95258945Sroberto#else
96258945Sroberto#define CONSISTENT(sock) do {} while (0)
97258945Sroberto#endif
98258945Srobertostatic void consistent(isc_socket_t *sock);
99258945Sroberto
100258945Sroberto/*
101258945Sroberto * Define this macro to control the behavior of connection
102258945Sroberto * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
103258945Sroberto * for details.
104258945Sroberto * NOTE: This requires that Windows 2000 systems install Service Pack 2
105258945Sroberto * or later.
106258945Sroberto */
107258945Sroberto#ifndef SIO_UDP_CONNRESET
108258945Sroberto#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
109258945Sroberto#endif
110258945Sroberto
111258945Sroberto/*
112258945Sroberto * Some systems define the socket length argument as an int, some as size_t,
113258945Sroberto * some as socklen_t.  This is here so it can be easily changed if needed.
114258945Sroberto */
115258945Sroberto#ifndef ISC_SOCKADDR_LEN_T
116258945Sroberto#define ISC_SOCKADDR_LEN_T unsigned int
117258945Sroberto#endif
118258945Sroberto
119258945Sroberto/*
120258945Sroberto * Define what the possible "soft" errors can be.  These are non-fatal returns
121258945Sroberto * of various network related functions, like recv() and so on.
122258945Sroberto */
123258945Sroberto#define SOFT_ERROR(e)	((e) == WSAEINTR || \
124258945Sroberto			 (e) == WSAEWOULDBLOCK || \
125258945Sroberto			 (e) == EWOULDBLOCK || \
126258945Sroberto			 (e) == EINTR || \
127258945Sroberto			 (e) == EAGAIN || \
128258945Sroberto			 (e) == 0)
129258945Sroberto
130258945Sroberto/*
131258945Sroberto * Pending errors are not really errors and should be
132258945Sroberto * kept separate
133258945Sroberto */
134258945Sroberto#define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
135258945Sroberto
136258945Sroberto#define DOIO_SUCCESS	  0       /* i/o ok, event sent */
137258945Sroberto#define DOIO_SOFT	  1       /* i/o ok, soft error, no event sent */
138258945Sroberto#define DOIO_HARD	  2       /* i/o error, event sent */
139258945Sroberto#define DOIO_EOF	  3       /* EOF, no event sent */
140258945Sroberto#define DOIO_PENDING	  4       /* status when i/o is in process */
141258945Sroberto#define DOIO_NEEDMORE	  5       /* IO was processed, but we need more due to minimum */
142258945Sroberto
143258945Sroberto#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
144258945Sroberto
145258945Sroberto/*
146258945Sroberto * DLVL(90)  --  Function entry/exit and other tracing.
147258945Sroberto * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
148258945Sroberto * DLVL(60)  --  Socket data send/receive
149258945Sroberto * DLVL(50)  --  Event tracing, including receiving/sending completion events.
150258945Sroberto * DLVL(20)  --  Socket creation/destruction.
151258945Sroberto */
152258945Sroberto#define TRACE_LEVEL		90
153258945Sroberto#define CORRECTNESS_LEVEL	70
154258945Sroberto#define IOEVENT_LEVEL		60
155258945Sroberto#define EVENT_LEVEL		50
156258945Sroberto#define CREATION_LEVEL		20
157258945Sroberto
158258945Sroberto#define TRACE		DLVL(TRACE_LEVEL)
159258945Sroberto#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
160258945Sroberto#define IOEVENT		DLVL(IOEVENT_LEVEL)
161258945Sroberto#define EVENT		DLVL(EVENT_LEVEL)
162258945Sroberto#define CREATION	DLVL(CREATION_LEVEL)
163258945Sroberto
164258945Srobertotypedef isc_event_t intev_t;
165258945Sroberto
166258945Sroberto/*
167258945Sroberto * Socket State
168258945Sroberto */
169258945Srobertoenum {
170258945Sroberto  SOCK_INITIALIZED,	/* Socket Initialized */
171258945Sroberto  SOCK_OPEN,		/* Socket opened but nothing yet to do */
172258945Sroberto  SOCK_DATA,		/* Socket sending or receiving data */
173258945Sroberto  SOCK_LISTEN,		/* TCP Socket listening for connects */
174258945Sroberto  SOCK_ACCEPT,		/* TCP socket is waiting to accept */
175258945Sroberto  SOCK_CONNECT,		/* TCP Socket connecting */
176258945Sroberto  SOCK_CLOSED,		/* Socket has been closed */
177258945Sroberto};
178258945Sroberto
179258945Sroberto#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
180258945Sroberto#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
181258945Sroberto
182258945Sroberto/*
183258945Sroberto * IPv6 control information.  If the socket is an IPv6 socket we want
184258945Sroberto * to collect the destination address and interface so the client can
185258945Sroberto * set them on outgoing packets.
186258945Sroberto */
187258945Sroberto#ifdef ISC_PLATFORM_HAVEIPV6
188258945Sroberto#ifndef USE_CMSG
189258945Sroberto#define USE_CMSG	1
190258945Sroberto#endif
191258945Sroberto#endif
192258945Sroberto
193258945Sroberto/*
194258945Sroberto * We really  don't want to try and use these control messages. Win32
195258945Sroberto * doesn't have this mechanism before XP.
196258945Sroberto */
197258945Sroberto#undef USE_CMSG
198258945Sroberto
199258945Sroberto/*
200258945Sroberto * Message header for recvmsg and sendmsg calls.
201258945Sroberto * Used value-result for recvmsg, value only for sendmsg.
202258945Sroberto */
203258945Srobertostruct msghdr {
204258945Sroberto	SOCKADDR_STORAGE to_addr;	/* UDP send/recv address */
205258945Sroberto	int      to_addr_len;		/* length of the address */
206258945Sroberto	WSABUF  *msg_iov;		/* scatter/gather array */
207258945Sroberto	u_int   msg_iovlen;             /* # elements in msg_iov */
208258945Sroberto	void	*msg_control;           /* ancillary data, see below */
209258945Sroberto	u_int   msg_controllen;         /* ancillary data buffer len */
210258945Sroberto	int	msg_totallen;		/* total length of this message */
211258945Sroberto} msghdr;
212258945Sroberto
213258945Sroberto/*
214258945Sroberto * The size to raise the receive buffer to.
215258945Sroberto */
216258945Sroberto#define RCVBUFSIZE (32*1024)
217258945Sroberto
218258945Sroberto/*
219258945Sroberto * The number of times a send operation is repeated if the result
220258945Sroberto * is WSAEINTR.
221258945Sroberto */
222258945Sroberto#define NRETRIES 10
223258945Sroberto
224258945Srobertostruct isc_socket {
225258945Sroberto	/* Not locked. */
226258945Sroberto	unsigned int		magic;
227258945Sroberto	isc_socketmgr_t	       *manager;
228258945Sroberto	isc_mutex_t		lock;
229258945Sroberto	isc_sockettype_t	type;
230258945Sroberto
231258945Sroberto	/* Pointers to scatter/gather buffers */
232258945Sroberto	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
233258945Sroberto
234258945Sroberto	/* Locked by socket lock. */
235258945Sroberto	ISC_LINK(isc_socket_t)	link;
236258945Sroberto	unsigned int		references; /* EXTERNAL references */
237258945Sroberto	SOCKET			fd;	/* file handle */
238258945Sroberto	int			pf;	/* protocol family */
239258945Sroberto	char			name[16];
240258945Sroberto	void *			tag;
241258945Sroberto
242258945Sroberto	/*
243258945Sroberto	 * Each recv() call uses this buffer.  It is a per-socket receive
244258945Sroberto	 * buffer that allows us to decouple the system recv() from the
245258945Sroberto	 * recv_list done events.  This means the items on the recv_list
246258945Sroberto	 * can be removed without having to cancel pending system recv()
247258945Sroberto	 * calls.  It also allows us to read-ahead in some cases.
248258945Sroberto	 */
249258945Sroberto	struct {
250258945Sroberto		SOCKADDR_STORAGE	from_addr;	   // UDP send/recv address
251258945Sroberto		int		from_addr_len;	   // length of the address
252258945Sroberto		char		*base;		   // the base of the buffer
253258945Sroberto		char		*consume_position; // where to start copying data from next
254258945Sroberto		unsigned int	len;		   // the actual size of this buffer
255258945Sroberto		unsigned int	remaining;	   // the number of bytes remaining
256258945Sroberto	} recvbuf;
257258945Sroberto
258258945Sroberto	ISC_LIST(isc_socketevent_t)		send_list;
259258945Sroberto	ISC_LIST(isc_socketevent_t)		recv_list;
260258945Sroberto	ISC_LIST(isc_socket_newconnev_t)	accept_list;
261258945Sroberto	isc_socket_connev_t		       *connect_ev;
262258945Sroberto
263258945Sroberto	isc_sockaddr_t		address;  /* remote address */
264258945Sroberto
265258945Sroberto	unsigned int		listener : 1,	/* listener socket */
266258945Sroberto				connected : 1,
267258945Sroberto				pending_connect : 1, /* connect pending */
268280849Scy				bound : 1,	/* bound to local addr */
269280849Scy				dupped : 1;     /* created by isc_socket_dup() */
270258945Sroberto	unsigned int		pending_iocp;	/* Should equal the counters below. Debug. */
271258945Sroberto	unsigned int		pending_recv;  /* Number of outstanding recv() calls. */
272258945Sroberto	unsigned int		pending_send;  /* Number of outstanding send() calls. */
273258945Sroberto	unsigned int		pending_accept; /* Number of outstanding accept() calls. */
274258945Sroberto	unsigned int		state; /* Socket state. Debugging and consistency checking. */
275258945Sroberto	int			state_lineno;  /* line which last touched state */
276258945Sroberto};
277258945Sroberto
278258945Sroberto#define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (0)
279258945Sroberto
280258945Sroberto/*
281258945Sroberto * Buffer structure
282258945Sroberto */
283258945Srobertotypedef struct buflist buflist_t;
284258945Sroberto
285258945Srobertostruct buflist {
286258945Sroberto	void			*buf;
287258945Sroberto	unsigned int		buflen;
288258945Sroberto	ISC_LINK(buflist_t)	link;
289258945Sroberto};
290258945Sroberto
291258945Sroberto/*
292258945Sroberto * I/O Completion ports Info structures
293258945Sroberto */
294258945Sroberto
295258945Srobertostatic HANDLE hHeapHandle = NULL;
296258945Srobertotypedef struct IoCompletionInfo {
297258945Sroberto	OVERLAPPED		overlapped;
298258945Sroberto	isc_socketevent_t	*dev;  /* send()/recv() done event */
299258945Sroberto	isc_socket_connev_t	*cdev; /* connect() done event */
300258945Sroberto	isc_socket_newconnev_t	*adev; /* accept() done event */
301258945Sroberto	void			*acceptbuffer;
302258945Sroberto	DWORD			received_bytes;
303258945Sroberto	int			request_type;
304258945Sroberto	struct msghdr		messagehdr;
305258945Sroberto	ISC_LIST(buflist_t)	bufferlist;	/*%< list of buffers */
306258945Sroberto} IoCompletionInfo;
307258945Sroberto
308258945Sroberto/*
309258945Sroberto * Define a maximum number of I/O Completion Port worker threads
310258945Sroberto * to handle the load on the Completion Port. The actual number
311258945Sroberto * used is the number of CPU's + 1.
312258945Sroberto */
313258945Sroberto#define MAX_IOCPTHREADS 20
314258945Sroberto
315258945Sroberto#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
316258945Sroberto#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
317258945Sroberto
318258945Srobertostruct isc_socketmgr {
319258945Sroberto	/* Not locked. */
320258945Sroberto	unsigned int			magic;
321258945Sroberto	isc_mem_t		       *mctx;
322258945Sroberto	isc_mutex_t			lock;
323258945Sroberto	isc_stats_t		       *stats;
324258945Sroberto
325258945Sroberto	/* Locked by manager lock. */
326258945Sroberto	ISC_LIST(isc_socket_t)		socklist;
327258945Sroberto	isc_boolean_t			bShutdown;
328258945Sroberto	isc_condition_t			shutdown_ok;
329258945Sroberto	HANDLE				hIoCompletionPort;
330258945Sroberto	int				maxIOCPThreads;
331258945Sroberto	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
332258945Sroberto	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
333258945Sroberto
334258945Sroberto	/*
335258945Sroberto	 * Debugging.
336258945Sroberto	 * Modified by InterlockedIncrement() and InterlockedDecrement()
337258945Sroberto	 */
338258945Sroberto	LONG				totalSockets;
339258945Sroberto	LONG				iocp_total;
340258945Sroberto};
341258945Sroberto
342258945Srobertoenum {
343258945Sroberto	SOCKET_RECV,
344258945Sroberto	SOCKET_SEND,
345258945Sroberto	SOCKET_ACCEPT,
346258945Sroberto	SOCKET_CONNECT
347258945Sroberto};
348258945Sroberto
349258945Sroberto/*
350258945Sroberto * send() and recv() iovec counts
351258945Sroberto */
352258945Sroberto#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
353258945Sroberto#define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
354258945Sroberto
355280849Scystatic isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
356280849Scy				  isc_sockettype_t type,
357280849Scy				  isc_socket_t **socketp,
358280849Scy				  isc_socket_t *dup_socket);
359258945Srobertostatic isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
360258945Srobertostatic void maybe_free_socket(isc_socket_t **, int);
361258945Srobertostatic void free_socket(isc_socket_t **, int);
362258945Srobertostatic isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
363258945Srobertostatic isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
364258945Srobertostatic isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
365258945Srobertostatic void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
366258945Srobertostatic void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
367258945Srobertostatic void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
368258945Srobertostatic void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
369258945Srobertostatic void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
370258945Srobertostatic void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
371258945Srobertostatic void queue_receive_request(isc_socket_t *sock);
372258945Sroberto
373258945Sroberto/*
374258945Sroberto * This is used to dump the contents of the sock structure
375258945Sroberto * You should make sure that the sock is locked before
376258945Sroberto * dumping it. Since the code uses simple printf() statements
377258945Sroberto * it should only be used interactively.
378258945Sroberto */
379258945Srobertovoid
380258945Srobertosock_dump(isc_socket_t *sock) {
381258945Sroberto	isc_socketevent_t *ldev;
382258945Sroberto	isc_socket_newconnev_t *ndev;
383258945Sroberto
384258945Sroberto#if 0
385258945Sroberto	isc_sockaddr_t addr;
386258945Sroberto	char socktext[256];
387258945Sroberto
388258945Sroberto	isc_socket_getpeername(sock, &addr);
389258945Sroberto	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
390258945Sroberto	printf("Remote Socket: %s\n", socktext);
391258945Sroberto	isc_socket_getsockname(sock, &addr);
392258945Sroberto	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
393258945Sroberto	printf("This Socket: %s\n", socktext);
394258945Sroberto#endif
395258945Sroberto
396258945Sroberto	printf("\n\t\tSock Dump\n");
397258945Sroberto	printf("\t\tfd: %u\n", sock->fd);
398258945Sroberto	printf("\t\treferences: %d\n", sock->references);
399258945Sroberto	printf("\t\tpending_accept: %d\n", sock->pending_accept);
400258945Sroberto	printf("\t\tconnecting: %d\n", sock->pending_connect);
401258945Sroberto	printf("\t\tconnected: %d\n", sock->connected);
402258945Sroberto	printf("\t\tbound: %d\n", sock->bound);
403258945Sroberto	printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
404258945Sroberto	printf("\t\tsocket type: %d\n", sock->type);
405258945Sroberto
406258945Sroberto	printf("\n\t\tSock Recv List\n");
407258945Sroberto	ldev = ISC_LIST_HEAD(sock->recv_list);
408258945Sroberto	while (ldev != NULL) {
409258945Sroberto		printf("\t\tdev: %p\n", ldev);
410258945Sroberto		ldev = ISC_LIST_NEXT(ldev, ev_link);
411258945Sroberto	}
412258945Sroberto
413258945Sroberto	printf("\n\t\tSock Send List\n");
414258945Sroberto	ldev = ISC_LIST_HEAD(sock->send_list);
415258945Sroberto	while (ldev != NULL) {
416258945Sroberto		printf("\t\tdev: %p\n", ldev);
417258945Sroberto		ldev = ISC_LIST_NEXT(ldev, ev_link);
418258945Sroberto	}
419258945Sroberto
420258945Sroberto	printf("\n\t\tSock Accept List\n");
421258945Sroberto	ndev = ISC_LIST_HEAD(sock->accept_list);
422258945Sroberto	while (ndev != NULL) {
423258945Sroberto		printf("\t\tdev: %p\n", ldev);
424258945Sroberto		ndev = ISC_LIST_NEXT(ndev, ev_link);
425258945Sroberto	}
426258945Sroberto}
427258945Sroberto
428258945Srobertostatic void
429258945Srobertosocket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
430258945Sroberto	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
431258945Sroberto	   isc_msgcat_t *msgcat, int msgset, int message,
432258945Sroberto	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
433258945Sroberto
434258945Sroberto/*  This function will add an entry to the I/O completion port
435258945Sroberto *  that will signal the I/O thread to exit (gracefully)
436258945Sroberto */
437258945Srobertostatic void
438258945Srobertosignal_iocompletionport_exit(isc_socketmgr_t *manager) {
439258945Sroberto	int i;
440258945Sroberto	int errval;
441258945Sroberto	char strbuf[ISC_STRERRORSIZE];
442258945Sroberto
443258945Sroberto	REQUIRE(VALID_MANAGER(manager));
444258945Sroberto	for (i = 0; i < manager->maxIOCPThreads; i++) {
445258945Sroberto		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
446258945Sroberto						0, 0, 0)) {
447258945Sroberto			errval = GetLastError();
448258945Sroberto			isc__strerror(errval, strbuf, sizeof(strbuf));
449258945Sroberto			FATAL_ERROR(__FILE__, __LINE__,
450258945Sroberto				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
451258945Sroberto				ISC_MSG_FAILED,
452258945Sroberto				"Can't request service thread to exit: %s"),
453258945Sroberto				strbuf);
454258945Sroberto		}
455258945Sroberto	}
456258945Sroberto}
457258945Sroberto
458258945Sroberto/*
459258945Sroberto * Create the worker threads for the I/O Completion Port
460258945Sroberto */
461258945Srobertovoid
462258945Srobertoiocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
463258945Sroberto	int errval;
464258945Sroberto	char strbuf[ISC_STRERRORSIZE];
465258945Sroberto	int i;
466258945Sroberto
467258945Sroberto	INSIST(total_threads > 0);
468258945Sroberto	REQUIRE(VALID_MANAGER(manager));
469258945Sroberto	/*
470258945Sroberto	 * We need at least one
471258945Sroberto	 */
472258945Sroberto	for (i = 0; i < total_threads; i++) {
473258945Sroberto		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
474258945Sroberto						manager, 0,
475258945Sroberto						&manager->dwIOCPThreadIds[i]);
476258945Sroberto		if (manager->hIOCPThreads[i] == NULL) {
477258945Sroberto			errval = GetLastError();
478258945Sroberto			isc__strerror(errval, strbuf, sizeof(strbuf));
479258945Sroberto			FATAL_ERROR(__FILE__, __LINE__,
480258945Sroberto				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
481258945Sroberto				ISC_MSG_FAILED,
482258945Sroberto				"Can't create IOCP thread: %s"),
483258945Sroberto				strbuf);
484258945Sroberto			exit(1);
485258945Sroberto		}
486258945Sroberto	}
487258945Sroberto}
488258945Sroberto
489258945Sroberto/*
490258945Sroberto *  Create/initialise the I/O completion port
491258945Sroberto */
492258945Srobertovoid
493258945Srobertoiocompletionport_init(isc_socketmgr_t *manager) {
494258945Sroberto	int errval;
495258945Sroberto	char strbuf[ISC_STRERRORSIZE];
496258945Sroberto
497258945Sroberto	REQUIRE(VALID_MANAGER(manager));
498258945Sroberto	/*
499258945Sroberto	 * Create a private heap to handle the socket overlapped structure
500258945Sroberto	 * The minimum number of structures is 10, there is no maximum
501258945Sroberto	 */
502258945Sroberto	hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
503258945Sroberto	if (hHeapHandle == NULL) {
504258945Sroberto		errval = GetLastError();
505258945Sroberto		isc__strerror(errval, strbuf, sizeof(strbuf));
506258945Sroberto		FATAL_ERROR(__FILE__, __LINE__,
507258945Sroberto			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
508258945Sroberto					   ISC_MSG_FAILED,
509258945Sroberto					   "HeapCreate() failed during "
510258945Sroberto					   "initialization: %s"),
511258945Sroberto			    strbuf);
512258945Sroberto		exit(1);
513258945Sroberto	}
514258945Sroberto
515258945Sroberto	manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
516258945Sroberto
517258945Sroberto	/* Now Create the Completion Port */
518258945Sroberto	manager->hIoCompletionPort = CreateIoCompletionPort(
519258945Sroberto			INVALID_HANDLE_VALUE, NULL,
520258945Sroberto			0, manager->maxIOCPThreads);
521258945Sroberto	if (manager->hIoCompletionPort == NULL) {
522258945Sroberto		errval = GetLastError();
523258945Sroberto		isc__strerror(errval, strbuf, sizeof(strbuf));
524258945Sroberto		FATAL_ERROR(__FILE__, __LINE__,
525258945Sroberto				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
526258945Sroberto				ISC_MSG_FAILED,
527258945Sroberto				"CreateIoCompletionPort() failed "
528258945Sroberto				"during initialization: %s"),
529258945Sroberto				strbuf);
530258945Sroberto		exit(1);
531258945Sroberto	}
532258945Sroberto
533258945Sroberto	/*
534258945Sroberto	 * Worker threads for servicing the I/O
535258945Sroberto	 */
536258945Sroberto	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
537258945Sroberto}
538258945Sroberto
539258945Sroberto/*
540258945Sroberto * Associate a socket with an IO Completion Port.  This allows us to queue events for it
541258945Sroberto * and have our worker pool of threads process them.
542258945Sroberto */
543258945Srobertovoid
544258945Srobertoiocompletionport_update(isc_socket_t *sock) {
545258945Sroberto	HANDLE hiocp;
546258945Sroberto	char strbuf[ISC_STRERRORSIZE];
547258945Sroberto
548258945Sroberto	REQUIRE(VALID_SOCKET(sock));
549258945Sroberto
550258945Sroberto	hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
551258945Sroberto		sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);
552258945Sroberto
553258945Sroberto	if (hiocp == NULL) {
554258945Sroberto		DWORD errval = GetLastError();
555258945Sroberto		isc__strerror(errval, strbuf, sizeof(strbuf));
556258945Sroberto		isc_log_iwrite(isc_lctx,
557258945Sroberto				ISC_LOGCATEGORY_GENERAL,
558258945Sroberto				ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
559258945Sroberto				isc_msgcat, ISC_MSGSET_SOCKET,
560258945Sroberto				ISC_MSG_TOOMANYHANDLES,
561258945Sroberto				"iocompletionport_update: failed to open"
562258945Sroberto				" io completion port: %s",
563258945Sroberto				strbuf);
564258945Sroberto
565258945Sroberto		/* XXXMLG temporary hack to make failures detected.
566258945Sroberto		 * This function should return errors to the caller, not
567258945Sroberto		 * exit here.
568258945Sroberto		 */
569258945Sroberto		FATAL_ERROR(__FILE__, __LINE__,
570258945Sroberto				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
571258945Sroberto				ISC_MSG_FAILED,
572258945Sroberto				"CreateIoCompletionPort() failed "
573258945Sroberto				"during initialization: %s"),
574258945Sroberto				strbuf);
575258945Sroberto		exit(1);
576258945Sroberto	}
577258945Sroberto
578258945Sroberto	InterlockedIncrement(&sock->manager->iocp_total);
579258945Sroberto}
580258945Sroberto
581258945Sroberto/*
582258945Sroberto * Routine to cleanup and then close the socket.
583258945Sroberto * Only close the socket here if it is NOT associated
584258945Sroberto * with an event, otherwise the WSAWaitForMultipleEvents
585258945Sroberto * may fail due to the fact that the Wait should not
586258945Sroberto * be running while closing an event or a socket.
587258945Sroberto * The socket is locked before calling this function
588258945Sroberto */
589258945Srobertovoid
590258945Srobertosocket_close(isc_socket_t *sock) {
591258945Sroberto
592258945Sroberto	REQUIRE(sock != NULL);
593258945Sroberto
594258945Sroberto	if (sock->fd != INVALID_SOCKET) {
595258945Sroberto		closesocket(sock->fd);
596258945Sroberto		sock->fd = INVALID_SOCKET;
597258945Sroberto		_set_state(sock, SOCK_CLOSED);
598258945Sroberto		InterlockedDecrement(&sock->manager->totalSockets);
599258945Sroberto	}
600258945Sroberto}
601258945Sroberto
602258945Srobertostatic isc_once_t initialise_once = ISC_ONCE_INIT;
603258945Srobertostatic isc_boolean_t initialised = ISC_FALSE;
604258945Sroberto
605258945Srobertostatic void
606258945Srobertoinitialise(void) {
607258945Sroberto	WORD wVersionRequested;
608258945Sroberto	WSADATA wsaData;
609258945Sroberto	int err;
610258945Sroberto	SOCKET sock;
611258945Sroberto	GUID GUIDConnectEx = WSAID_CONNECTEX;
612258945Sroberto	GUID GUIDAcceptEx = WSAID_ACCEPTEX;
613258945Sroberto	GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
614258945Sroberto	DWORD dwBytes;
615258945Sroberto
616258945Sroberto	/* Need Winsock 2.2 or better */
617258945Sroberto	wVersionRequested = MAKEWORD(2, 2);
618258945Sroberto
619258945Sroberto	err = WSAStartup(wVersionRequested, &wsaData);
620258945Sroberto	if (err != 0) {
621258945Sroberto		char strbuf[ISC_STRERRORSIZE];
622258945Sroberto		isc__strerror(err, strbuf, sizeof(strbuf));
623258945Sroberto		FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
624258945Sroberto			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
625258945Sroberto					   ISC_MSG_FAILED, "failed"),
626258945Sroberto			    strbuf);
627258945Sroberto		exit(1);
628258945Sroberto	}
629258945Sroberto	/*
630258945Sroberto	 * The following APIs do not exist as functions in a library, but we must
631258945Sroberto	 * ask winsock for them.  They are "extensions" -- but why they cannot be
632258945Sroberto	 * actual functions is beyond me.  So, ask winsock for the pointers to the
633258945Sroberto	 * functions we need.
634258945Sroberto	 */
635258945Sroberto	sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
636258945Sroberto	INSIST(sock != INVALID_SOCKET);
637258945Sroberto	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
638258945Sroberto		 &GUIDConnectEx, sizeof(GUIDConnectEx),
639258945Sroberto		 &ISCConnectEx, sizeof(ISCConnectEx),
640258945Sroberto		 &dwBytes, NULL, NULL);
641258945Sroberto	INSIST(err == 0);
642258945Sroberto
643258945Sroberto	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
644258945Sroberto		 &GUIDAcceptEx, sizeof(GUIDAcceptEx),
645258945Sroberto		 &ISCAcceptEx, sizeof(ISCAcceptEx),
646258945Sroberto		 &dwBytes, NULL, NULL);
647258945Sroberto	INSIST(err == 0);
648258945Sroberto
649258945Sroberto	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
650258945Sroberto		 &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
651258945Sroberto		 &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
652258945Sroberto		 &dwBytes, NULL, NULL);
653258945Sroberto	INSIST(err == 0);
654258945Sroberto
655258945Sroberto	closesocket(sock);
656258945Sroberto
657258945Sroberto	initialised = ISC_TRUE;
658258945Sroberto}
659258945Sroberto
660258945Sroberto/*
661258945Sroberto * Initialize socket services
662258945Sroberto */
663258945Srobertovoid
664258945SrobertoInitSockets(void) {
665258945Sroberto	RUNTIME_CHECK(isc_once_do(&initialise_once,
666258945Sroberto				  initialise) == ISC_R_SUCCESS);
667258945Sroberto	if (!initialised)
668258945Sroberto		exit(1);
669258945Sroberto}
670258945Sroberto
671258945Srobertoint
672258945Srobertointernal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
673258945Sroberto		 struct msghdr *messagehdr, int flags, int *Error)
674258945Sroberto{
675258945Sroberto	int Result;
676258945Sroberto	DWORD BytesSent;
677258945Sroberto	DWORD Flags = flags;
678258945Sroberto	int total_sent;
679258945Sroberto
680258945Sroberto	*Error = 0;
681258945Sroberto	Result = WSASendTo(sock->fd, messagehdr->msg_iov,
682258945Sroberto			   messagehdr->msg_iovlen, &BytesSent,
683258945Sroberto			   Flags, (SOCKADDR *)&messagehdr->to_addr,
684258945Sroberto			   messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
685258945Sroberto			   NULL);
686258945Sroberto
687258945Sroberto	total_sent = (int)BytesSent;
688258945Sroberto
689258945Sroberto	/* Check for errors.*/
690258945Sroberto	if (Result == SOCKET_ERROR) {
691258945Sroberto		*Error = WSAGetLastError();
692258945Sroberto
693258945Sroberto		switch (*Error) {
694258945Sroberto		case WSA_IO_INCOMPLETE:
695258945Sroberto		case WSA_WAIT_IO_COMPLETION:
696258945Sroberto		case WSA_IO_PENDING:
697258945Sroberto		case NO_ERROR:		/* Strange, but okay */
698258945Sroberto			sock->pending_iocp++;
699258945Sroberto			sock->pending_send++;
700258945Sroberto			break;
701258945Sroberto
702258945Sroberto		default:
703258945Sroberto			return (-1);
704258945Sroberto			break;
705258945Sroberto		}
706258945Sroberto	} else {
707258945Sroberto		sock->pending_iocp++;
708258945Sroberto		sock->pending_send++;
709258945Sroberto	}
710258945Sroberto
711258945Sroberto	if (lpo != NULL)
712258945Sroberto		return (0);
713258945Sroberto	else
714258945Sroberto		return (total_sent);
715258945Sroberto}
716258945Sroberto
717258945Srobertostatic void
718258945Srobertoqueue_receive_request(isc_socket_t *sock) {
719258945Sroberto	DWORD Flags = 0;
720258945Sroberto	DWORD NumBytes = 0;
721258945Sroberto	int total_bytes = 0;
722258945Sroberto	int Result;
723258945Sroberto	int Error;
724280849Scy	int need_retry;
725258945Sroberto	WSABUF iov[1];
726280849Scy	IoCompletionInfo *lpo = NULL;
727258945Sroberto	isc_result_t isc_result;
728258945Sroberto
729280849Scy retry:
730280849Scy	need_retry = ISC_FALSE;
731280849Scy
732258945Sroberto	/*
733258945Sroberto	 * If we already have a receive pending, do nothing.
734258945Sroberto	 */
735280849Scy	if (sock->pending_recv > 0) {
736280849Scy		if (lpo != NULL)
737280849Scy			HeapFree(hHeapHandle, 0, lpo);
738258945Sroberto		return;
739280849Scy	}
740258945Sroberto
741258945Sroberto	/*
742258945Sroberto	 * If no one is waiting, do nothing.
743258945Sroberto	 */
744280849Scy	if (ISC_LIST_EMPTY(sock->recv_list)) {
745280849Scy		if (lpo != NULL)
746280849Scy			HeapFree(hHeapHandle, 0, lpo);
747258945Sroberto		return;
748280849Scy	}
749258945Sroberto
750258945Sroberto	INSIST(sock->recvbuf.remaining == 0);
751258945Sroberto	INSIST(sock->fd != INVALID_SOCKET);
752258945Sroberto
753258945Sroberto	iov[0].len = sock->recvbuf.len;
754258945Sroberto	iov[0].buf = sock->recvbuf.base;
755258945Sroberto
756280849Scy	if (lpo == NULL) {
757280849Scy		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
758280849Scy						    HEAP_ZERO_MEMORY,
759280849Scy						    sizeof(IoCompletionInfo));
760280849Scy		RUNTIME_CHECK(lpo != NULL);
761280849Scy	} else
762280849Scy		ZeroMemory(lpo, sizeof(IoCompletionInfo));
763258945Sroberto	lpo->request_type = SOCKET_RECV;
764258945Sroberto
765258945Sroberto	sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
766258945Sroberto
767258945Sroberto	Error = 0;
768258945Sroberto	Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
769258945Sroberto			     &NumBytes, &Flags,
770258945Sroberto			     (SOCKADDR *)&sock->recvbuf.from_addr,
771258945Sroberto			     &sock->recvbuf.from_addr_len,
772258945Sroberto			     (LPWSAOVERLAPPED)lpo, NULL);
773258945Sroberto
774258945Sroberto	/* Check for errors. */
775258945Sroberto	if (Result == SOCKET_ERROR) {
776258945Sroberto		Error = WSAGetLastError();
777258945Sroberto
778258945Sroberto		switch (Error) {
779258945Sroberto		case WSA_IO_PENDING:
780258945Sroberto			sock->pending_iocp++;
781258945Sroberto			sock->pending_recv++;
782258945Sroberto			break;
783258945Sroberto
784280849Scy		/* direct error: no completion event */
785280849Scy		case ERROR_HOST_UNREACHABLE:
786280849Scy		case WSAENETRESET:
787280849Scy		case WSAECONNRESET:
788280849Scy			if (!sock->connected) {
789280849Scy				/* soft error */
790280849Scy				need_retry = ISC_TRUE;
791280849Scy				break;
792280849Scy			}
793280849Scy			/* FALLTHROUGH */
794280849Scy
795258945Sroberto		default:
796258945Sroberto			isc_result = isc__errno2result(Error);
797258945Sroberto			if (isc_result == ISC_R_UNEXPECTED)
798258945Sroberto				UNEXPECTED_ERROR(__FILE__, __LINE__,
799258945Sroberto					"WSARecvFrom: Windows error code: %d, isc result %d",
800258945Sroberto					Error, isc_result);
801258945Sroberto			send_recvdone_abort(sock, isc_result);
802280849Scy			HeapFree(hHeapHandle, 0, lpo);
803280849Scy			lpo = NULL;
804258945Sroberto			break;
805258945Sroberto		}
806258945Sroberto	} else {
807258945Sroberto		/*
808258945Sroberto		 * The recv() finished immediately, but we will still get
809258945Sroberto		 * a completion event.  Rather than duplicate code, let
810258945Sroberto		 * that thread handle sending the data along its way.
811258945Sroberto		 */
812258945Sroberto		sock->pending_iocp++;
813258945Sroberto		sock->pending_recv++;
814258945Sroberto	}
815258945Sroberto
816258945Sroberto	socket_log(__LINE__, sock, NULL, IOEVENT,
817258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET,
818258945Sroberto		   ISC_MSG_DOIORECV,
819258945Sroberto		   "queue_io_request: fd %d result %d error %d",
820258945Sroberto		   sock->fd, Result, Error);
821258945Sroberto
822258945Sroberto	CONSISTENT(sock);
823280849Scy
824280849Scy	if (need_retry)
825280849Scy		goto retry;
826258945Sroberto}
827258945Sroberto
828258945Srobertostatic void
829258945Srobertomanager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
830258945Sroberto	    isc_logmodule_t *module, int level, const char *fmt, ...)
831258945Sroberto{
832258945Sroberto	char msgbuf[2048];
833258945Sroberto	va_list ap;
834258945Sroberto
835258945Sroberto	if (!isc_log_wouldlog(isc_lctx, level))
836258945Sroberto		return;
837258945Sroberto
838258945Sroberto	va_start(ap, fmt);
839258945Sroberto	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
840258945Sroberto	va_end(ap);
841258945Sroberto
842258945Sroberto	isc_log_write(isc_lctx, category, module, level,
843258945Sroberto		      "sockmgr %p: %s", sockmgr, msgbuf);
844258945Sroberto}
845258945Sroberto
846258945Srobertostatic void
847258945Srobertosocket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
848258945Sroberto	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
849258945Sroberto	   isc_msgcat_t *msgcat, int msgset, int message,
850258945Sroberto	   const char *fmt, ...)
851258945Sroberto{
852258945Sroberto	char msgbuf[2048];
853258945Sroberto	char peerbuf[256];
854258945Sroberto	va_list ap;
855258945Sroberto
856258945Sroberto
857258945Sroberto	if (!isc_log_wouldlog(isc_lctx, level))
858258945Sroberto		return;
859258945Sroberto
860258945Sroberto	va_start(ap, fmt);
861258945Sroberto	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
862258945Sroberto	va_end(ap);
863258945Sroberto
864258945Sroberto	if (address == NULL) {
865258945Sroberto		isc_log_iwrite(isc_lctx, category, module, level,
866258945Sroberto			       msgcat, msgset, message,
867258945Sroberto			       "socket %p line %d: %s", sock, lineno, msgbuf);
868258945Sroberto	} else {
869258945Sroberto		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
870258945Sroberto		isc_log_iwrite(isc_lctx, category, module, level,
871258945Sroberto			       msgcat, msgset, message,
872258945Sroberto				   "socket %p line %d peer %s: %s", sock, lineno,
873258945Sroberto				   peerbuf, msgbuf);
874258945Sroberto	}
875258945Sroberto
876258945Sroberto}
877258945Sroberto
878258945Sroberto/*
879258945Sroberto * Make an fd SOCKET non-blocking.
880258945Sroberto */
881258945Srobertostatic isc_result_t
882258945Srobertomake_nonblock(SOCKET fd) {
883258945Sroberto	int ret;
884258945Sroberto	unsigned long flags = 1;
885258945Sroberto	char strbuf[ISC_STRERRORSIZE];
886258945Sroberto
887258945Sroberto	/* Set the socket to non-blocking */
888258945Sroberto	ret = ioctlsocket(fd, FIONBIO, &flags);
889258945Sroberto
890258945Sroberto	if (ret == -1) {
891258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
892258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
893258945Sroberto				 "ioctlsocket(%d, FIOBIO, %d): %s",
894258945Sroberto				 fd, flags, strbuf);
895258945Sroberto
896258945Sroberto		return (ISC_R_UNEXPECTED);
897258945Sroberto	}
898258945Sroberto
899258945Sroberto	return (ISC_R_SUCCESS);
900258945Sroberto}
901258945Sroberto
902258945Sroberto/*
903280849Scy * Windows 2000 systems incorrectly cause UDP sockets using WSARecvFrom
904258945Sroberto * to not work correctly, returning a WSACONNRESET error when a WSASendTo
905258945Sroberto * fails with an "ICMP port unreachable" response and preventing the
906258945Sroberto * socket from using the WSARecvFrom in subsequent operations.
907258945Sroberto * The function below fixes this, but requires that Windows 2000
908258945Sroberto * Service Pack 2 or later be installed on the system.  NT 4.0
909258945Sroberto * systems are not affected by this and work correctly.
910258945Sroberto * See Microsoft Knowledge Base Article Q263823 for details of this.
911258945Sroberto */
912258945Srobertoisc_result_t
913258945Srobertoconnection_reset_fix(SOCKET fd) {
914258945Sroberto	DWORD dwBytesReturned = 0;
915258945Sroberto	BOOL  bNewBehavior = FALSE;
916258945Sroberto	DWORD status;
917258945Sroberto
918258945Sroberto	if (isc_win32os_majorversion() < 5)
919258945Sroberto		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */
920258945Sroberto
921258945Sroberto	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
922258945Sroberto	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
923258945Sroberto			  sizeof(bNewBehavior), NULL, 0,
924258945Sroberto			  &dwBytesReturned, NULL, NULL);
925258945Sroberto	if (status != SOCKET_ERROR)
926258945Sroberto		return (ISC_R_SUCCESS);
927258945Sroberto	else {
928258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
929258945Sroberto				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
930258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
931258945Sroberto						ISC_MSG_FAILED, "failed"));
932258945Sroberto		return (ISC_R_UNEXPECTED);
933258945Sroberto	}
934258945Sroberto}
935258945Sroberto
936258945Sroberto/*
937258945Sroberto * Construct an iov array and attach it to the msghdr passed in.  This is
938258945Sroberto * the SEND constructor, which will use the used region of the buffer
939258945Sroberto * (if using a buffer list) or will use the internal region (if a single
940258945Sroberto * buffer I/O is requested).
941258945Sroberto *
942258945Sroberto * Nothing can be NULL, and the done event must list at least one buffer
943258945Sroberto * on the buffer linked list for this function to be meaningful.
944258945Sroberto */
945258945Srobertostatic void
946258945Srobertobuild_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
947258945Sroberto		  struct msghdr *msg, char *cmsg, WSABUF *iov,
948258945Sroberto		  IoCompletionInfo  *lpo)
949258945Sroberto{
950258945Sroberto	unsigned int iovcount;
951258945Sroberto	isc_buffer_t *buffer;
952258945Sroberto	buflist_t  *cpbuffer;
953258945Sroberto	isc_region_t used;
954258945Sroberto	size_t write_count;
955258945Sroberto	size_t skip_count;
956258945Sroberto
957258945Sroberto	memset(msg, 0, sizeof(*msg));
958258945Sroberto
959258945Sroberto	memcpy(&msg->to_addr, &dev->address.type, dev->address.length);
960258945Sroberto	msg->to_addr_len = dev->address.length;
961258945Sroberto
962258945Sroberto	buffer = ISC_LIST_HEAD(dev->bufferlist);
963258945Sroberto	write_count = 0;
964258945Sroberto	iovcount = 0;
965258945Sroberto
966258945Sroberto	/*
967258945Sroberto	 * Single buffer I/O?  Skip what we've done so far in this region.
968258945Sroberto	 */
969258945Sroberto	if (buffer == NULL) {
970258945Sroberto		write_count = dev->region.length - dev->n;
971258945Sroberto		cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
972258945Sroberto		RUNTIME_CHECK(cpbuffer != NULL);
973258945Sroberto		cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
974258945Sroberto		RUNTIME_CHECK(cpbuffer->buf != NULL);
975258945Sroberto
976258945Sroberto		socket_log(__LINE__, sock, NULL, TRACE,
977258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
978258945Sroberto		   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
979258945Sroberto		   cpbuffer->buf, write_count);
980258945Sroberto
981258945Sroberto		memcpy(cpbuffer->buf,(dev->region.base + dev->n), write_count);
982258945Sroberto		cpbuffer->buflen = write_count;
983258945Sroberto		ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
984258945Sroberto		iov[0].buf = cpbuffer->buf;
985258945Sroberto		iov[0].len = write_count;
986258945Sroberto		iovcount = 1;
987258945Sroberto
988258945Sroberto		goto config;
989258945Sroberto	}
990258945Sroberto
991258945Sroberto	/*
992258945Sroberto	 * Multibuffer I/O.
993258945Sroberto	 * Skip the data in the buffer list that we have already written.
994258945Sroberto	 */
995258945Sroberto	skip_count = dev->n;
996258945Sroberto	while (buffer != NULL) {
997258945Sroberto		REQUIRE(ISC_BUFFER_VALID(buffer));
998258945Sroberto		if (skip_count < isc_buffer_usedlength(buffer))
999258945Sroberto			break;
1000258945Sroberto		skip_count -= isc_buffer_usedlength(buffer);
1001258945Sroberto		buffer = ISC_LIST_NEXT(buffer, link);
1002258945Sroberto	}
1003258945Sroberto
1004258945Sroberto	while (buffer != NULL) {
1005258945Sroberto		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1006258945Sroberto
1007258945Sroberto		isc_buffer_usedregion(buffer, &used);
1008258945Sroberto
1009258945Sroberto		if (used.length > 0) {
1010258945Sroberto			int uselen = used.length - skip_count;
1011258945Sroberto			cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
1012258945Sroberto			RUNTIME_CHECK(cpbuffer != NULL);
1013258945Sroberto			cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
1014258945Sroberto			RUNTIME_CHECK(cpbuffer->buf != NULL);
1015258945Sroberto
1016258945Sroberto			socket_log(__LINE__, sock, NULL, TRACE,
1017258945Sroberto			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1018258945Sroberto			   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
1019258945Sroberto			   cpbuffer->buf, write_count);
1020258945Sroberto
1021258945Sroberto			memcpy(cpbuffer->buf,(used.base + skip_count), uselen);
1022258945Sroberto			cpbuffer->buflen = uselen;
1023258945Sroberto			iov[iovcount].buf = cpbuffer->buf;
1024258945Sroberto			iov[iovcount].len = used.length - skip_count;
1025258945Sroberto			write_count += uselen;
1026258945Sroberto			skip_count = 0;
1027258945Sroberto			iovcount++;
1028258945Sroberto		}
1029258945Sroberto		buffer = ISC_LIST_NEXT(buffer, link);
1030258945Sroberto	}
1031258945Sroberto
1032258945Sroberto	INSIST(skip_count == 0);
1033258945Sroberto
1034258945Sroberto config:
1035258945Sroberto	msg->msg_iov = iov;
1036258945Sroberto	msg->msg_iovlen = iovcount;
1037258945Sroberto	msg->msg_totallen = write_count;
1038258945Sroberto}
1039258945Sroberto
1040258945Srobertostatic void
1041258945Srobertoset_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1042258945Sroberto		isc_socketevent_t *dev)
1043258945Sroberto{
1044258945Sroberto	if (sock->type == isc_sockettype_udp) {
1045258945Sroberto		if (address != NULL)
1046258945Sroberto			dev->address = *address;
1047258945Sroberto		else
1048258945Sroberto			dev->address = sock->address;
1049258945Sroberto	} else if (sock->type == isc_sockettype_tcp) {
1050258945Sroberto		INSIST(address == NULL);
1051258945Sroberto		dev->address = sock->address;
1052258945Sroberto	}
1053258945Sroberto}
1054258945Sroberto
1055258945Srobertostatic void
1056258945Srobertodestroy_socketevent(isc_event_t *event) {
1057258945Sroberto	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1058258945Sroberto
1059258945Sroberto	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1060258945Sroberto
1061258945Sroberto	(ev->destroy)(event);
1062258945Sroberto}
1063258945Sroberto
1064258945Srobertostatic isc_socketevent_t *
1065258945Srobertoallocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1066258945Sroberto		     isc_taskaction_t action, const void *arg)
1067258945Sroberto{
1068258945Sroberto	isc_socketevent_t *ev;
1069258945Sroberto
1070258945Sroberto	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1071258945Sroberto						     sock, eventtype,
1072258945Sroberto						     action, arg,
1073258945Sroberto						     sizeof(*ev));
1074258945Sroberto	if (ev == NULL)
1075258945Sroberto		return (NULL);
1076258945Sroberto
1077258945Sroberto	ev->result = ISC_R_IOERROR; // XXXMLG temporary change to detect failure to set
1078258945Sroberto	ISC_LINK_INIT(ev, ev_link);
1079258945Sroberto	ISC_LIST_INIT(ev->bufferlist);
1080258945Sroberto	ev->region.base = NULL;
1081258945Sroberto	ev->n = 0;
1082258945Sroberto	ev->offset = 0;
1083258945Sroberto	ev->attributes = 0;
1084258945Sroberto	ev->destroy = ev->ev_destroy;
1085258945Sroberto	ev->ev_destroy = destroy_socketevent;
1086258945Sroberto
1087258945Sroberto	return (ev);
1088258945Sroberto}
1089258945Sroberto
1090258945Sroberto#if defined(ISC_SOCKET_DEBUG)
1091258945Srobertostatic void
1092258945Srobertodump_msg(struct msghdr *msg, isc_socket_t *sock) {
1093258945Sroberto	unsigned int i;
1094258945Sroberto
1095258945Sroberto	printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
1096258945Sroberto	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
1097258945Sroberto	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1098258945Sroberto	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1099258945Sroberto		printf("\t\t%d\tbase %p, len %d\n", i,
1100258945Sroberto		       msg->msg_iov[i].buf,
1101258945Sroberto		       msg->msg_iov[i].len);
1102258945Sroberto}
1103258945Sroberto#endif
1104258945Sroberto
1105258945Sroberto/*
1106258945Sroberto * map the error code
1107258945Sroberto */
1108258945Srobertoint
1109258945Srobertomap_socket_error(isc_socket_t *sock, int windows_errno, int *isc_errno,
1110258945Sroberto		 char *errorstring, size_t bufsize) {
1111258945Sroberto
1112258945Sroberto	int doreturn;
1113258945Sroberto	switch (windows_errno) {
1114258945Sroberto	case WSAECONNREFUSED:
1115258945Sroberto		*isc_errno = ISC_R_CONNREFUSED;
1116258945Sroberto		if (sock->connected)
1117258945Sroberto			doreturn = DOIO_HARD;
1118258945Sroberto		else
1119258945Sroberto			doreturn = DOIO_SOFT;
1120258945Sroberto		break;
1121258945Sroberto	case WSAENETUNREACH:
1122258945Sroberto	case ERROR_NETWORK_UNREACHABLE:
1123258945Sroberto		*isc_errno = ISC_R_NETUNREACH;
1124258945Sroberto		if (sock->connected)
1125258945Sroberto			doreturn = DOIO_HARD;
1126258945Sroberto		else
1127258945Sroberto			doreturn = DOIO_SOFT;
1128258945Sroberto		break;
1129258945Sroberto	case ERROR_PORT_UNREACHABLE:
1130258945Sroberto	case ERROR_HOST_UNREACHABLE:
1131258945Sroberto	case WSAEHOSTUNREACH:
1132258945Sroberto		*isc_errno = ISC_R_HOSTUNREACH;
1133258945Sroberto		if (sock->connected)
1134258945Sroberto			doreturn = DOIO_HARD;
1135258945Sroberto		else
1136258945Sroberto			doreturn = DOIO_SOFT;
1137258945Sroberto		break;
1138258945Sroberto	case WSAENETDOWN:
1139258945Sroberto		*isc_errno = ISC_R_NETDOWN;
1140258945Sroberto		if (sock->connected)
1141258945Sroberto			doreturn = DOIO_HARD;
1142258945Sroberto		else
1143258945Sroberto			doreturn = DOIO_SOFT;
1144258945Sroberto		break;
1145258945Sroberto	case WSAEHOSTDOWN:
1146258945Sroberto		*isc_errno = ISC_R_HOSTDOWN;
1147258945Sroberto		if (sock->connected)
1148258945Sroberto			doreturn = DOIO_HARD;
1149258945Sroberto		else
1150258945Sroberto			doreturn = DOIO_SOFT;
1151258945Sroberto		break;
1152258945Sroberto	case WSAEACCES:
1153258945Sroberto		*isc_errno = ISC_R_NOPERM;
1154258945Sroberto		if (sock->connected)
1155258945Sroberto			doreturn = DOIO_HARD;
1156258945Sroberto		else
1157258945Sroberto			doreturn = DOIO_SOFT;
1158258945Sroberto		break;
1159258945Sroberto	case WSAECONNRESET:
1160258945Sroberto	case WSAENETRESET:
1161258945Sroberto	case WSAECONNABORTED:
1162258945Sroberto	case WSAEDISCON:
1163258945Sroberto		*isc_errno = ISC_R_CONNECTIONRESET;
1164258945Sroberto		if (sock->connected)
1165258945Sroberto			doreturn = DOIO_HARD;
1166258945Sroberto		else
1167258945Sroberto			doreturn = DOIO_SOFT;
1168258945Sroberto		break;
1169258945Sroberto	case WSAENOTCONN:
1170258945Sroberto		*isc_errno = ISC_R_NOTCONNECTED;
1171258945Sroberto		if (sock->connected)
1172258945Sroberto			doreturn = DOIO_HARD;
1173258945Sroberto		else
1174258945Sroberto			doreturn = DOIO_SOFT;
1175258945Sroberto		break;
1176258945Sroberto	case ERROR_OPERATION_ABORTED:
1177258945Sroberto	case ERROR_CONNECTION_ABORTED:
1178258945Sroberto	case ERROR_REQUEST_ABORTED:
1179258945Sroberto		*isc_errno = ISC_R_CONNECTIONRESET;
1180258945Sroberto		doreturn = DOIO_HARD;
1181258945Sroberto		break;
1182258945Sroberto	case WSAENOBUFS:
1183258945Sroberto		*isc_errno = ISC_R_NORESOURCES;
1184258945Sroberto		doreturn = DOIO_HARD;
1185258945Sroberto		break;
1186258945Sroberto	case WSAEAFNOSUPPORT:
1187258945Sroberto		*isc_errno = ISC_R_FAMILYNOSUPPORT;
1188258945Sroberto		doreturn = DOIO_HARD;
1189258945Sroberto		break;
1190258945Sroberto	case WSAEADDRNOTAVAIL:
1191258945Sroberto		*isc_errno = ISC_R_ADDRNOTAVAIL;
1192258945Sroberto		doreturn = DOIO_HARD;
1193258945Sroberto		break;
1194258945Sroberto	case WSAEDESTADDRREQ:
1195258945Sroberto		*isc_errno = ISC_R_BADADDRESSFORM;
1196258945Sroberto		doreturn = DOIO_HARD;
1197258945Sroberto		break;
1198258945Sroberto	case ERROR_NETNAME_DELETED:
1199258945Sroberto		*isc_errno = ISC_R_NETDOWN;
1200258945Sroberto		doreturn = DOIO_HARD;
1201258945Sroberto		break;
1202258945Sroberto	default:
1203258945Sroberto		*isc_errno = ISC_R_IOERROR;
1204258945Sroberto		doreturn = DOIO_HARD;
1205258945Sroberto		break;
1206258945Sroberto	}
1207258945Sroberto	if (doreturn == DOIO_HARD) {
1208258945Sroberto		isc__strerror(windows_errno, errorstring, bufsize);
1209258945Sroberto	}
1210258945Sroberto	return (doreturn);
1211258945Sroberto}
1212258945Sroberto
1213258945Srobertostatic void
1214258945Srobertofill_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1215258945Sroberto	isc_region_t r;
1216258945Sroberto	int copylen;
1217258945Sroberto	isc_buffer_t *buffer;
1218258945Sroberto
1219258945Sroberto	INSIST(dev->n < dev->minimum);
1220258945Sroberto	INSIST(sock->recvbuf.remaining > 0);
1221258945Sroberto	INSIST(sock->pending_recv == 0);
1222258945Sroberto
1223258945Sroberto	if (sock->type == isc_sockettype_udp) {
1224258945Sroberto		dev->address.length = sock->recvbuf.from_addr_len;
1225258945Sroberto		memcpy(&dev->address.type, &sock->recvbuf.from_addr,
1226258945Sroberto		    sock->recvbuf.from_addr_len);
1227258945Sroberto		if (isc_sockaddr_getport(&dev->address) == 0) {
1228258945Sroberto			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1229258945Sroberto				socket_log(__LINE__, sock, &dev->address, IOEVENT,
1230258945Sroberto					   isc_msgcat, ISC_MSGSET_SOCKET,
1231258945Sroberto					   ISC_MSG_ZEROPORT,
1232258945Sroberto					   "dropping source port zero packet");
1233258945Sroberto			}
1234258945Sroberto			sock->recvbuf.remaining = 0;
1235258945Sroberto			return;
1236258945Sroberto		}
1237258945Sroberto	} else if (sock->type == isc_sockettype_tcp) {
1238258945Sroberto		dev->address = sock->address;
1239258945Sroberto	}
1240258945Sroberto
1241258945Sroberto	/*
1242258945Sroberto	 * Run through the list of buffers we were given, and find the
1243258945Sroberto	 * first one with space.  Once it is found, loop through, filling
1244258945Sroberto	 * the buffers as much as possible.
1245258945Sroberto	 */
1246258945Sroberto	buffer = ISC_LIST_HEAD(dev->bufferlist);
1247258945Sroberto	if (buffer != NULL) { // Multi-buffer receive
1248258945Sroberto		while (buffer != NULL && sock->recvbuf.remaining > 0) {
1249258945Sroberto			REQUIRE(ISC_BUFFER_VALID(buffer));
1250258945Sroberto			if (isc_buffer_availablelength(buffer) > 0) {
1251258945Sroberto				isc_buffer_availableregion(buffer, &r);
1252258945Sroberto				copylen = min(r.length, sock->recvbuf.remaining);
1253258945Sroberto				memcpy(r.base, sock->recvbuf.consume_position, copylen);
1254258945Sroberto				sock->recvbuf.consume_position += copylen;
1255258945Sroberto				sock->recvbuf.remaining -= copylen;
1256258945Sroberto				isc_buffer_add(buffer, copylen);
1257258945Sroberto				dev->n += copylen;
1258258945Sroberto			}
1259258945Sroberto			buffer = ISC_LIST_NEXT(buffer, link);
1260258945Sroberto		}
1261258945Sroberto	} else { // Single-buffer receive
1262258945Sroberto		copylen = min(dev->region.length - dev->n, sock->recvbuf.remaining);
1263258945Sroberto		memcpy(dev->region.base + dev->n, sock->recvbuf.consume_position, copylen);
1264258945Sroberto		sock->recvbuf.consume_position += copylen;
1265258945Sroberto		sock->recvbuf.remaining -= copylen;
1266258945Sroberto		dev->n += copylen;
1267258945Sroberto	}
1268258945Sroberto
1269258945Sroberto	/*
1270258945Sroberto	 * UDP receives are all-consuming.  That is, if we have 4k worth of
1271258945Sroberto	 * data in our receive buffer, and the caller only gave us
1272258945Sroberto	 * 1k of space, we will toss the remaining 3k of data.  TCP
1273258945Sroberto	 * will keep the extra data around and use it for later requests.
1274258945Sroberto	 */
1275258945Sroberto	if (sock->type == isc_sockettype_udp)
1276258945Sroberto		sock->recvbuf.remaining = 0;
1277258945Sroberto}
1278258945Sroberto
1279258945Sroberto/*
1280258945Sroberto * Copy out as much data from the internal buffer to done events.
1281258945Sroberto * As each done event is filled, send it along its way.
1282258945Sroberto */
1283258945Srobertostatic void
1284258945Srobertocompleteio_recv(isc_socket_t *sock)
1285258945Sroberto{
1286258945Sroberto	isc_socketevent_t *dev;
1287258945Sroberto
1288258945Sroberto	/*
1289258945Sroberto	 * If we are in the process of filling our buffer, we cannot
1290258945Sroberto	 * touch it yet, so don't.
1291258945Sroberto	 */
1292258945Sroberto	if (sock->pending_recv > 0)
1293258945Sroberto		return;
1294258945Sroberto
1295258945Sroberto	while (sock->recvbuf.remaining > 0 && !ISC_LIST_EMPTY(sock->recv_list)) {
1296258945Sroberto		dev = ISC_LIST_HEAD(sock->recv_list);
1297258945Sroberto
1298258945Sroberto		/*
1299258945Sroberto		 * See if we have sufficient data in our receive buffer
1300258945Sroberto		 * to handle this.  If we do, copy out the data.
1301258945Sroberto		 */
1302258945Sroberto		fill_recv(sock, dev);
1303258945Sroberto
1304258945Sroberto		/*
1305258945Sroberto		 * Did we satisfy it?
1306258945Sroberto		 */
1307258945Sroberto		if (dev->n >= dev->minimum) {
1308258945Sroberto			dev->result = ISC_R_SUCCESS;
1309258945Sroberto			send_recvdone_event(sock, &dev);
1310258945Sroberto		}
1311258945Sroberto	}
1312258945Sroberto}
1313258945Sroberto
1314258945Sroberto/*
1315258945Sroberto * Returns:
1316258945Sroberto *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1317258945Sroberto *			ISC_R_SUCCESS.
1318258945Sroberto *
1319258945Sroberto *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1320258945Sroberto *			dev->result contains the appropriate error.
1321258945Sroberto *
1322258945Sroberto *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1323258945Sroberto *			event was sent.  The operation should be retried.
1324258945Sroberto *
1325258945Sroberto *	No other return values are possible.
1326258945Sroberto */
1327258945Srobertostatic int
1328258945Srobertocompleteio_send(isc_socket_t *sock, isc_socketevent_t *dev,
1329258945Sroberto		struct msghdr *messagehdr, int cc, int send_errno)
1330258945Sroberto{
1331258945Sroberto	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1332258945Sroberto	char strbuf[ISC_STRERRORSIZE];
1333258945Sroberto
1334258945Sroberto	if (send_errno != 0) {
1335258945Sroberto		if (SOFT_ERROR(send_errno))
1336258945Sroberto			return (DOIO_SOFT);
1337258945Sroberto
1338258945Sroberto		return (map_socket_error(sock, send_errno, &dev->result,
1339258945Sroberto			strbuf, sizeof(strbuf)));
1340258945Sroberto
1341258945Sroberto		/*
1342258945Sroberto		 * The other error types depend on whether or not the
1343258945Sroberto		 * socket is UDP or TCP.  If it is UDP, some errors
1344258945Sroberto		 * that we expect to be fatal under TCP are merely
1345258945Sroberto		 * annoying, and are really soft errors.
1346258945Sroberto		 *
1347258945Sroberto		 * However, these soft errors are still returned as
1348258945Sroberto		 * a status.
1349258945Sroberto		 */
1350258945Sroberto		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1351258945Sroberto		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1352258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s",
1353258945Sroberto				 addrbuf, strbuf);
1354258945Sroberto		dev->result = isc__errno2result(send_errno);
1355280849Scy		return (DOIO_HARD);
1356258945Sroberto	}
1357258945Sroberto
1358258945Sroberto	/*
1359258945Sroberto	 * If we write less than we expected, update counters, poke.
1360258945Sroberto	 */
1361258945Sroberto	dev->n += cc;
1362258945Sroberto	if (cc != messagehdr->msg_totallen)
1363258945Sroberto		return (DOIO_SOFT);
1364258945Sroberto
1365258945Sroberto	/*
1366258945Sroberto	 * Exactly what we wanted to write.  We're done with this
1367258945Sroberto	 * entry.  Post its completion event.
1368258945Sroberto	 */
1369258945Sroberto	dev->result = ISC_R_SUCCESS;
1370258945Sroberto	return (DOIO_SUCCESS);
1371258945Sroberto}
1372258945Sroberto
1373258945Srobertostatic int
1374258945Srobertostartio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes,
1375258945Sroberto	     int *send_errno)
1376258945Sroberto{
1377258945Sroberto	char *cmsg = NULL;
1378258945Sroberto	char strbuf[ISC_STRERRORSIZE];
1379258945Sroberto	IoCompletionInfo *lpo;
1380258945Sroberto	int status;
1381258945Sroberto	struct msghdr *msghdr;
1382258945Sroberto
1383258945Sroberto	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
1384258945Sroberto					    HEAP_ZERO_MEMORY,
1385258945Sroberto					    sizeof(IoCompletionInfo));
1386258945Sroberto	RUNTIME_CHECK(lpo != NULL);
1387258945Sroberto	lpo->request_type = SOCKET_SEND;
1388258945Sroberto	lpo->dev = dev;
1389258945Sroberto	msghdr = &lpo->messagehdr;
1390258945Sroberto	memset(msghdr, 0, sizeof(struct msghdr));
1391258945Sroberto	ISC_LIST_INIT(lpo->bufferlist);
1392258945Sroberto
1393258945Sroberto	build_msghdr_send(sock, dev, msghdr, cmsg, sock->iov, lpo);
1394258945Sroberto
1395258945Sroberto	*nbytes = internal_sendmsg(sock, lpo, msghdr, 0, send_errno);
1396258945Sroberto
1397258945Sroberto	if (*nbytes < 0) {
1398258945Sroberto		/*
1399258945Sroberto		 * I/O has been initiated
1400258945Sroberto		 * completion will be through the completion port
1401258945Sroberto		 */
1402258945Sroberto		if (PENDING_ERROR(*send_errno)) {
1403258945Sroberto			status = DOIO_PENDING;
1404258945Sroberto			goto done;
1405258945Sroberto		}
1406258945Sroberto
1407258945Sroberto		if (SOFT_ERROR(*send_errno)) {
1408258945Sroberto			status = DOIO_SOFT;
1409258945Sroberto			goto done;
1410258945Sroberto		}
1411258945Sroberto
1412258945Sroberto		/*
1413258945Sroberto		 * If we got this far then something is wrong
1414258945Sroberto		 */
1415258945Sroberto		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1416258945Sroberto			isc__strerror(*send_errno, strbuf, sizeof(strbuf));
1417258945Sroberto			socket_log(__LINE__, sock, NULL, IOEVENT,
1418258945Sroberto				   isc_msgcat, ISC_MSGSET_SOCKET,
1419258945Sroberto				   ISC_MSG_INTERNALSEND,
1420258945Sroberto				   "startio_send: internal_sendmsg(%d) %d "
1421258945Sroberto				   "bytes, err %d/%s",
1422258945Sroberto				   sock->fd, *nbytes, *send_errno, strbuf);
1423258945Sroberto		}
1424280849Scy		status = DOIO_HARD;
1425258945Sroberto		goto done;
1426258945Sroberto	}
1427258945Sroberto	dev->result = ISC_R_SUCCESS;
1428258945Sroberto	status = DOIO_SOFT;
1429258945Sroberto done:
1430258945Sroberto	_set_state(sock, SOCK_DATA);
1431258945Sroberto	return (status);
1432258945Sroberto}
1433258945Sroberto
1434258945Srobertostatic isc_result_t
1435258945Srobertoallocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1436258945Sroberto		isc_socket_t **socketp) {
1437258945Sroberto	isc_socket_t *sock;
1438258945Sroberto	isc_result_t result;
1439258945Sroberto
1440258945Sroberto	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1441258945Sroberto
1442258945Sroberto	if (sock == NULL)
1443258945Sroberto		return (ISC_R_NOMEMORY);
1444258945Sroberto
1445258945Sroberto	sock->magic = 0;
1446258945Sroberto	sock->references = 0;
1447258945Sroberto
1448258945Sroberto	sock->manager = manager;
1449258945Sroberto	sock->type = type;
1450258945Sroberto	sock->fd = INVALID_SOCKET;
1451258945Sroberto
1452258945Sroberto	ISC_LINK_INIT(sock, link);
1453258945Sroberto
1454258945Sroberto	/*
1455258945Sroberto	 * set up list of readers and writers to be initially empty
1456258945Sroberto	 */
1457258945Sroberto	ISC_LIST_INIT(sock->recv_list);
1458258945Sroberto	ISC_LIST_INIT(sock->send_list);
1459258945Sroberto	ISC_LIST_INIT(sock->accept_list);
1460258945Sroberto	sock->connect_ev = NULL;
1461258945Sroberto	sock->pending_accept = 0;
1462258945Sroberto	sock->pending_recv = 0;
1463258945Sroberto	sock->pending_send = 0;
1464258945Sroberto	sock->pending_iocp = 0;
1465258945Sroberto	sock->listener = 0;
1466258945Sroberto	sock->connected = 0;
1467258945Sroberto	sock->pending_connect = 0;
1468258945Sroberto	sock->bound = 0;
1469280849Scy	sock->dupped = 0;
1470258945Sroberto	memset(sock->name, 0, sizeof(sock->name));	// zero the name field
1471258945Sroberto	_set_state(sock, SOCK_INITIALIZED);
1472258945Sroberto
1473258945Sroberto	sock->recvbuf.len = 65536;
1474258945Sroberto	sock->recvbuf.consume_position = sock->recvbuf.base;
1475258945Sroberto	sock->recvbuf.remaining = 0;
1476258945Sroberto	sock->recvbuf.base = isc_mem_get(manager->mctx, sock->recvbuf.len); // max buffer size
1477258945Sroberto	if (sock->recvbuf.base == NULL) {
1478258945Sroberto		sock->magic = 0;
1479258945Sroberto		goto error;
1480258945Sroberto	}
1481258945Sroberto
1482258945Sroberto	/*
1483258945Sroberto	 * initialize the lock
1484258945Sroberto	 */
1485258945Sroberto	result = isc_mutex_init(&sock->lock);
1486258945Sroberto	if (result != ISC_R_SUCCESS) {
1487258945Sroberto		sock->magic = 0;
1488258945Sroberto		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1489258945Sroberto		sock->recvbuf.base = NULL;
1490258945Sroberto		goto error;
1491258945Sroberto	}
1492258945Sroberto
1493258945Sroberto	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1494258945Sroberto		   "allocated");
1495258945Sroberto
1496258945Sroberto	sock->magic = SOCKET_MAGIC;
1497258945Sroberto	*socketp = sock;
1498258945Sroberto
1499258945Sroberto	return (ISC_R_SUCCESS);
1500258945Sroberto
1501258945Sroberto error:
1502258945Sroberto	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1503258945Sroberto
1504258945Sroberto	return (result);
1505258945Sroberto}
1506258945Sroberto
1507258945Sroberto/*
1508258945Sroberto * Verify that the socket state is consistent.
1509258945Sroberto */
1510258945Srobertostatic void
1511258945Srobertoconsistent(isc_socket_t *sock) {
1512258945Sroberto
1513258945Sroberto	isc_socketevent_t *dev;
1514258945Sroberto	isc_socket_newconnev_t *nev;
1515258945Sroberto	unsigned int count;
1516258945Sroberto	char *crash_reason;
1517258945Sroberto	isc_boolean_t crash = ISC_FALSE;
1518258945Sroberto
1519258945Sroberto	REQUIRE(sock->pending_iocp == sock->pending_recv + sock->pending_send
1520258945Sroberto		+ sock->pending_accept + sock->pending_connect);
1521258945Sroberto
1522258945Sroberto	dev = ISC_LIST_HEAD(sock->send_list);
1523258945Sroberto	count = 0;
1524258945Sroberto	while (dev != NULL) {
1525258945Sroberto		count++;
1526258945Sroberto		dev = ISC_LIST_NEXT(dev, ev_link);
1527258945Sroberto	}
1528258945Sroberto	if (count > sock->pending_send) {
1529258945Sroberto		crash = ISC_TRUE;
1530258945Sroberto		crash_reason = "send_list > sock->pending_send";
1531258945Sroberto	}
1532258945Sroberto
1533258945Sroberto	nev = ISC_LIST_HEAD(sock->accept_list);
1534258945Sroberto	count = 0;
1535258945Sroberto	while (nev != NULL) {
1536258945Sroberto		count++;
1537258945Sroberto		nev = ISC_LIST_NEXT(nev, ev_link);
1538258945Sroberto	}
1539258945Sroberto	if (count > sock->pending_accept) {
1540258945Sroberto		crash = ISC_TRUE;
1541258945Sroberto		crash_reason = "send_list > sock->pending_send";
1542258945Sroberto	}
1543258945Sroberto
1544258945Sroberto	if (crash) {
1545258945Sroberto		socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1546258945Sroberto			   ISC_MSG_DESTROYING, "SOCKET INCONSISTENT: %s",
1547258945Sroberto			   crash_reason);
1548258945Sroberto		sock_dump(sock);
1549258945Sroberto		INSIST(crash == ISC_FALSE);
1550258945Sroberto	}
1551258945Sroberto}
1552258945Sroberto
1553258945Sroberto/*
1554258945Sroberto * Maybe free the socket.
1555258945Sroberto *
1556258945Sroberto * This function will verify tht the socket is no longer in use in any way,
1557258945Sroberto * either internally or externally.  This is the only place where this
1558258945Sroberto * check is to be made; if some bit of code believes that IT is done with
1559258945Sroberto * the socket (e.g., some reference counter reaches zero), it should call
1560258945Sroberto * this function.
1561258945Sroberto *
1562258945Sroberto * When calling this function, the socket must be locked, and the manager
1563258945Sroberto * must be unlocked.
1564258945Sroberto *
1565258945Sroberto * When this function returns, *socketp will be NULL.  No tricks to try
1566258945Sroberto * to hold on to this pointer are allowed.
1567258945Sroberto */
1568258945Srobertostatic void
1569258945Srobertomaybe_free_socket(isc_socket_t **socketp, int lineno) {
1570258945Sroberto	isc_socket_t *sock = *socketp;
1571258945Sroberto	*socketp = NULL;
1572258945Sroberto
1573258945Sroberto	INSIST(VALID_SOCKET(sock));
1574258945Sroberto	CONSISTENT(sock);
1575258945Sroberto
1576258945Sroberto	if (sock->pending_iocp > 0
1577258945Sroberto	    || sock->pending_recv > 0
1578258945Sroberto	    || sock->pending_send > 0
1579258945Sroberto	    || sock->pending_accept > 0
1580258945Sroberto	    || sock->references > 0
1581258945Sroberto	    || sock->pending_connect == 1
1582258945Sroberto	    || !ISC_LIST_EMPTY(sock->recv_list)
1583258945Sroberto	    || !ISC_LIST_EMPTY(sock->send_list)
1584258945Sroberto	    || !ISC_LIST_EMPTY(sock->accept_list)
1585258945Sroberto	    || sock->fd != INVALID_SOCKET) {
1586258945Sroberto		UNLOCK(&sock->lock);
1587258945Sroberto		return;
1588258945Sroberto	}
1589258945Sroberto	UNLOCK(&sock->lock);
1590258945Sroberto
1591258945Sroberto	free_socket(&sock, lineno);
1592258945Sroberto}
1593258945Sroberto
1594258945Srobertovoid
1595258945Srobertofree_socket(isc_socket_t **sockp, int lineno) {
1596258945Sroberto	isc_socketmgr_t *manager;
1597258945Sroberto	isc_socket_t *sock = *sockp;
1598258945Sroberto	*sockp = NULL;
1599258945Sroberto
1600258945Sroberto	manager = sock->manager;
1601258945Sroberto
1602258945Sroberto	/*
1603258945Sroberto	 * Seems we can free the socket after all.
1604258945Sroberto	 */
1605258945Sroberto	manager = sock->manager;
1606258945Sroberto	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1607258945Sroberto		   ISC_MSG_DESTROYING, "freeing socket line %d fd %d lock %p semaphore %p",
1608258945Sroberto		   lineno, sock->fd, &sock->lock, sock->lock.LockSemaphore);
1609258945Sroberto
1610258945Sroberto	sock->magic = 0;
1611258945Sroberto	DESTROYLOCK(&sock->lock);
1612258945Sroberto
1613258945Sroberto	if (sock->recvbuf.base != NULL)
1614258945Sroberto		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1615258945Sroberto
1616258945Sroberto	LOCK(&manager->lock);
1617258945Sroberto	if (ISC_LINK_LINKED(sock, link))
1618258945Sroberto		ISC_LIST_UNLINK(manager->socklist, sock, link);
1619258945Sroberto	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1620258945Sroberto
1621258945Sroberto	if (ISC_LIST_EMPTY(manager->socklist))
1622258945Sroberto		SIGNAL(&manager->shutdown_ok);
1623258945Sroberto	UNLOCK(&manager->lock);
1624258945Sroberto}
1625258945Sroberto
1626258945Sroberto/*
1627258945Sroberto * Create a new 'type' socket managed by 'manager'.  Events
1628258945Sroberto * will be posted to 'task' and when dispatched 'action' will be
1629258945Sroberto * called with 'arg' as the arg value.  The new socket is returned
1630258945Sroberto * in 'socketp'.
1631258945Sroberto */
1632280849Scystatic isc_result_t
1633280849Scysocket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1634280849Scy	      isc_socket_t **socketp, isc_socket_t *dup_socket)
1635280849Scy{
1636258945Sroberto	isc_socket_t *sock = NULL;
1637258945Sroberto	isc_result_t result;
1638258945Sroberto#if defined(USE_CMSG)
1639258945Sroberto	int on = 1;
1640258945Sroberto#endif
1641258945Sroberto#if defined(SO_RCVBUF)
1642258945Sroberto	ISC_SOCKADDR_LEN_T optlen;
1643258945Sroberto	int size;
1644258945Sroberto#endif
1645258945Sroberto	int socket_errno;
1646258945Sroberto	char strbuf[ISC_STRERRORSIZE];
1647258945Sroberto
1648258945Sroberto	REQUIRE(VALID_MANAGER(manager));
1649258945Sroberto	REQUIRE(socketp != NULL && *socketp == NULL);
1650258945Sroberto	REQUIRE(type != isc_sockettype_fdwatch);
1651258945Sroberto
1652280849Scy	if (dup_socket != NULL)
1653280849Scy		return (ISC_R_NOTIMPLEMENTED);
1654280849Scy
1655258945Sroberto	result = allocate_socket(manager, type, &sock);
1656258945Sroberto	if (result != ISC_R_SUCCESS)
1657258945Sroberto		return (result);
1658258945Sroberto
1659258945Sroberto	sock->pf = pf;
1660280849Scy#if 0
1661280849Scy	if (dup_socket == NULL) {
1662280849Scy#endif
1663280849Scy		switch (type) {
1664280849Scy		case isc_sockettype_udp:
1665280849Scy			sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1666280849Scy			if (sock->fd != INVALID_SOCKET) {
1667280849Scy				result = connection_reset_fix(sock->fd);
1668280849Scy				if (result != ISC_R_SUCCESS) {
1669280849Scy					socket_log(__LINE__, sock,
1670280849Scy						NULL, EVENT, NULL, 0, 0,
1671280849Scy						"closed %d %d %d "
1672280849Scy						"con_reset_fix_failed",
1673280849Scy						sock->pending_recv,
1674280849Scy						sock->pending_send,
1675280849Scy						sock->references);
1676280849Scy					closesocket(sock->fd);
1677280849Scy					_set_state(sock, SOCK_CLOSED);
1678280849Scy					sock->fd = INVALID_SOCKET;
1679280849Scy					free_socket(&sock, __LINE__);
1680280849Scy					return (result);
1681280849Scy				}
1682258945Sroberto			}
1683280849Scy			break;
1684280849Scy		case isc_sockettype_tcp:
1685280849Scy			sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1686280849Scy			break;
1687258945Sroberto		}
1688280849Scy#if 0
1689280849Scy	} else {
1690280849Scy		/*
1691280849Scy		 * XXX: dup() is deprecated in windows, use _dup()
1692280849Scy		 * instead.  In future we may want to investigate
1693280849Scy		 * WSADuplicateSocket().
1694280849Scy		 */
1695280849Scy		sock->fd = _dup(dup_socket->fd);
1696280849Scy		sock->dupped = 1;
1697280849Scy		sock->bound = dup_socket->bound;
1698258945Sroberto	}
1699280849Scy#endif
1700258945Sroberto
1701258945Sroberto	if (sock->fd == INVALID_SOCKET) {
1702258945Sroberto		socket_errno = WSAGetLastError();
1703258945Sroberto		free_socket(&sock, __LINE__);
1704258945Sroberto
1705258945Sroberto		switch (socket_errno) {
1706258945Sroberto		case WSAEMFILE:
1707258945Sroberto		case WSAENOBUFS:
1708258945Sroberto			return (ISC_R_NORESOURCES);
1709258945Sroberto
1710258945Sroberto		case WSAEPROTONOSUPPORT:
1711258945Sroberto		case WSAEPFNOSUPPORT:
1712258945Sroberto		case WSAEAFNOSUPPORT:
1713258945Sroberto			return (ISC_R_FAMILYNOSUPPORT);
1714258945Sroberto
1715258945Sroberto		default:
1716258945Sroberto			isc__strerror(socket_errno, strbuf, sizeof(strbuf));
1717258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
1718258945Sroberto					 "socket() %s: %s",
1719258945Sroberto					 isc_msgcat_get(isc_msgcat,
1720258945Sroberto							ISC_MSGSET_GENERAL,
1721258945Sroberto							ISC_MSG_FAILED,
1722258945Sroberto							"failed"),
1723258945Sroberto					 strbuf);
1724258945Sroberto			return (ISC_R_UNEXPECTED);
1725258945Sroberto		}
1726258945Sroberto	}
1727258945Sroberto
1728258945Sroberto	result = make_nonblock(sock->fd);
1729258945Sroberto	if (result != ISC_R_SUCCESS) {
1730258945Sroberto		socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1731258945Sroberto			"closed %d %d %d make_nonblock_failed",
1732258945Sroberto			sock->pending_recv, sock->pending_send,
1733258945Sroberto			sock->references);
1734258945Sroberto		closesocket(sock->fd);
1735258945Sroberto		sock->fd = INVALID_SOCKET;
1736258945Sroberto		free_socket(&sock, __LINE__);
1737258945Sroberto		return (result);
1738258945Sroberto	}
1739258945Sroberto
1740258945Sroberto
1741258945Sroberto#if defined(USE_CMSG) || defined(SO_RCVBUF)
1742258945Sroberto	if (type == isc_sockettype_udp) {
1743258945Sroberto
1744258945Sroberto#if defined(USE_CMSG)
1745258945Sroberto#if defined(ISC_PLATFORM_HAVEIPV6)
1746258945Sroberto#ifdef IPV6_RECVPKTINFO
1747258945Sroberto		/* 2292bis */
1748258945Sroberto		if ((pf == AF_INET6)
1749258945Sroberto		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1750280849Scy				   (char *)&on, sizeof(on)) < 0)) {
1751258945Sroberto			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1752258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
1753258945Sroberto					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1754258945Sroberto					 "%s: %s", sock->fd,
1755258945Sroberto					 isc_msgcat_get(isc_msgcat,
1756258945Sroberto							ISC_MSGSET_GENERAL,
1757258945Sroberto							ISC_MSG_FAILED,
1758258945Sroberto							"failed"),
1759258945Sroberto					 strbuf);
1760258945Sroberto		}
1761258945Sroberto#else
1762258945Sroberto		/* 2292 */
1763258945Sroberto		if ((pf == AF_INET6)
1764258945Sroberto		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1765280849Scy				   (char *)&on, sizeof(on)) < 0)) {
1766258945Sroberto			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1767258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
1768258945Sroberto					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1769258945Sroberto					 sock->fd,
1770258945Sroberto					 isc_msgcat_get(isc_msgcat,
1771258945Sroberto							ISC_MSGSET_GENERAL,
1772258945Sroberto							ISC_MSG_FAILED,
1773258945Sroberto							"failed"),
1774258945Sroberto					 strbuf);
1775258945Sroberto		}
1776258945Sroberto#endif /* IPV6_RECVPKTINFO */
1777258945Sroberto#ifdef IPV6_USE_MIN_MTU	/*2292bis, not too common yet*/
1778258945Sroberto		/* use minimum MTU */
1779258945Sroberto		if (pf == AF_INET6) {
1780258945Sroberto			(void)setsockopt(sock->fd, IPPROTO_IPV6,
1781258945Sroberto					 IPV6_USE_MIN_MTU,
1782280849Scy					 (char *)&on, sizeof(on));
1783258945Sroberto		}
1784258945Sroberto#endif
1785258945Sroberto#endif /* ISC_PLATFORM_HAVEIPV6 */
1786258945Sroberto#endif /* defined(USE_CMSG) */
1787258945Sroberto
1788258945Sroberto#if defined(SO_RCVBUF)
1789258945Sroberto	       optlen = sizeof(size);
1790258945Sroberto	       if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1791280849Scy			      (char *)&size, &optlen) >= 0 &&
1792258945Sroberto		    size < RCVBUFSIZE) {
1793258945Sroberto		       size = RCVBUFSIZE;
1794258945Sroberto		       (void)setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1795280849Scy					(char *)&size, sizeof(size));
1796258945Sroberto	       }
1797258945Sroberto#endif
1798258945Sroberto
1799258945Sroberto	}
1800258945Sroberto#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1801258945Sroberto
1802258945Sroberto	_set_state(sock, SOCK_OPEN);
1803258945Sroberto	sock->references = 1;
1804258945Sroberto	*socketp = sock;
1805258945Sroberto
1806258945Sroberto	iocompletionport_update(sock);
1807258945Sroberto
1808258945Sroberto	/*
1809258945Sroberto	 * Note we don't have to lock the socket like we normally would because
1810258945Sroberto	 * there are no external references to it yet.
1811258945Sroberto	 */
1812258945Sroberto	LOCK(&manager->lock);
1813258945Sroberto	ISC_LIST_APPEND(manager->socklist, sock, link);
1814258945Sroberto	InterlockedIncrement(&manager->totalSockets);
1815258945Sroberto	UNLOCK(&manager->lock);
1816258945Sroberto
1817280849Scy	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat,
1818280849Scy		   ISC_MSGSET_SOCKET, ISC_MSG_CREATED,
1819280849Scy		   "created %u type %u", sock->fd, type);
1820258945Sroberto
1821258945Sroberto	return (ISC_R_SUCCESS);
1822258945Sroberto}
1823258945Sroberto
1824258945Srobertoisc_result_t
1825280849Scyisc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1826280849Scy		   isc_socket_t **socketp)
1827280849Scy{
1828280849Scy	return (socket_create(manager, pf, type, socketp, NULL));
1829280849Scy}
1830280849Scy
1831280849Scyisc_result_t
1832280849Scyisc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
1833280849Scy	REQUIRE(VALID_SOCKET(sock));
1834280849Scy	REQUIRE(socketp != NULL && *socketp == NULL);
1835280849Scy
1836280849Scy#if 1
1837280849Scy	return (ISC_R_NOTIMPLEMENTED);
1838280849Scy#else
1839280849Scy	return (socket_create(sock->manager, sock->pf, sock->type,
1840280849Scy			      socketp, sock));
1841280849Scy#endif
1842280849Scy}
1843280849Scy
1844280849Scyisc_result_t
1845258945Srobertoisc_socket_open(isc_socket_t *sock) {
1846258945Sroberto	REQUIRE(VALID_SOCKET(sock));
1847258945Sroberto	REQUIRE(sock->type != isc_sockettype_fdwatch);
1848258945Sroberto
1849258945Sroberto	return (ISC_R_NOTIMPLEMENTED);
1850258945Sroberto}
1851258945Sroberto
1852258945Sroberto/*
1853258945Sroberto * Attach to a socket.  Caller must explicitly detach when it is done.
1854258945Sroberto */
1855258945Srobertovoid
1856280849Scyisc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1857258945Sroberto	REQUIRE(VALID_SOCKET(sock));
1858258945Sroberto	REQUIRE(socketp != NULL && *socketp == NULL);
1859258945Sroberto
1860258945Sroberto	LOCK(&sock->lock);
1861258945Sroberto	CONSISTENT(sock);
1862258945Sroberto	sock->references++;
1863258945Sroberto	UNLOCK(&sock->lock);
1864258945Sroberto
1865258945Sroberto	*socketp = sock;
1866258945Sroberto}
1867258945Sroberto
1868258945Sroberto/*
1869258945Sroberto * Dereference a socket.  If this is the last reference to it, clean things
1870258945Sroberto * up by destroying the socket.
1871258945Sroberto */
1872258945Srobertovoid
1873280849Scyisc__socket_detach(isc_socket_t **socketp) {
1874258945Sroberto	isc_socket_t *sock;
1875258945Sroberto	isc_boolean_t kill_socket = ISC_FALSE;
1876258945Sroberto
1877258945Sroberto	REQUIRE(socketp != NULL);
1878258945Sroberto	sock = *socketp;
1879258945Sroberto	REQUIRE(VALID_SOCKET(sock));
1880258945Sroberto	REQUIRE(sock->type != isc_sockettype_fdwatch);
1881258945Sroberto
1882258945Sroberto	LOCK(&sock->lock);
1883258945Sroberto	CONSISTENT(sock);
1884258945Sroberto	REQUIRE(sock->references > 0);
1885258945Sroberto	sock->references--;
1886258945Sroberto
1887258945Sroberto	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1888258945Sroberto		"detach_socket %d %d %d",
1889258945Sroberto		sock->pending_recv, sock->pending_send,
1890258945Sroberto		sock->references);
1891258945Sroberto
1892258945Sroberto	if (sock->references == 0 && sock->fd != INVALID_SOCKET) {
1893258945Sroberto		closesocket(sock->fd);
1894258945Sroberto		sock->fd = INVALID_SOCKET;
1895258945Sroberto		_set_state(sock, SOCK_CLOSED);
1896258945Sroberto	}
1897258945Sroberto
1898258945Sroberto	maybe_free_socket(&sock, __LINE__);
1899258945Sroberto
1900258945Sroberto	*socketp = NULL;
1901258945Sroberto}
1902258945Sroberto
1903258945Srobertoisc_result_t
1904258945Srobertoisc_socket_close(isc_socket_t *sock) {
1905258945Sroberto	REQUIRE(VALID_SOCKET(sock));
1906258945Sroberto	REQUIRE(sock->type != isc_sockettype_fdwatch);
1907258945Sroberto
1908258945Sroberto	return (ISC_R_NOTIMPLEMENTED);
1909258945Sroberto}
1910258945Sroberto
1911258945Sroberto/*
1912258945Sroberto * Dequeue an item off the given socket's read queue, set the result code
1913258945Sroberto * in the done event to the one provided, and send it to the task it was
1914258945Sroberto * destined for.
1915258945Sroberto *
1916258945Sroberto * If the event to be sent is on a list, remove it before sending.  If
1917258945Sroberto * asked to, send and detach from the task as well.
1918258945Sroberto *
1919258945Sroberto * Caller must have the socket locked if the event is attached to the socket.
1920258945Sroberto */
1921258945Srobertostatic void
1922258945Srobertosend_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1923258945Sroberto	isc_task_t *task;
1924258945Sroberto
1925258945Sroberto	task = (*dev)->ev_sender;
1926258945Sroberto	(*dev)->ev_sender = sock;
1927258945Sroberto
1928258945Sroberto	if (ISC_LINK_LINKED(*dev, ev_link))
1929258945Sroberto		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1930258945Sroberto
1931258945Sroberto	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1932258945Sroberto	    == ISC_SOCKEVENTATTR_ATTACHED)
1933258945Sroberto		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1934258945Sroberto	else
1935258945Sroberto		isc_task_send(task, (isc_event_t **)dev);
1936258945Sroberto
1937258945Sroberto	CONSISTENT(sock);
1938258945Sroberto}
1939258945Sroberto
1940258945Sroberto/*
1941258945Sroberto * See comments for send_recvdone_event() above.
1942258945Sroberto */
1943258945Srobertostatic void
1944258945Srobertosend_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1945258945Sroberto	isc_task_t *task;
1946258945Sroberto
1947258945Sroberto	INSIST(dev != NULL && *dev != NULL);
1948258945Sroberto
1949258945Sroberto	task = (*dev)->ev_sender;
1950258945Sroberto	(*dev)->ev_sender = sock;
1951258945Sroberto
1952258945Sroberto	if (ISC_LINK_LINKED(*dev, ev_link))
1953258945Sroberto		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1954258945Sroberto
1955258945Sroberto	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1956258945Sroberto	    == ISC_SOCKEVENTATTR_ATTACHED)
1957258945Sroberto		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1958258945Sroberto	else
1959258945Sroberto		isc_task_send(task, (isc_event_t **)dev);
1960258945Sroberto
1961258945Sroberto	CONSISTENT(sock);
1962258945Sroberto}
1963258945Sroberto
1964258945Sroberto/*
1965258945Sroberto * See comments for send_recvdone_event() above.
1966258945Sroberto */
1967258945Srobertostatic void
1968258945Srobertosend_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev) {
1969258945Sroberto	isc_task_t *task;
1970258945Sroberto
1971258945Sroberto	INSIST(adev != NULL && *adev != NULL);
1972258945Sroberto
1973258945Sroberto	task = (*adev)->ev_sender;
1974258945Sroberto	(*adev)->ev_sender = sock;
1975258945Sroberto
1976258945Sroberto	if (ISC_LINK_LINKED(*adev, ev_link))
1977258945Sroberto		ISC_LIST_DEQUEUE(sock->accept_list, *adev, ev_link);
1978258945Sroberto
1979258945Sroberto	isc_task_sendanddetach(&task, (isc_event_t **)adev);
1980258945Sroberto
1981258945Sroberto	CONSISTENT(sock);
1982258945Sroberto}
1983258945Sroberto
1984258945Sroberto/*
1985258945Sroberto * See comments for send_recvdone_event() above.
1986258945Sroberto */
1987258945Srobertostatic void
1988258945Srobertosend_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev) {
1989258945Sroberto	isc_task_t *task;
1990258945Sroberto
1991258945Sroberto	INSIST(cdev != NULL && *cdev != NULL);
1992258945Sroberto
1993258945Sroberto	task = (*cdev)->ev_sender;
1994258945Sroberto	(*cdev)->ev_sender = sock;
1995258945Sroberto
1996258945Sroberto	sock->connect_ev = NULL;
1997258945Sroberto
1998258945Sroberto	isc_task_sendanddetach(&task, (isc_event_t **)cdev);
1999258945Sroberto
2000258945Sroberto	CONSISTENT(sock);
2001258945Sroberto}
2002258945Sroberto
2003258945Sroberto/*
2004258945Sroberto * On entry to this function, the event delivered is the internal
2005258945Sroberto * readable event, and the first item on the accept_list should be
2006258945Sroberto * the done event we want to send.  If the list is empty, this is a no-op,
2007258945Sroberto * so just close the new connection, unlock, and return.
2008258945Sroberto *
2009258945Sroberto * Note the socket is locked before entering here
2010258945Sroberto */
2011258945Srobertostatic void
2012258945Srobertointernal_accept(isc_socket_t *sock, IoCompletionInfo *lpo, int accept_errno) {
2013258945Sroberto	isc_socket_newconnev_t *adev;
2014258945Sroberto	isc_result_t result = ISC_R_SUCCESS;
2015258945Sroberto	isc_socket_t *nsock;
2016258945Sroberto	struct sockaddr *localaddr;
2017258945Sroberto	int localaddr_len = sizeof(*localaddr);
2018258945Sroberto	struct sockaddr *remoteaddr;
2019258945Sroberto	int remoteaddr_len = sizeof(*remoteaddr);
2020258945Sroberto
2021258945Sroberto	INSIST(VALID_SOCKET(sock));
2022258945Sroberto	LOCK(&sock->lock);
2023258945Sroberto	CONSISTENT(sock);
2024258945Sroberto
2025258945Sroberto	socket_log(__LINE__, sock, NULL, TRACE,
2026258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2027258945Sroberto		   "internal_accept called");
2028258945Sroberto
2029258945Sroberto	INSIST(sock->listener);
2030258945Sroberto
2031258945Sroberto	INSIST(sock->pending_iocp > 0);
2032258945Sroberto	sock->pending_iocp--;
2033258945Sroberto	INSIST(sock->pending_accept > 0);
2034258945Sroberto	sock->pending_accept--;
2035258945Sroberto
2036258945Sroberto	adev = lpo->adev;
2037258945Sroberto
2038258945Sroberto	/*
2039258945Sroberto	 * If the event is no longer in the list we can just return.
2040258945Sroberto	 */
2041258945Sroberto	if (!acceptdone_is_active(sock, adev))
2042258945Sroberto		goto done;
2043258945Sroberto
2044258945Sroberto	nsock = adev->newsocket;
2045258945Sroberto
2046258945Sroberto	/*
2047258945Sroberto	 * Pull off the done event.
2048258945Sroberto	 */
2049258945Sroberto	ISC_LIST_UNLINK(sock->accept_list, adev, ev_link);
2050258945Sroberto
2051258945Sroberto	/*
2052258945Sroberto	 * Extract the addresses from the socket, copy them into the structure,
2053258945Sroberto	 * and return the new socket.
2054258945Sroberto	 */
2055258945Sroberto	ISCGetAcceptExSockaddrs(lpo->acceptbuffer, 0,
2056258945Sroberto		sizeof(SOCKADDR_STORAGE) + 16, sizeof(SOCKADDR_STORAGE) + 16,
2057258945Sroberto		(LPSOCKADDR *)&localaddr, &localaddr_len,
2058258945Sroberto		(LPSOCKADDR *)&remoteaddr, &remoteaddr_len);
2059258945Sroberto	memcpy(&adev->address.type, remoteaddr, remoteaddr_len);
2060258945Sroberto	adev->address.length = remoteaddr_len;
2061258945Sroberto	nsock->address = adev->address;
2062258945Sroberto	nsock->pf = adev->address.type.sa.sa_family;
2063258945Sroberto
2064258945Sroberto	socket_log(__LINE__, nsock, &nsock->address, TRACE,
2065258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2066258945Sroberto		   "internal_accept parent %p", sock);
2067258945Sroberto
2068258945Sroberto	result = make_nonblock(adev->newsocket->fd);
2069258945Sroberto	INSIST(result == ISC_R_SUCCESS);
2070258945Sroberto
2071258945Sroberto	INSIST(setsockopt(nsock->fd, SOL_SOCKET, SO_UPDATE_ACCEPT_CONTEXT,
2072280849Scy			  (char *)&sock->fd, sizeof(sock->fd)) == 0);
2073258945Sroberto
2074258945Sroberto	/*
2075258945Sroberto	 * Hook it up into the manager.
2076258945Sroberto	 */
2077258945Sroberto	nsock->bound = 1;
2078258945Sroberto	nsock->connected = 1;
2079258945Sroberto	_set_state(nsock, SOCK_OPEN);
2080258945Sroberto
2081258945Sroberto	LOCK(&nsock->manager->lock);
2082258945Sroberto	ISC_LIST_APPEND(nsock->manager->socklist, nsock, link);
2083258945Sroberto	InterlockedIncrement(&nsock->manager->totalSockets);
2084258945Sroberto	UNLOCK(&nsock->manager->lock);
2085258945Sroberto
2086258945Sroberto	socket_log(__LINE__, sock, &nsock->address, CREATION,
2087258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2088258945Sroberto		   "accepted_connection new_socket %p fd %d",
2089258945Sroberto		   nsock, nsock->fd);
2090258945Sroberto
2091258945Sroberto	adev->result = result;
2092258945Sroberto	send_acceptdone_event(sock, &adev);
2093258945Sroberto
2094258945Srobertodone:
2095258945Sroberto	CONSISTENT(sock);
2096258945Sroberto	UNLOCK(&sock->lock);
2097258945Sroberto
2098258945Sroberto	HeapFree(hHeapHandle, 0, lpo->acceptbuffer);
2099258945Sroberto	lpo->acceptbuffer = NULL;
2100258945Sroberto}
2101258945Sroberto
2102258945Sroberto/*
2103258945Sroberto * Called when a socket with a pending connect() finishes.
2104258945Sroberto * Note that the socket is locked before entering.
2105258945Sroberto */
2106258945Srobertostatic void
2107258945Srobertointernal_connect(isc_socket_t *sock, IoCompletionInfo *lpo, int connect_errno) {
2108258945Sroberto	isc_socket_connev_t *cdev;
2109258945Sroberto	char strbuf[ISC_STRERRORSIZE];
2110258945Sroberto
2111258945Sroberto	INSIST(VALID_SOCKET(sock));
2112258945Sroberto
2113258945Sroberto	LOCK(&sock->lock);
2114258945Sroberto
2115258945Sroberto	INSIST(sock->pending_iocp > 0);
2116258945Sroberto	sock->pending_iocp--;
2117258945Sroberto	INSIST(sock->pending_connect == 1);
2118258945Sroberto	sock->pending_connect = 0;
2119258945Sroberto
2120258945Sroberto	/*
2121258945Sroberto	 * Has this event been canceled?
2122258945Sroberto	 */
2123258945Sroberto	cdev = lpo->cdev;
2124258945Sroberto	if (!connectdone_is_active(sock, cdev)) {
2125258945Sroberto		sock->pending_connect = 0;
2126258945Sroberto		if (sock->fd != INVALID_SOCKET) {
2127258945Sroberto			closesocket(sock->fd);
2128258945Sroberto			sock->fd = INVALID_SOCKET;
2129258945Sroberto			_set_state(sock, SOCK_CLOSED);
2130258945Sroberto		}
2131258945Sroberto		CONSISTENT(sock);
2132258945Sroberto		UNLOCK(&sock->lock);
2133258945Sroberto		return;
2134258945Sroberto	}
2135258945Sroberto
2136258945Sroberto	/*
2137258945Sroberto	 * Check possible Windows network event error status here.
2138258945Sroberto	 */
2139258945Sroberto	if (connect_errno != 0) {
2140258945Sroberto		/*
2141258945Sroberto		 * If the error is SOFT, just try again on this
2142258945Sroberto		 * fd and pretend nothing strange happened.
2143258945Sroberto		 */
2144258945Sroberto		if (SOFT_ERROR(connect_errno) ||
2145258945Sroberto		    connect_errno == WSAEINPROGRESS) {
2146258945Sroberto			sock->pending_connect = 1;
2147258945Sroberto			CONSISTENT(sock);
2148258945Sroberto			UNLOCK(&sock->lock);
2149258945Sroberto			return;
2150258945Sroberto		}
2151258945Sroberto
2152258945Sroberto		/*
2153258945Sroberto		 * Translate other errors into ISC_R_* flavors.
2154258945Sroberto		 */
2155258945Sroberto		switch (connect_errno) {
2156258945Sroberto#define ERROR_MATCH(a, b) case a: cdev->result = b; break;
2157258945Sroberto			ERROR_MATCH(WSAEACCES, ISC_R_NOPERM);
2158258945Sroberto			ERROR_MATCH(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2159258945Sroberto			ERROR_MATCH(WSAEAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2160258945Sroberto			ERROR_MATCH(WSAECONNREFUSED, ISC_R_CONNREFUSED);
2161258945Sroberto			ERROR_MATCH(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
2162258945Sroberto			ERROR_MATCH(WSAEHOSTDOWN, ISC_R_HOSTDOWN);
2163258945Sroberto			ERROR_MATCH(WSAENETUNREACH, ISC_R_NETUNREACH);
2164258945Sroberto			ERROR_MATCH(WSAENETDOWN, ISC_R_NETDOWN);
2165258945Sroberto			ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES);
2166258945Sroberto			ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET);
2167258945Sroberto			ERROR_MATCH(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
2168258945Sroberto			ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT);
2169258945Sroberto#undef ERROR_MATCH
2170258945Sroberto		default:
2171258945Sroberto			cdev->result = ISC_R_UNEXPECTED;
2172258945Sroberto			isc__strerror(connect_errno, strbuf, sizeof(strbuf));
2173258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
2174258945Sroberto					 "internal_connect: connect() %s",
2175258945Sroberto					 strbuf);
2176258945Sroberto		}
2177258945Sroberto	} else {
2178280849Scy		INSIST(setsockopt(sock->fd, SOL_SOCKET,
2179280849Scy				  SO_UPDATE_CONNECT_CONTEXT, NULL, 0) == 0);
2180258945Sroberto		cdev->result = ISC_R_SUCCESS;
2181258945Sroberto		sock->connected = 1;
2182258945Sroberto		socket_log(__LINE__, sock, &sock->address, IOEVENT,
2183258945Sroberto			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2184258945Sroberto			   "internal_connect: success");
2185258945Sroberto	}
2186258945Sroberto
2187258945Sroberto	send_connectdone_event(sock, &cdev);
2188258945Sroberto
2189258945Sroberto	UNLOCK(&sock->lock);
2190258945Sroberto}
2191258945Sroberto
2192258945Sroberto/*
2193258945Sroberto * Loop through the socket, returning ISC_R_EOF for each done event pending.
2194258945Sroberto */
2195258945Srobertostatic void
2196258945Srobertosend_recvdone_abort(isc_socket_t *sock, isc_result_t result) {
2197258945Sroberto	isc_socketevent_t *dev;
2198258945Sroberto
2199258945Sroberto	while (!ISC_LIST_EMPTY(sock->recv_list)) {
2200258945Sroberto		dev = ISC_LIST_HEAD(sock->recv_list);
2201258945Sroberto		dev->result = result;
2202258945Sroberto		send_recvdone_event(sock, &dev);
2203258945Sroberto	}
2204258945Sroberto}
2205258945Sroberto
2206258945Sroberto/*
2207258945Sroberto * Take the data we received in our private buffer, and if any recv() calls on
2208258945Sroberto * our list are satisfied, send the corresponding done event.
2209258945Sroberto *
2210258945Sroberto * If we need more data (there are still items on the recv_list after we consume all
2211258945Sroberto * our data) then arrange for another system recv() call to fill our buffers.
2212258945Sroberto */
2213258945Srobertostatic void
2214258945Srobertointernal_recv(isc_socket_t *sock, int nbytes)
2215258945Sroberto{
2216258945Sroberto	INSIST(VALID_SOCKET(sock));
2217258945Sroberto
2218258945Sroberto	LOCK(&sock->lock);
2219258945Sroberto	CONSISTENT(sock);
2220258945Sroberto
2221258945Sroberto	socket_log(__LINE__, sock, NULL, IOEVENT,
2222258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2223258945Sroberto		   "internal_recv: %d bytes received", nbytes);
2224258945Sroberto
2225258945Sroberto	/*
2226258945Sroberto	 * If we got here, the I/O operation succeeded.  However, we might still have removed this
2227258945Sroberto	 * event from our notification list (or never placed it on it due to immediate completion.)
2228258945Sroberto	 * Handle the reference counting here, and handle the cancellation event just after.
2229258945Sroberto	 */
2230258945Sroberto	INSIST(sock->pending_iocp > 0);
2231258945Sroberto	sock->pending_iocp--;
2232258945Sroberto	INSIST(sock->pending_recv > 0);
2233258945Sroberto	sock->pending_recv--;
2234258945Sroberto
2235258945Sroberto	/*
2236258945Sroberto	 * The only way we could have gotten here is that our I/O has successfully completed.
2237258945Sroberto	 * Update our pointers, and move on.  The only odd case here is that we might not
2238258945Sroberto	 * have received enough data on a TCP stream to satisfy the minimum requirements.  If
2239258945Sroberto	 * this is the case, we will re-issue the recv() call for what we need.
2240258945Sroberto	 *
2241258945Sroberto	 * We do check for a recv() of 0 bytes on a TCP stream.  This means the remote end
2242258945Sroberto	 * has closed.
2243258945Sroberto	 */
2244258945Sroberto	if (nbytes == 0 && sock->type == isc_sockettype_tcp) {
2245258945Sroberto		send_recvdone_abort(sock, ISC_R_EOF);
2246258945Sroberto		maybe_free_socket(&sock, __LINE__);
2247258945Sroberto		return;
2248258945Sroberto	}
2249258945Sroberto	sock->recvbuf.remaining = nbytes;
2250258945Sroberto	sock->recvbuf.consume_position = sock->recvbuf.base;
2251258945Sroberto	completeio_recv(sock);
2252258945Sroberto
2253258945Sroberto	/*
2254258945Sroberto	 * If there are more receivers waiting for data, queue another receive
2255258945Sroberto	 * here.
2256258945Sroberto	 */
2257258945Sroberto	queue_receive_request(sock);
2258258945Sroberto
2259258945Sroberto	/*
2260258945Sroberto	 * Unlock and/or destroy if we are the last thing this socket has left to do.
2261258945Sroberto	 */
2262258945Sroberto	maybe_free_socket(&sock, __LINE__);
2263258945Sroberto}
2264258945Sroberto
2265258945Srobertostatic void
2266258945Srobertointernal_send(isc_socket_t *sock, isc_socketevent_t *dev,
2267258945Sroberto	      struct msghdr *messagehdr, int nbytes, int send_errno, IoCompletionInfo *lpo)
2268258945Sroberto{
2269258945Sroberto	buflist_t *buffer;
2270258945Sroberto
2271258945Sroberto	/*
2272258945Sroberto	 * Find out what socket this is and lock it.
2273258945Sroberto	 */
2274258945Sroberto	INSIST(VALID_SOCKET(sock));
2275258945Sroberto
2276258945Sroberto	LOCK(&sock->lock);
2277258945Sroberto	CONSISTENT(sock);
2278258945Sroberto
2279258945Sroberto	socket_log(__LINE__, sock, NULL, IOEVENT,
2280258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2281258945Sroberto		   "internal_send: task got socket event %p", dev);
2282258945Sroberto
2283258945Sroberto	buffer = ISC_LIST_HEAD(lpo->bufferlist);
2284258945Sroberto	while (buffer != NULL) {
2285258945Sroberto		ISC_LIST_DEQUEUE(lpo->bufferlist, buffer, link);
2286258945Sroberto
2287258945Sroberto		socket_log(__LINE__, sock, NULL, TRACE,
2288258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2289258945Sroberto		   "free_buffer %p %p", buffer, buffer->buf);
2290258945Sroberto
2291258945Sroberto		HeapFree(hHeapHandle, 0, buffer->buf);
2292258945Sroberto		HeapFree(hHeapHandle, 0, buffer);
2293258945Sroberto		buffer = ISC_LIST_HEAD(lpo->bufferlist);
2294258945Sroberto	}
2295258945Sroberto
2296258945Sroberto	INSIST(sock->pending_iocp > 0);
2297258945Sroberto	sock->pending_iocp--;
2298258945Sroberto	INSIST(sock->pending_send > 0);
2299258945Sroberto	sock->pending_send--;
2300258945Sroberto
2301258945Sroberto	/* If the event is no longer in the list we can just return */
2302258945Sroberto	if (!senddone_is_active(sock, dev))
2303258945Sroberto		goto done;
2304258945Sroberto
2305258945Sroberto	/*
2306258945Sroberto	 * Set the error code and send things on its way.
2307258945Sroberto	 */
2308258945Sroberto	switch (completeio_send(sock, dev, messagehdr, nbytes, send_errno)) {
2309258945Sroberto	case DOIO_SOFT:
2310258945Sroberto		break;
2311258945Sroberto	case DOIO_HARD:
2312258945Sroberto	case DOIO_SUCCESS:
2313258945Sroberto		send_senddone_event(sock, &dev);
2314258945Sroberto		break;
2315258945Sroberto	}
2316258945Sroberto
2317258945Sroberto done:
2318258945Sroberto	maybe_free_socket(&sock, __LINE__);
2319258945Sroberto}
2320258945Sroberto
2321258945Sroberto/*
2322258945Sroberto * These return if the done event passed in is on the list (or for connect, is
2323258945Sroberto * the one we're waiting for.  Using these ensures we will not double-send an
2324258945Sroberto * event.
2325258945Sroberto */
2326258945Srobertostatic isc_boolean_t
2327258945Srobertosenddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev)
2328258945Sroberto{
2329258945Sroberto	isc_socketevent_t *ldev;
2330258945Sroberto
2331258945Sroberto	ldev = ISC_LIST_HEAD(sock->send_list);
2332258945Sroberto	while (ldev != NULL && ldev != dev)
2333258945Sroberto		ldev = ISC_LIST_NEXT(ldev, ev_link);
2334258945Sroberto
2335258945Sroberto	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2336258945Sroberto}
2337258945Sroberto
2338258945Srobertostatic isc_boolean_t
2339258945Srobertoacceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev)
2340258945Sroberto{
2341258945Sroberto	isc_socket_newconnev_t *ldev;
2342258945Sroberto
2343258945Sroberto	ldev = ISC_LIST_HEAD(sock->accept_list);
2344258945Sroberto	while (ldev != NULL && ldev != dev)
2345258945Sroberto		ldev = ISC_LIST_NEXT(ldev, ev_link);
2346258945Sroberto
2347258945Sroberto	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2348258945Sroberto}
2349258945Sroberto
2350258945Srobertostatic isc_boolean_t
2351258945Srobertoconnectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev)
2352258945Sroberto{
2353258945Sroberto	return (sock->connect_ev == dev ? ISC_TRUE : ISC_FALSE);
2354258945Sroberto}
2355258945Sroberto
2356280849Scy//
2357280849Scy// The Windows network stack seems to have two very distinct paths depending
2358280849Scy// on what is installed.  Specifically, if something is looking at network
2359280849Scy// connections (like an anti-virus or anti-malware application, such as
2360280849Scy// McAfee products) Windows may return additional error conditions which
2361280849Scy// were not previously returned.
2362280849Scy//
2363280849Scy// One specific one is when a TCP SYN scan is used.  In this situation,
2364280849Scy// Windows responds with the SYN-ACK, but the scanner never responds with
2365280849Scy// the 3rd packet, the ACK.  Windows consiers this a partially open connection.
2366280849Scy// Most Unix networking stacks, and Windows without McAfee installed, will
2367280849Scy// not return this to the caller.  However, with this product installed,
2368280849Scy// Windows returns this as a failed status on the Accept() call.  Here, we
2369280849Scy// will just re-issue the ISCAcceptEx() call as if nothing had happened.
2370280849Scy//
2371280849Scy// This code should only be called when the listening socket has received
2372280849Scy// such an error.  Additionally, the "parent" socket must be locked.
2373280849Scy// Additionally, the lpo argument is re-used here, and must not be freed
2374280849Scy// by the caller.
2375280849Scy//
2376280849Scystatic isc_result_t
2377280849Scyrestart_accept(isc_socket_t *parent, IoCompletionInfo *lpo)
2378280849Scy{
2379280849Scy	isc_socket_t *nsock = lpo->adev->newsocket;
2380280849Scy	SOCKET new_fd;
2381280849Scy
2382280849Scy	/*
2383280849Scy	 * AcceptEx() requires we pass in a socket.  Note that we carefully
2384280849Scy	 * do not close the previous socket in case of an error message returned by
2385280849Scy	 * our new socket() call.  If we return an error here, our caller will
2386280849Scy	 * clean up.
2387280849Scy	 */
2388280849Scy	new_fd = socket(parent->pf, SOCK_STREAM, IPPROTO_TCP);
2389280849Scy	if (nsock->fd == INVALID_SOCKET) {
2390280849Scy		return (ISC_R_FAILURE); // parent will ask windows for error message
2391280849Scy	}
2392280849Scy	closesocket(nsock->fd);
2393280849Scy	nsock->fd = new_fd;
2394280849Scy
2395280849Scy	memset(&lpo->overlapped, 0, sizeof(lpo->overlapped));
2396280849Scy
2397280849Scy	ISCAcceptEx(parent->fd,
2398280849Scy		    nsock->fd,				/* Accepted Socket */
2399280849Scy		    lpo->acceptbuffer,			/* Buffer for initial Recv */
2400280849Scy		    0,					/* Length of Buffer */
2401280849Scy		    sizeof(SOCKADDR_STORAGE) + 16,	/* Local address length + 16 */
2402280849Scy		    sizeof(SOCKADDR_STORAGE) + 16,	/* Remote address lengh + 16 */
2403280849Scy		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
2404280849Scy		    (LPOVERLAPPED)lpo			/* Overlapped structure */
2405280849Scy		    );
2406280849Scy
2407280849Scy	InterlockedDecrement(&nsock->manager->iocp_total);
2408280849Scy	iocompletionport_update(nsock);
2409280849Scy
2410280849Scy	return (ISC_R_SUCCESS);
2411280849Scy}
2412280849Scy
2413258945Sroberto/*
2414258945Sroberto * This is the I/O Completion Port Worker Function. It loops forever
2415258945Sroberto * waiting for I/O to complete and then forwards them for further
2416258945Sroberto * processing. There are a number of these in separate threads.
2417258945Sroberto */
2418258945Srobertostatic isc_threadresult_t WINAPI
2419258945SrobertoSocketIoThread(LPVOID ThreadContext) {
2420258945Sroberto	isc_socketmgr_t *manager = ThreadContext;
2421258945Sroberto	BOOL bSuccess = FALSE;
2422258945Sroberto	DWORD nbytes;
2423258945Sroberto	IoCompletionInfo *lpo = NULL;
2424258945Sroberto	isc_socket_t *sock = NULL;
2425258945Sroberto	int request;
2426258945Sroberto	struct msghdr *messagehdr = NULL;
2427258945Sroberto	int errval;
2428258945Sroberto	char strbuf[ISC_STRERRORSIZE];
2429258945Sroberto	int errstatus;
2430258945Sroberto
2431258945Sroberto	REQUIRE(VALID_MANAGER(manager));
2432258945Sroberto
2433258945Sroberto	/*
2434258945Sroberto	 * Set the thread priority high enough so I/O will
2435258945Sroberto	 * preempt normal recv packet processing, but not
2436258945Sroberto	 * higher than the timer sync thread.
2437258945Sroberto	 */
2438258945Sroberto	if (!SetThreadPriority(GetCurrentThread(),
2439258945Sroberto			       THREAD_PRIORITY_ABOVE_NORMAL)) {
2440258945Sroberto		errval = GetLastError();
2441258945Sroberto		isc__strerror(errval, strbuf, sizeof(strbuf));
2442258945Sroberto		FATAL_ERROR(__FILE__, __LINE__,
2443258945Sroberto				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2444258945Sroberto				ISC_MSG_FAILED,
2445258945Sroberto				"Can't set thread priority: %s"),
2446258945Sroberto				strbuf);
2447258945Sroberto	}
2448258945Sroberto
2449258945Sroberto	/*
2450258945Sroberto	 * Loop forever waiting on I/O Completions and then processing them
2451258945Sroberto	 */
2452258945Sroberto	while (TRUE) {
2453280849Scy		wait_again:
2454258945Sroberto		bSuccess = GetQueuedCompletionStatus(manager->hIoCompletionPort,
2455258945Sroberto						     &nbytes, (LPDWORD)&sock,
2456258945Sroberto						     (LPWSAOVERLAPPED *)&lpo,
2457258945Sroberto						     INFINITE);
2458258945Sroberto		if (lpo == NULL) /* Received request to exit */
2459258945Sroberto			break;
2460258945Sroberto
2461258945Sroberto		REQUIRE(VALID_SOCKET(sock));
2462258945Sroberto
2463258945Sroberto		request = lpo->request_type;
2464258945Sroberto
2465258945Sroberto		errstatus = 0;
2466258945Sroberto		if (!bSuccess) {
2467258945Sroberto			isc_result_t isc_result;
2468258945Sroberto
2469258945Sroberto			/*
2470258945Sroberto			 * Did the I/O operation complete?
2471258945Sroberto			 */
2472280849Scy			errstatus = GetLastError();
2473258945Sroberto			isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2474258945Sroberto
2475258945Sroberto			LOCK(&sock->lock);
2476258945Sroberto			CONSISTENT(sock);
2477258945Sroberto			switch (request) {
2478258945Sroberto			case SOCKET_RECV:
2479258945Sroberto				INSIST(sock->pending_iocp > 0);
2480258945Sroberto				sock->pending_iocp--;
2481258945Sroberto				INSIST(sock->pending_recv > 0);
2482258945Sroberto				sock->pending_recv--;
2483280849Scy				if (!sock->connected &&
2484280849Scy				    ((errstatus == ERROR_HOST_UNREACHABLE) ||
2485280849Scy				     (errstatus == WSAENETRESET) ||
2486280849Scy				     (errstatus == WSAECONNRESET))) {
2487280849Scy					/* ignore soft errors */
2488280849Scy					queue_receive_request(sock);
2489280849Scy					break;
2490280849Scy				}
2491258945Sroberto				send_recvdone_abort(sock, isc_result);
2492258945Sroberto				if (isc_result == ISC_R_UNEXPECTED) {
2493258945Sroberto					UNEXPECTED_ERROR(__FILE__, __LINE__,
2494258945Sroberto						"SOCKET_RECV: Windows error code: %d, returning ISC error %d",
2495258945Sroberto						errstatus, isc_result);
2496258945Sroberto				}
2497258945Sroberto				break;
2498258945Sroberto
2499258945Sroberto			case SOCKET_SEND:
2500258945Sroberto				INSIST(sock->pending_iocp > 0);
2501258945Sroberto				sock->pending_iocp--;
2502258945Sroberto				INSIST(sock->pending_send > 0);
2503258945Sroberto				sock->pending_send--;
2504258945Sroberto				if (senddone_is_active(sock, lpo->dev)) {
2505258945Sroberto					lpo->dev->result = isc_result;
2506258945Sroberto					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2507258945Sroberto						"canceled_send");
2508258945Sroberto					send_senddone_event(sock, &lpo->dev);
2509258945Sroberto				}
2510258945Sroberto				break;
2511258945Sroberto
2512258945Sroberto			case SOCKET_ACCEPT:
2513258945Sroberto				INSIST(sock->pending_iocp > 0);
2514280849Scy				INSIST(sock->pending_accept > 0);
2515280849Scy
2516280849Scy				socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2517280849Scy					"Accept: errstatus=%d isc_result=%d", errstatus, isc_result);
2518280849Scy
2519280849Scy				if (acceptdone_is_active(sock, lpo->adev)) {
2520280849Scy					if (restart_accept(sock, lpo) == ISC_R_SUCCESS) {
2521280849Scy						UNLOCK(&sock->lock);
2522280849Scy						goto wait_again;
2523280849Scy					} else {
2524280849Scy						errstatus = GetLastError();
2525280849Scy						isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2526280849Scy						socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2527280849Scy							"restart_accept() failed: errstatus=%d isc_result=%d",
2528280849Scy							errstatus, isc_result);
2529280849Scy					}
2530280849Scy				}
2531280849Scy
2532258945Sroberto				sock->pending_iocp--;
2533258945Sroberto				sock->pending_accept--;
2534258945Sroberto				if (acceptdone_is_active(sock, lpo->adev)) {
2535258945Sroberto					closesocket(lpo->adev->newsocket->fd);
2536258945Sroberto					lpo->adev->newsocket->fd = INVALID_SOCKET;
2537258945Sroberto					lpo->adev->newsocket->references--;
2538258945Sroberto					free_socket(&lpo->adev->newsocket, __LINE__);
2539258945Sroberto					lpo->adev->result = isc_result;
2540258945Sroberto					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2541258945Sroberto						"canceled_accept");
2542258945Sroberto					send_acceptdone_event(sock, &lpo->adev);
2543258945Sroberto				}
2544258945Sroberto				break;
2545258945Sroberto
2546258945Sroberto			case SOCKET_CONNECT:
2547258945Sroberto				INSIST(sock->pending_iocp > 0);
2548258945Sroberto				sock->pending_iocp--;
2549258945Sroberto				INSIST(sock->pending_connect == 1);
2550258945Sroberto				sock->pending_connect = 0;
2551258945Sroberto				if (connectdone_is_active(sock, lpo->cdev)) {
2552258945Sroberto					lpo->cdev->result = isc_result;
2553258945Sroberto					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2554258945Sroberto						"canceled_connect");
2555258945Sroberto					send_connectdone_event(sock, &lpo->cdev);
2556258945Sroberto				}
2557258945Sroberto				break;
2558258945Sroberto			}
2559258945Sroberto			maybe_free_socket(&sock, __LINE__);
2560258945Sroberto
2561258945Sroberto			if (lpo != NULL)
2562258945Sroberto				HeapFree(hHeapHandle, 0, lpo);
2563258945Sroberto			continue;
2564258945Sroberto		}
2565258945Sroberto
2566258945Sroberto		messagehdr = &lpo->messagehdr;
2567258945Sroberto
2568258945Sroberto		switch (request) {
2569258945Sroberto		case SOCKET_RECV:
2570258945Sroberto			internal_recv(sock, nbytes);
2571258945Sroberto			break;
2572258945Sroberto		case SOCKET_SEND:
2573258945Sroberto			internal_send(sock, lpo->dev, messagehdr, nbytes, errstatus, lpo);
2574258945Sroberto			break;
2575258945Sroberto		case SOCKET_ACCEPT:
2576258945Sroberto			internal_accept(sock, lpo, errstatus);
2577258945Sroberto			break;
2578258945Sroberto		case SOCKET_CONNECT:
2579258945Sroberto			internal_connect(sock, lpo, errstatus);
2580258945Sroberto			break;
2581258945Sroberto		}
2582258945Sroberto
2583258945Sroberto		if (lpo != NULL)
2584258945Sroberto			HeapFree(hHeapHandle, 0, lpo);
2585258945Sroberto	}
2586258945Sroberto
2587258945Sroberto	/*
2588258945Sroberto	 * Exit Completion Port Thread
2589258945Sroberto	 */
2590258945Sroberto	manager_log(manager, TRACE,
2591258945Sroberto		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2592258945Sroberto				   ISC_MSG_EXITING, "SocketIoThread exiting"));
2593258945Sroberto	return ((isc_threadresult_t)0);
2594258945Sroberto}
2595258945Sroberto
2596258945Sroberto/*
2597258945Sroberto * Create a new socket manager.
2598258945Sroberto */
2599258945Srobertoisc_result_t
2600280849Scyisc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2601258945Sroberto	return (isc_socketmgr_create2(mctx, managerp, 0));
2602258945Sroberto}
2603258945Sroberto
2604258945Srobertoisc_result_t
2605280849Scyisc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
2606280849Scy		       unsigned int maxsocks)
2607258945Sroberto{
2608258945Sroberto	isc_socketmgr_t *manager;
2609258945Sroberto	isc_result_t result;
2610258945Sroberto
2611258945Sroberto	REQUIRE(managerp != NULL && *managerp == NULL);
2612258945Sroberto
2613258945Sroberto	if (maxsocks != 0)
2614258945Sroberto		return (ISC_R_NOTIMPLEMENTED);
2615258945Sroberto
2616258945Sroberto	manager = isc_mem_get(mctx, sizeof(*manager));
2617258945Sroberto	if (manager == NULL)
2618258945Sroberto		return (ISC_R_NOMEMORY);
2619258945Sroberto
2620258945Sroberto	InitSockets();
2621258945Sroberto
2622258945Sroberto	manager->magic = SOCKET_MANAGER_MAGIC;
2623258945Sroberto	manager->mctx = NULL;
2624258945Sroberto	manager->stats = NULL;
2625258945Sroberto	ISC_LIST_INIT(manager->socklist);
2626258945Sroberto	result = isc_mutex_init(&manager->lock);
2627258945Sroberto	if (result != ISC_R_SUCCESS) {
2628258945Sroberto		isc_mem_put(mctx, manager, sizeof(*manager));
2629258945Sroberto		return (result);
2630258945Sroberto	}
2631258945Sroberto	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2632258945Sroberto		DESTROYLOCK(&manager->lock);
2633258945Sroberto		isc_mem_put(mctx, manager, sizeof(*manager));
2634258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
2635258945Sroberto				 "isc_condition_init() %s",
2636258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2637258945Sroberto						ISC_MSG_FAILED, "failed"));
2638258945Sroberto		return (ISC_R_UNEXPECTED);
2639258945Sroberto	}
2640258945Sroberto
2641258945Sroberto	isc_mem_attach(mctx, &manager->mctx);
2642258945Sroberto
2643258945Sroberto	iocompletionport_init(manager);	/* Create the Completion Ports */
2644258945Sroberto
2645258945Sroberto	manager->bShutdown = ISC_FALSE;
2646258945Sroberto	manager->totalSockets = 0;
2647258945Sroberto	manager->iocp_total = 0;
2648258945Sroberto
2649258945Sroberto	*managerp = manager;
2650258945Sroberto
2651258945Sroberto	return (ISC_R_SUCCESS);
2652258945Sroberto}
2653258945Sroberto
2654258945Srobertoisc_result_t
2655280849Scyisc__socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
2656258945Sroberto	REQUIRE(VALID_MANAGER(manager));
2657258945Sroberto	REQUIRE(nsockp != NULL);
2658258945Sroberto
2659258945Sroberto	return (ISC_R_NOTIMPLEMENTED);
2660258945Sroberto}
2661258945Sroberto
2662258945Srobertovoid
2663280849Scyisc__socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
2664258945Sroberto	REQUIRE(VALID_MANAGER(manager));
2665258945Sroberto	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
2666258945Sroberto	REQUIRE(manager->stats == NULL);
2667258945Sroberto	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2668258945Sroberto
2669258945Sroberto	isc_stats_attach(stats, &manager->stats);
2670258945Sroberto}
2671258945Sroberto
2672258945Srobertovoid
2673280849Scyisc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2674258945Sroberto	isc_socketmgr_t *manager;
2675258945Sroberto	int i;
2676258945Sroberto	isc_mem_t *mctx;
2677258945Sroberto
2678258945Sroberto	/*
2679258945Sroberto	 * Destroy a socket manager.
2680258945Sroberto	 */
2681258945Sroberto
2682258945Sroberto	REQUIRE(managerp != NULL);
2683258945Sroberto	manager = *managerp;
2684258945Sroberto	REQUIRE(VALID_MANAGER(manager));
2685258945Sroberto
2686258945Sroberto	LOCK(&manager->lock);
2687258945Sroberto
2688258945Sroberto	/*
2689258945Sroberto	 * Wait for all sockets to be destroyed.
2690258945Sroberto	 */
2691258945Sroberto	while (!ISC_LIST_EMPTY(manager->socklist)) {
2692258945Sroberto		manager_log(manager, CREATION,
2693258945Sroberto			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2694258945Sroberto					   ISC_MSG_SOCKETSREMAIN,
2695258945Sroberto					   "sockets exist"));
2696258945Sroberto		WAIT(&manager->shutdown_ok, &manager->lock);
2697258945Sroberto	}
2698258945Sroberto
2699258945Sroberto	UNLOCK(&manager->lock);
2700258945Sroberto
2701258945Sroberto	/*
2702258945Sroberto	 * Here, we need to had some wait code for the completion port
2703258945Sroberto	 * thread.
2704258945Sroberto	 */
2705258945Sroberto	signal_iocompletionport_exit(manager);
2706258945Sroberto	manager->bShutdown = ISC_TRUE;
2707258945Sroberto
2708258945Sroberto	/*
2709258945Sroberto	 * Wait for threads to exit.
2710258945Sroberto	 */
2711258945Sroberto	for (i = 0; i < manager->maxIOCPThreads; i++) {
2712258945Sroberto		if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i],
2713258945Sroberto			NULL) != ISC_R_SUCCESS)
2714258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
2715258945Sroberto				 "isc_thread_join() for Completion Port %s",
2716258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2717258945Sroberto						ISC_MSG_FAILED, "failed"));
2718258945Sroberto	}
2719258945Sroberto	/*
2720258945Sroberto	 * Clean up.
2721258945Sroberto	 */
2722258945Sroberto
2723258945Sroberto	CloseHandle(manager->hIoCompletionPort);
2724258945Sroberto
2725258945Sroberto	(void)isc_condition_destroy(&manager->shutdown_ok);
2726258945Sroberto
2727258945Sroberto	DESTROYLOCK(&manager->lock);
2728258945Sroberto	if (manager->stats != NULL)
2729258945Sroberto		isc_stats_detach(&manager->stats);
2730258945Sroberto	manager->magic = 0;
2731258945Sroberto	mctx= manager->mctx;
2732258945Sroberto	isc_mem_put(mctx, manager, sizeof(*manager));
2733258945Sroberto
2734258945Sroberto	isc_mem_detach(&mctx);
2735258945Sroberto
2736258945Sroberto	*managerp = NULL;
2737258945Sroberto}
2738258945Sroberto
2739258945Srobertostatic void
2740258945Srobertoqueue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev)
2741258945Sroberto{
2742258945Sroberto	isc_task_t *ntask = NULL;
2743258945Sroberto
2744258945Sroberto	isc_task_attach(task, &ntask);
2745258945Sroberto	dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2746258945Sroberto
2747258945Sroberto	/*
2748258945Sroberto	 * Enqueue the request.
2749258945Sroberto	 */
2750258945Sroberto	INSIST(!ISC_LINK_LINKED(dev, ev_link));
2751258945Sroberto	ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2752258945Sroberto
2753258945Sroberto	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2754258945Sroberto		   "queue_receive_event: event %p -> task %p",
2755258945Sroberto		   dev, ntask);
2756258945Sroberto}
2757258945Sroberto
2758258945Sroberto/*
2759258945Sroberto * Check the pending receive queue, and if we have data pending, give it to this
2760258945Sroberto * caller.  If we have none, queue an I/O request.  If this caller is not the first
2761258945Sroberto * on the list, then we will just queue this event and return.
2762258945Sroberto *
2763258945Sroberto * Caller must have the socket locked.
2764258945Sroberto */
2765258945Srobertostatic isc_result_t
2766258945Srobertosocket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2767258945Sroberto	    unsigned int flags)
2768258945Sroberto{
2769258945Sroberto	int cc = 0;
2770258945Sroberto	isc_task_t *ntask = NULL;
2771258945Sroberto	isc_result_t result = ISC_R_SUCCESS;
2772258945Sroberto	int recv_errno = 0;
2773258945Sroberto
2774258945Sroberto	dev->ev_sender = task;
2775258945Sroberto
2776258945Sroberto	if (sock->fd == INVALID_SOCKET)
2777258945Sroberto		return (ISC_R_EOF);
2778258945Sroberto
2779258945Sroberto	/*
2780258945Sroberto	 * Queue our event on the list of things to do.  Call our function to
2781258945Sroberto	 * attempt to fill buffers as much as possible, and return done events.
2782258945Sroberto	 * We are going to lie about our handling of the ISC_SOCKFLAG_IMMEDIATE
2783258945Sroberto	 * here and tell our caller that we could not satisfy it immediately.
2784258945Sroberto	 */
2785258945Sroberto	queue_receive_event(sock, task, dev);
2786258945Sroberto	if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2787258945Sroberto		result = ISC_R_INPROGRESS;
2788258945Sroberto
2789258945Sroberto	completeio_recv(sock);
2790258945Sroberto
2791258945Sroberto	/*
2792258945Sroberto	 * If there are more receivers waiting for data, queue another receive
2793258945Sroberto	 * here.  If the
2794258945Sroberto	 */
2795258945Sroberto	queue_receive_request(sock);
2796258945Sroberto
2797258945Sroberto	return (result);
2798258945Sroberto}
2799258945Sroberto
2800258945Srobertoisc_result_t
2801280849Scyisc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2802258945Sroberto		 unsigned int minimum, isc_task_t *task,
2803258945Sroberto		 isc_taskaction_t action, const void *arg)
2804258945Sroberto{
2805258945Sroberto	isc_socketevent_t *dev;
2806258945Sroberto	isc_socketmgr_t *manager;
2807258945Sroberto	unsigned int iocount;
2808258945Sroberto	isc_buffer_t *buffer;
2809258945Sroberto	isc_result_t ret;
2810258945Sroberto
2811258945Sroberto	REQUIRE(VALID_SOCKET(sock));
2812258945Sroberto	LOCK(&sock->lock);
2813258945Sroberto	CONSISTENT(sock);
2814258945Sroberto
2815258945Sroberto	/*
2816258945Sroberto	 * Make sure that the socket is not closed.  XXXMLG change error here?
2817258945Sroberto	 */
2818258945Sroberto	if (sock->fd == INVALID_SOCKET) {
2819258945Sroberto		UNLOCK(&sock->lock);
2820258945Sroberto		return (ISC_R_CONNREFUSED);
2821258945Sroberto	}
2822258945Sroberto
2823258945Sroberto	REQUIRE(buflist != NULL);
2824258945Sroberto	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2825258945Sroberto	REQUIRE(task != NULL);
2826258945Sroberto	REQUIRE(action != NULL);
2827258945Sroberto
2828258945Sroberto	manager = sock->manager;
2829258945Sroberto	REQUIRE(VALID_MANAGER(manager));
2830258945Sroberto
2831258945Sroberto	iocount = isc_bufferlist_availablecount(buflist);
2832258945Sroberto	REQUIRE(iocount > 0);
2833258945Sroberto
2834258945Sroberto	INSIST(sock->bound);
2835258945Sroberto
2836258945Sroberto	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2837258945Sroberto	if (dev == NULL) {
2838258945Sroberto		UNLOCK(&sock->lock);
2839258945Sroberto		return (ISC_R_NOMEMORY);
2840258945Sroberto	}
2841258945Sroberto
2842258945Sroberto	/*
2843258945Sroberto	 * UDP sockets are always partial read
2844258945Sroberto	 */
2845258945Sroberto	if (sock->type == isc_sockettype_udp)
2846258945Sroberto		dev->minimum = 1;
2847258945Sroberto	else {
2848258945Sroberto		if (minimum == 0)
2849258945Sroberto			dev->minimum = iocount;
2850258945Sroberto		else
2851258945Sroberto			dev->minimum = minimum;
2852258945Sroberto	}
2853258945Sroberto
2854258945Sroberto	/*
2855258945Sroberto	 * Move each buffer from the passed in list to our internal one.
2856258945Sroberto	 */
2857258945Sroberto	buffer = ISC_LIST_HEAD(*buflist);
2858258945Sroberto	while (buffer != NULL) {
2859258945Sroberto		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2860258945Sroberto		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2861258945Sroberto		buffer = ISC_LIST_HEAD(*buflist);
2862258945Sroberto	}
2863258945Sroberto
2864258945Sroberto	ret = socket_recv(sock, dev, task, 0);
2865258945Sroberto
2866258945Sroberto	UNLOCK(&sock->lock);
2867258945Sroberto	return (ret);
2868258945Sroberto}
2869258945Sroberto
2870258945Srobertoisc_result_t
2871280849Scyisc__socket_recv(isc_socket_t *sock, isc_region_t *region,
2872280849Scy		 unsigned int minimum, isc_task_t *task,
2873280849Scy		 isc_taskaction_t action, const void *arg)
2874258945Sroberto{
2875258945Sroberto	isc_socketevent_t *dev;
2876258945Sroberto	isc_socketmgr_t *manager;
2877258945Sroberto	isc_result_t ret;
2878258945Sroberto
2879258945Sroberto	REQUIRE(VALID_SOCKET(sock));
2880258945Sroberto	LOCK(&sock->lock);
2881258945Sroberto	CONSISTENT(sock);
2882258945Sroberto
2883258945Sroberto	/*
2884258945Sroberto	 * make sure that the socket's not closed
2885258945Sroberto	 */
2886258945Sroberto	if (sock->fd == INVALID_SOCKET) {
2887258945Sroberto		UNLOCK(&sock->lock);
2888258945Sroberto		return (ISC_R_CONNREFUSED);
2889258945Sroberto	}
2890258945Sroberto	REQUIRE(action != NULL);
2891258945Sroberto
2892258945Sroberto	manager = sock->manager;
2893258945Sroberto	REQUIRE(VALID_MANAGER(manager));
2894258945Sroberto
2895258945Sroberto	INSIST(sock->bound);
2896258945Sroberto
2897258945Sroberto	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2898258945Sroberto	if (dev == NULL) {
2899258945Sroberto		UNLOCK(&sock->lock);
2900258945Sroberto		return (ISC_R_NOMEMORY);
2901258945Sroberto	}
2902258945Sroberto
2903258945Sroberto	ret = isc_socket_recv2(sock, region, minimum, task, dev, 0);
2904258945Sroberto	UNLOCK(&sock->lock);
2905258945Sroberto	return (ret);
2906258945Sroberto}
2907258945Sroberto
2908258945Srobertoisc_result_t
2909280849Scyisc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
2910280849Scy		  unsigned int minimum, isc_task_t *task,
2911280849Scy		  isc_socketevent_t *event, unsigned int flags)
2912258945Sroberto{
2913258945Sroberto	isc_result_t ret;
2914258945Sroberto
2915258945Sroberto	REQUIRE(VALID_SOCKET(sock));
2916258945Sroberto	LOCK(&sock->lock);
2917258945Sroberto	CONSISTENT(sock);
2918258945Sroberto
2919258945Sroberto	event->result = ISC_R_UNEXPECTED;
2920258945Sroberto	event->ev_sender = sock;
2921258945Sroberto	/*
2922258945Sroberto	 * make sure that the socket's not closed
2923258945Sroberto	 */
2924258945Sroberto	if (sock->fd == INVALID_SOCKET) {
2925258945Sroberto		UNLOCK(&sock->lock);
2926258945Sroberto		return (ISC_R_CONNREFUSED);
2927258945Sroberto	}
2928258945Sroberto
2929258945Sroberto	ISC_LIST_INIT(event->bufferlist);
2930258945Sroberto	event->region = *region;
2931258945Sroberto	event->n = 0;
2932258945Sroberto	event->offset = 0;
2933258945Sroberto	event->attributes = 0;
2934258945Sroberto
2935258945Sroberto	/*
2936258945Sroberto	 * UDP sockets are always partial read.
2937258945Sroberto	 */
2938258945Sroberto	if (sock->type == isc_sockettype_udp)
2939258945Sroberto		event->minimum = 1;
2940258945Sroberto	else {
2941258945Sroberto		if (minimum == 0)
2942258945Sroberto			event->minimum = region->length;
2943258945Sroberto		else
2944258945Sroberto			event->minimum = minimum;
2945258945Sroberto	}
2946258945Sroberto
2947258945Sroberto	ret = socket_recv(sock, event, task, flags);
2948258945Sroberto	UNLOCK(&sock->lock);
2949258945Sroberto	return (ret);
2950258945Sroberto}
2951258945Sroberto
2952258945Sroberto/*
2953258945Sroberto * Caller must have the socket locked.
2954258945Sroberto */
2955258945Srobertostatic isc_result_t
2956258945Srobertosocket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2957258945Sroberto	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2958258945Sroberto	    unsigned int flags)
2959258945Sroberto{
2960258945Sroberto	int io_state;
2961258945Sroberto	int send_errno = 0;
2962258945Sroberto	int cc = 0;
2963258945Sroberto	isc_task_t *ntask = NULL;
2964258945Sroberto	isc_result_t result = ISC_R_SUCCESS;
2965258945Sroberto
2966258945Sroberto	dev->ev_sender = task;
2967258945Sroberto
2968258945Sroberto	set_dev_address(address, sock, dev);
2969258945Sroberto	if (pktinfo != NULL) {
2970258945Sroberto		socket_log(__LINE__, sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2971258945Sroberto			   ISC_MSG_PKTINFOPROVIDED,
2972258945Sroberto			   "pktinfo structure provided, ifindex %u (set to 0)",
2973258945Sroberto			   pktinfo->ipi6_ifindex);
2974258945Sroberto
2975258945Sroberto		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2976258945Sroberto		dev->pktinfo = *pktinfo;
2977258945Sroberto		/*
2978258945Sroberto		 * Set the pktinfo index to 0 here, to let the kernel decide
2979258945Sroberto		 * what interface it should send on.
2980258945Sroberto		 */
2981258945Sroberto		dev->pktinfo.ipi6_ifindex = 0;
2982258945Sroberto	}
2983258945Sroberto
2984258945Sroberto	io_state = startio_send(sock, dev, &cc, &send_errno);
2985258945Sroberto	switch (io_state) {
2986258945Sroberto	case DOIO_PENDING:	/* I/O started. Nothing more to do */
2987258945Sroberto	case DOIO_SOFT:
2988258945Sroberto		/*
2989258945Sroberto		 * We couldn't send all or part of the request right now, so
2990258945Sroberto		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2991258945Sroberto		 */
2992258945Sroberto		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2993258945Sroberto			isc_task_attach(task, &ntask);
2994258945Sroberto			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2995258945Sroberto
2996258945Sroberto			/*
2997258945Sroberto			 * Enqueue the request.
2998258945Sroberto			 */
2999258945Sroberto			INSIST(!ISC_LINK_LINKED(dev, ev_link));
3000258945Sroberto			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
3001258945Sroberto
3002258945Sroberto			socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
3003258945Sroberto				   "socket_send: event %p -> task %p",
3004258945Sroberto				   dev, ntask);
3005258945Sroberto
3006258945Sroberto			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
3007258945Sroberto				result = ISC_R_INPROGRESS;
3008258945Sroberto			break;
3009258945Sroberto		}
3010258945Sroberto
3011258945Sroberto	case DOIO_SUCCESS:
3012258945Sroberto		break;
3013258945Sroberto	}
3014258945Sroberto
3015258945Sroberto	return (result);
3016258945Sroberto}
3017258945Sroberto
3018258945Srobertoisc_result_t
3019280849Scyisc__socket_send(isc_socket_t *sock, isc_region_t *region,
3020280849Scy		 isc_task_t *task, isc_taskaction_t action, const void *arg)
3021258945Sroberto{
3022258945Sroberto	/*
3023258945Sroberto	 * REQUIRE() checking is performed in isc_socket_sendto().
3024258945Sroberto	 */
3025258945Sroberto	return (isc_socket_sendto(sock, region, task, action, arg, NULL,
3026258945Sroberto				  NULL));
3027258945Sroberto}
3028258945Sroberto
3029258945Srobertoisc_result_t
3030280849Scyisc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
3031280849Scy		   isc_task_t *task, isc_taskaction_t action, const void *arg,
3032280849Scy		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3033258945Sroberto{
3034258945Sroberto	isc_socketevent_t *dev;
3035258945Sroberto	isc_socketmgr_t *manager;
3036258945Sroberto	isc_result_t ret;
3037258945Sroberto
3038258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3039258945Sroberto	REQUIRE(sock->type != isc_sockettype_fdwatch);
3040258945Sroberto
3041258945Sroberto	LOCK(&sock->lock);
3042258945Sroberto	CONSISTENT(sock);
3043258945Sroberto
3044258945Sroberto	/*
3045258945Sroberto	 * make sure that the socket's not closed
3046258945Sroberto	 */
3047258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3048258945Sroberto		UNLOCK(&sock->lock);
3049258945Sroberto		return (ISC_R_CONNREFUSED);
3050258945Sroberto	}
3051258945Sroberto	REQUIRE(region != NULL);
3052258945Sroberto	REQUIRE(task != NULL);
3053258945Sroberto	REQUIRE(action != NULL);
3054258945Sroberto
3055258945Sroberto	manager = sock->manager;
3056258945Sroberto	REQUIRE(VALID_MANAGER(manager));
3057258945Sroberto
3058258945Sroberto	INSIST(sock->bound);
3059258945Sroberto
3060258945Sroberto	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3061258945Sroberto	if (dev == NULL) {
3062258945Sroberto		UNLOCK(&sock->lock);
3063258945Sroberto		return (ISC_R_NOMEMORY);
3064258945Sroberto	}
3065258945Sroberto	dev->region = *region;
3066258945Sroberto
3067258945Sroberto	ret = socket_send(sock, dev, task, address, pktinfo, 0);
3068258945Sroberto	UNLOCK(&sock->lock);
3069258945Sroberto	return (ret);
3070258945Sroberto}
3071258945Sroberto
3072258945Srobertoisc_result_t
3073280849Scyisc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
3074280849Scy		  isc_task_t *task, isc_taskaction_t action, const void *arg)
3075258945Sroberto{
3076258945Sroberto	return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
3077258945Sroberto				   NULL));
3078258945Sroberto}
3079258945Sroberto
3080258945Srobertoisc_result_t
3081280849Scyisc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
3082280849Scy		    isc_task_t *task, isc_taskaction_t action, const void *arg,
3083280849Scy		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
3084258945Sroberto{
3085258945Sroberto	isc_socketevent_t *dev;
3086258945Sroberto	isc_socketmgr_t *manager;
3087258945Sroberto	unsigned int iocount;
3088258945Sroberto	isc_buffer_t *buffer;
3089258945Sroberto	isc_result_t ret;
3090258945Sroberto
3091258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3092258945Sroberto
3093258945Sroberto	LOCK(&sock->lock);
3094258945Sroberto	CONSISTENT(sock);
3095258945Sroberto
3096258945Sroberto	/*
3097258945Sroberto	 * make sure that the socket's not closed
3098258945Sroberto	 */
3099258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3100258945Sroberto		UNLOCK(&sock->lock);
3101258945Sroberto		return (ISC_R_CONNREFUSED);
3102258945Sroberto	}
3103258945Sroberto	REQUIRE(buflist != NULL);
3104258945Sroberto	REQUIRE(!ISC_LIST_EMPTY(*buflist));
3105258945Sroberto	REQUIRE(task != NULL);
3106258945Sroberto	REQUIRE(action != NULL);
3107258945Sroberto
3108258945Sroberto	manager = sock->manager;
3109258945Sroberto	REQUIRE(VALID_MANAGER(manager));
3110258945Sroberto
3111258945Sroberto	iocount = isc_bufferlist_usedcount(buflist);
3112258945Sroberto	REQUIRE(iocount > 0);
3113258945Sroberto
3114258945Sroberto	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
3115258945Sroberto	if (dev == NULL) {
3116258945Sroberto		UNLOCK(&sock->lock);
3117258945Sroberto		return (ISC_R_NOMEMORY);
3118258945Sroberto	}
3119258945Sroberto
3120258945Sroberto	/*
3121258945Sroberto	 * Move each buffer from the passed in list to our internal one.
3122258945Sroberto	 */
3123258945Sroberto	buffer = ISC_LIST_HEAD(*buflist);
3124258945Sroberto	while (buffer != NULL) {
3125258945Sroberto		ISC_LIST_DEQUEUE(*buflist, buffer, link);
3126258945Sroberto		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
3127258945Sroberto		buffer = ISC_LIST_HEAD(*buflist);
3128258945Sroberto	}
3129258945Sroberto
3130258945Sroberto	ret = socket_send(sock, dev, task, address, pktinfo, 0);
3131258945Sroberto	UNLOCK(&sock->lock);
3132258945Sroberto	return (ret);
3133258945Sroberto}
3134258945Sroberto
3135258945Srobertoisc_result_t
3136280849Scyisc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
3137280849Scy		    isc_task_t *task,
3138280849Scy		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
3139280849Scy		    isc_socketevent_t *event, unsigned int flags)
3140258945Sroberto{
3141258945Sroberto	isc_result_t ret;
3142258945Sroberto
3143258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3144258945Sroberto	LOCK(&sock->lock);
3145258945Sroberto	CONSISTENT(sock);
3146258945Sroberto
3147258945Sroberto	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
3148258945Sroberto	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
3149258945Sroberto		REQUIRE(sock->type == isc_sockettype_udp);
3150258945Sroberto	event->ev_sender = sock;
3151258945Sroberto	event->result = ISC_R_UNEXPECTED;
3152258945Sroberto	/*
3153258945Sroberto	 * make sure that the socket's not closed
3154258945Sroberto	 */
3155258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3156258945Sroberto		UNLOCK(&sock->lock);
3157258945Sroberto		return (ISC_R_CONNREFUSED);
3158258945Sroberto	}
3159258945Sroberto	ISC_LIST_INIT(event->bufferlist);
3160258945Sroberto	event->region = *region;
3161258945Sroberto	event->n = 0;
3162258945Sroberto	event->offset = 0;
3163258945Sroberto	event->attributes = 0;
3164258945Sroberto
3165258945Sroberto	ret = socket_send(sock, event, task, address, pktinfo, flags);
3166258945Sroberto	UNLOCK(&sock->lock);
3167258945Sroberto	return (ret);
3168258945Sroberto}
3169258945Sroberto
3170258945Srobertoisc_result_t
3171280849Scyisc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
3172280849Scy		 unsigned int options) {
3173258945Sroberto	int bind_errno;
3174258945Sroberto	char strbuf[ISC_STRERRORSIZE];
3175258945Sroberto	int on = 1;
3176258945Sroberto
3177258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3178258945Sroberto	LOCK(&sock->lock);
3179258945Sroberto	CONSISTENT(sock);
3180258945Sroberto
3181258945Sroberto	/*
3182258945Sroberto	 * make sure that the socket's not closed
3183258945Sroberto	 */
3184258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3185258945Sroberto		UNLOCK(&sock->lock);
3186258945Sroberto		return (ISC_R_CONNREFUSED);
3187258945Sroberto	}
3188258945Sroberto
3189258945Sroberto	INSIST(!sock->bound);
3190280849Scy	INSIST(!sock->dupped);
3191258945Sroberto
3192258945Sroberto	if (sock->pf != sockaddr->type.sa.sa_family) {
3193258945Sroberto		UNLOCK(&sock->lock);
3194258945Sroberto		return (ISC_R_FAMILYMISMATCH);
3195258945Sroberto	}
3196258945Sroberto	/*
3197258945Sroberto	 * Only set SO_REUSEADDR when we want a specific port.
3198258945Sroberto	 */
3199258945Sroberto	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
3200258945Sroberto	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
3201280849Scy	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
3202258945Sroberto		       sizeof(on)) < 0) {
3203258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
3204258945Sroberto				 "setsockopt(%d) %s", sock->fd,
3205258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3206258945Sroberto						ISC_MSG_FAILED, "failed"));
3207258945Sroberto		/* Press on... */
3208258945Sroberto	}
3209258945Sroberto	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
3210258945Sroberto		bind_errno = WSAGetLastError();
3211258945Sroberto		UNLOCK(&sock->lock);
3212258945Sroberto		switch (bind_errno) {
3213258945Sroberto		case WSAEACCES:
3214258945Sroberto			return (ISC_R_NOPERM);
3215258945Sroberto		case WSAEADDRNOTAVAIL:
3216258945Sroberto			return (ISC_R_ADDRNOTAVAIL);
3217258945Sroberto		case WSAEADDRINUSE:
3218258945Sroberto			return (ISC_R_ADDRINUSE);
3219258945Sroberto		case WSAEINVAL:
3220258945Sroberto			return (ISC_R_BOUND);
3221258945Sroberto		default:
3222258945Sroberto			isc__strerror(bind_errno, strbuf, sizeof(strbuf));
3223258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
3224258945Sroberto					 strbuf);
3225258945Sroberto			return (ISC_R_UNEXPECTED);
3226258945Sroberto		}
3227258945Sroberto	}
3228258945Sroberto
3229258945Sroberto	socket_log(__LINE__, sock, sockaddr, TRACE,
3230258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
3231258945Sroberto	sock->bound = 1;
3232258945Sroberto
3233258945Sroberto	UNLOCK(&sock->lock);
3234258945Sroberto	return (ISC_R_SUCCESS);
3235258945Sroberto}
3236258945Sroberto
3237258945Srobertoisc_result_t
3238280849Scyisc__socket_filter(isc_socket_t *sock, const char *filter) {
3239258945Sroberto	UNUSED(sock);
3240258945Sroberto	UNUSED(filter);
3241258945Sroberto
3242258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3243258945Sroberto	return (ISC_R_NOTIMPLEMENTED);
3244258945Sroberto}
3245258945Sroberto
3246258945Sroberto/*
3247258945Sroberto * Set up to listen on a given socket.  We do this by creating an internal
3248258945Sroberto * event that will be dispatched when the socket has read activity.  The
3249258945Sroberto * watcher will send the internal event to the task when there is a new
3250258945Sroberto * connection.
3251258945Sroberto *
3252258945Sroberto * Unlike in read, we don't preallocate a done event here.  Every time there
3253258945Sroberto * is a new connection we'll have to allocate a new one anyway, so we might
3254258945Sroberto * as well keep things simple rather than having to track them.
3255258945Sroberto */
3256258945Srobertoisc_result_t
3257280849Scyisc__socket_listen(isc_socket_t *sock, unsigned int backlog) {
3258258945Sroberto	char strbuf[ISC_STRERRORSIZE];
3259258945Sroberto
3260258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3261258945Sroberto
3262258945Sroberto	LOCK(&sock->lock);
3263258945Sroberto	CONSISTENT(sock);
3264258945Sroberto
3265258945Sroberto	/*
3266258945Sroberto	 * make sure that the socket's not closed
3267258945Sroberto	 */
3268258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3269258945Sroberto		UNLOCK(&sock->lock);
3270258945Sroberto		return (ISC_R_CONNREFUSED);
3271258945Sroberto	}
3272258945Sroberto
3273258945Sroberto	REQUIRE(!sock->listener);
3274258945Sroberto	REQUIRE(sock->bound);
3275258945Sroberto	REQUIRE(sock->type == isc_sockettype_tcp);
3276258945Sroberto
3277258945Sroberto	if (backlog == 0)
3278258945Sroberto		backlog = SOMAXCONN;
3279258945Sroberto
3280258945Sroberto	if (listen(sock->fd, (int)backlog) < 0) {
3281258945Sroberto		UNLOCK(&sock->lock);
3282258945Sroberto		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3283258945Sroberto
3284258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
3285258945Sroberto
3286258945Sroberto		return (ISC_R_UNEXPECTED);
3287258945Sroberto	}
3288258945Sroberto
3289258945Sroberto	socket_log(__LINE__, sock, NULL, TRACE,
3290258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "listening");
3291258945Sroberto	sock->listener = 1;
3292258945Sroberto	_set_state(sock, SOCK_LISTEN);
3293258945Sroberto
3294258945Sroberto	UNLOCK(&sock->lock);
3295258945Sroberto	return (ISC_R_SUCCESS);
3296258945Sroberto}
3297258945Sroberto
3298258945Sroberto/*
3299258945Sroberto * This should try to do aggressive accept() XXXMLG
3300258945Sroberto */
3301258945Srobertoisc_result_t
3302280849Scyisc__socket_accept(isc_socket_t *sock,
3303280849Scy		   isc_task_t *task, isc_taskaction_t action, const void *arg)
3304258945Sroberto{
3305258945Sroberto	isc_socket_newconnev_t *adev;
3306258945Sroberto	isc_socketmgr_t *manager;
3307258945Sroberto	isc_task_t *ntask = NULL;
3308258945Sroberto	isc_socket_t *nsock;
3309258945Sroberto	isc_result_t result;
3310258945Sroberto	IoCompletionInfo *lpo;
3311258945Sroberto
3312258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3313258945Sroberto
3314258945Sroberto	manager = sock->manager;
3315258945Sroberto	REQUIRE(VALID_MANAGER(manager));
3316258945Sroberto
3317258945Sroberto	LOCK(&sock->lock);
3318258945Sroberto	CONSISTENT(sock);
3319258945Sroberto
3320258945Sroberto	/*
3321258945Sroberto	 * make sure that the socket's not closed
3322258945Sroberto	 */
3323258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3324258945Sroberto		UNLOCK(&sock->lock);
3325258945Sroberto		return (ISC_R_CONNREFUSED);
3326258945Sroberto	}
3327258945Sroberto
3328258945Sroberto	REQUIRE(sock->listener);
3329258945Sroberto
3330258945Sroberto	/*
3331258945Sroberto	 * Sender field is overloaded here with the task we will be sending
3332258945Sroberto	 * this event to.  Just before the actual event is delivered the
3333258945Sroberto	 * actual ev_sender will be touched up to be the socket.
3334258945Sroberto	 */
3335258945Sroberto	adev = (isc_socket_newconnev_t *)
3336258945Sroberto		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
3337258945Sroberto				   action, arg, sizeof(*adev));
3338258945Sroberto	if (adev == NULL) {
3339258945Sroberto		UNLOCK(&sock->lock);
3340258945Sroberto		return (ISC_R_NOMEMORY);
3341258945Sroberto	}
3342258945Sroberto	ISC_LINK_INIT(adev, ev_link);
3343258945Sroberto
3344258945Sroberto	result = allocate_socket(manager, sock->type, &nsock);
3345258945Sroberto	if (result != ISC_R_SUCCESS) {
3346258945Sroberto		isc_event_free((isc_event_t **)&adev);
3347258945Sroberto		UNLOCK(&sock->lock);
3348258945Sroberto		return (result);
3349258945Sroberto	}
3350258945Sroberto
3351258945Sroberto	/*
3352258945Sroberto	 * AcceptEx() requires we pass in a socket.
3353258945Sroberto	 */
3354258945Sroberto	nsock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
3355258945Sroberto	if (nsock->fd == INVALID_SOCKET) {
3356258945Sroberto		free_socket(&nsock, __LINE__);
3357258945Sroberto		isc_event_free((isc_event_t **)&adev);
3358258945Sroberto		UNLOCK(&sock->lock);
3359258945Sroberto		return (ISC_R_FAILURE); // XXXMLG need real error message
3360258945Sroberto	}
3361258945Sroberto
3362258945Sroberto	/*
3363258945Sroberto	 * Attach to socket and to task.
3364258945Sroberto	 */
3365258945Sroberto	isc_task_attach(task, &ntask);
3366280849Scy	if (isc_task_exiting(ntask)) {
3367280849Scy		free_socket(&nsock, __LINE__);
3368280849Scy		isc_task_detach(&ntask);
3369280849Scy		isc_event_free(ISC_EVENT_PTR(&adev));
3370280849Scy		UNLOCK(&sock->lock);
3371280849Scy		return (ISC_R_SHUTTINGDOWN);
3372280849Scy	}
3373258945Sroberto	nsock->references++;
3374258945Sroberto
3375258945Sroberto	adev->ev_sender = ntask;
3376258945Sroberto	adev->newsocket = nsock;
3377258945Sroberto	_set_state(nsock, SOCK_ACCEPT);
3378258945Sroberto
3379258945Sroberto	/*
3380258945Sroberto	 * Queue io completion for an accept().
3381258945Sroberto	 */
3382258945Sroberto	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3383258945Sroberto					    HEAP_ZERO_MEMORY,
3384258945Sroberto					    sizeof(IoCompletionInfo));
3385258945Sroberto	RUNTIME_CHECK(lpo != NULL);
3386258945Sroberto	lpo->acceptbuffer = (void *)HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
3387258945Sroberto		(sizeof(SOCKADDR_STORAGE) + 16) * 2);
3388258945Sroberto	RUNTIME_CHECK(lpo->acceptbuffer != NULL);
3389258945Sroberto
3390258945Sroberto	lpo->adev = adev;
3391258945Sroberto	lpo->request_type = SOCKET_ACCEPT;
3392258945Sroberto
3393258945Sroberto	ISCAcceptEx(sock->fd,
3394258945Sroberto		    nsock->fd,				/* Accepted Socket */
3395258945Sroberto		    lpo->acceptbuffer,			/* Buffer for initial Recv */
3396258945Sroberto		    0,					/* Length of Buffer */
3397258945Sroberto		    sizeof(SOCKADDR_STORAGE) + 16,		/* Local address length + 16 */
3398258945Sroberto		    sizeof(SOCKADDR_STORAGE) + 16,		/* Remote address lengh + 16 */
3399258945Sroberto		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
3400258945Sroberto		    (LPOVERLAPPED)lpo			/* Overlapped structure */
3401258945Sroberto		    );
3402258945Sroberto	iocompletionport_update(nsock);
3403258945Sroberto
3404258945Sroberto	socket_log(__LINE__, sock, NULL, TRACE,
3405258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND,
3406258945Sroberto		   "accepting for nsock %p fd %d", nsock, nsock->fd);
3407258945Sroberto
3408258945Sroberto	/*
3409258945Sroberto	 * Enqueue the event
3410258945Sroberto	 */
3411258945Sroberto	ISC_LIST_ENQUEUE(sock->accept_list, adev, ev_link);
3412258945Sroberto	sock->pending_accept++;
3413258945Sroberto	sock->pending_iocp++;
3414258945Sroberto
3415258945Sroberto	UNLOCK(&sock->lock);
3416258945Sroberto	return (ISC_R_SUCCESS);
3417258945Sroberto}
3418258945Sroberto
3419258945Srobertoisc_result_t
3420280849Scyisc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3421280849Scy		    isc_task_t *task, isc_taskaction_t action, const void *arg)
3422258945Sroberto{
3423258945Sroberto	char strbuf[ISC_STRERRORSIZE];
3424258945Sroberto	isc_socket_connev_t *cdev;
3425258945Sroberto	isc_task_t *ntask = NULL;
3426258945Sroberto	isc_socketmgr_t *manager;
3427258945Sroberto	IoCompletionInfo *lpo;
3428258945Sroberto	int bind_errno;
3429258945Sroberto
3430258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3431258945Sroberto	REQUIRE(addr != NULL);
3432258945Sroberto	REQUIRE(task != NULL);
3433258945Sroberto	REQUIRE(action != NULL);
3434258945Sroberto
3435258945Sroberto	manager = sock->manager;
3436258945Sroberto	REQUIRE(VALID_MANAGER(manager));
3437258945Sroberto	REQUIRE(addr != NULL);
3438258945Sroberto
3439258945Sroberto	if (isc_sockaddr_ismulticast(addr))
3440258945Sroberto		return (ISC_R_MULTICAST);
3441258945Sroberto
3442258945Sroberto	LOCK(&sock->lock);
3443258945Sroberto	CONSISTENT(sock);
3444258945Sroberto
3445258945Sroberto	/*
3446258945Sroberto	 * make sure that the socket's not closed
3447258945Sroberto	 */
3448258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3449258945Sroberto		UNLOCK(&sock->lock);
3450258945Sroberto		return (ISC_R_CONNREFUSED);
3451258945Sroberto	}
3452258945Sroberto
3453258945Sroberto	/*
3454258945Sroberto	 * Windows sockets won't connect unless the socket is bound.
3455258945Sroberto	 */
3456258945Sroberto	if (!sock->bound) {
3457258945Sroberto		isc_sockaddr_t any;
3458258945Sroberto
3459258945Sroberto		isc_sockaddr_anyofpf(&any, isc_sockaddr_pf(addr));
3460258945Sroberto		if (bind(sock->fd, &any.type.sa, any.length) < 0) {
3461258945Sroberto			bind_errno = WSAGetLastError();
3462258945Sroberto			UNLOCK(&sock->lock);
3463258945Sroberto			switch (bind_errno) {
3464258945Sroberto			case WSAEACCES:
3465258945Sroberto				return (ISC_R_NOPERM);
3466258945Sroberto			case WSAEADDRNOTAVAIL:
3467258945Sroberto				return (ISC_R_ADDRNOTAVAIL);
3468258945Sroberto			case WSAEADDRINUSE:
3469258945Sroberto				return (ISC_R_ADDRINUSE);
3470258945Sroberto			case WSAEINVAL:
3471258945Sroberto				return (ISC_R_BOUND);
3472258945Sroberto			default:
3473258945Sroberto				isc__strerror(bind_errno, strbuf,
3474258945Sroberto					      sizeof(strbuf));
3475258945Sroberto				UNEXPECTED_ERROR(__FILE__, __LINE__,
3476258945Sroberto						 "bind: %s", strbuf);
3477258945Sroberto				return (ISC_R_UNEXPECTED);
3478258945Sroberto			}
3479258945Sroberto		}
3480258945Sroberto		sock->bound = 1;
3481258945Sroberto	}
3482258945Sroberto
3483258945Sroberto	REQUIRE(!sock->pending_connect);
3484258945Sroberto
3485258945Sroberto	cdev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3486258945Sroberto							ISC_SOCKEVENT_CONNECT,
3487258945Sroberto							action,	arg,
3488258945Sroberto							sizeof(*cdev));
3489258945Sroberto	if (cdev == NULL) {
3490258945Sroberto		UNLOCK(&sock->lock);
3491258945Sroberto		return (ISC_R_NOMEMORY);
3492258945Sroberto	}
3493258945Sroberto	ISC_LINK_INIT(cdev, ev_link);
3494258945Sroberto
3495258945Sroberto	if (sock->type == isc_sockettype_tcp) {
3496258945Sroberto		/*
3497258945Sroberto		 * Queue io completion for an accept().
3498258945Sroberto		 */
3499258945Sroberto		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
3500258945Sroberto						    HEAP_ZERO_MEMORY,
3501258945Sroberto						    sizeof(IoCompletionInfo));
3502258945Sroberto		lpo->cdev = cdev;
3503258945Sroberto		lpo->request_type = SOCKET_CONNECT;
3504258945Sroberto
3505258945Sroberto		sock->address = *addr;
3506258945Sroberto		ISCConnectEx(sock->fd, &addr->type.sa, addr->length,
3507258945Sroberto			NULL, 0, NULL, (LPOVERLAPPED)lpo);
3508258945Sroberto
3509258945Sroberto		/*
3510258945Sroberto		 * Attach to task.
3511258945Sroberto		 */
3512258945Sroberto		isc_task_attach(task, &ntask);
3513258945Sroberto		cdev->ev_sender = ntask;
3514258945Sroberto
3515258945Sroberto		sock->pending_connect = 1;
3516258945Sroberto		_set_state(sock, SOCK_CONNECT);
3517258945Sroberto
3518258945Sroberto		/*
3519258945Sroberto		 * Enqueue the request.
3520258945Sroberto		 */
3521258945Sroberto		sock->connect_ev = cdev;
3522258945Sroberto		sock->pending_iocp++;
3523258945Sroberto	} else {
3524258945Sroberto		WSAConnect(sock->fd, &addr->type.sa, addr->length, NULL, NULL, NULL, NULL);
3525258945Sroberto		cdev->result = ISC_R_SUCCESS;
3526258945Sroberto		isc_task_send(task, (isc_event_t **)&cdev);
3527258945Sroberto	}
3528258945Sroberto	CONSISTENT(sock);
3529258945Sroberto	UNLOCK(&sock->lock);
3530258945Sroberto
3531258945Sroberto	return (ISC_R_SUCCESS);
3532258945Sroberto}
3533258945Sroberto
3534258945Srobertoisc_result_t
3535280849Scyisc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3536258945Sroberto	isc_result_t result;
3537258945Sroberto
3538258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3539258945Sroberto	REQUIRE(addressp != NULL);
3540258945Sroberto
3541258945Sroberto	LOCK(&sock->lock);
3542258945Sroberto	CONSISTENT(sock);
3543258945Sroberto
3544258945Sroberto	/*
3545258945Sroberto	 * make sure that the socket's not closed
3546258945Sroberto	 */
3547258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3548258945Sroberto		UNLOCK(&sock->lock);
3549258945Sroberto		return (ISC_R_CONNREFUSED);
3550258945Sroberto	}
3551258945Sroberto
3552258945Sroberto	if (sock->connected) {
3553258945Sroberto		*addressp = sock->address;
3554258945Sroberto		result = ISC_R_SUCCESS;
3555258945Sroberto	} else {
3556258945Sroberto		result = ISC_R_NOTCONNECTED;
3557258945Sroberto	}
3558258945Sroberto
3559258945Sroberto	UNLOCK(&sock->lock);
3560258945Sroberto
3561258945Sroberto	return (result);
3562258945Sroberto}
3563258945Sroberto
3564258945Srobertoisc_result_t
3565280849Scyisc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3566258945Sroberto	ISC_SOCKADDR_LEN_T len;
3567258945Sroberto	isc_result_t result;
3568258945Sroberto	char strbuf[ISC_STRERRORSIZE];
3569258945Sroberto
3570258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3571258945Sroberto	REQUIRE(addressp != NULL);
3572258945Sroberto
3573258945Sroberto	LOCK(&sock->lock);
3574258945Sroberto	CONSISTENT(sock);
3575258945Sroberto
3576258945Sroberto	/*
3577258945Sroberto	 * make sure that the socket's not closed
3578258945Sroberto	 */
3579258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3580258945Sroberto		UNLOCK(&sock->lock);
3581258945Sroberto		return (ISC_R_CONNREFUSED);
3582258945Sroberto	}
3583258945Sroberto
3584258945Sroberto	if (!sock->bound) {
3585258945Sroberto		result = ISC_R_NOTBOUND;
3586258945Sroberto		goto out;
3587258945Sroberto	}
3588258945Sroberto
3589258945Sroberto	result = ISC_R_SUCCESS;
3590258945Sroberto
3591258945Sroberto	len = sizeof(addressp->type);
3592258945Sroberto	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3593258945Sroberto		isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
3594258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3595258945Sroberto				 strbuf);
3596258945Sroberto		result = ISC_R_UNEXPECTED;
3597258945Sroberto		goto out;
3598258945Sroberto	}
3599258945Sroberto	addressp->length = (unsigned int)len;
3600258945Sroberto
3601258945Sroberto out:
3602258945Sroberto	UNLOCK(&sock->lock);
3603258945Sroberto
3604258945Sroberto	return (result);
3605258945Sroberto}
3606258945Sroberto
3607258945Sroberto/*
3608258945Sroberto * Run through the list of events on this socket, and cancel the ones
3609258945Sroberto * queued for task "task" of type "how".  "how" is a bitmask.
3610258945Sroberto */
3611258945Srobertovoid
3612280849Scyisc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3613258945Sroberto
3614258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3615258945Sroberto
3616258945Sroberto	/*
3617258945Sroberto	 * Quick exit if there is nothing to do.  Don't even bother locking
3618258945Sroberto	 * in this case.
3619258945Sroberto	 */
3620258945Sroberto	if (how == 0)
3621258945Sroberto		return;
3622258945Sroberto
3623258945Sroberto	LOCK(&sock->lock);
3624258945Sroberto	CONSISTENT(sock);
3625258945Sroberto
3626258945Sroberto	/*
3627258945Sroberto	 * make sure that the socket's not closed
3628258945Sroberto	 */
3629258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3630258945Sroberto		UNLOCK(&sock->lock);
3631258945Sroberto		return;
3632258945Sroberto	}
3633258945Sroberto
3634258945Sroberto	/*
3635258945Sroberto	 * All of these do the same thing, more or less.
3636258945Sroberto	 * Each will:
3637258945Sroberto	 *	o If the internal event is marked as "posted" try to
3638258945Sroberto	 *	  remove it from the task's queue.  If this fails, mark it
3639258945Sroberto	 *	  as canceled instead, and let the task clean it up later.
3640258945Sroberto	 *	o For each I/O request for that task of that type, post
3641258945Sroberto	 *	  its done event with status of "ISC_R_CANCELED".
3642258945Sroberto	 *	o Reset any state needed.
3643258945Sroberto	 */
3644258945Sroberto
3645258945Sroberto	if ((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) {
3646258945Sroberto		isc_socketevent_t      *dev;
3647258945Sroberto		isc_socketevent_t      *next;
3648258945Sroberto		isc_task_t	       *current_task;
3649258945Sroberto
3650258945Sroberto		dev = ISC_LIST_HEAD(sock->recv_list);
3651258945Sroberto		while (dev != NULL) {
3652258945Sroberto			current_task = dev->ev_sender;
3653258945Sroberto			next = ISC_LIST_NEXT(dev, ev_link);
3654258945Sroberto			if ((task == NULL) || (task == current_task)) {
3655258945Sroberto				dev->result = ISC_R_CANCELED;
3656258945Sroberto				send_recvdone_event(sock, &dev);
3657258945Sroberto			}
3658258945Sroberto			dev = next;
3659258945Sroberto		}
3660258945Sroberto	}
3661258945Sroberto	how &= ~ISC_SOCKCANCEL_RECV;
3662258945Sroberto
3663258945Sroberto	if ((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) {
3664258945Sroberto		isc_socketevent_t      *dev;
3665258945Sroberto		isc_socketevent_t      *next;
3666258945Sroberto		isc_task_t	       *current_task;
3667258945Sroberto
3668258945Sroberto		dev = ISC_LIST_HEAD(sock->send_list);
3669258945Sroberto
3670258945Sroberto		while (dev != NULL) {
3671258945Sroberto			current_task = dev->ev_sender;
3672258945Sroberto			next = ISC_LIST_NEXT(dev, ev_link);
3673258945Sroberto			if ((task == NULL) || (task == current_task)) {
3674258945Sroberto				dev->result = ISC_R_CANCELED;
3675258945Sroberto				send_senddone_event(sock, &dev);
3676258945Sroberto			}
3677258945Sroberto			dev = next;
3678258945Sroberto		}
3679258945Sroberto	}
3680258945Sroberto	how &= ~ISC_SOCKCANCEL_SEND;
3681258945Sroberto
3682258945Sroberto	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3683258945Sroberto	    && !ISC_LIST_EMPTY(sock->accept_list)) {
3684258945Sroberto		isc_socket_newconnev_t *dev;
3685258945Sroberto		isc_socket_newconnev_t *next;
3686258945Sroberto		isc_task_t	       *current_task;
3687258945Sroberto
3688258945Sroberto		dev = ISC_LIST_HEAD(sock->accept_list);
3689258945Sroberto		while (dev != NULL) {
3690258945Sroberto			current_task = dev->ev_sender;
3691258945Sroberto			next = ISC_LIST_NEXT(dev, ev_link);
3692258945Sroberto
3693258945Sroberto			if ((task == NULL) || (task == current_task)) {
3694258945Sroberto
3695258945Sroberto				dev->newsocket->references--;
3696258945Sroberto				closesocket(dev->newsocket->fd);
3697258945Sroberto				dev->newsocket->fd = INVALID_SOCKET;
3698258945Sroberto				free_socket(&dev->newsocket, __LINE__);
3699258945Sroberto
3700258945Sroberto				dev->result = ISC_R_CANCELED;
3701258945Sroberto				send_acceptdone_event(sock, &dev);
3702258945Sroberto			}
3703258945Sroberto
3704258945Sroberto			dev = next;
3705258945Sroberto		}
3706258945Sroberto	}
3707258945Sroberto	how &= ~ISC_SOCKCANCEL_ACCEPT;
3708258945Sroberto
3709258945Sroberto	/*
3710258945Sroberto	 * Connecting is not a list.
3711258945Sroberto	 */
3712258945Sroberto	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3713258945Sroberto	    && sock->connect_ev != NULL) {
3714258945Sroberto		isc_socket_connev_t    *dev;
3715258945Sroberto		isc_task_t	       *current_task;
3716258945Sroberto
3717258945Sroberto		INSIST(sock->pending_connect);
3718258945Sroberto
3719258945Sroberto		dev = sock->connect_ev;
3720258945Sroberto		current_task = dev->ev_sender;
3721258945Sroberto
3722258945Sroberto		if ((task == NULL) || (task == current_task)) {
3723258945Sroberto			closesocket(sock->fd);
3724258945Sroberto			sock->fd = INVALID_SOCKET;
3725258945Sroberto			_set_state(sock, SOCK_CLOSED);
3726258945Sroberto
3727258945Sroberto			sock->connect_ev = NULL;
3728258945Sroberto			dev->result = ISC_R_CANCELED;
3729258945Sroberto			send_connectdone_event(sock, &dev);
3730258945Sroberto		}
3731258945Sroberto	}
3732258945Sroberto	how &= ~ISC_SOCKCANCEL_CONNECT;
3733258945Sroberto
3734258945Sroberto	maybe_free_socket(&sock, __LINE__);
3735258945Sroberto}
3736258945Sroberto
3737258945Srobertoisc_sockettype_t
3738280849Scyisc__socket_gettype(isc_socket_t *sock) {
3739258945Sroberto	isc_sockettype_t type;
3740258945Sroberto
3741258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3742258945Sroberto
3743258945Sroberto	LOCK(&sock->lock);
3744258945Sroberto
3745258945Sroberto	/*
3746258945Sroberto	 * make sure that the socket's not closed
3747258945Sroberto	 */
3748258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3749258945Sroberto		UNLOCK(&sock->lock);
3750258945Sroberto		return (ISC_R_CONNREFUSED);
3751258945Sroberto	}
3752258945Sroberto
3753258945Sroberto	type = sock->type;
3754258945Sroberto	UNLOCK(&sock->lock);
3755258945Sroberto	return (type);
3756258945Sroberto}
3757258945Sroberto
3758258945Srobertoisc_boolean_t
3759280849Scyisc__socket_isbound(isc_socket_t *sock) {
3760258945Sroberto	isc_boolean_t val;
3761258945Sroberto
3762258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3763258945Sroberto
3764258945Sroberto	LOCK(&sock->lock);
3765258945Sroberto	CONSISTENT(sock);
3766258945Sroberto
3767258945Sroberto	/*
3768258945Sroberto	 * make sure that the socket's not closed
3769258945Sroberto	 */
3770258945Sroberto	if (sock->fd == INVALID_SOCKET) {
3771258945Sroberto		UNLOCK(&sock->lock);
3772258945Sroberto		return (ISC_FALSE);
3773258945Sroberto	}
3774258945Sroberto
3775258945Sroberto	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3776258945Sroberto	UNLOCK(&sock->lock);
3777258945Sroberto
3778258945Sroberto	return (val);
3779258945Sroberto}
3780258945Sroberto
3781258945Srobertovoid
3782280849Scyisc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
3783258945Sroberto#if defined(IPV6_V6ONLY)
3784258945Sroberto	int onoff = yes ? 1 : 0;
3785258945Sroberto#else
3786258945Sroberto	UNUSED(yes);
3787258945Sroberto#endif
3788258945Sroberto
3789258945Sroberto	REQUIRE(VALID_SOCKET(sock));
3790258945Sroberto
3791258945Sroberto#ifdef IPV6_V6ONLY
3792258945Sroberto	if (sock->pf == AF_INET6) {
3793258945Sroberto		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
3794280849Scy				 (char *)&onoff, sizeof(onoff));
3795258945Sroberto	}
3796258945Sroberto#endif
3797258945Sroberto}
3798258945Sroberto
3799258945Srobertovoid
3800280849Scyisc__socket_cleanunix(isc_sockaddr_t *addr, isc_boolean_t active) {
3801258945Sroberto	UNUSED(addr);
3802258945Sroberto	UNUSED(active);
3803258945Sroberto}
3804258945Sroberto
3805258945Srobertoisc_result_t
3806280849Scyisc__socket_permunix(isc_sockaddr_t *addr, isc_uint32_t perm,
3807280849Scy		     isc_uint32_t owner,	isc_uint32_t group)
3808258945Sroberto{
3809258945Sroberto	UNUSED(addr);
3810258945Sroberto	UNUSED(perm);
3811258945Sroberto	UNUSED(owner);
3812258945Sroberto	UNUSED(group);
3813258945Sroberto	return (ISC_R_NOTIMPLEMENTED);
3814258945Sroberto}
3815258945Sroberto
3816258945Srobertovoid
3817280849Scyisc__socket_setname(isc_socket_t *socket, const char *name, void *tag) {
3818258945Sroberto
3819258945Sroberto	/*
3820258945Sroberto	 * Name 'socket'.
3821258945Sroberto	 */
3822258945Sroberto
3823258945Sroberto	REQUIRE(VALID_SOCKET(socket));
3824258945Sroberto
3825258945Sroberto	LOCK(&socket->lock);
3826258945Sroberto	memset(socket->name, 0, sizeof(socket->name));
3827258945Sroberto	strncpy(socket->name, name, sizeof(socket->name) - 1);
3828258945Sroberto	socket->tag = tag;
3829258945Sroberto	UNLOCK(&socket->lock);
3830258945Sroberto}
3831258945Sroberto
3832258945Srobertoconst char *
3833280849Scyisc__socket_getname(isc_socket_t *socket) {
3834258945Sroberto	return (socket->name);
3835258945Sroberto}
3836258945Sroberto
3837258945Srobertovoid *
3838280849Scyisc__socket_gettag(isc_socket_t *socket) {
3839258945Sroberto	return (socket->tag);
3840258945Sroberto}
3841258945Sroberto
3842280849Scyint
3843280849Scyisc__socket_getfd(isc_socket_t *socket) {
3844280849Scy	return ((short) socket->fd);
3845280849Scy}
3846280849Scy
3847258945Srobertovoid
3848258945Srobertoisc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
3849258945Sroberto	UNUSED(manager);
3850258945Sroberto	UNUSED(reserved);
3851258945Sroberto}
3852280849Scy
3853280849Scyvoid
3854280849Scyisc___socketmgr_maxudp(isc_socketmgr_t *manager, int maxudp) {
3855280849Scy
3856280849Scy	UNUSED(manager);
3857280849Scy	UNUSED(maxudp);
3858280849Scy}
3859280849Scy
3860280849Scy#ifdef HAVE_LIBXML2
3861280849Scy
3862280849Scystatic const char *
3863280849Scy_socktype(isc_sockettype_t type)
3864280849Scy{
3865280849Scy	if (type == isc_sockettype_udp)
3866280849Scy		return ("udp");
3867280849Scy	else if (type == isc_sockettype_tcp)
3868280849Scy		return ("tcp");
3869280849Scy	else if (type == isc_sockettype_unix)
3870280849Scy		return ("unix");
3871280849Scy	else if (type == isc_sockettype_fdwatch)
3872280849Scy		return ("fdwatch");
3873280849Scy	else
3874280849Scy		return ("not-initialized");
3875280849Scy}
3876280849Scy
3877280849Scyvoid
3878280849Scyisc_socketmgr_renderxml(isc_socketmgr_t *mgr, xmlTextWriterPtr writer)
3879280849Scy{
3880280849Scy	isc_socket_t *sock;
3881280849Scy	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
3882280849Scy	isc_sockaddr_t addr;
3883280849Scy	ISC_SOCKADDR_LEN_T len;
3884280849Scy
3885280849Scy	LOCK(&mgr->lock);
3886280849Scy
3887280849Scy#ifndef ISC_PLATFORM_USETHREADS
3888280849Scy	xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3889280849Scy	xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
3890280849Scy	xmlTextWriterEndElement(writer);
3891280849Scy#endif
3892280849Scy
3893280849Scy	xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
3894280849Scy	sock = ISC_LIST_HEAD(mgr->socklist);
3895280849Scy	while (sock != NULL) {
3896280849Scy		LOCK(&sock->lock);
3897280849Scy		xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
3898280849Scy
3899280849Scy		xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
3900280849Scy		xmlTextWriterWriteFormatString(writer, "%p", sock);
3901280849Scy		xmlTextWriterEndElement(writer);
3902280849Scy
3903280849Scy		if (sock->name[0] != 0) {
3904280849Scy			xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
3905280849Scy			xmlTextWriterWriteFormatString(writer, "%s",
3906280849Scy						       sock->name);
3907280849Scy			xmlTextWriterEndElement(writer); /* name */
3908280849Scy		}
3909280849Scy
3910280849Scy		xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
3911280849Scy		xmlTextWriterWriteFormatString(writer, "%d", sock->references);
3912280849Scy		xmlTextWriterEndElement(writer);
3913280849Scy
3914280849Scy		xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
3915280849Scy					  ISC_XMLCHAR _socktype(sock->type));
3916280849Scy
3917280849Scy		if (sock->connected) {
3918280849Scy			isc_sockaddr_format(&sock->address, peerbuf,
3919280849Scy					    sizeof(peerbuf));
3920280849Scy			xmlTextWriterWriteElement(writer,
3921280849Scy						  ISC_XMLCHAR "peer-address",
3922280849Scy						  ISC_XMLCHAR peerbuf);
3923280849Scy		}
3924280849Scy
3925280849Scy		len = sizeof(addr);
3926280849Scy		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
3927280849Scy			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
3928280849Scy			xmlTextWriterWriteElement(writer,
3929280849Scy						  ISC_XMLCHAR "local-address",
3930280849Scy						  ISC_XMLCHAR peerbuf);
3931280849Scy		}
3932280849Scy
3933280849Scy		xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
3934280849Scy		if (sock->pending_recv)
3935280849Scy			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3936280849Scy						ISC_XMLCHAR "pending-receive");
3937280849Scy		if (sock->pending_send)
3938280849Scy			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3939280849Scy						  ISC_XMLCHAR "pending-send");
3940280849Scy		if (sock->pending_accept)
3941280849Scy			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3942280849Scy						 ISC_XMLCHAR "pending_accept");
3943280849Scy		if (sock->listener)
3944280849Scy			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3945280849Scy						  ISC_XMLCHAR "listener");
3946280849Scy		if (sock->connected)
3947280849Scy			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3948280849Scy						  ISC_XMLCHAR "connected");
3949280849Scy		if (sock->pending_connect)
3950280849Scy			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3951280849Scy						  ISC_XMLCHAR "connecting");
3952280849Scy		if (sock->bound)
3953280849Scy			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
3954280849Scy						  ISC_XMLCHAR "bound");
3955280849Scy
3956280849Scy		xmlTextWriterEndElement(writer); /* states */
3957280849Scy
3958280849Scy		xmlTextWriterEndElement(writer); /* socket */
3959280849Scy
3960280849Scy		UNLOCK(&sock->lock);
3961280849Scy		sock = ISC_LIST_NEXT(sock, link);
3962280849Scy	}
3963280849Scy	xmlTextWriterEndElement(writer); /* sockets */
3964280849Scy
3965280849Scy	UNLOCK(&mgr->lock);
3966280849Scy}
3967280849Scy#endif /* HAVE_LIBXML2 */
3968