1258945Sroberto/*
2280849Scy * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3258945Sroberto * Copyright (C) 1998-2003  Internet Software Consortium.
4258945Sroberto *
5258945Sroberto * Permission to use, copy, modify, and/or distribute this software for any
6258945Sroberto * purpose with or without fee is hereby granted, provided that the above
7258945Sroberto * copyright notice and this permission notice appear in all copies.
8258945Sroberto *
9258945Sroberto * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10258945Sroberto * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11258945Sroberto * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12258945Sroberto * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13258945Sroberto * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14258945Sroberto * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15258945Sroberto * PERFORMANCE OF THIS SOFTWARE.
16258945Sroberto */
17258945Sroberto
18280849Scy/* $Id$ */
19258945Sroberto
20258945Sroberto/*! \file */
21258945Sroberto
22258945Sroberto#include <config.h>
23258945Sroberto
24258945Sroberto#include <sys/param.h>
25258945Sroberto#include <sys/types.h>
26258945Sroberto#include <sys/socket.h>
27258945Sroberto#include <sys/stat.h>
28258945Sroberto#include <sys/time.h>
29258945Sroberto#include <sys/uio.h>
30258945Sroberto
31258945Sroberto#include <errno.h>
32258945Sroberto#include <fcntl.h>
33258945Sroberto#include <stddef.h>
34258945Sroberto#include <stdlib.h>
35258945Sroberto#include <string.h>
36258945Sroberto#include <unistd.h>
37258945Sroberto
38258945Sroberto#include <isc/buffer.h>
39258945Sroberto#include <isc/bufferlist.h>
40258945Sroberto#include <isc/condition.h>
41258945Sroberto#include <isc/formatcheck.h>
42258945Sroberto#include <isc/list.h>
43258945Sroberto#include <isc/log.h>
44258945Sroberto#include <isc/mem.h>
45258945Sroberto#include <isc/msgs.h>
46258945Sroberto#include <isc/mutex.h>
47258945Sroberto#include <isc/net.h>
48258945Sroberto#include <isc/once.h>
49258945Sroberto#include <isc/platform.h>
50258945Sroberto#include <isc/print.h>
51258945Sroberto#include <isc/region.h>
52258945Sroberto#include <isc/socket.h>
53258945Sroberto#include <isc/stats.h>
54258945Sroberto#include <isc/strerror.h>
55258945Sroberto#include <isc/task.h>
56258945Sroberto#include <isc/thread.h>
57258945Sroberto#include <isc/util.h>
58258945Sroberto#include <isc/xml.h>
59258945Sroberto
60258945Sroberto#ifdef ISC_PLATFORM_HAVESYSUNH
61258945Sroberto#include <sys/un.h>
62258945Sroberto#endif
63258945Sroberto#ifdef ISC_PLATFORM_HAVEKQUEUE
64258945Sroberto#include <sys/event.h>
65258945Sroberto#endif
66258945Sroberto#ifdef ISC_PLATFORM_HAVEEPOLL
67258945Sroberto#include <sys/epoll.h>
68258945Sroberto#endif
69258945Sroberto#ifdef ISC_PLATFORM_HAVEDEVPOLL
70280849Scy#if defined(HAVE_SYS_DEVPOLL_H)
71258945Sroberto#include <sys/devpoll.h>
72280849Scy#elif defined(HAVE_DEVPOLL_H)
73280849Scy#include <devpoll.h>
74258945Sroberto#endif
75280849Scy#endif
76258945Sroberto
77258945Sroberto#include "errno2result.h"
78258945Sroberto
79280849Scy/* See task.c about the following definition: */
80280849Scy#ifdef BIND9
81280849Scy#ifdef ISC_PLATFORM_USETHREADS
82280849Scy#define USE_WATCHER_THREAD
83280849Scy#else
84280849Scy#define USE_SHARED_MANAGER
85280849Scy#endif	/* ISC_PLATFORM_USETHREADS */
86280849Scy#endif	/* BIND9 */
87280849Scy
88280849Scy#ifndef USE_WATCHER_THREAD
89258945Sroberto#include "socket_p.h"
90280849Scy#include "../task_p.h"
91280849Scy#endif /* USE_WATCHER_THREAD */
92258945Sroberto
93258945Sroberto#if defined(SO_BSDCOMPAT) && defined(__linux__)
94258945Sroberto#include <sys/utsname.h>
95258945Sroberto#endif
96258945Sroberto
97258945Sroberto/*%
98258945Sroberto * Choose the most preferable multiplex method.
99258945Sroberto */
100258945Sroberto#ifdef ISC_PLATFORM_HAVEKQUEUE
101258945Sroberto#define USE_KQUEUE
102258945Sroberto#elif defined (ISC_PLATFORM_HAVEEPOLL)
103258945Sroberto#define USE_EPOLL
104258945Sroberto#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
105258945Sroberto#define USE_DEVPOLL
106258945Srobertotypedef struct {
107258945Sroberto	unsigned int want_read : 1,
108258945Sroberto		want_write : 1;
109258945Sroberto} pollinfo_t;
110258945Sroberto#else
111258945Sroberto#define USE_SELECT
112258945Sroberto#endif	/* ISC_PLATFORM_HAVEKQUEUE */
113258945Sroberto
114280849Scy#ifndef USE_WATCHER_THREAD
115258945Sroberto#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
116258945Srobertostruct isc_socketwait {
117258945Sroberto	int nevents;
118258945Sroberto};
119258945Sroberto#elif defined (USE_SELECT)
120258945Srobertostruct isc_socketwait {
121258945Sroberto	fd_set *readset;
122258945Sroberto	fd_set *writeset;
123258945Sroberto	int nfds;
124258945Sroberto	int maxfd;
125258945Sroberto};
126258945Sroberto#endif	/* USE_KQUEUE */
127280849Scy#endif /* !USE_WATCHER_THREAD */
128258945Sroberto
129258945Sroberto/*%
130258945Sroberto * Maximum number of allowable open sockets.  This is also the maximum
131258945Sroberto * allowable socket file descriptor.
132258945Sroberto *
133258945Sroberto * Care should be taken before modifying this value for select():
134258945Sroberto * The API standard doesn't ensure select() accept more than (the system default
135258945Sroberto * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
136258945Sroberto * the vast majority of cases.  This constant should therefore be increased only
137258945Sroberto * when absolutely necessary and possible, i.e., the server is exhausting all
138258945Sroberto * available file descriptors (up to FD_SETSIZE) and the select() function
139258945Sroberto * and FD_xxx macros support larger values than FD_SETSIZE (which may not
140258945Sroberto * always by true, but we keep using some of them to ensure as much
141258945Sroberto * portability as possible).  Note also that overall server performance
142258945Sroberto * may be rather worsened with a larger value of this constant due to
143258945Sroberto * inherent scalability problems of select().
144258945Sroberto *
145258945Sroberto * As a special note, this value shouldn't have to be touched if
146258945Sroberto * this is a build for an authoritative only DNS server.
147258945Sroberto */
148258945Sroberto#ifndef ISC_SOCKET_MAXSOCKETS
149258945Sroberto#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
150258945Sroberto#define ISC_SOCKET_MAXSOCKETS 4096
151258945Sroberto#elif defined(USE_SELECT)
152258945Sroberto#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
153258945Sroberto#endif	/* USE_KQUEUE... */
154258945Sroberto#endif	/* ISC_SOCKET_MAXSOCKETS */
155258945Sroberto
156258945Sroberto#ifdef USE_SELECT
157258945Sroberto/*%
158258945Sroberto * Mac OS X needs a special definition to support larger values in select().
159258945Sroberto * We always define this because a larger value can be specified run-time.
160258945Sroberto */
161258945Sroberto#ifdef __APPLE__
162258945Sroberto#define _DARWIN_UNLIMITED_SELECT
163258945Sroberto#endif	/* __APPLE__ */
164258945Sroberto#endif	/* USE_SELECT */
165258945Sroberto
166258945Sroberto#ifdef ISC_SOCKET_USE_POLLWATCH
167258945Sroberto/*%
168258945Sroberto * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
169258945Sroberto * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
170258945Sroberto * some of the specified FD.  The idea is based on the observation that it's
171258945Sroberto * likely for a busy server to keep receiving packets.  It specifically works
172258945Sroberto * as follows: the socket watcher is first initialized with the state of
173258945Sroberto * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
174258945Sroberto * event occurs.  When it wakes up for a socket I/O event, it moves to the
175258945Sroberto * poll_active state, and sets the poll timeout to a short period
176258945Sroberto * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
177258945Sroberto * watcher goes to the poll_checking state with the same timeout period.
178258945Sroberto * In this state, the watcher tries to detect whether this is a break
179258945Sroberto * during intermittent events or the kernel bug is triggered.  If the next
180258945Sroberto * polling reports an event within the short period, the previous timeout is
181258945Sroberto * likely to be a kernel bug, and so the watcher goes back to the active state.
182258945Sroberto * Otherwise, it moves to the idle state again.
183258945Sroberto *
184258945Sroberto * It's not clear whether this is a thread-related bug, but since we've only
185258945Sroberto * seen this with threads, this workaround is used only when enabling threads.
186258945Sroberto */
187258945Sroberto
188258945Srobertotypedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
189258945Sroberto
190258945Sroberto#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
191258945Sroberto#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
192258945Sroberto#endif	/* ISC_SOCKET_POLLWATCH_TIMEOUT */
193258945Sroberto#endif	/* ISC_SOCKET_USE_POLLWATCH */
194258945Sroberto
195258945Sroberto/*%
196258945Sroberto * Size of per-FD lock buckets.
197258945Sroberto */
198258945Sroberto#ifdef ISC_PLATFORM_USETHREADS
199258945Sroberto#define FDLOCK_COUNT		1024
200258945Sroberto#define FDLOCK_ID(fd)		((fd) % FDLOCK_COUNT)
201258945Sroberto#else
202258945Sroberto#define FDLOCK_COUNT		1
203258945Sroberto#define FDLOCK_ID(fd)		0
204258945Sroberto#endif	/* ISC_PLATFORM_USETHREADS */
205258945Sroberto
206258945Sroberto/*%
207258945Sroberto * Maximum number of events communicated with the kernel.  There should normally
208258945Sroberto * be no need for having a large number.
209258945Sroberto */
210258945Sroberto#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
211258945Sroberto#ifndef ISC_SOCKET_MAXEVENTS
212258945Sroberto#define ISC_SOCKET_MAXEVENTS	64
213258945Sroberto#endif
214258945Sroberto#endif
215258945Sroberto
216258945Sroberto/*%
217258945Sroberto * Some systems define the socket length argument as an int, some as size_t,
218258945Sroberto * some as socklen_t.  This is here so it can be easily changed if needed.
219258945Sroberto */
220258945Sroberto#ifndef ISC_SOCKADDR_LEN_T
221258945Sroberto#define ISC_SOCKADDR_LEN_T unsigned int
222258945Sroberto#endif
223258945Sroberto
224258945Sroberto/*%
225258945Sroberto * Define what the possible "soft" errors can be.  These are non-fatal returns
226258945Sroberto * of various network related functions, like recv() and so on.
227258945Sroberto *
228258945Sroberto * For some reason, BSDI (and perhaps others) will sometimes return <0
229258945Sroberto * from recv() but will have errno==0.  This is broken, but we have to
230258945Sroberto * work around it here.
231258945Sroberto */
232258945Sroberto#define SOFT_ERROR(e)	((e) == EAGAIN || \
233258945Sroberto			 (e) == EWOULDBLOCK || \
234258945Sroberto			 (e) == EINTR || \
235258945Sroberto			 (e) == 0)
236258945Sroberto
237258945Sroberto#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
238258945Sroberto
239258945Sroberto/*!<
240258945Sroberto * DLVL(90)  --  Function entry/exit and other tracing.
241258945Sroberto * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
242258945Sroberto * DLVL(60)  --  Socket data send/receive
243258945Sroberto * DLVL(50)  --  Event tracing, including receiving/sending completion events.
244258945Sroberto * DLVL(20)  --  Socket creation/destruction.
245258945Sroberto */
246258945Sroberto#define TRACE_LEVEL		90
247258945Sroberto#define CORRECTNESS_LEVEL	70
248258945Sroberto#define IOEVENT_LEVEL		60
249258945Sroberto#define EVENT_LEVEL		50
250258945Sroberto#define CREATION_LEVEL		20
251258945Sroberto
252258945Sroberto#define TRACE		DLVL(TRACE_LEVEL)
253258945Sroberto#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
254258945Sroberto#define IOEVENT		DLVL(IOEVENT_LEVEL)
255258945Sroberto#define EVENT		DLVL(EVENT_LEVEL)
256258945Sroberto#define CREATION	DLVL(CREATION_LEVEL)
257258945Sroberto
258258945Srobertotypedef isc_event_t intev_t;
259258945Sroberto
260258945Sroberto#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
261280849Scy#define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
262258945Sroberto
263258945Sroberto/*!
264258945Sroberto * IPv6 control information.  If the socket is an IPv6 socket we want
265258945Sroberto * to collect the destination address and interface so the client can
266258945Sroberto * set them on outgoing packets.
267258945Sroberto */
268258945Sroberto#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
269258945Sroberto#ifndef USE_CMSG
270258945Sroberto#define USE_CMSG	1
271258945Sroberto#endif
272258945Sroberto#endif
273258945Sroberto
274258945Sroberto/*%
275258945Sroberto * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
276258945Sroberto * a setsockopt() like interface to request timestamps, and if the OS
277258945Sroberto * doesn't do it for us, call gettimeofday() on every UDP receive?
278258945Sroberto */
279258945Sroberto#ifdef SO_TIMESTAMP
280258945Sroberto#ifndef USE_CMSG
281258945Sroberto#define USE_CMSG	1
282258945Sroberto#endif
283258945Sroberto#endif
284258945Sroberto
285258945Sroberto/*%
286258945Sroberto * The size to raise the receive buffer to (from BIND 8).
287258945Sroberto */
288258945Sroberto#define RCVBUFSIZE (32*1024)
289258945Sroberto
290258945Sroberto/*%
291258945Sroberto * The number of times a send operation is repeated if the result is EINTR.
292258945Sroberto */
293258945Sroberto#define NRETRIES 10
294258945Sroberto
295280849Scytypedef struct isc__socket isc__socket_t;
296280849Scytypedef struct isc__socketmgr isc__socketmgr_t;
297280849Scy
298280849Scy#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
299280849Scy
300280849Scystruct isc__socket {
301258945Sroberto	/* Not locked. */
302280849Scy	isc_socket_t		common;
303280849Scy	isc__socketmgr_t	*manager;
304258945Sroberto	isc_mutex_t		lock;
305258945Sroberto	isc_sockettype_t	type;
306258945Sroberto	const isc_statscounter_t	*statsindex;
307258945Sroberto
308258945Sroberto	/* Locked by socket lock. */
309280849Scy	ISC_LINK(isc__socket_t)	link;
310258945Sroberto	unsigned int		references;
311258945Sroberto	int			fd;
312258945Sroberto	int			pf;
313258945Sroberto	char				name[16];
314258945Sroberto	void *				tag;
315258945Sroberto
316258945Sroberto	ISC_LIST(isc_socketevent_t)		send_list;
317258945Sroberto	ISC_LIST(isc_socketevent_t)		recv_list;
318258945Sroberto	ISC_LIST(isc_socket_newconnev_t)	accept_list;
319258945Sroberto	isc_socket_connev_t		       *connect_ev;
320258945Sroberto
321258945Sroberto	/*
322258945Sroberto	 * Internal events.  Posted when a descriptor is readable or
323258945Sroberto	 * writable.  These are statically allocated and never freed.
324258945Sroberto	 * They will be set to non-purgable before use.
325258945Sroberto	 */
326258945Sroberto	intev_t			readable_ev;
327258945Sroberto	intev_t			writable_ev;
328258945Sroberto
329258945Sroberto	isc_sockaddr_t		peer_address;  /* remote address */
330258945Sroberto
331258945Sroberto	unsigned int		pending_recv : 1,
332258945Sroberto				pending_send : 1,
333258945Sroberto				pending_accept : 1,
334258945Sroberto				listener : 1, /* listener socket */
335258945Sroberto				connected : 1,
336258945Sroberto				connecting : 1, /* connect pending */
337280849Scy				bound : 1, /* bound to local addr */
338280849Scy				dupped : 1;
339258945Sroberto
340258945Sroberto#ifdef ISC_NET_RECVOVERFLOW
341258945Sroberto	unsigned char		overflow; /* used for MSG_TRUNC fake */
342258945Sroberto#endif
343258945Sroberto
344258945Sroberto	char			*recvcmsgbuf;
345258945Sroberto	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
346258945Sroberto	char			*sendcmsgbuf;
347258945Sroberto	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
348258945Sroberto
349258945Sroberto	void			*fdwatcharg;
350258945Sroberto	isc_sockfdwatch_t	fdwatchcb;
351258945Sroberto	int			fdwatchflags;
352258945Sroberto	isc_task_t		*fdwatchtask;
353258945Sroberto};
354258945Sroberto
355258945Sroberto#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
356258945Sroberto#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
357258945Sroberto
358280849Scystruct isc__socketmgr {
359258945Sroberto	/* Not locked. */
360280849Scy	isc_socketmgr_t		common;
361258945Sroberto	isc_mem_t	       *mctx;
362258945Sroberto	isc_mutex_t		lock;
363258945Sroberto	isc_mutex_t		*fdlock;
364258945Sroberto	isc_stats_t		*stats;
365258945Sroberto#ifdef USE_KQUEUE
366258945Sroberto	int			kqueue_fd;
367258945Sroberto	int			nevents;
368258945Sroberto	struct kevent		*events;
369258945Sroberto#endif	/* USE_KQUEUE */
370258945Sroberto#ifdef USE_EPOLL
371258945Sroberto	int			epoll_fd;
372258945Sroberto	int			nevents;
373258945Sroberto	struct epoll_event	*events;
374258945Sroberto#endif	/* USE_EPOLL */
375258945Sroberto#ifdef USE_DEVPOLL
376258945Sroberto	int			devpoll_fd;
377258945Sroberto	int			nevents;
378258945Sroberto	struct pollfd		*events;
379258945Sroberto#endif	/* USE_DEVPOLL */
380258945Sroberto#ifdef USE_SELECT
381258945Sroberto	int			fd_bufsize;
382258945Sroberto#endif	/* USE_SELECT */
383258945Sroberto	unsigned int		maxsocks;
384258945Sroberto#ifdef ISC_PLATFORM_USETHREADS
385258945Sroberto	int			pipe_fds[2];
386258945Sroberto#endif
387258945Sroberto
388258945Sroberto	/* Locked by fdlock. */
389280849Scy	isc__socket_t	       **fds;
390258945Sroberto	int			*fdstate;
391258945Sroberto#ifdef USE_DEVPOLL
392258945Sroberto	pollinfo_t		*fdpollinfo;
393258945Sroberto#endif
394258945Sroberto
395258945Sroberto	/* Locked by manager lock. */
396280849Scy	ISC_LIST(isc__socket_t)	socklist;
397258945Sroberto#ifdef USE_SELECT
398258945Sroberto	fd_set			*read_fds;
399258945Sroberto	fd_set			*read_fds_copy;
400258945Sroberto	fd_set			*write_fds;
401258945Sroberto	fd_set			*write_fds_copy;
402258945Sroberto	int			maxfd;
403258945Sroberto#endif	/* USE_SELECT */
404258945Sroberto	int			reserved;	/* unlocked */
405280849Scy#ifdef USE_WATCHER_THREAD
406258945Sroberto	isc_thread_t		watcher;
407258945Sroberto	isc_condition_t		shutdown_ok;
408280849Scy#else /* USE_WATCHER_THREAD */
409258945Sroberto	unsigned int		refs;
410280849Scy#endif /* USE_WATCHER_THREAD */
411280849Scy	int			maxudp;
412258945Sroberto};
413258945Sroberto
414280849Scy#ifdef USE_SHARED_MANAGER
415280849Scystatic isc__socketmgr_t *socketmgr = NULL;
416280849Scy#endif /* USE_SHARED_MANAGER */
417258945Sroberto
418258945Sroberto#define CLOSED			0	/* this one must be zero */
419258945Sroberto#define MANAGED			1
420258945Sroberto#define CLOSE_PENDING		2
421258945Sroberto
422258945Sroberto/*
423258945Sroberto * send() and recv() iovec counts
424258945Sroberto */
425258945Sroberto#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
426258945Sroberto#ifdef ISC_NET_RECVOVERFLOW
427258945Sroberto# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
428258945Sroberto#else
429258945Sroberto# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
430258945Sroberto#endif
431258945Sroberto
432280849Scystatic isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
433280849Scy				  isc_sockettype_t type,
434280849Scy				  isc_socket_t **socketp,
435280849Scy				  isc_socket_t *dup_socket);
436280849Scystatic void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
437280849Scystatic void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
438280849Scystatic void free_socket(isc__socket_t **);
439280849Scystatic isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
440280849Scy				    isc__socket_t **);
441280849Scystatic void destroy(isc__socket_t **);
442258945Srobertostatic void internal_accept(isc_task_t *, isc_event_t *);
443258945Srobertostatic void internal_connect(isc_task_t *, isc_event_t *);
444258945Srobertostatic void internal_recv(isc_task_t *, isc_event_t *);
445258945Srobertostatic void internal_send(isc_task_t *, isc_event_t *);
446258945Srobertostatic void internal_fdwatch_write(isc_task_t *, isc_event_t *);
447258945Srobertostatic void internal_fdwatch_read(isc_task_t *, isc_event_t *);
448280849Scystatic void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
449280849Scystatic void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
450258945Sroberto			      struct msghdr *, struct iovec *, size_t *);
451280849Scystatic void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
452258945Sroberto			      struct msghdr *, struct iovec *, size_t *);
453280849Scy#ifdef USE_WATCHER_THREAD
454280849Scystatic isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
455258945Sroberto#endif
456258945Sroberto
457280849Scy/*%
458280849Scy * The following can be either static or public, depending on build environment.
459280849Scy */
460280849Scy
461280849Scy#ifdef BIND9
462280849Scy#define ISC_SOCKETFUNC_SCOPE
463280849Scy#else
464280849Scy#define ISC_SOCKETFUNC_SCOPE static
465280849Scy#endif
466280849Scy
467280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
468280849Scyisc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
469280849Scy		   isc_socket_t **socketp);
470280849ScyISC_SOCKETFUNC_SCOPE void
471280849Scyisc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
472280849ScyISC_SOCKETFUNC_SCOPE void
473280849Scyisc__socket_detach(isc_socket_t **socketp);
474280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
475280849Scyisc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
476280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
477280849Scyisc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
478280849Scy		       unsigned int maxsocks);
479280849ScyISC_SOCKETFUNC_SCOPE void
480280849Scyisc__socketmgr_destroy(isc_socketmgr_t **managerp);
481280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
482280849Scyisc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
483280849Scy		 unsigned int minimum, isc_task_t *task,
484280849Scy		  isc_taskaction_t action, const void *arg);
485280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
486280849Scyisc__socket_recv(isc_socket_t *sock, isc_region_t *region,
487280849Scy		 unsigned int minimum, isc_task_t *task,
488280849Scy		 isc_taskaction_t action, const void *arg);
489280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
490280849Scyisc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
491280849Scy		  unsigned int minimum, isc_task_t *task,
492280849Scy		  isc_socketevent_t *event, unsigned int flags);
493280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
494280849Scyisc__socket_send(isc_socket_t *sock, isc_region_t *region,
495280849Scy		 isc_task_t *task, isc_taskaction_t action, const void *arg);
496280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
497280849Scyisc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
498280849Scy		   isc_task_t *task, isc_taskaction_t action, const void *arg,
499280849Scy		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
500280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
501280849Scyisc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
502280849Scy		  isc_task_t *task, isc_taskaction_t action, const void *arg);
503280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
504280849Scyisc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
505280849Scy		    isc_task_t *task, isc_taskaction_t action, const void *arg,
506280849Scy		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
507280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
508280849Scyisc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
509280849Scy		    isc_task_t *task,
510280849Scy		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
511280849Scy		    isc_socketevent_t *event, unsigned int flags);
512280849ScyISC_SOCKETFUNC_SCOPE void
513280849Scyisc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active);
514280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
515280849Scyisc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
516280849Scy		     isc_uint32_t owner, isc_uint32_t group);
517280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
518280849Scyisc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
519280849Scy		 unsigned int options);
520280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
521280849Scyisc__socket_filter(isc_socket_t *sock, const char *filter);
522280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
523280849Scyisc__socket_listen(isc_socket_t *sock, unsigned int backlog);
524280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
525280849Scyisc__socket_accept(isc_socket_t *sock,
526280849Scy		   isc_task_t *task, isc_taskaction_t action, const void *arg);
527280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
528280849Scyisc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
529280849Scy		    isc_task_t *task, isc_taskaction_t action,
530280849Scy		    const void *arg);
531280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
532280849Scyisc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
533280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
534280849Scyisc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
535280849ScyISC_SOCKETFUNC_SCOPE void
536280849Scyisc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
537280849ScyISC_SOCKETFUNC_SCOPE isc_sockettype_t
538280849Scyisc__socket_gettype(isc_socket_t *sock);
539280849ScyISC_SOCKETFUNC_SCOPE isc_boolean_t
540280849Scyisc__socket_isbound(isc_socket_t *sock);
541280849ScyISC_SOCKETFUNC_SCOPE void
542280849Scyisc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
543280849Scy#if defined(HAVE_LIBXML2) && defined(BIND9)
544280849ScyISC_SOCKETFUNC_SCOPE void
545280849Scyisc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
546280849Scy#endif
547280849Scy
548280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
549280849Scyisc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
550280849Scy			  isc_sockfdwatch_t callback, void *cbarg,
551280849Scy			  isc_task_t *task, isc_socket_t **socketp);
552280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
553280849Scyisc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
554280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
555280849Scyisc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp);
556280849ScyISC_SOCKETFUNC_SCOPE int
557280849Scyisc__socket_getfd(isc_socket_t *sock);
558280849Scy
559280849Scystatic struct {
560280849Scy	isc_socketmethods_t methods;
561280849Scy
562280849Scy	/*%
563280849Scy	 * The following are defined just for avoiding unused static functions.
564280849Scy	 */
565280849Scy#ifndef BIND9
566280849Scy	void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
567280849Scy		*listen, *accept, *getpeername, *isbound;
568280849Scy#endif
569280849Scy} socketmethods = {
570280849Scy	{
571280849Scy		isc__socket_attach,
572280849Scy		isc__socket_detach,
573280849Scy		isc__socket_bind,
574280849Scy		isc__socket_sendto,
575280849Scy		isc__socket_connect,
576280849Scy		isc__socket_recv,
577280849Scy		isc__socket_cancel,
578280849Scy		isc__socket_getsockname,
579280849Scy		isc__socket_gettype,
580280849Scy		isc__socket_ipv6only,
581280849Scy		isc__socket_fdwatchpoke,
582280849Scy		isc__socket_dup,
583280849Scy		isc__socket_getfd
584280849Scy	}
585280849Scy#ifndef BIND9
586280849Scy	,
587280849Scy	(void *)isc__socket_recvv, (void *)isc__socket_send,
588280849Scy	(void *)isc__socket_sendv, (void *)isc__socket_sendto2,
589280849Scy	(void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
590280849Scy	(void *)isc__socket_filter, (void *)isc__socket_listen,
591280849Scy	(void *)isc__socket_accept, (void *)isc__socket_getpeername,
592280849Scy	(void *)isc__socket_isbound
593280849Scy#endif
594280849Scy};
595280849Scy
596280849Scystatic isc_socketmgrmethods_t socketmgrmethods = {
597280849Scy	isc__socketmgr_destroy,
598280849Scy	isc__socket_create,
599280849Scy	isc__socket_fdwatchcreate
600280849Scy};
601280849Scy
602258945Sroberto#define SELECT_POKE_SHUTDOWN		(-1)
603258945Sroberto#define SELECT_POKE_NOTHING		(-2)
604258945Sroberto#define SELECT_POKE_READ		(-3)
605258945Sroberto#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
606258945Sroberto#define SELECT_POKE_WRITE		(-4)
607258945Sroberto#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
608258945Sroberto#define SELECT_POKE_CLOSE		(-5)
609258945Sroberto
610258945Sroberto#define SOCK_DEAD(s)			((s)->references == 0)
611258945Sroberto
612258945Sroberto/*%
613258945Sroberto * Shortcut index arrays to get access to statistics counters.
614258945Sroberto */
615258945Srobertoenum {
616258945Sroberto	STATID_OPEN = 0,
617258945Sroberto	STATID_OPENFAIL = 1,
618258945Sroberto	STATID_CLOSE = 2,
619258945Sroberto	STATID_BINDFAIL = 3,
620258945Sroberto	STATID_CONNECTFAIL = 4,
621258945Sroberto	STATID_CONNECT = 5,
622258945Sroberto	STATID_ACCEPTFAIL = 6,
623258945Sroberto	STATID_ACCEPT = 7,
624258945Sroberto	STATID_SENDFAIL = 8,
625258945Sroberto	STATID_RECVFAIL = 9
626258945Sroberto};
627258945Srobertostatic const isc_statscounter_t upd4statsindex[] = {
628258945Sroberto	isc_sockstatscounter_udp4open,
629258945Sroberto	isc_sockstatscounter_udp4openfail,
630258945Sroberto	isc_sockstatscounter_udp4close,
631258945Sroberto	isc_sockstatscounter_udp4bindfail,
632258945Sroberto	isc_sockstatscounter_udp4connectfail,
633258945Sroberto	isc_sockstatscounter_udp4connect,
634258945Sroberto	-1,
635258945Sroberto	-1,
636258945Sroberto	isc_sockstatscounter_udp4sendfail,
637258945Sroberto	isc_sockstatscounter_udp4recvfail
638258945Sroberto};
639258945Srobertostatic const isc_statscounter_t upd6statsindex[] = {
640258945Sroberto	isc_sockstatscounter_udp6open,
641258945Sroberto	isc_sockstatscounter_udp6openfail,
642258945Sroberto	isc_sockstatscounter_udp6close,
643258945Sroberto	isc_sockstatscounter_udp6bindfail,
644258945Sroberto	isc_sockstatscounter_udp6connectfail,
645258945Sroberto	isc_sockstatscounter_udp6connect,
646258945Sroberto	-1,
647258945Sroberto	-1,
648258945Sroberto	isc_sockstatscounter_udp6sendfail,
649258945Sroberto	isc_sockstatscounter_udp6recvfail
650258945Sroberto};
651258945Srobertostatic const isc_statscounter_t tcp4statsindex[] = {
652258945Sroberto	isc_sockstatscounter_tcp4open,
653258945Sroberto	isc_sockstatscounter_tcp4openfail,
654258945Sroberto	isc_sockstatscounter_tcp4close,
655258945Sroberto	isc_sockstatscounter_tcp4bindfail,
656258945Sroberto	isc_sockstatscounter_tcp4connectfail,
657258945Sroberto	isc_sockstatscounter_tcp4connect,
658258945Sroberto	isc_sockstatscounter_tcp4acceptfail,
659258945Sroberto	isc_sockstatscounter_tcp4accept,
660258945Sroberto	isc_sockstatscounter_tcp4sendfail,
661258945Sroberto	isc_sockstatscounter_tcp4recvfail
662258945Sroberto};
663258945Srobertostatic const isc_statscounter_t tcp6statsindex[] = {
664258945Sroberto	isc_sockstatscounter_tcp6open,
665258945Sroberto	isc_sockstatscounter_tcp6openfail,
666258945Sroberto	isc_sockstatscounter_tcp6close,
667258945Sroberto	isc_sockstatscounter_tcp6bindfail,
668258945Sroberto	isc_sockstatscounter_tcp6connectfail,
669258945Sroberto	isc_sockstatscounter_tcp6connect,
670258945Sroberto	isc_sockstatscounter_tcp6acceptfail,
671258945Sroberto	isc_sockstatscounter_tcp6accept,
672258945Sroberto	isc_sockstatscounter_tcp6sendfail,
673258945Sroberto	isc_sockstatscounter_tcp6recvfail
674258945Sroberto};
675258945Srobertostatic const isc_statscounter_t unixstatsindex[] = {
676258945Sroberto	isc_sockstatscounter_unixopen,
677258945Sroberto	isc_sockstatscounter_unixopenfail,
678258945Sroberto	isc_sockstatscounter_unixclose,
679258945Sroberto	isc_sockstatscounter_unixbindfail,
680258945Sroberto	isc_sockstatscounter_unixconnectfail,
681258945Sroberto	isc_sockstatscounter_unixconnect,
682258945Sroberto	isc_sockstatscounter_unixacceptfail,
683258945Sroberto	isc_sockstatscounter_unixaccept,
684258945Sroberto	isc_sockstatscounter_unixsendfail,
685258945Sroberto	isc_sockstatscounter_unixrecvfail
686258945Sroberto};
687258945Srobertostatic const isc_statscounter_t fdwatchstatsindex[] = {
688258945Sroberto	-1,
689258945Sroberto	-1,
690258945Sroberto	isc_sockstatscounter_fdwatchclose,
691258945Sroberto	isc_sockstatscounter_fdwatchbindfail,
692258945Sroberto	isc_sockstatscounter_fdwatchconnectfail,
693258945Sroberto	isc_sockstatscounter_fdwatchconnect,
694258945Sroberto	-1,
695258945Sroberto	-1,
696258945Sroberto	isc_sockstatscounter_fdwatchsendfail,
697258945Sroberto	isc_sockstatscounter_fdwatchrecvfail
698258945Sroberto};
699258945Sroberto
700280849Scy#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
701280849Scy    defined(USE_WATCHER_THREAD)
702258945Srobertostatic void
703280849Scymanager_log(isc__socketmgr_t *sockmgr,
704258945Sroberto	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
705258945Sroberto	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
706258945Srobertostatic void
707280849Scymanager_log(isc__socketmgr_t *sockmgr,
708258945Sroberto	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
709258945Sroberto	    const char *fmt, ...)
710258945Sroberto{
711258945Sroberto	char msgbuf[2048];
712258945Sroberto	va_list ap;
713258945Sroberto
714258945Sroberto	if (! isc_log_wouldlog(isc_lctx, level))
715258945Sroberto		return;
716258945Sroberto
717258945Sroberto	va_start(ap, fmt);
718258945Sroberto	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
719258945Sroberto	va_end(ap);
720258945Sroberto
721258945Sroberto	isc_log_write(isc_lctx, category, module, level,
722258945Sroberto		      "sockmgr %p: %s", sockmgr, msgbuf);
723258945Sroberto}
724280849Scy#endif
725258945Sroberto
726258945Srobertostatic void
727280849Scysocket_log(isc__socket_t *sock, isc_sockaddr_t *address,
728258945Sroberto	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
729258945Sroberto	   isc_msgcat_t *msgcat, int msgset, int message,
730258945Sroberto	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
731258945Srobertostatic void
732280849Scysocket_log(isc__socket_t *sock, isc_sockaddr_t *address,
733258945Sroberto	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
734258945Sroberto	   isc_msgcat_t *msgcat, int msgset, int message,
735258945Sroberto	   const char *fmt, ...)
736258945Sroberto{
737258945Sroberto	char msgbuf[2048];
738258945Sroberto	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
739258945Sroberto	va_list ap;
740258945Sroberto
741258945Sroberto	if (! isc_log_wouldlog(isc_lctx, level))
742258945Sroberto		return;
743258945Sroberto
744258945Sroberto	va_start(ap, fmt);
745258945Sroberto	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
746258945Sroberto	va_end(ap);
747258945Sroberto
748258945Sroberto	if (address == NULL) {
749258945Sroberto		isc_log_iwrite(isc_lctx, category, module, level,
750258945Sroberto			       msgcat, msgset, message,
751258945Sroberto			       "socket %p: %s", sock, msgbuf);
752258945Sroberto	} else {
753258945Sroberto		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
754258945Sroberto		isc_log_iwrite(isc_lctx, category, module, level,
755258945Sroberto			       msgcat, msgset, message,
756258945Sroberto			       "socket %p %s: %s", sock, peerbuf, msgbuf);
757258945Sroberto	}
758258945Sroberto}
759258945Sroberto
760258945Sroberto#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
761258945Sroberto    defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
762258945Sroberto/*
763258945Sroberto * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
764258945Sroberto * setting IPV6_V6ONLY.
765258945Sroberto */
766258945Srobertostatic void
767280849ScyFIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
768258945Sroberto{
769258945Sroberto	char strbuf[ISC_STRERRORSIZE];
770258945Sroberto	int on = 1;
771258945Sroberto
772258945Sroberto	if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
773258945Sroberto		return;
774258945Sroberto
775258945Sroberto	if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
776258945Sroberto		       (void *)&on, sizeof(on)) < 0) {
777258945Sroberto
778280849Scy		isc__strerror(errno, strbuf, sizeof(strbuf));
779258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
780258945Sroberto				 "setsockopt(%d, IPV6_RECVPKTINFO) "
781258945Sroberto				 "%s: %s", sock->fd,
782258945Sroberto				 isc_msgcat_get(isc_msgcat,
783258945Sroberto						ISC_MSGSET_GENERAL,
784258945Sroberto						ISC_MSG_FAILED,
785258945Sroberto						"failed"),
786258945Sroberto				 strbuf);
787258945Sroberto	}
788258945Sroberto}
789258945Sroberto#else
790258945Sroberto#define FIX_IPV6_RECVPKTINFO(sock) (void)0
791258945Sroberto#endif
792258945Sroberto
793258945Sroberto/*%
794258945Sroberto * Increment socket-related statistics counters.
795258945Sroberto */
796258945Srobertostatic inline void
797258945Srobertoinc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
798258945Sroberto	REQUIRE(counterid != -1);
799258945Sroberto
800258945Sroberto	if (stats != NULL)
801258945Sroberto		isc_stats_increment(stats, counterid);
802258945Sroberto}
803258945Sroberto
804258945Srobertostatic inline isc_result_t
805280849Scywatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
806258945Sroberto	isc_result_t result = ISC_R_SUCCESS;
807258945Sroberto
808258945Sroberto#ifdef USE_KQUEUE
809258945Sroberto	struct kevent evchange;
810258945Sroberto
811258945Sroberto	memset(&evchange, 0, sizeof(evchange));
812258945Sroberto	if (msg == SELECT_POKE_READ)
813258945Sroberto		evchange.filter = EVFILT_READ;
814258945Sroberto	else
815258945Sroberto		evchange.filter = EVFILT_WRITE;
816258945Sroberto	evchange.flags = EV_ADD;
817258945Sroberto	evchange.ident = fd;
818258945Sroberto	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
819258945Sroberto		result = isc__errno2result(errno);
820258945Sroberto
821258945Sroberto	return (result);
822258945Sroberto#elif defined(USE_EPOLL)
823258945Sroberto	struct epoll_event event;
824258945Sroberto
825258945Sroberto	if (msg == SELECT_POKE_READ)
826258945Sroberto		event.events = EPOLLIN;
827258945Sroberto	else
828258945Sroberto		event.events = EPOLLOUT;
829280849Scy	memset(&event.data, 0, sizeof(event.data));
830258945Sroberto	event.data.fd = fd;
831258945Sroberto	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
832258945Sroberto	    errno != EEXIST) {
833258945Sroberto		result = isc__errno2result(errno);
834258945Sroberto	}
835258945Sroberto
836258945Sroberto	return (result);
837258945Sroberto#elif defined(USE_DEVPOLL)
838258945Sroberto	struct pollfd pfd;
839258945Sroberto	int lockid = FDLOCK_ID(fd);
840258945Sroberto
841258945Sroberto	memset(&pfd, 0, sizeof(pfd));
842258945Sroberto	if (msg == SELECT_POKE_READ)
843258945Sroberto		pfd.events = POLLIN;
844258945Sroberto	else
845258945Sroberto		pfd.events = POLLOUT;
846258945Sroberto	pfd.fd = fd;
847258945Sroberto	pfd.revents = 0;
848258945Sroberto	LOCK(&manager->fdlock[lockid]);
849258945Sroberto	if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
850258945Sroberto		result = isc__errno2result(errno);
851258945Sroberto	else {
852258945Sroberto		if (msg == SELECT_POKE_READ)
853258945Sroberto			manager->fdpollinfo[fd].want_read = 1;
854258945Sroberto		else
855258945Sroberto			manager->fdpollinfo[fd].want_write = 1;
856258945Sroberto	}
857258945Sroberto	UNLOCK(&manager->fdlock[lockid]);
858258945Sroberto
859258945Sroberto	return (result);
860258945Sroberto#elif defined(USE_SELECT)
861258945Sroberto	LOCK(&manager->lock);
862258945Sroberto	if (msg == SELECT_POKE_READ)
863258945Sroberto		FD_SET(fd, manager->read_fds);
864258945Sroberto	if (msg == SELECT_POKE_WRITE)
865258945Sroberto		FD_SET(fd, manager->write_fds);
866258945Sroberto	UNLOCK(&manager->lock);
867258945Sroberto
868258945Sroberto	return (result);
869258945Sroberto#endif
870258945Sroberto}
871258945Sroberto
872258945Srobertostatic inline isc_result_t
873280849Scyunwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
874258945Sroberto	isc_result_t result = ISC_R_SUCCESS;
875258945Sroberto
876258945Sroberto#ifdef USE_KQUEUE
877258945Sroberto	struct kevent evchange;
878258945Sroberto
879258945Sroberto	memset(&evchange, 0, sizeof(evchange));
880258945Sroberto	if (msg == SELECT_POKE_READ)
881258945Sroberto		evchange.filter = EVFILT_READ;
882258945Sroberto	else
883258945Sroberto		evchange.filter = EVFILT_WRITE;
884258945Sroberto	evchange.flags = EV_DELETE;
885258945Sroberto	evchange.ident = fd;
886258945Sroberto	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
887258945Sroberto		result = isc__errno2result(errno);
888258945Sroberto
889258945Sroberto	return (result);
890258945Sroberto#elif defined(USE_EPOLL)
891258945Sroberto	struct epoll_event event;
892258945Sroberto
893258945Sroberto	if (msg == SELECT_POKE_READ)
894258945Sroberto		event.events = EPOLLIN;
895258945Sroberto	else
896258945Sroberto		event.events = EPOLLOUT;
897280849Scy	memset(&event.data, 0, sizeof(event.data));
898258945Sroberto	event.data.fd = fd;
899258945Sroberto	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
900258945Sroberto	    errno != ENOENT) {
901258945Sroberto		char strbuf[ISC_STRERRORSIZE];
902258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
903258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
904258945Sroberto				 "epoll_ctl(DEL), %d: %s", fd, strbuf);
905258945Sroberto		result = ISC_R_UNEXPECTED;
906258945Sroberto	}
907258945Sroberto	return (result);
908258945Sroberto#elif defined(USE_DEVPOLL)
909258945Sroberto	struct pollfd pfds[2];
910258945Sroberto	size_t writelen = sizeof(pfds[0]);
911258945Sroberto	int lockid = FDLOCK_ID(fd);
912258945Sroberto
913258945Sroberto	memset(pfds, 0, sizeof(pfds));
914258945Sroberto	pfds[0].events = POLLREMOVE;
915258945Sroberto	pfds[0].fd = fd;
916258945Sroberto
917258945Sroberto	/*
918258945Sroberto	 * Canceling read or write polling via /dev/poll is tricky.  Since it
919258945Sroberto	 * only provides a way of canceling per FD, we may need to re-poll the
920258945Sroberto	 * socket for the other operation.
921258945Sroberto	 */
922258945Sroberto	LOCK(&manager->fdlock[lockid]);
923258945Sroberto	if (msg == SELECT_POKE_READ &&
924258945Sroberto	    manager->fdpollinfo[fd].want_write == 1) {
925258945Sroberto		pfds[1].events = POLLOUT;
926258945Sroberto		pfds[1].fd = fd;
927258945Sroberto		writelen += sizeof(pfds[1]);
928258945Sroberto	}
929258945Sroberto	if (msg == SELECT_POKE_WRITE &&
930258945Sroberto	    manager->fdpollinfo[fd].want_read == 1) {
931258945Sroberto		pfds[1].events = POLLIN;
932258945Sroberto		pfds[1].fd = fd;
933258945Sroberto		writelen += sizeof(pfds[1]);
934258945Sroberto	}
935258945Sroberto
936258945Sroberto	if (write(manager->devpoll_fd, pfds, writelen) == -1)
937258945Sroberto		result = isc__errno2result(errno);
938258945Sroberto	else {
939258945Sroberto		if (msg == SELECT_POKE_READ)
940258945Sroberto			manager->fdpollinfo[fd].want_read = 0;
941258945Sroberto		else
942258945Sroberto			manager->fdpollinfo[fd].want_write = 0;
943258945Sroberto	}
944258945Sroberto	UNLOCK(&manager->fdlock[lockid]);
945258945Sroberto
946258945Sroberto	return (result);
947258945Sroberto#elif defined(USE_SELECT)
948258945Sroberto	LOCK(&manager->lock);
949258945Sroberto	if (msg == SELECT_POKE_READ)
950258945Sroberto		FD_CLR(fd, manager->read_fds);
951258945Sroberto	else if (msg == SELECT_POKE_WRITE)
952258945Sroberto		FD_CLR(fd, manager->write_fds);
953258945Sroberto	UNLOCK(&manager->lock);
954258945Sroberto
955258945Sroberto	return (result);
956258945Sroberto#endif
957258945Sroberto}
958258945Sroberto
959258945Srobertostatic void
960280849Scywakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
961258945Sroberto	isc_result_t result;
962258945Sroberto	int lockid = FDLOCK_ID(fd);
963258945Sroberto
964258945Sroberto	/*
965258945Sroberto	 * This is a wakeup on a socket.  If the socket is not in the
966258945Sroberto	 * process of being closed, start watching it for either reads
967258945Sroberto	 * or writes.
968258945Sroberto	 */
969258945Sroberto
970258945Sroberto	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
971258945Sroberto
972258945Sroberto	if (msg == SELECT_POKE_CLOSE) {
973258945Sroberto		/* No one should be updating fdstate, so no need to lock it */
974258945Sroberto		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
975258945Sroberto		manager->fdstate[fd] = CLOSED;
976258945Sroberto		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
977258945Sroberto		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
978258945Sroberto		(void)close(fd);
979258945Sroberto		return;
980258945Sroberto	}
981258945Sroberto
982258945Sroberto	LOCK(&manager->fdlock[lockid]);
983258945Sroberto	if (manager->fdstate[fd] == CLOSE_PENDING) {
984258945Sroberto		UNLOCK(&manager->fdlock[lockid]);
985258945Sroberto
986258945Sroberto		/*
987258945Sroberto		 * We accept (and ignore) any error from unwatch_fd() as we are
988258945Sroberto		 * closing the socket, hoping it doesn't leave dangling state in
989258945Sroberto		 * the kernel.
990258945Sroberto		 * Note that unwatch_fd() must be called after releasing the
991258945Sroberto		 * fdlock; otherwise it could cause deadlock due to a lock order
992258945Sroberto		 * reversal.
993258945Sroberto		 */
994258945Sroberto		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
995258945Sroberto		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
996258945Sroberto		return;
997258945Sroberto	}
998258945Sroberto	if (manager->fdstate[fd] != MANAGED) {
999258945Sroberto		UNLOCK(&manager->fdlock[lockid]);
1000258945Sroberto		return;
1001258945Sroberto	}
1002258945Sroberto	UNLOCK(&manager->fdlock[lockid]);
1003258945Sroberto
1004258945Sroberto	/*
1005258945Sroberto	 * Set requested bit.
1006258945Sroberto	 */
1007258945Sroberto	result = watch_fd(manager, fd, msg);
1008258945Sroberto	if (result != ISC_R_SUCCESS) {
1009258945Sroberto		/*
1010258945Sroberto		 * XXXJT: what should we do?  Ignoring the failure of watching
1011258945Sroberto		 * a socket will make the application dysfunctional, but there
1012258945Sroberto		 * seems to be no reasonable recovery process.
1013258945Sroberto		 */
1014258945Sroberto		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1015258945Sroberto			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1016258945Sroberto			      "failed to start watching FD (%d): %s",
1017258945Sroberto			      fd, isc_result_totext(result));
1018258945Sroberto	}
1019258945Sroberto}
1020258945Sroberto
1021280849Scy#ifdef USE_WATCHER_THREAD
1022258945Sroberto/*
1023258945Sroberto * Poke the select loop when there is something for us to do.
1024258945Sroberto * The write is required (by POSIX) to complete.  That is, we
1025258945Sroberto * will not get partial writes.
1026258945Sroberto */
1027258945Srobertostatic void
1028280849Scyselect_poke(isc__socketmgr_t *mgr, int fd, int msg) {
1029258945Sroberto	int cc;
1030258945Sroberto	int buf[2];
1031258945Sroberto	char strbuf[ISC_STRERRORSIZE];
1032258945Sroberto
1033258945Sroberto	buf[0] = fd;
1034258945Sroberto	buf[1] = msg;
1035258945Sroberto
1036258945Sroberto	do {
1037258945Sroberto		cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
1038258945Sroberto#ifdef ENOSR
1039258945Sroberto		/*
1040258945Sroberto		 * Treat ENOSR as EAGAIN but loop slowly as it is
1041258945Sroberto		 * unlikely to clear fast.
1042258945Sroberto		 */
1043258945Sroberto		if (cc < 0 && errno == ENOSR) {
1044258945Sroberto			sleep(1);
1045258945Sroberto			errno = EAGAIN;
1046258945Sroberto		}
1047258945Sroberto#endif
1048258945Sroberto	} while (cc < 0 && SOFT_ERROR(errno));
1049258945Sroberto
1050258945Sroberto	if (cc < 0) {
1051258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
1052258945Sroberto		FATAL_ERROR(__FILE__, __LINE__,
1053258945Sroberto			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1054258945Sroberto					   ISC_MSG_WRITEFAILED,
1055258945Sroberto					   "write() failed "
1056258945Sroberto					   "during watcher poke: %s"),
1057258945Sroberto			    strbuf);
1058258945Sroberto	}
1059258945Sroberto
1060258945Sroberto	INSIST(cc == sizeof(buf));
1061258945Sroberto}
1062258945Sroberto
1063258945Sroberto/*
1064258945Sroberto * Read a message on the internal fd.
1065258945Sroberto */
1066258945Srobertostatic void
1067280849Scyselect_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) {
1068258945Sroberto	int buf[2];
1069258945Sroberto	int cc;
1070258945Sroberto	char strbuf[ISC_STRERRORSIZE];
1071258945Sroberto
1072258945Sroberto	cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
1073258945Sroberto	if (cc < 0) {
1074258945Sroberto		*msg = SELECT_POKE_NOTHING;
1075258945Sroberto		*fd = -1;	/* Silence compiler. */
1076258945Sroberto		if (SOFT_ERROR(errno))
1077258945Sroberto			return;
1078258945Sroberto
1079258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
1080258945Sroberto		FATAL_ERROR(__FILE__, __LINE__,
1081258945Sroberto			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1082258945Sroberto					   ISC_MSG_READFAILED,
1083258945Sroberto					   "read() failed "
1084258945Sroberto					   "during watcher poke: %s"),
1085258945Sroberto			    strbuf);
1086258945Sroberto
1087258945Sroberto		return;
1088258945Sroberto	}
1089258945Sroberto	INSIST(cc == sizeof(buf));
1090258945Sroberto
1091258945Sroberto	*fd = buf[0];
1092258945Sroberto	*msg = buf[1];
1093258945Sroberto}
1094280849Scy#else /* USE_WATCHER_THREAD */
1095258945Sroberto/*
1096258945Sroberto * Update the state of the socketmgr when something changes.
1097258945Sroberto */
1098258945Srobertostatic void
1099280849Scyselect_poke(isc__socketmgr_t *manager, int fd, int msg) {
1100258945Sroberto	if (msg == SELECT_POKE_SHUTDOWN)
1101258945Sroberto		return;
1102258945Sroberto	else if (fd >= 0)
1103258945Sroberto		wakeup_socket(manager, fd, msg);
1104258945Sroberto	return;
1105258945Sroberto}
1106280849Scy#endif /* USE_WATCHER_THREAD */
1107258945Sroberto
1108258945Sroberto/*
1109258945Sroberto * Make a fd non-blocking.
1110258945Sroberto */
1111258945Srobertostatic isc_result_t
1112258945Srobertomake_nonblock(int fd) {
1113258945Sroberto	int ret;
1114258945Sroberto	int flags;
1115258945Sroberto	char strbuf[ISC_STRERRORSIZE];
1116258945Sroberto#ifdef USE_FIONBIO_IOCTL
1117258945Sroberto	int on = 1;
1118258945Sroberto
1119258945Sroberto	ret = ioctl(fd, FIONBIO, (char *)&on);
1120258945Sroberto#else
1121258945Sroberto	flags = fcntl(fd, F_GETFL, 0);
1122258945Sroberto	flags |= PORT_NONBLOCK;
1123258945Sroberto	ret = fcntl(fd, F_SETFL, flags);
1124258945Sroberto#endif
1125258945Sroberto
1126258945Sroberto	if (ret == -1) {
1127258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
1128258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
1129258945Sroberto#ifdef USE_FIONBIO_IOCTL
1130258945Sroberto				 "ioctl(%d, FIONBIO, &on): %s", fd,
1131258945Sroberto#else
1132258945Sroberto				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1133258945Sroberto#endif
1134258945Sroberto				 strbuf);
1135258945Sroberto
1136258945Sroberto		return (ISC_R_UNEXPECTED);
1137258945Sroberto	}
1138258945Sroberto
1139258945Sroberto	return (ISC_R_SUCCESS);
1140258945Sroberto}
1141258945Sroberto
1142258945Sroberto#ifdef USE_CMSG
1143258945Sroberto/*
1144258945Sroberto * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1145258945Sroberto * In order to ensure as much portability as possible, we provide wrapper
1146258945Sroberto * functions of these macros.
1147258945Sroberto * Note that cmsg_space() could run slow on OSes that do not have
1148258945Sroberto * CMSG_SPACE.
1149258945Sroberto */
1150258945Srobertostatic inline ISC_SOCKADDR_LEN_T
1151258945Srobertocmsg_len(ISC_SOCKADDR_LEN_T len) {
1152258945Sroberto#ifdef CMSG_LEN
1153258945Sroberto	return (CMSG_LEN(len));
1154258945Sroberto#else
1155258945Sroberto	ISC_SOCKADDR_LEN_T hdrlen;
1156258945Sroberto
1157258945Sroberto	/*
1158258945Sroberto	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1159258945Sroberto	 * is correct.
1160258945Sroberto	 */
1161258945Sroberto	hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
1162258945Sroberto	return (hdrlen + len);
1163258945Sroberto#endif
1164258945Sroberto}
1165258945Sroberto
1166258945Srobertostatic inline ISC_SOCKADDR_LEN_T
1167258945Srobertocmsg_space(ISC_SOCKADDR_LEN_T len) {
1168258945Sroberto#ifdef CMSG_SPACE
1169258945Sroberto	return (CMSG_SPACE(len));
1170258945Sroberto#else
1171258945Sroberto	struct msghdr msg;
1172258945Sroberto	struct cmsghdr *cmsgp;
1173258945Sroberto	/*
1174258945Sroberto	 * XXX: The buffer length is an ad-hoc value, but should be enough
1175258945Sroberto	 * in a practical sense.
1176258945Sroberto	 */
1177258945Sroberto	char dummybuf[sizeof(struct cmsghdr) + 1024];
1178258945Sroberto
1179258945Sroberto	memset(&msg, 0, sizeof(msg));
1180258945Sroberto	msg.msg_control = dummybuf;
1181258945Sroberto	msg.msg_controllen = sizeof(dummybuf);
1182258945Sroberto
1183258945Sroberto	cmsgp = (struct cmsghdr *)dummybuf;
1184258945Sroberto	cmsgp->cmsg_len = cmsg_len(len);
1185258945Sroberto
1186258945Sroberto	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1187258945Sroberto	if (cmsgp != NULL)
1188258945Sroberto		return ((char *)cmsgp - (char *)msg.msg_control);
1189258945Sroberto	else
1190258945Sroberto		return (0);
1191258945Sroberto#endif
1192258945Sroberto}
1193258945Sroberto#endif /* USE_CMSG */
1194258945Sroberto
1195258945Sroberto/*
1196258945Sroberto * Process control messages received on a socket.
1197258945Sroberto */
1198258945Srobertostatic void
1199280849Scyprocess_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1200258945Sroberto#ifdef USE_CMSG
1201258945Sroberto	struct cmsghdr *cmsgp;
1202258945Sroberto#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1203258945Sroberto	struct in6_pktinfo *pktinfop;
1204258945Sroberto#endif
1205258945Sroberto#ifdef SO_TIMESTAMP
1206258945Sroberto	struct timeval *timevalp;
1207258945Sroberto#endif
1208258945Sroberto#endif
1209258945Sroberto
1210258945Sroberto	/*
1211258945Sroberto	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1212258945Sroberto	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1213258945Sroberto	 * They are all here, outside of the CPP tests, because it is
1214258945Sroberto	 * more consistent with the usual ISC coding style.
1215258945Sroberto	 */
1216258945Sroberto	UNUSED(sock);
1217258945Sroberto	UNUSED(msg);
1218258945Sroberto	UNUSED(dev);
1219258945Sroberto
1220258945Sroberto#ifdef ISC_NET_BSD44MSGHDR
1221258945Sroberto
1222258945Sroberto#ifdef MSG_TRUNC
1223258945Sroberto	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1224258945Sroberto		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1225258945Sroberto#endif
1226258945Sroberto
1227258945Sroberto#ifdef MSG_CTRUNC
1228258945Sroberto	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1229258945Sroberto		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1230258945Sroberto#endif
1231258945Sroberto
1232258945Sroberto#ifndef USE_CMSG
1233258945Sroberto	return;
1234258945Sroberto#else
1235258945Sroberto	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1236258945Sroberto		return;
1237258945Sroberto
1238258945Sroberto#ifdef SO_TIMESTAMP
1239258945Sroberto	timevalp = NULL;
1240258945Sroberto#endif
1241258945Sroberto#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1242258945Sroberto	pktinfop = NULL;
1243258945Sroberto#endif
1244258945Sroberto
1245258945Sroberto	cmsgp = CMSG_FIRSTHDR(msg);
1246258945Sroberto	while (cmsgp != NULL) {
1247258945Sroberto		socket_log(sock, NULL, TRACE,
1248258945Sroberto			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1249258945Sroberto			   "processing cmsg %p", cmsgp);
1250258945Sroberto
1251258945Sroberto#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1252258945Sroberto		if (cmsgp->cmsg_level == IPPROTO_IPV6
1253258945Sroberto		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
1254258945Sroberto
1255258945Sroberto			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1256258945Sroberto			memcpy(&dev->pktinfo, pktinfop,
1257258945Sroberto			       sizeof(struct in6_pktinfo));
1258258945Sroberto			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1259258945Sroberto			socket_log(sock, NULL, TRACE,
1260258945Sroberto				   isc_msgcat, ISC_MSGSET_SOCKET,
1261258945Sroberto				   ISC_MSG_IFRECEIVED,
1262258945Sroberto				   "interface received on ifindex %u",
1263258945Sroberto				   dev->pktinfo.ipi6_ifindex);
1264258945Sroberto			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1265258945Sroberto				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1266258945Sroberto			goto next;
1267258945Sroberto		}
1268258945Sroberto#endif
1269258945Sroberto
1270258945Sroberto#ifdef SO_TIMESTAMP
1271258945Sroberto		if (cmsgp->cmsg_level == SOL_SOCKET
1272258945Sroberto		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1273258945Sroberto			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1274258945Sroberto			dev->timestamp.seconds = timevalp->tv_sec;
1275258945Sroberto			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1276258945Sroberto			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1277258945Sroberto			goto next;
1278258945Sroberto		}
1279258945Sroberto#endif
1280258945Sroberto
1281258945Sroberto	next:
1282258945Sroberto		cmsgp = CMSG_NXTHDR(msg, cmsgp);
1283258945Sroberto	}
1284258945Sroberto#endif /* USE_CMSG */
1285258945Sroberto
1286258945Sroberto#endif /* ISC_NET_BSD44MSGHDR */
1287258945Sroberto}
1288258945Sroberto
1289258945Sroberto/*
1290258945Sroberto * Construct an iov array and attach it to the msghdr passed in.  This is
1291258945Sroberto * the SEND constructor, which will use the used region of the buffer
1292258945Sroberto * (if using a buffer list) or will use the internal region (if a single
1293258945Sroberto * buffer I/O is requested).
1294258945Sroberto *
1295258945Sroberto * Nothing can be NULL, and the done event must list at least one buffer
1296258945Sroberto * on the buffer linked list for this function to be meaningful.
1297258945Sroberto *
1298258945Sroberto * If write_countp != NULL, *write_countp will hold the number of bytes
1299258945Sroberto * this transaction can send.
1300258945Sroberto */
1301258945Srobertostatic void
1302280849Scybuild_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev,
1303258945Sroberto		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1304258945Sroberto{
1305258945Sroberto	unsigned int iovcount;
1306258945Sroberto	isc_buffer_t *buffer;
1307258945Sroberto	isc_region_t used;
1308258945Sroberto	size_t write_count;
1309258945Sroberto	size_t skip_count;
1310258945Sroberto
1311258945Sroberto	memset(msg, 0, sizeof(*msg));
1312258945Sroberto
1313258945Sroberto	if (!sock->connected) {
1314258945Sroberto		msg->msg_name = (void *)&dev->address.type.sa;
1315258945Sroberto		msg->msg_namelen = dev->address.length;
1316258945Sroberto	} else {
1317258945Sroberto		msg->msg_name = NULL;
1318258945Sroberto		msg->msg_namelen = 0;
1319258945Sroberto	}
1320258945Sroberto
1321258945Sroberto	buffer = ISC_LIST_HEAD(dev->bufferlist);
1322258945Sroberto	write_count = 0;
1323258945Sroberto	iovcount = 0;
1324258945Sroberto
1325258945Sroberto	/*
1326258945Sroberto	 * Single buffer I/O?  Skip what we've done so far in this region.
1327258945Sroberto	 */
1328258945Sroberto	if (buffer == NULL) {
1329258945Sroberto		write_count = dev->region.length - dev->n;
1330258945Sroberto		iov[0].iov_base = (void *)(dev->region.base + dev->n);
1331258945Sroberto		iov[0].iov_len = write_count;
1332258945Sroberto		iovcount = 1;
1333258945Sroberto
1334258945Sroberto		goto config;
1335258945Sroberto	}
1336258945Sroberto
1337258945Sroberto	/*
1338258945Sroberto	 * Multibuffer I/O.
1339258945Sroberto	 * Skip the data in the buffer list that we have already written.
1340258945Sroberto	 */
1341258945Sroberto	skip_count = dev->n;
1342258945Sroberto	while (buffer != NULL) {
1343258945Sroberto		REQUIRE(ISC_BUFFER_VALID(buffer));
1344258945Sroberto		if (skip_count < isc_buffer_usedlength(buffer))
1345258945Sroberto			break;
1346258945Sroberto		skip_count -= isc_buffer_usedlength(buffer);
1347258945Sroberto		buffer = ISC_LIST_NEXT(buffer, link);
1348258945Sroberto	}
1349258945Sroberto
1350258945Sroberto	while (buffer != NULL) {
1351258945Sroberto		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1352258945Sroberto
1353258945Sroberto		isc_buffer_usedregion(buffer, &used);
1354258945Sroberto
1355258945Sroberto		if (used.length > 0) {
1356258945Sroberto			iov[iovcount].iov_base = (void *)(used.base
1357258945Sroberto							  + skip_count);
1358258945Sroberto			iov[iovcount].iov_len = used.length - skip_count;
1359258945Sroberto			write_count += (used.length - skip_count);
1360258945Sroberto			skip_count = 0;
1361258945Sroberto			iovcount++;
1362258945Sroberto		}
1363258945Sroberto		buffer = ISC_LIST_NEXT(buffer, link);
1364258945Sroberto	}
1365258945Sroberto
1366258945Sroberto	INSIST(skip_count == 0U);
1367258945Sroberto
1368258945Sroberto config:
1369258945Sroberto	msg->msg_iov = iov;
1370258945Sroberto	msg->msg_iovlen = iovcount;
1371258945Sroberto
1372258945Sroberto#ifdef ISC_NET_BSD44MSGHDR
1373258945Sroberto	msg->msg_control = NULL;
1374258945Sroberto	msg->msg_controllen = 0;
1375258945Sroberto	msg->msg_flags = 0;
1376258945Sroberto#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1377258945Sroberto	if ((sock->type == isc_sockettype_udp)
1378258945Sroberto	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1379280849Scy#if defined(IPV6_USE_MIN_MTU)
1380280849Scy		int use_min_mtu = 1;	/* -1, 0, 1 */
1381280849Scy#endif
1382258945Sroberto		struct cmsghdr *cmsgp;
1383258945Sroberto		struct in6_pktinfo *pktinfop;
1384258945Sroberto
1385258945Sroberto		socket_log(sock, NULL, TRACE,
1386258945Sroberto			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1387258945Sroberto			   "sendto pktinfo data, ifindex %u",
1388258945Sroberto			   dev->pktinfo.ipi6_ifindex);
1389258945Sroberto
1390258945Sroberto		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1391258945Sroberto		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1392258945Sroberto		msg->msg_control = (void *)sock->sendcmsgbuf;
1393258945Sroberto
1394258945Sroberto		cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1395258945Sroberto		cmsgp->cmsg_level = IPPROTO_IPV6;
1396258945Sroberto		cmsgp->cmsg_type = IPV6_PKTINFO;
1397258945Sroberto		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1398258945Sroberto		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1399258945Sroberto		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1400280849Scy#if defined(IPV6_USE_MIN_MTU)
1401280849Scy		/*
1402280849Scy		 * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD
1403280849Scy		 * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO
1404280849Scy		 * is used.
1405280849Scy		 */
1406280849Scy		cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
1407280849Scy					   msg->msg_controllen);
1408280849Scy		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1409280849Scy		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1410280849Scy
1411280849Scy		cmsgp->cmsg_level = IPPROTO_IPV6;
1412280849Scy		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1413280849Scy		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1414280849Scy		memcpy(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1415280849Scy#endif
1416258945Sroberto	}
1417258945Sroberto#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1418258945Sroberto#else /* ISC_NET_BSD44MSGHDR */
1419258945Sroberto	msg->msg_accrights = NULL;
1420258945Sroberto	msg->msg_accrightslen = 0;
1421258945Sroberto#endif /* ISC_NET_BSD44MSGHDR */
1422258945Sroberto
1423258945Sroberto	if (write_countp != NULL)
1424258945Sroberto		*write_countp = write_count;
1425258945Sroberto}
1426258945Sroberto
1427258945Sroberto/*
1428258945Sroberto * Construct an iov array and attach it to the msghdr passed in.  This is
1429258945Sroberto * the RECV constructor, which will use the available region of the buffer
1430258945Sroberto * (if using a buffer list) or will use the internal region (if a single
1431258945Sroberto * buffer I/O is requested).
1432258945Sroberto *
1433258945Sroberto * Nothing can be NULL, and the done event must list at least one buffer
1434258945Sroberto * on the buffer linked list for this function to be meaningful.
1435258945Sroberto *
1436258945Sroberto * If read_countp != NULL, *read_countp will hold the number of bytes
1437258945Sroberto * this transaction can receive.
1438258945Sroberto */
1439258945Srobertostatic void
1440280849Scybuild_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev,
1441258945Sroberto		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1442258945Sroberto{
1443258945Sroberto	unsigned int iovcount;
1444258945Sroberto	isc_buffer_t *buffer;
1445258945Sroberto	isc_region_t available;
1446258945Sroberto	size_t read_count;
1447258945Sroberto
1448258945Sroberto	memset(msg, 0, sizeof(struct msghdr));
1449258945Sroberto
1450258945Sroberto	if (sock->type == isc_sockettype_udp) {
1451258945Sroberto		memset(&dev->address, 0, sizeof(dev->address));
1452258945Sroberto#ifdef BROKEN_RECVMSG
1453258945Sroberto		if (sock->pf == AF_INET) {
1454258945Sroberto			msg->msg_name = (void *)&dev->address.type.sin;
1455258945Sroberto			msg->msg_namelen = sizeof(dev->address.type.sin6);
1456258945Sroberto		} else if (sock->pf == AF_INET6) {
1457258945Sroberto			msg->msg_name = (void *)&dev->address.type.sin6;
1458258945Sroberto			msg->msg_namelen = sizeof(dev->address.type.sin6);
1459258945Sroberto#ifdef ISC_PLATFORM_HAVESYSUNH
1460258945Sroberto		} else if (sock->pf == AF_UNIX) {
1461258945Sroberto			msg->msg_name = (void *)&dev->address.type.sunix;
1462258945Sroberto			msg->msg_namelen = sizeof(dev->address.type.sunix);
1463258945Sroberto#endif
1464258945Sroberto		} else {
1465258945Sroberto			msg->msg_name = (void *)&dev->address.type.sa;
1466258945Sroberto			msg->msg_namelen = sizeof(dev->address.type);
1467258945Sroberto		}
1468258945Sroberto#else
1469258945Sroberto		msg->msg_name = (void *)&dev->address.type.sa;
1470258945Sroberto		msg->msg_namelen = sizeof(dev->address.type);
1471258945Sroberto#endif
1472258945Sroberto#ifdef ISC_NET_RECVOVERFLOW
1473258945Sroberto		/* If needed, steal one iovec for overflow detection. */
1474258945Sroberto		maxiov--;
1475258945Sroberto#endif
1476258945Sroberto	} else { /* TCP */
1477258945Sroberto		msg->msg_name = NULL;
1478258945Sroberto		msg->msg_namelen = 0;
1479258945Sroberto		dev->address = sock->peer_address;
1480258945Sroberto	}
1481258945Sroberto
1482258945Sroberto	buffer = ISC_LIST_HEAD(dev->bufferlist);
1483258945Sroberto	read_count = 0;
1484258945Sroberto
1485258945Sroberto	/*
1486258945Sroberto	 * Single buffer I/O?  Skip what we've done so far in this region.
1487258945Sroberto	 */
1488258945Sroberto	if (buffer == NULL) {
1489258945Sroberto		read_count = dev->region.length - dev->n;
1490258945Sroberto		iov[0].iov_base = (void *)(dev->region.base + dev->n);
1491258945Sroberto		iov[0].iov_len = read_count;
1492258945Sroberto		iovcount = 1;
1493258945Sroberto
1494258945Sroberto		goto config;
1495258945Sroberto	}
1496258945Sroberto
1497258945Sroberto	/*
1498258945Sroberto	 * Multibuffer I/O.
1499258945Sroberto	 * Skip empty buffers.
1500258945Sroberto	 */
1501258945Sroberto	while (buffer != NULL) {
1502258945Sroberto		REQUIRE(ISC_BUFFER_VALID(buffer));
1503258945Sroberto		if (isc_buffer_availablelength(buffer) != 0)
1504258945Sroberto			break;
1505258945Sroberto		buffer = ISC_LIST_NEXT(buffer, link);
1506258945Sroberto	}
1507258945Sroberto
1508258945Sroberto	iovcount = 0;
1509258945Sroberto	while (buffer != NULL) {
1510258945Sroberto		INSIST(iovcount < MAXSCATTERGATHER_RECV);
1511258945Sroberto
1512258945Sroberto		isc_buffer_availableregion(buffer, &available);
1513258945Sroberto
1514258945Sroberto		if (available.length > 0) {
1515258945Sroberto			iov[iovcount].iov_base = (void *)(available.base);
1516258945Sroberto			iov[iovcount].iov_len = available.length;
1517258945Sroberto			read_count += available.length;
1518258945Sroberto			iovcount++;
1519258945Sroberto		}
1520258945Sroberto		buffer = ISC_LIST_NEXT(buffer, link);
1521258945Sroberto	}
1522258945Sroberto
1523258945Sroberto config:
1524258945Sroberto
1525258945Sroberto	/*
1526258945Sroberto	 * If needed, set up to receive that one extra byte.  Note that
1527258945Sroberto	 * we know there is at least one iov left, since we stole it
1528258945Sroberto	 * at the top of this function.
1529258945Sroberto	 */
1530258945Sroberto#ifdef ISC_NET_RECVOVERFLOW
1531258945Sroberto	if (sock->type == isc_sockettype_udp) {
1532258945Sroberto		iov[iovcount].iov_base = (void *)(&sock->overflow);
1533258945Sroberto		iov[iovcount].iov_len = 1;
1534258945Sroberto		iovcount++;
1535258945Sroberto	}
1536258945Sroberto#endif
1537258945Sroberto
1538258945Sroberto	msg->msg_iov = iov;
1539258945Sroberto	msg->msg_iovlen = iovcount;
1540258945Sroberto
1541258945Sroberto#ifdef ISC_NET_BSD44MSGHDR
1542258945Sroberto	msg->msg_control = NULL;
1543258945Sroberto	msg->msg_controllen = 0;
1544258945Sroberto	msg->msg_flags = 0;
1545258945Sroberto#if defined(USE_CMSG)
1546258945Sroberto	if (sock->type == isc_sockettype_udp) {
1547258945Sroberto		msg->msg_control = sock->recvcmsgbuf;
1548258945Sroberto		msg->msg_controllen = sock->recvcmsgbuflen;
1549258945Sroberto	}
1550258945Sroberto#endif /* USE_CMSG */
1551258945Sroberto#else /* ISC_NET_BSD44MSGHDR */
1552258945Sroberto	msg->msg_accrights = NULL;
1553258945Sroberto	msg->msg_accrightslen = 0;
1554258945Sroberto#endif /* ISC_NET_BSD44MSGHDR */
1555258945Sroberto
1556258945Sroberto	if (read_countp != NULL)
1557258945Sroberto		*read_countp = read_count;
1558258945Sroberto}
1559258945Sroberto
1560258945Srobertostatic void
1561280849Scyset_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
1562258945Sroberto		isc_socketevent_t *dev)
1563258945Sroberto{
1564258945Sroberto	if (sock->type == isc_sockettype_udp) {
1565258945Sroberto		if (address != NULL)
1566258945Sroberto			dev->address = *address;
1567258945Sroberto		else
1568258945Sroberto			dev->address = sock->peer_address;
1569258945Sroberto	} else if (sock->type == isc_sockettype_tcp) {
1570258945Sroberto		INSIST(address == NULL);
1571258945Sroberto		dev->address = sock->peer_address;
1572258945Sroberto	}
1573258945Sroberto}
1574258945Sroberto
1575258945Srobertostatic void
1576258945Srobertodestroy_socketevent(isc_event_t *event) {
1577258945Sroberto	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1578258945Sroberto
1579258945Sroberto	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1580258945Sroberto
1581258945Sroberto	(ev->destroy)(event);
1582258945Sroberto}
1583258945Sroberto
1584258945Srobertostatic isc_socketevent_t *
1585280849Scyallocate_socketevent(isc__socket_t *sock, isc_eventtype_t eventtype,
1586258945Sroberto		     isc_taskaction_t action, const void *arg)
1587258945Sroberto{
1588258945Sroberto	isc_socketevent_t *ev;
1589258945Sroberto
1590258945Sroberto	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1591258945Sroberto						     sock, eventtype,
1592258945Sroberto						     action, arg,
1593258945Sroberto						     sizeof(*ev));
1594258945Sroberto
1595258945Sroberto	if (ev == NULL)
1596258945Sroberto		return (NULL);
1597258945Sroberto
1598280849Scy	ev->result = ISC_R_UNSET;
1599258945Sroberto	ISC_LINK_INIT(ev, ev_link);
1600258945Sroberto	ISC_LIST_INIT(ev->bufferlist);
1601258945Sroberto	ev->region.base = NULL;
1602258945Sroberto	ev->n = 0;
1603258945Sroberto	ev->offset = 0;
1604258945Sroberto	ev->attributes = 0;
1605258945Sroberto	ev->destroy = ev->ev_destroy;
1606258945Sroberto	ev->ev_destroy = destroy_socketevent;
1607258945Sroberto
1608258945Sroberto	return (ev);
1609258945Sroberto}
1610258945Sroberto
1611258945Sroberto#if defined(ISC_SOCKET_DEBUG)
1612258945Srobertostatic void
1613258945Srobertodump_msg(struct msghdr *msg) {
1614258945Sroberto	unsigned int i;
1615258945Sroberto
1616258945Sroberto	printf("MSGHDR %p\n", msg);
1617258945Sroberto	printf("\tname %p, namelen %ld\n", msg->msg_name,
1618258945Sroberto	       (long) msg->msg_namelen);
1619258945Sroberto	printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1620258945Sroberto	       (long) msg->msg_iovlen);
1621258945Sroberto	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1622258945Sroberto		printf("\t\t%d\tbase %p, len %ld\n", i,
1623258945Sroberto		       msg->msg_iov[i].iov_base,
1624258945Sroberto		       (long) msg->msg_iov[i].iov_len);
1625258945Sroberto#ifdef ISC_NET_BSD44MSGHDR
1626258945Sroberto	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1627258945Sroberto	       (long) msg->msg_controllen);
1628258945Sroberto#endif
1629258945Sroberto}
1630258945Sroberto#endif
1631258945Sroberto
1632258945Sroberto#define DOIO_SUCCESS		0	/* i/o ok, event sent */
1633258945Sroberto#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
1634258945Sroberto#define DOIO_HARD		2	/* i/o error, event sent */
1635258945Sroberto#define DOIO_EOF		3	/* EOF, no event sent */
1636258945Sroberto
1637258945Srobertostatic int
1638280849Scydoio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
1639258945Sroberto	int cc;
1640258945Sroberto	struct iovec iov[MAXSCATTERGATHER_RECV];
1641258945Sroberto	size_t read_count;
1642258945Sroberto	size_t actual_count;
1643258945Sroberto	struct msghdr msghdr;
1644258945Sroberto	isc_buffer_t *buffer;
1645258945Sroberto	int recv_errno;
1646258945Sroberto	char strbuf[ISC_STRERRORSIZE];
1647258945Sroberto
1648258945Sroberto	build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1649258945Sroberto
1650258945Sroberto#if defined(ISC_SOCKET_DEBUG)
1651258945Sroberto	dump_msg(&msghdr);
1652258945Sroberto#endif
1653258945Sroberto
1654258945Sroberto	cc = recvmsg(sock->fd, &msghdr, 0);
1655258945Sroberto	recv_errno = errno;
1656258945Sroberto
1657258945Sroberto#if defined(ISC_SOCKET_DEBUG)
1658258945Sroberto	dump_msg(&msghdr);
1659258945Sroberto#endif
1660258945Sroberto
1661258945Sroberto	if (cc < 0) {
1662258945Sroberto		if (SOFT_ERROR(recv_errno))
1663258945Sroberto			return (DOIO_SOFT);
1664258945Sroberto
1665258945Sroberto		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1666258945Sroberto			isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1667258945Sroberto			socket_log(sock, NULL, IOEVENT,
1668258945Sroberto				   isc_msgcat, ISC_MSGSET_SOCKET,
1669258945Sroberto				   ISC_MSG_DOIORECV,
1670258945Sroberto				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1671258945Sroberto				   sock->fd, cc, recv_errno, strbuf);
1672258945Sroberto		}
1673258945Sroberto
1674258945Sroberto#define SOFT_OR_HARD(_system, _isc) \
1675258945Sroberto	if (recv_errno == _system) { \
1676258945Sroberto		if (sock->connected) { \
1677258945Sroberto			dev->result = _isc; \
1678258945Sroberto			inc_stats(sock->manager->stats, \
1679258945Sroberto				  sock->statsindex[STATID_RECVFAIL]); \
1680258945Sroberto			return (DOIO_HARD); \
1681258945Sroberto		} \
1682258945Sroberto		return (DOIO_SOFT); \
1683258945Sroberto	}
1684258945Sroberto#define ALWAYS_HARD(_system, _isc) \
1685258945Sroberto	if (recv_errno == _system) { \
1686258945Sroberto		dev->result = _isc; \
1687258945Sroberto		inc_stats(sock->manager->stats, \
1688258945Sroberto			  sock->statsindex[STATID_RECVFAIL]); \
1689258945Sroberto		return (DOIO_HARD); \
1690258945Sroberto	}
1691258945Sroberto
1692258945Sroberto		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1693258945Sroberto		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1694258945Sroberto		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1695258945Sroberto		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1696258945Sroberto		/* HPUX 11.11 can return EADDRNOTAVAIL. */
1697258945Sroberto		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1698258945Sroberto		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1699258945Sroberto		/*
1700258945Sroberto		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1701258945Sroberto		 * errors.
1702258945Sroberto		 */
1703258945Sroberto#ifdef EPROTO
1704258945Sroberto		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1705258945Sroberto#endif
1706258945Sroberto		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1707258945Sroberto
1708258945Sroberto#undef SOFT_OR_HARD
1709258945Sroberto#undef ALWAYS_HARD
1710258945Sroberto
1711258945Sroberto		dev->result = isc__errno2result(recv_errno);
1712258945Sroberto		inc_stats(sock->manager->stats,
1713258945Sroberto			  sock->statsindex[STATID_RECVFAIL]);
1714258945Sroberto		return (DOIO_HARD);
1715258945Sroberto	}
1716258945Sroberto
1717258945Sroberto	/*
1718280849Scy	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1719280849Scy	 * while on UDP sockets, zero length reads are perfectly valid,
1720280849Scy	 * although strange.
1721258945Sroberto	 */
1722280849Scy	switch (sock->type) {
1723280849Scy	case isc_sockettype_tcp:
1724280849Scy	case isc_sockettype_unix:
1725280849Scy		if (cc == 0)
1726280849Scy			return (DOIO_EOF);
1727280849Scy		break;
1728280849Scy	case isc_sockettype_udp:
1729280849Scy		break;
1730280849Scy	case isc_sockettype_fdwatch:
1731280849Scy	default:
1732280849Scy		INSIST(0);
1733280849Scy	}
1734258945Sroberto
1735258945Sroberto	if (sock->type == isc_sockettype_udp) {
1736258945Sroberto		dev->address.length = msghdr.msg_namelen;
1737258945Sroberto		if (isc_sockaddr_getport(&dev->address) == 0) {
1738258945Sroberto			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1739258945Sroberto				socket_log(sock, &dev->address, IOEVENT,
1740258945Sroberto					   isc_msgcat, ISC_MSGSET_SOCKET,
1741258945Sroberto					   ISC_MSG_ZEROPORT,
1742258945Sroberto					   "dropping source port zero packet");
1743258945Sroberto			}
1744258945Sroberto			return (DOIO_SOFT);
1745258945Sroberto		}
1746280849Scy		/*
1747280849Scy		 * Simulate a firewall blocking UDP responses bigger than
1748280849Scy		 * 512 bytes.
1749280849Scy		 */
1750280849Scy		if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp)
1751280849Scy			return (DOIO_SOFT);
1752258945Sroberto	}
1753258945Sroberto
1754258945Sroberto	socket_log(sock, &dev->address, IOEVENT,
1755258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1756258945Sroberto		   "packet received correctly");
1757258945Sroberto
1758258945Sroberto	/*
1759258945Sroberto	 * Overflow bit detection.  If we received MORE bytes than we should,
1760258945Sroberto	 * this indicates an overflow situation.  Set the flag in the
1761258945Sroberto	 * dev entry and adjust how much we read by one.
1762258945Sroberto	 */
1763258945Sroberto#ifdef ISC_NET_RECVOVERFLOW
1764258945Sroberto	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1765258945Sroberto		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1766258945Sroberto		cc--;
1767258945Sroberto	}
1768258945Sroberto#endif
1769258945Sroberto
1770258945Sroberto	/*
1771258945Sroberto	 * If there are control messages attached, run through them and pull
1772258945Sroberto	 * out the interesting bits.
1773258945Sroberto	 */
1774258945Sroberto	if (sock->type == isc_sockettype_udp)
1775258945Sroberto		process_cmsg(sock, &msghdr, dev);
1776258945Sroberto
1777258945Sroberto	/*
1778258945Sroberto	 * update the buffers (if any) and the i/o count
1779258945Sroberto	 */
1780258945Sroberto	dev->n += cc;
1781258945Sroberto	actual_count = cc;
1782258945Sroberto	buffer = ISC_LIST_HEAD(dev->bufferlist);
1783258945Sroberto	while (buffer != NULL && actual_count > 0U) {
1784258945Sroberto		REQUIRE(ISC_BUFFER_VALID(buffer));
1785258945Sroberto		if (isc_buffer_availablelength(buffer) <= actual_count) {
1786258945Sroberto			actual_count -= isc_buffer_availablelength(buffer);
1787258945Sroberto			isc_buffer_add(buffer,
1788258945Sroberto				       isc_buffer_availablelength(buffer));
1789258945Sroberto		} else {
1790258945Sroberto			isc_buffer_add(buffer, actual_count);
1791258945Sroberto			actual_count = 0;
1792280849Scy			POST(actual_count);
1793258945Sroberto			break;
1794258945Sroberto		}
1795258945Sroberto		buffer = ISC_LIST_NEXT(buffer, link);
1796258945Sroberto		if (buffer == NULL) {
1797258945Sroberto			INSIST(actual_count == 0U);
1798258945Sroberto		}
1799258945Sroberto	}
1800258945Sroberto
1801258945Sroberto	/*
1802258945Sroberto	 * If we read less than we expected, update counters,
1803258945Sroberto	 * and let the upper layer poke the descriptor.
1804258945Sroberto	 */
1805258945Sroberto	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1806258945Sroberto		return (DOIO_SOFT);
1807258945Sroberto
1808258945Sroberto	/*
1809258945Sroberto	 * Full reads are posted, or partials if partials are ok.
1810258945Sroberto	 */
1811258945Sroberto	dev->result = ISC_R_SUCCESS;
1812258945Sroberto	return (DOIO_SUCCESS);
1813258945Sroberto}
1814258945Sroberto
1815258945Sroberto/*
1816258945Sroberto * Returns:
1817258945Sroberto *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1818258945Sroberto *			ISC_R_SUCCESS.
1819258945Sroberto *
1820258945Sroberto *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1821258945Sroberto *			dev->result contains the appropriate error.
1822258945Sroberto *
1823258945Sroberto *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1824258945Sroberto *			event was sent.  The operation should be retried.
1825258945Sroberto *
1826258945Sroberto *	No other return values are possible.
1827258945Sroberto */
1828258945Srobertostatic int
1829280849Scydoio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1830258945Sroberto	int cc;
1831258945Sroberto	struct iovec iov[MAXSCATTERGATHER_SEND];
1832258945Sroberto	size_t write_count;
1833258945Sroberto	struct msghdr msghdr;
1834258945Sroberto	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1835258945Sroberto	int attempts = 0;
1836258945Sroberto	int send_errno;
1837258945Sroberto	char strbuf[ISC_STRERRORSIZE];
1838258945Sroberto
1839258945Sroberto	build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1840258945Sroberto
1841258945Sroberto resend:
1842258945Sroberto	cc = sendmsg(sock->fd, &msghdr, 0);
1843258945Sroberto	send_errno = errno;
1844258945Sroberto
1845258945Sroberto	/*
1846258945Sroberto	 * Check for error or block condition.
1847258945Sroberto	 */
1848258945Sroberto	if (cc < 0) {
1849258945Sroberto		if (send_errno == EINTR && ++attempts < NRETRIES)
1850258945Sroberto			goto resend;
1851258945Sroberto
1852258945Sroberto		if (SOFT_ERROR(send_errno))
1853258945Sroberto			return (DOIO_SOFT);
1854258945Sroberto
1855258945Sroberto#define SOFT_OR_HARD(_system, _isc) \
1856258945Sroberto	if (send_errno == _system) { \
1857258945Sroberto		if (sock->connected) { \
1858258945Sroberto			dev->result = _isc; \
1859258945Sroberto			inc_stats(sock->manager->stats, \
1860258945Sroberto				  sock->statsindex[STATID_SENDFAIL]); \
1861258945Sroberto			return (DOIO_HARD); \
1862258945Sroberto		} \
1863258945Sroberto		return (DOIO_SOFT); \
1864258945Sroberto	}
1865258945Sroberto#define ALWAYS_HARD(_system, _isc) \
1866258945Sroberto	if (send_errno == _system) { \
1867258945Sroberto		dev->result = _isc; \
1868258945Sroberto		inc_stats(sock->manager->stats, \
1869258945Sroberto			  sock->statsindex[STATID_SENDFAIL]); \
1870258945Sroberto		return (DOIO_HARD); \
1871258945Sroberto	}
1872258945Sroberto
1873258945Sroberto		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1874258945Sroberto		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1875258945Sroberto		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1876258945Sroberto		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1877258945Sroberto		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1878258945Sroberto#ifdef EHOSTDOWN
1879258945Sroberto		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1880258945Sroberto#endif
1881258945Sroberto		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1882258945Sroberto		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1883258945Sroberto		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1884258945Sroberto		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1885258945Sroberto		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1886258945Sroberto
1887258945Sroberto#undef SOFT_OR_HARD
1888258945Sroberto#undef ALWAYS_HARD
1889258945Sroberto
1890258945Sroberto		/*
1891258945Sroberto		 * The other error types depend on whether or not the
1892258945Sroberto		 * socket is UDP or TCP.  If it is UDP, some errors
1893258945Sroberto		 * that we expect to be fatal under TCP are merely
1894258945Sroberto		 * annoying, and are really soft errors.
1895258945Sroberto		 *
1896258945Sroberto		 * However, these soft errors are still returned as
1897258945Sroberto		 * a status.
1898258945Sroberto		 */
1899258945Sroberto		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1900258945Sroberto		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1901258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1902258945Sroberto				 addrbuf, strbuf);
1903258945Sroberto		dev->result = isc__errno2result(send_errno);
1904258945Sroberto		inc_stats(sock->manager->stats,
1905258945Sroberto			  sock->statsindex[STATID_SENDFAIL]);
1906258945Sroberto		return (DOIO_HARD);
1907258945Sroberto	}
1908258945Sroberto
1909258945Sroberto	if (cc == 0) {
1910258945Sroberto		inc_stats(sock->manager->stats,
1911258945Sroberto			  sock->statsindex[STATID_SENDFAIL]);
1912258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
1913258945Sroberto				 "doio_send: send() %s 0",
1914258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1915258945Sroberto						ISC_MSG_RETURNED, "returned"));
1916258945Sroberto	}
1917258945Sroberto
1918258945Sroberto	/*
1919258945Sroberto	 * If we write less than we expected, update counters, poke.
1920258945Sroberto	 */
1921258945Sroberto	dev->n += cc;
1922258945Sroberto	if ((size_t)cc != write_count)
1923258945Sroberto		return (DOIO_SOFT);
1924258945Sroberto
1925258945Sroberto	/*
1926258945Sroberto	 * Exactly what we wanted to write.  We're done with this
1927258945Sroberto	 * entry.  Post its completion event.
1928258945Sroberto	 */
1929258945Sroberto	dev->result = ISC_R_SUCCESS;
1930258945Sroberto	return (DOIO_SUCCESS);
1931258945Sroberto}
1932258945Sroberto
1933258945Sroberto/*
1934258945Sroberto * Kill.
1935258945Sroberto *
1936258945Sroberto * Caller must ensure that the socket is not locked and no external
1937258945Sroberto * references exist.
1938258945Sroberto */
1939258945Srobertostatic void
1940280849Scyclosesocket(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1941258945Sroberto	isc_sockettype_t type = sock->type;
1942258945Sroberto	int lockid = FDLOCK_ID(fd);
1943258945Sroberto
1944258945Sroberto	/*
1945258945Sroberto	 * No one has this socket open, so the watcher doesn't have to be
1946258945Sroberto	 * poked, and the socket doesn't have to be locked.
1947258945Sroberto	 */
1948258945Sroberto	LOCK(&manager->fdlock[lockid]);
1949258945Sroberto	manager->fds[fd] = NULL;
1950258945Sroberto	if (type == isc_sockettype_fdwatch)
1951258945Sroberto		manager->fdstate[fd] = CLOSED;
1952258945Sroberto	else
1953258945Sroberto		manager->fdstate[fd] = CLOSE_PENDING;
1954258945Sroberto	UNLOCK(&manager->fdlock[lockid]);
1955258945Sroberto	if (type == isc_sockettype_fdwatch) {
1956258945Sroberto		/*
1957258945Sroberto		 * The caller may close the socket once this function returns,
1958258945Sroberto		 * and `fd' may be reassigned for a new socket.  So we do
1959258945Sroberto		 * unwatch_fd() here, rather than defer it via select_poke().
1960258945Sroberto		 * Note: this may complicate data protection among threads and
1961258945Sroberto		 * may reduce performance due to additional locks.  One way to
1962258945Sroberto		 * solve this would be to dup() the watched descriptor, but we
1963258945Sroberto		 * take a simpler approach at this moment.
1964258945Sroberto		 */
1965258945Sroberto		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1966258945Sroberto		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1967258945Sroberto	} else
1968258945Sroberto		select_poke(manager, fd, SELECT_POKE_CLOSE);
1969258945Sroberto
1970258945Sroberto	inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
1971258945Sroberto
1972258945Sroberto	/*
1973258945Sroberto	 * update manager->maxfd here (XXX: this should be implemented more
1974258945Sroberto	 * efficiently)
1975258945Sroberto	 */
1976258945Sroberto#ifdef USE_SELECT
1977258945Sroberto	LOCK(&manager->lock);
1978258945Sroberto	if (manager->maxfd == fd) {
1979258945Sroberto		int i;
1980258945Sroberto
1981258945Sroberto		manager->maxfd = 0;
1982258945Sroberto		for (i = fd - 1; i >= 0; i--) {
1983258945Sroberto			lockid = FDLOCK_ID(i);
1984258945Sroberto
1985258945Sroberto			LOCK(&manager->fdlock[lockid]);
1986258945Sroberto			if (manager->fdstate[i] == MANAGED) {
1987258945Sroberto				manager->maxfd = i;
1988258945Sroberto				UNLOCK(&manager->fdlock[lockid]);
1989258945Sroberto				break;
1990258945Sroberto			}
1991258945Sroberto			UNLOCK(&manager->fdlock[lockid]);
1992258945Sroberto		}
1993258945Sroberto#ifdef ISC_PLATFORM_USETHREADS
1994258945Sroberto		if (manager->maxfd < manager->pipe_fds[0])
1995258945Sroberto			manager->maxfd = manager->pipe_fds[0];
1996258945Sroberto#endif
1997258945Sroberto	}
1998258945Sroberto	UNLOCK(&manager->lock);
1999258945Sroberto#endif	/* USE_SELECT */
2000258945Sroberto}
2001258945Sroberto
2002258945Srobertostatic void
2003280849Scydestroy(isc__socket_t **sockp) {
2004258945Sroberto	int fd;
2005280849Scy	isc__socket_t *sock = *sockp;
2006280849Scy	isc__socketmgr_t *manager = sock->manager;
2007258945Sroberto
2008258945Sroberto	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2009258945Sroberto		   ISC_MSG_DESTROYING, "destroying");
2010258945Sroberto
2011258945Sroberto	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2012258945Sroberto	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2013258945Sroberto	INSIST(ISC_LIST_EMPTY(sock->send_list));
2014258945Sroberto	INSIST(sock->connect_ev == NULL);
2015258945Sroberto	REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
2016258945Sroberto
2017258945Sroberto	if (sock->fd >= 0) {
2018258945Sroberto		fd = sock->fd;
2019258945Sroberto		sock->fd = -1;
2020258945Sroberto		closesocket(manager, sock, fd);
2021258945Sroberto	}
2022258945Sroberto
2023258945Sroberto	LOCK(&manager->lock);
2024258945Sroberto
2025258945Sroberto	ISC_LIST_UNLINK(manager->socklist, sock, link);
2026258945Sroberto
2027280849Scy#ifdef USE_WATCHER_THREAD
2028258945Sroberto	if (ISC_LIST_EMPTY(manager->socklist))
2029258945Sroberto		SIGNAL(&manager->shutdown_ok);
2030280849Scy#endif /* USE_WATCHER_THREAD */
2031258945Sroberto
2032280849Scy	/* can't unlock manager as its memory context is still used */
2033280849Scy	free_socket(sockp);
2034280849Scy
2035258945Sroberto	UNLOCK(&manager->lock);
2036258945Sroberto}
2037258945Sroberto
2038258945Srobertostatic isc_result_t
2039280849Scyallocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
2040280849Scy		isc__socket_t **socketp)
2041258945Sroberto{
2042280849Scy	isc__socket_t *sock;
2043258945Sroberto	isc_result_t result;
2044258945Sroberto	ISC_SOCKADDR_LEN_T cmsgbuflen;
2045258945Sroberto
2046258945Sroberto	sock = isc_mem_get(manager->mctx, sizeof(*sock));
2047258945Sroberto
2048258945Sroberto	if (sock == NULL)
2049258945Sroberto		return (ISC_R_NOMEMORY);
2050258945Sroberto
2051280849Scy	sock->common.magic = 0;
2052280849Scy	sock->common.impmagic = 0;
2053258945Sroberto	sock->references = 0;
2054258945Sroberto
2055258945Sroberto	sock->manager = manager;
2056258945Sroberto	sock->type = type;
2057258945Sroberto	sock->fd = -1;
2058280849Scy	sock->dupped = 0;
2059258945Sroberto	sock->statsindex = NULL;
2060258945Sroberto
2061258945Sroberto	ISC_LINK_INIT(sock, link);
2062258945Sroberto
2063258945Sroberto	sock->recvcmsgbuf = NULL;
2064258945Sroberto	sock->sendcmsgbuf = NULL;
2065258945Sroberto
2066258945Sroberto	/*
2067258945Sroberto	 * set up cmsg buffers
2068258945Sroberto	 */
2069258945Sroberto	cmsgbuflen = 0;
2070258945Sroberto#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
2071280849Scy	cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
2072258945Sroberto#endif
2073258945Sroberto#if defined(USE_CMSG) && defined(SO_TIMESTAMP)
2074258945Sroberto	cmsgbuflen += cmsg_space(sizeof(struct timeval));
2075258945Sroberto#endif
2076258945Sroberto	sock->recvcmsgbuflen = cmsgbuflen;
2077258945Sroberto	if (sock->recvcmsgbuflen != 0U) {
2078258945Sroberto		sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
2079280849Scy		if (sock->recvcmsgbuf == NULL) {
2080280849Scy			result = ISC_R_NOMEMORY;
2081258945Sroberto			goto error;
2082280849Scy		}
2083258945Sroberto	}
2084258945Sroberto
2085258945Sroberto	cmsgbuflen = 0;
2086258945Sroberto#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
2087280849Scy	cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
2088280849Scy#if defined(IPV6_USE_MIN_MTU)
2089280849Scy	/*
2090280849Scy	 * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU
2091280849Scy	 * support.
2092280849Scy	 */
2093280849Scy	cmsgbuflen += cmsg_space(sizeof(int));
2094258945Sroberto#endif
2095280849Scy#endif
2096258945Sroberto	sock->sendcmsgbuflen = cmsgbuflen;
2097258945Sroberto	if (sock->sendcmsgbuflen != 0U) {
2098258945Sroberto		sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
2099280849Scy		if (sock->sendcmsgbuf == NULL) {
2100280849Scy			result = ISC_R_NOMEMORY;
2101258945Sroberto			goto error;
2102280849Scy		}
2103258945Sroberto	}
2104258945Sroberto
2105258945Sroberto	memset(sock->name, 0, sizeof(sock->name));
2106258945Sroberto	sock->tag = NULL;
2107258945Sroberto
2108258945Sroberto	/*
2109258945Sroberto	 * set up list of readers and writers to be initially empty
2110258945Sroberto	 */
2111258945Sroberto	ISC_LIST_INIT(sock->recv_list);
2112258945Sroberto	ISC_LIST_INIT(sock->send_list);
2113258945Sroberto	ISC_LIST_INIT(sock->accept_list);
2114258945Sroberto	sock->connect_ev = NULL;
2115258945Sroberto	sock->pending_recv = 0;
2116258945Sroberto	sock->pending_send = 0;
2117258945Sroberto	sock->pending_accept = 0;
2118258945Sroberto	sock->listener = 0;
2119258945Sroberto	sock->connected = 0;
2120258945Sroberto	sock->connecting = 0;
2121258945Sroberto	sock->bound = 0;
2122258945Sroberto
2123258945Sroberto	/*
2124258945Sroberto	 * initialize the lock
2125258945Sroberto	 */
2126258945Sroberto	result = isc_mutex_init(&sock->lock);
2127258945Sroberto	if (result != ISC_R_SUCCESS) {
2128280849Scy		sock->common.magic = 0;
2129280849Scy		sock->common.impmagic = 0;
2130258945Sroberto		goto error;
2131258945Sroberto	}
2132258945Sroberto
2133258945Sroberto	/*
2134258945Sroberto	 * Initialize readable and writable events
2135258945Sroberto	 */
2136258945Sroberto	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
2137258945Sroberto		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
2138258945Sroberto		       NULL, sock, sock, NULL, NULL);
2139258945Sroberto	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
2140258945Sroberto		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
2141258945Sroberto		       NULL, sock, sock, NULL, NULL);
2142258945Sroberto
2143280849Scy	sock->common.magic = ISCAPI_SOCKET_MAGIC;
2144280849Scy	sock->common.impmagic = SOCKET_MAGIC;
2145258945Sroberto	*socketp = sock;
2146258945Sroberto
2147258945Sroberto	return (ISC_R_SUCCESS);
2148258945Sroberto
2149258945Sroberto error:
2150258945Sroberto	if (sock->recvcmsgbuf != NULL)
2151258945Sroberto		isc_mem_put(manager->mctx, sock->recvcmsgbuf,
2152258945Sroberto			    sock->recvcmsgbuflen);
2153258945Sroberto	if (sock->sendcmsgbuf != NULL)
2154258945Sroberto		isc_mem_put(manager->mctx, sock->sendcmsgbuf,
2155258945Sroberto			    sock->sendcmsgbuflen);
2156258945Sroberto	isc_mem_put(manager->mctx, sock, sizeof(*sock));
2157258945Sroberto
2158258945Sroberto	return (result);
2159258945Sroberto}
2160258945Sroberto
2161258945Sroberto/*
2162258945Sroberto * This event requires that the various lists be empty, that the reference
2163258945Sroberto * count be 1, and that the magic number is valid.  The other socket bits,
2164258945Sroberto * like the lock, must be initialized as well.  The fd associated must be
2165258945Sroberto * marked as closed, by setting it to -1 on close, or this routine will
2166258945Sroberto * also close the socket.
2167258945Sroberto */
2168258945Srobertostatic void
2169280849Scyfree_socket(isc__socket_t **socketp) {
2170280849Scy	isc__socket_t *sock = *socketp;
2171258945Sroberto
2172258945Sroberto	INSIST(sock->references == 0);
2173258945Sroberto	INSIST(VALID_SOCKET(sock));
2174258945Sroberto	INSIST(!sock->connecting);
2175258945Sroberto	INSIST(!sock->pending_recv);
2176258945Sroberto	INSIST(!sock->pending_send);
2177258945Sroberto	INSIST(!sock->pending_accept);
2178258945Sroberto	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2179258945Sroberto	INSIST(ISC_LIST_EMPTY(sock->send_list));
2180258945Sroberto	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2181258945Sroberto	INSIST(!ISC_LINK_LINKED(sock, link));
2182258945Sroberto
2183258945Sroberto	if (sock->recvcmsgbuf != NULL)
2184258945Sroberto		isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
2185258945Sroberto			    sock->recvcmsgbuflen);
2186258945Sroberto	if (sock->sendcmsgbuf != NULL)
2187258945Sroberto		isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
2188258945Sroberto			    sock->sendcmsgbuflen);
2189258945Sroberto
2190280849Scy	sock->common.magic = 0;
2191280849Scy	sock->common.impmagic = 0;
2192258945Sroberto
2193258945Sroberto	DESTROYLOCK(&sock->lock);
2194258945Sroberto
2195258945Sroberto	isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
2196258945Sroberto
2197258945Sroberto	*socketp = NULL;
2198258945Sroberto}
2199258945Sroberto
2200258945Sroberto#ifdef SO_BSDCOMPAT
2201258945Sroberto/*
2202258945Sroberto * This really should not be necessary to do.  Having to workout
2203258945Sroberto * which kernel version we are on at run time so that we don't cause
2204258945Sroberto * the kernel to issue a warning about us using a deprecated socket option.
2205258945Sroberto * Such warnings should *never* be on by default in production kernels.
2206258945Sroberto *
2207258945Sroberto * We can't do this a build time because executables are moved between
2208258945Sroberto * machines and hence kernels.
2209258945Sroberto *
2210258945Sroberto * We can't just not set SO_BSDCOMAT because some kernels require it.
2211258945Sroberto */
2212258945Sroberto
2213258945Srobertostatic isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
2214258945Srobertoisc_boolean_t bsdcompat = ISC_TRUE;
2215258945Sroberto
2216258945Srobertostatic void
2217258945Srobertoclear_bsdcompat(void) {
2218258945Sroberto#ifdef __linux__
2219258945Sroberto	 struct utsname buf;
2220258945Sroberto	 char *endp;
2221258945Sroberto	 long int major;
2222258945Sroberto	 long int minor;
2223258945Sroberto
2224258945Sroberto	 uname(&buf);    /* Can only fail if buf is bad in Linux. */
2225258945Sroberto
2226258945Sroberto	 /* Paranoia in parsing can be increased, but we trust uname(). */
2227258945Sroberto	 major = strtol(buf.release, &endp, 10);
2228258945Sroberto	 if (*endp == '.') {
2229258945Sroberto		minor = strtol(endp+1, &endp, 10);
2230258945Sroberto		if ((major > 2) || ((major == 2) && (minor >= 4))) {
2231258945Sroberto			bsdcompat = ISC_FALSE;
2232258945Sroberto		}
2233258945Sroberto	 }
2234258945Sroberto#endif /* __linux __ */
2235258945Sroberto}
2236258945Sroberto#endif
2237258945Sroberto
2238258945Srobertostatic isc_result_t
2239280849Scyopensocket(isc__socketmgr_t *manager, isc__socket_t *sock,
2240280849Scy	   isc__socket_t *dup_socket)
2241280849Scy{
2242280849Scy	isc_result_t result;
2243258945Sroberto	char strbuf[ISC_STRERRORSIZE];
2244258945Sroberto	const char *err = "socket";
2245258945Sroberto	int tries = 0;
2246258945Sroberto#if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
2247258945Sroberto	int on = 1;
2248258945Sroberto#endif
2249258945Sroberto#if defined(SO_RCVBUF)
2250258945Sroberto	ISC_SOCKADDR_LEN_T optlen;
2251258945Sroberto	int size;
2252258945Sroberto#endif
2253258945Sroberto
2254258945Sroberto again:
2255280849Scy	if (dup_socket == NULL) {
2256280849Scy		switch (sock->type) {
2257280849Scy		case isc_sockettype_udp:
2258280849Scy			sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2259280849Scy			break;
2260280849Scy		case isc_sockettype_tcp:
2261280849Scy			sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2262280849Scy			break;
2263280849Scy		case isc_sockettype_unix:
2264280849Scy			sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2265280849Scy			break;
2266280849Scy		case isc_sockettype_fdwatch:
2267280849Scy			/*
2268280849Scy			 * We should not be called for isc_sockettype_fdwatch
2269280849Scy			 * sockets.
2270280849Scy			 */
2271280849Scy			INSIST(0);
2272280849Scy			break;
2273280849Scy		}
2274280849Scy	} else {
2275280849Scy		sock->fd = dup(dup_socket->fd);
2276280849Scy		sock->dupped = 1;
2277280849Scy		sock->bound = dup_socket->bound;
2278258945Sroberto	}
2279258945Sroberto	if (sock->fd == -1 && errno == EINTR && tries++ < 42)
2280258945Sroberto		goto again;
2281258945Sroberto
2282258945Sroberto#ifdef F_DUPFD
2283258945Sroberto	/*
2284258945Sroberto	 * Leave a space for stdio and TCP to work in.
2285258945Sroberto	 */
2286258945Sroberto	if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2287258945Sroberto	    sock->fd >= 0 && sock->fd < manager->reserved) {
2288258945Sroberto		int new, tmp;
2289258945Sroberto		new = fcntl(sock->fd, F_DUPFD, manager->reserved);
2290258945Sroberto		tmp = errno;
2291258945Sroberto		(void)close(sock->fd);
2292258945Sroberto		errno = tmp;
2293258945Sroberto		sock->fd = new;
2294258945Sroberto		err = "isc_socket_create: fcntl/reserved";
2295258945Sroberto	} else if (sock->fd >= 0 && sock->fd < 20) {
2296258945Sroberto		int new, tmp;
2297258945Sroberto		new = fcntl(sock->fd, F_DUPFD, 20);
2298258945Sroberto		tmp = errno;
2299258945Sroberto		(void)close(sock->fd);
2300258945Sroberto		errno = tmp;
2301258945Sroberto		sock->fd = new;
2302258945Sroberto		err = "isc_socket_create: fcntl";
2303258945Sroberto	}
2304258945Sroberto#endif
2305258945Sroberto
2306258945Sroberto	if (sock->fd >= (int)manager->maxsocks) {
2307258945Sroberto		(void)close(sock->fd);
2308258945Sroberto		isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2309258945Sroberto			       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2310258945Sroberto			       isc_msgcat, ISC_MSGSET_SOCKET,
2311258945Sroberto			       ISC_MSG_TOOMANYFDS,
2312258945Sroberto			       "socket: file descriptor exceeds limit (%d/%u)",
2313258945Sroberto			       sock->fd, manager->maxsocks);
2314258945Sroberto		return (ISC_R_NORESOURCES);
2315258945Sroberto	}
2316258945Sroberto
2317258945Sroberto	if (sock->fd < 0) {
2318258945Sroberto		switch (errno) {
2319258945Sroberto		case EMFILE:
2320258945Sroberto		case ENFILE:
2321258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
2322258945Sroberto			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2323258945Sroberto				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2324258945Sroberto				       isc_msgcat, ISC_MSGSET_SOCKET,
2325258945Sroberto				       ISC_MSG_TOOMANYFDS,
2326258945Sroberto				       "%s: %s", err, strbuf);
2327258945Sroberto			/* fallthrough */
2328258945Sroberto		case ENOBUFS:
2329258945Sroberto			return (ISC_R_NORESOURCES);
2330258945Sroberto
2331258945Sroberto		case EPROTONOSUPPORT:
2332258945Sroberto		case EPFNOSUPPORT:
2333258945Sroberto		case EAFNOSUPPORT:
2334258945Sroberto		/*
2335258945Sroberto		 * Linux 2.2 (and maybe others) return EINVAL instead of
2336258945Sroberto		 * EAFNOSUPPORT.
2337258945Sroberto		 */
2338258945Sroberto		case EINVAL:
2339258945Sroberto			return (ISC_R_FAMILYNOSUPPORT);
2340258945Sroberto
2341258945Sroberto		default:
2342258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
2343258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
2344258945Sroberto					 "%s() %s: %s", err,
2345258945Sroberto					 isc_msgcat_get(isc_msgcat,
2346258945Sroberto							ISC_MSGSET_GENERAL,
2347258945Sroberto							ISC_MSG_FAILED,
2348258945Sroberto							"failed"),
2349258945Sroberto					 strbuf);
2350258945Sroberto			return (ISC_R_UNEXPECTED);
2351258945Sroberto		}
2352258945Sroberto	}
2353258945Sroberto
2354280849Scy	if (dup_socket != NULL)
2355280849Scy		goto setup_done;
2356280849Scy
2357280849Scy	result = make_nonblock(sock->fd);
2358280849Scy	if (result != ISC_R_SUCCESS) {
2359258945Sroberto		(void)close(sock->fd);
2360280849Scy		return (result);
2361258945Sroberto	}
2362258945Sroberto
2363258945Sroberto#ifdef SO_BSDCOMPAT
2364258945Sroberto	RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
2365258945Sroberto				  clear_bsdcompat) == ISC_R_SUCCESS);
2366258945Sroberto	if (sock->type != isc_sockettype_unix && bsdcompat &&
2367258945Sroberto	    setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
2368258945Sroberto		       (void *)&on, sizeof(on)) < 0) {
2369258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
2370258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
2371258945Sroberto				 "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
2372258945Sroberto				 sock->fd,
2373258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2374258945Sroberto						ISC_MSG_FAILED, "failed"),
2375258945Sroberto				 strbuf);
2376258945Sroberto		/* Press on... */
2377258945Sroberto	}
2378258945Sroberto#endif
2379258945Sroberto
2380258945Sroberto#ifdef SO_NOSIGPIPE
2381258945Sroberto	if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
2382258945Sroberto		       (void *)&on, sizeof(on)) < 0) {
2383258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
2384258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
2385258945Sroberto				 "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
2386258945Sroberto				 sock->fd,
2387258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2388258945Sroberto						ISC_MSG_FAILED, "failed"),
2389258945Sroberto				 strbuf);
2390258945Sroberto		/* Press on... */
2391258945Sroberto	}
2392258945Sroberto#endif
2393258945Sroberto
2394258945Sroberto#if defined(USE_CMSG) || defined(SO_RCVBUF)
2395258945Sroberto	if (sock->type == isc_sockettype_udp) {
2396258945Sroberto
2397258945Sroberto#if defined(USE_CMSG)
2398258945Sroberto#if defined(SO_TIMESTAMP)
2399258945Sroberto		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
2400258945Sroberto			       (void *)&on, sizeof(on)) < 0
2401258945Sroberto		    && errno != ENOPROTOOPT) {
2402258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
2403258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
2404258945Sroberto					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
2405258945Sroberto					 sock->fd,
2406258945Sroberto					 isc_msgcat_get(isc_msgcat,
2407258945Sroberto							ISC_MSGSET_GENERAL,
2408258945Sroberto							ISC_MSG_FAILED,
2409258945Sroberto							"failed"),
2410258945Sroberto					 strbuf);
2411258945Sroberto			/* Press on... */
2412258945Sroberto		}
2413258945Sroberto#endif /* SO_TIMESTAMP */
2414258945Sroberto
2415258945Sroberto#if defined(ISC_PLATFORM_HAVEIPV6)
2416258945Sroberto		if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
2417258945Sroberto			/*
2418258945Sroberto			 * Warn explicitly because this anomaly can be hidden
2419258945Sroberto			 * in usual operation (and unexpectedly appear later).
2420258945Sroberto			 */
2421258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
2422258945Sroberto					 "No buffer available to receive "
2423258945Sroberto					 "IPv6 destination");
2424258945Sroberto		}
2425258945Sroberto#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
2426258945Sroberto#ifdef IPV6_RECVPKTINFO
2427258945Sroberto		/* RFC 3542 */
2428258945Sroberto		if ((sock->pf == AF_INET6)
2429258945Sroberto		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2430258945Sroberto				   (void *)&on, sizeof(on)) < 0)) {
2431258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
2432258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
2433258945Sroberto					 "setsockopt(%d, IPV6_RECVPKTINFO) "
2434258945Sroberto					 "%s: %s", sock->fd,
2435258945Sroberto					 isc_msgcat_get(isc_msgcat,
2436258945Sroberto							ISC_MSGSET_GENERAL,
2437258945Sroberto							ISC_MSG_FAILED,
2438258945Sroberto							"failed"),
2439258945Sroberto					 strbuf);
2440258945Sroberto		}
2441258945Sroberto#else
2442258945Sroberto		/* RFC 2292 */
2443258945Sroberto		if ((sock->pf == AF_INET6)
2444258945Sroberto		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2445258945Sroberto				   (void *)&on, sizeof(on)) < 0)) {
2446258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
2447258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
2448258945Sroberto					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
2449258945Sroberto					 sock->fd,
2450258945Sroberto					 isc_msgcat_get(isc_msgcat,
2451258945Sroberto							ISC_MSGSET_GENERAL,
2452258945Sroberto							ISC_MSG_FAILED,
2453258945Sroberto							"failed"),
2454258945Sroberto					 strbuf);
2455258945Sroberto		}
2456258945Sroberto#endif /* IPV6_RECVPKTINFO */
2457258945Sroberto#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
2458258945Sroberto#ifdef IPV6_USE_MIN_MTU        /* RFC 3542, not too common yet*/
2459258945Sroberto		/* use minimum MTU */
2460280849Scy		if (sock->pf == AF_INET6 &&
2461280849Scy		    setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2462280849Scy			       (void *)&on, sizeof(on)) < 0) {
2463280849Scy			isc__strerror(errno, strbuf, sizeof(strbuf));
2464280849Scy			UNEXPECTED_ERROR(__FILE__, __LINE__,
2465280849Scy					 "setsockopt(%d, IPV6_USE_MIN_MTU) "
2466280849Scy					 "%s: %s", sock->fd,
2467280849Scy					 isc_msgcat_get(isc_msgcat,
2468280849Scy							ISC_MSGSET_GENERAL,
2469280849Scy							ISC_MSG_FAILED,
2470280849Scy							"failed"),
2471280849Scy					 strbuf);
2472280849Scy		}
2473280849Scy#endif
2474280849Scy#if defined(IPV6_MTU)
2475280849Scy		/*
2476280849Scy		 * Use minimum MTU on IPv6 sockets.
2477280849Scy		 */
2478258945Sroberto		if (sock->pf == AF_INET6) {
2479280849Scy			int mtu = 1280;
2480280849Scy			(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU,
2481280849Scy					 &mtu, sizeof(mtu));
2482280849Scy		}
2483280849Scy#endif
2484280849Scy#if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT)
2485280849Scy		/*
2486280849Scy		 * Turn off Path MTU discovery on IPv6/UDP sockets.
2487280849Scy		 */
2488280849Scy		if (sock->pf == AF_INET6) {
2489280849Scy			int action = IPV6_PMTUDISC_DONT;
2490258945Sroberto			(void)setsockopt(sock->fd, IPPROTO_IPV6,
2491280849Scy					 IPV6_MTU_DISCOVER, &action,
2492280849Scy					 sizeof(action));
2493258945Sroberto		}
2494258945Sroberto#endif
2495258945Sroberto#endif /* ISC_PLATFORM_HAVEIPV6 */
2496258945Sroberto#endif /* defined(USE_CMSG) */
2497258945Sroberto
2498258945Sroberto#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
2499258945Sroberto		/*
2500258945Sroberto		 * Turn off Path MTU discovery on IPv4/UDP sockets.
2501258945Sroberto		 */
2502258945Sroberto		if (sock->pf == AF_INET) {
2503258945Sroberto			int action = IP_PMTUDISC_DONT;
2504258945Sroberto			(void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2505258945Sroberto					 &action, sizeof(action));
2506258945Sroberto		}
2507258945Sroberto#endif
2508258945Sroberto#if defined(IP_DONTFRAG)
2509258945Sroberto		/*
2510258945Sroberto		 * Turn off Path MTU discovery on IPv4/UDP sockets.
2511258945Sroberto		 */
2512258945Sroberto		if (sock->pf == AF_INET) {
2513258945Sroberto			int off = 0;
2514258945Sroberto			(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
2515258945Sroberto					 &off, sizeof(off));
2516258945Sroberto		}
2517258945Sroberto#endif
2518258945Sroberto
2519258945Sroberto#if defined(SO_RCVBUF)
2520258945Sroberto		optlen = sizeof(size);
2521258945Sroberto		if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2522258945Sroberto			       (void *)&size, &optlen) >= 0 &&
2523258945Sroberto		     size < RCVBUFSIZE) {
2524258945Sroberto			size = RCVBUFSIZE;
2525258945Sroberto			if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2526258945Sroberto				       (void *)&size, sizeof(size)) == -1) {
2527258945Sroberto				isc__strerror(errno, strbuf, sizeof(strbuf));
2528258945Sroberto				UNEXPECTED_ERROR(__FILE__, __LINE__,
2529258945Sroberto					"setsockopt(%d, SO_RCVBUF, %d) %s: %s",
2530258945Sroberto					sock->fd, size,
2531258945Sroberto					isc_msgcat_get(isc_msgcat,
2532258945Sroberto						       ISC_MSGSET_GENERAL,
2533258945Sroberto						       ISC_MSG_FAILED,
2534258945Sroberto						       "failed"),
2535258945Sroberto					strbuf);
2536258945Sroberto			}
2537258945Sroberto		}
2538258945Sroberto#endif
2539258945Sroberto	}
2540258945Sroberto#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
2541258945Sroberto
2542280849Scysetup_done:
2543258945Sroberto	inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2544258945Sroberto
2545258945Sroberto	return (ISC_R_SUCCESS);
2546258945Sroberto}
2547258945Sroberto
2548280849Scy/*
2549280849Scy * Create a 'type' socket or duplicate an existing socket, managed
2550280849Scy * by 'manager'.  Events will be posted to 'task' and when dispatched
2551280849Scy * 'action' will be called with 'arg' as the arg value.  The new
2552280849Scy * socket is returned in 'socketp'.
2553258945Sroberto */
2554280849Scystatic isc_result_t
2555280849Scysocket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2556280849Scy	      isc_socket_t **socketp, isc_socket_t *dup_socket)
2557258945Sroberto{
2558280849Scy	isc__socket_t *sock = NULL;
2559280849Scy	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2560258945Sroberto	isc_result_t result;
2561258945Sroberto	int lockid;
2562258945Sroberto
2563258945Sroberto	REQUIRE(VALID_MANAGER(manager));
2564258945Sroberto	REQUIRE(socketp != NULL && *socketp == NULL);
2565258945Sroberto	REQUIRE(type != isc_sockettype_fdwatch);
2566258945Sroberto
2567258945Sroberto	result = allocate_socket(manager, type, &sock);
2568258945Sroberto	if (result != ISC_R_SUCCESS)
2569258945Sroberto		return (result);
2570258945Sroberto
2571258945Sroberto	switch (sock->type) {
2572258945Sroberto	case isc_sockettype_udp:
2573258945Sroberto		sock->statsindex =
2574258945Sroberto			(pf == AF_INET) ? upd4statsindex : upd6statsindex;
2575258945Sroberto		break;
2576258945Sroberto	case isc_sockettype_tcp:
2577258945Sroberto		sock->statsindex =
2578258945Sroberto			(pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
2579258945Sroberto		break;
2580258945Sroberto	case isc_sockettype_unix:
2581258945Sroberto		sock->statsindex = unixstatsindex;
2582258945Sroberto		break;
2583258945Sroberto	default:
2584258945Sroberto		INSIST(0);
2585258945Sroberto	}
2586258945Sroberto
2587258945Sroberto	sock->pf = pf;
2588280849Scy
2589280849Scy	result = opensocket(manager, sock, (isc__socket_t *)dup_socket);
2590258945Sroberto	if (result != ISC_R_SUCCESS) {
2591258945Sroberto		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2592258945Sroberto		free_socket(&sock);
2593258945Sroberto		return (result);
2594258945Sroberto	}
2595258945Sroberto
2596280849Scy	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
2597258945Sroberto	sock->references = 1;
2598280849Scy	*socketp = (isc_socket_t *)sock;
2599258945Sroberto
2600258945Sroberto	/*
2601258945Sroberto	 * Note we don't have to lock the socket like we normally would because
2602258945Sroberto	 * there are no external references to it yet.
2603258945Sroberto	 */
2604258945Sroberto
2605258945Sroberto	lockid = FDLOCK_ID(sock->fd);
2606258945Sroberto	LOCK(&manager->fdlock[lockid]);
2607258945Sroberto	manager->fds[sock->fd] = sock;
2608258945Sroberto	manager->fdstate[sock->fd] = MANAGED;
2609258945Sroberto#ifdef USE_DEVPOLL
2610258945Sroberto	INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2611258945Sroberto	       sock->manager->fdpollinfo[sock->fd].want_write == 0);
2612258945Sroberto#endif
2613258945Sroberto	UNLOCK(&manager->fdlock[lockid]);
2614258945Sroberto
2615258945Sroberto	LOCK(&manager->lock);
2616258945Sroberto	ISC_LIST_APPEND(manager->socklist, sock, link);
2617258945Sroberto#ifdef USE_SELECT
2618258945Sroberto	if (manager->maxfd < sock->fd)
2619258945Sroberto		manager->maxfd = sock->fd;
2620258945Sroberto#endif
2621258945Sroberto	UNLOCK(&manager->lock);
2622258945Sroberto
2623258945Sroberto	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2624280849Scy		   ISC_MSG_CREATED, dup_socket == NULL ? "dupped" : "created");
2625258945Sroberto
2626258945Sroberto	return (ISC_R_SUCCESS);
2627258945Sroberto}
2628258945Sroberto
2629280849Scy/*%
2630280849Scy * Create a new 'type' socket managed by 'manager'.  Events
2631280849Scy * will be posted to 'task' and when dispatched 'action' will be
2632280849Scy * called with 'arg' as the arg value.  The new socket is returned
2633280849Scy * in 'socketp'.
2634280849Scy */
2635280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
2636280849Scyisc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2637280849Scy		   isc_socket_t **socketp)
2638280849Scy{
2639280849Scy	return (socket_create(manager0, pf, type, socketp, NULL));
2640280849Scy}
2641280849Scy
2642280849Scy/*%
2643280849Scy * Duplicate an existing socket.  The new socket is returned
2644280849Scy * in 'socketp'.
2645280849Scy */
2646280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
2647280849Scyisc__socket_dup(isc_socket_t *sock0, isc_socket_t **socketp) {
2648280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
2649280849Scy
2650280849Scy	REQUIRE(VALID_SOCKET(sock));
2651280849Scy	REQUIRE(socketp != NULL && *socketp == NULL);
2652280849Scy
2653280849Scy	return (socket_create((isc_socketmgr_t *) sock->manager,
2654280849Scy			      sock->pf, sock->type, socketp,
2655280849Scy			      sock0));
2656280849Scy}
2657280849Scy
2658280849Scy#ifdef BIND9
2659280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
2660280849Scyisc__socket_open(isc_socket_t *sock0) {
2661258945Sroberto	isc_result_t result;
2662280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
2663258945Sroberto
2664258945Sroberto	REQUIRE(VALID_SOCKET(sock));
2665258945Sroberto
2666258945Sroberto	LOCK(&sock->lock);
2667258945Sroberto	REQUIRE(sock->references == 1);
2668258945Sroberto	REQUIRE(sock->type != isc_sockettype_fdwatch);
2669258945Sroberto	UNLOCK(&sock->lock);
2670258945Sroberto	/*
2671258945Sroberto	 * We don't need to retain the lock hereafter, since no one else has
2672258945Sroberto	 * this socket.
2673258945Sroberto	 */
2674258945Sroberto	REQUIRE(sock->fd == -1);
2675258945Sroberto
2676280849Scy	result = opensocket(sock->manager, sock, NULL);
2677258945Sroberto	if (result != ISC_R_SUCCESS)
2678258945Sroberto		sock->fd = -1;
2679258945Sroberto
2680258945Sroberto	if (result == ISC_R_SUCCESS) {
2681258945Sroberto		int lockid = FDLOCK_ID(sock->fd);
2682258945Sroberto
2683258945Sroberto		LOCK(&sock->manager->fdlock[lockid]);
2684258945Sroberto		sock->manager->fds[sock->fd] = sock;
2685258945Sroberto		sock->manager->fdstate[sock->fd] = MANAGED;
2686258945Sroberto#ifdef USE_DEVPOLL
2687258945Sroberto		INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
2688258945Sroberto		       sock->manager->fdpollinfo[sock->fd].want_write == 0);
2689258945Sroberto#endif
2690258945Sroberto		UNLOCK(&sock->manager->fdlock[lockid]);
2691258945Sroberto
2692258945Sroberto#ifdef USE_SELECT
2693258945Sroberto		LOCK(&sock->manager->lock);
2694258945Sroberto		if (sock->manager->maxfd < sock->fd)
2695258945Sroberto			sock->manager->maxfd = sock->fd;
2696258945Sroberto		UNLOCK(&sock->manager->lock);
2697258945Sroberto#endif
2698258945Sroberto	}
2699258945Sroberto
2700258945Sroberto	return (result);
2701258945Sroberto}
2702280849Scy#endif	/* BIND9 */
2703258945Sroberto
2704258945Sroberto/*
2705258945Sroberto * Create a new 'type' socket managed by 'manager'.  Events
2706258945Sroberto * will be posted to 'task' and when dispatched 'action' will be
2707258945Sroberto * called with 'arg' as the arg value.  The new socket is returned
2708258945Sroberto * in 'socketp'.
2709258945Sroberto */
2710280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
2711280849Scyisc__socket_fdwatchcreate(isc_socketmgr_t *manager0, int fd, int flags,
2712280849Scy			  isc_sockfdwatch_t callback, void *cbarg,
2713280849Scy			  isc_task_t *task, isc_socket_t **socketp)
2714258945Sroberto{
2715280849Scy	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2716280849Scy	isc__socket_t *sock = NULL;
2717258945Sroberto	isc_result_t result;
2718258945Sroberto	int lockid;
2719258945Sroberto
2720258945Sroberto	REQUIRE(VALID_MANAGER(manager));
2721258945Sroberto	REQUIRE(socketp != NULL && *socketp == NULL);
2722258945Sroberto
2723258945Sroberto	result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
2724258945Sroberto	if (result != ISC_R_SUCCESS)
2725258945Sroberto		return (result);
2726258945Sroberto
2727258945Sroberto	sock->fd = fd;
2728258945Sroberto	sock->fdwatcharg = cbarg;
2729258945Sroberto	sock->fdwatchcb = callback;
2730258945Sroberto	sock->fdwatchflags = flags;
2731258945Sroberto	sock->fdwatchtask = task;
2732258945Sroberto	sock->statsindex = fdwatchstatsindex;
2733258945Sroberto
2734280849Scy	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
2735258945Sroberto	sock->references = 1;
2736280849Scy	*socketp = (isc_socket_t *)sock;
2737258945Sroberto
2738258945Sroberto	/*
2739258945Sroberto	 * Note we don't have to lock the socket like we normally would because
2740258945Sroberto	 * there are no external references to it yet.
2741258945Sroberto	 */
2742258945Sroberto
2743258945Sroberto	lockid = FDLOCK_ID(sock->fd);
2744258945Sroberto	LOCK(&manager->fdlock[lockid]);
2745258945Sroberto	manager->fds[sock->fd] = sock;
2746258945Sroberto	manager->fdstate[sock->fd] = MANAGED;
2747258945Sroberto	UNLOCK(&manager->fdlock[lockid]);
2748258945Sroberto
2749258945Sroberto	LOCK(&manager->lock);
2750258945Sroberto	ISC_LIST_APPEND(manager->socklist, sock, link);
2751258945Sroberto#ifdef USE_SELECT
2752258945Sroberto	if (manager->maxfd < sock->fd)
2753258945Sroberto		manager->maxfd = sock->fd;
2754258945Sroberto#endif
2755258945Sroberto	UNLOCK(&manager->lock);
2756258945Sroberto
2757258945Sroberto	if (flags & ISC_SOCKFDWATCH_READ)
2758258945Sroberto		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2759258945Sroberto	if (flags & ISC_SOCKFDWATCH_WRITE)
2760258945Sroberto		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2761258945Sroberto
2762258945Sroberto	socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
2763258945Sroberto		   ISC_MSG_CREATED, "fdwatch-created");
2764258945Sroberto
2765258945Sroberto	return (ISC_R_SUCCESS);
2766258945Sroberto}
2767258945Sroberto
2768258945Sroberto/*
2769280849Scy * Indicate to the manager that it should watch the socket again.
2770280849Scy * This can be used to restart watching if the previous event handler
2771280849Scy * didn't indicate there was more data to be processed.  Primarily
2772280849Scy * it is for writing but could be used for reading if desired
2773280849Scy */
2774280849Scy
2775280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
2776280849Scyisc__socket_fdwatchpoke(isc_socket_t *sock0, int flags)
2777280849Scy{
2778280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
2779280849Scy
2780280849Scy	REQUIRE(VALID_SOCKET(sock));
2781280849Scy
2782280849Scy	/*
2783280849Scy	 * We check both flags first to allow us to get the lock
2784280849Scy	 * once but only if we need it.
2785280849Scy	 */
2786280849Scy
2787280849Scy	if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) {
2788280849Scy		LOCK(&sock->lock);
2789280849Scy		if (((flags & ISC_SOCKFDWATCH_READ) != 0) &&
2790280849Scy		    !sock->pending_recv)
2791280849Scy			select_poke(sock->manager, sock->fd,
2792280849Scy				    SELECT_POKE_READ);
2793280849Scy		if (((flags & ISC_SOCKFDWATCH_WRITE) != 0) &&
2794280849Scy		    !sock->pending_send)
2795280849Scy			select_poke(sock->manager, sock->fd,
2796280849Scy				    SELECT_POKE_WRITE);
2797280849Scy		UNLOCK(&sock->lock);
2798280849Scy	}
2799280849Scy
2800280849Scy	socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2801280849Scy		   ISC_MSG_POKED, "fdwatch-poked flags: %d", flags);
2802280849Scy
2803280849Scy	return (ISC_R_SUCCESS);
2804280849Scy}
2805280849Scy
2806280849Scy/*
2807258945Sroberto * Attach to a socket.  Caller must explicitly detach when it is done.
2808258945Sroberto */
2809280849ScyISC_SOCKETFUNC_SCOPE void
2810280849Scyisc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
2811280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
2812280849Scy
2813258945Sroberto	REQUIRE(VALID_SOCKET(sock));
2814258945Sroberto	REQUIRE(socketp != NULL && *socketp == NULL);
2815258945Sroberto
2816258945Sroberto	LOCK(&sock->lock);
2817258945Sroberto	sock->references++;
2818258945Sroberto	UNLOCK(&sock->lock);
2819258945Sroberto
2820280849Scy	*socketp = (isc_socket_t *)sock;
2821258945Sroberto}
2822258945Sroberto
2823258945Sroberto/*
2824258945Sroberto * Dereference a socket.  If this is the last reference to it, clean things
2825258945Sroberto * up by destroying the socket.
2826258945Sroberto */
2827280849ScyISC_SOCKETFUNC_SCOPE void
2828280849Scyisc__socket_detach(isc_socket_t **socketp) {
2829280849Scy	isc__socket_t *sock;
2830258945Sroberto	isc_boolean_t kill_socket = ISC_FALSE;
2831258945Sroberto
2832258945Sroberto	REQUIRE(socketp != NULL);
2833280849Scy	sock = (isc__socket_t *)*socketp;
2834258945Sroberto	REQUIRE(VALID_SOCKET(sock));
2835258945Sroberto
2836258945Sroberto	LOCK(&sock->lock);
2837258945Sroberto	REQUIRE(sock->references > 0);
2838258945Sroberto	sock->references--;
2839258945Sroberto	if (sock->references == 0)
2840258945Sroberto		kill_socket = ISC_TRUE;
2841258945Sroberto	UNLOCK(&sock->lock);
2842258945Sroberto
2843258945Sroberto	if (kill_socket)
2844258945Sroberto		destroy(&sock);
2845258945Sroberto
2846258945Sroberto	*socketp = NULL;
2847258945Sroberto}
2848258945Sroberto
2849280849Scy#ifdef BIND9
2850280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
2851280849Scyisc__socket_close(isc_socket_t *sock0) {
2852280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
2853258945Sroberto	int fd;
2854280849Scy	isc__socketmgr_t *manager;
2855258945Sroberto
2856280849Scy	fflush(stdout);
2857258945Sroberto	REQUIRE(VALID_SOCKET(sock));
2858258945Sroberto
2859258945Sroberto	LOCK(&sock->lock);
2860258945Sroberto
2861258945Sroberto	REQUIRE(sock->references == 1);
2862258945Sroberto	REQUIRE(sock->type != isc_sockettype_fdwatch);
2863258945Sroberto	REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2864258945Sroberto
2865258945Sroberto	INSIST(!sock->connecting);
2866258945Sroberto	INSIST(!sock->pending_recv);
2867258945Sroberto	INSIST(!sock->pending_send);
2868258945Sroberto	INSIST(!sock->pending_accept);
2869258945Sroberto	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2870258945Sroberto	INSIST(ISC_LIST_EMPTY(sock->send_list));
2871258945Sroberto	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2872258945Sroberto	INSIST(sock->connect_ev == NULL);
2873258945Sroberto
2874258945Sroberto	manager = sock->manager;
2875258945Sroberto	fd = sock->fd;
2876258945Sroberto	sock->fd = -1;
2877280849Scy	sock->dupped = 0;
2878258945Sroberto	memset(sock->name, 0, sizeof(sock->name));
2879258945Sroberto	sock->tag = NULL;
2880258945Sroberto	sock->listener = 0;
2881258945Sroberto	sock->connected = 0;
2882258945Sroberto	sock->connecting = 0;
2883258945Sroberto	sock->bound = 0;
2884258945Sroberto	isc_sockaddr_any(&sock->peer_address);
2885258945Sroberto
2886258945Sroberto	UNLOCK(&sock->lock);
2887258945Sroberto
2888258945Sroberto	closesocket(manager, sock, fd);
2889258945Sroberto
2890258945Sroberto	return (ISC_R_SUCCESS);
2891258945Sroberto}
2892280849Scy#endif	/* BIND9 */
2893258945Sroberto
2894258945Sroberto/*
2895258945Sroberto * I/O is possible on a given socket.  Schedule an event to this task that
2896258945Sroberto * will call an internal function to do the I/O.  This will charge the
2897258945Sroberto * task with the I/O operation and let our select loop handler get back
2898258945Sroberto * to doing something real as fast as possible.
2899258945Sroberto *
2900258945Sroberto * The socket and manager must be locked before calling this function.
2901258945Sroberto */
2902258945Srobertostatic void
2903280849Scydispatch_recv(isc__socket_t *sock) {
2904258945Sroberto	intev_t *iev;
2905258945Sroberto	isc_socketevent_t *ev;
2906258945Sroberto	isc_task_t *sender;
2907258945Sroberto
2908258945Sroberto	INSIST(!sock->pending_recv);
2909258945Sroberto
2910258945Sroberto	if (sock->type != isc_sockettype_fdwatch) {
2911258945Sroberto		ev = ISC_LIST_HEAD(sock->recv_list);
2912258945Sroberto		if (ev == NULL)
2913258945Sroberto			return;
2914258945Sroberto		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2915258945Sroberto			   "dispatch_recv:  event %p -> task %p",
2916258945Sroberto			   ev, ev->ev_sender);
2917258945Sroberto		sender = ev->ev_sender;
2918258945Sroberto	} else {
2919258945Sroberto		sender = sock->fdwatchtask;
2920258945Sroberto	}
2921258945Sroberto
2922258945Sroberto	sock->pending_recv = 1;
2923258945Sroberto	iev = &sock->readable_ev;
2924258945Sroberto
2925258945Sroberto	sock->references++;
2926258945Sroberto	iev->ev_sender = sock;
2927258945Sroberto	if (sock->type == isc_sockettype_fdwatch)
2928258945Sroberto		iev->ev_action = internal_fdwatch_read;
2929258945Sroberto	else
2930258945Sroberto		iev->ev_action = internal_recv;
2931258945Sroberto	iev->ev_arg = sock;
2932258945Sroberto
2933258945Sroberto	isc_task_send(sender, (isc_event_t **)&iev);
2934258945Sroberto}
2935258945Sroberto
2936258945Srobertostatic void
2937280849Scydispatch_send(isc__socket_t *sock) {
2938258945Sroberto	intev_t *iev;
2939258945Sroberto	isc_socketevent_t *ev;
2940258945Sroberto	isc_task_t *sender;
2941258945Sroberto
2942258945Sroberto	INSIST(!sock->pending_send);
2943258945Sroberto
2944258945Sroberto	if (sock->type != isc_sockettype_fdwatch) {
2945258945Sroberto		ev = ISC_LIST_HEAD(sock->send_list);
2946258945Sroberto		if (ev == NULL)
2947258945Sroberto			return;
2948258945Sroberto		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2949258945Sroberto			   "dispatch_send:  event %p -> task %p",
2950258945Sroberto			   ev, ev->ev_sender);
2951258945Sroberto		sender = ev->ev_sender;
2952258945Sroberto	} else {
2953258945Sroberto		sender = sock->fdwatchtask;
2954258945Sroberto	}
2955258945Sroberto
2956258945Sroberto	sock->pending_send = 1;
2957258945Sroberto	iev = &sock->writable_ev;
2958258945Sroberto
2959258945Sroberto	sock->references++;
2960258945Sroberto	iev->ev_sender = sock;
2961258945Sroberto	if (sock->type == isc_sockettype_fdwatch)
2962258945Sroberto		iev->ev_action = internal_fdwatch_write;
2963258945Sroberto	else
2964258945Sroberto		iev->ev_action = internal_send;
2965258945Sroberto	iev->ev_arg = sock;
2966258945Sroberto
2967258945Sroberto	isc_task_send(sender, (isc_event_t **)&iev);
2968258945Sroberto}
2969258945Sroberto
2970258945Sroberto/*
2971258945Sroberto * Dispatch an internal accept event.
2972258945Sroberto */
2973258945Srobertostatic void
2974280849Scydispatch_accept(isc__socket_t *sock) {
2975258945Sroberto	intev_t *iev;
2976258945Sroberto	isc_socket_newconnev_t *ev;
2977258945Sroberto
2978258945Sroberto	INSIST(!sock->pending_accept);
2979258945Sroberto
2980258945Sroberto	/*
2981258945Sroberto	 * Are there any done events left, or were they all canceled
2982258945Sroberto	 * before the manager got the socket lock?
2983258945Sroberto	 */
2984258945Sroberto	ev = ISC_LIST_HEAD(sock->accept_list);
2985258945Sroberto	if (ev == NULL)
2986258945Sroberto		return;
2987258945Sroberto
2988258945Sroberto	sock->pending_accept = 1;
2989258945Sroberto	iev = &sock->readable_ev;
2990258945Sroberto
2991258945Sroberto	sock->references++;  /* keep socket around for this internal event */
2992258945Sroberto	iev->ev_sender = sock;
2993258945Sroberto	iev->ev_action = internal_accept;
2994258945Sroberto	iev->ev_arg = sock;
2995258945Sroberto
2996258945Sroberto	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
2997258945Sroberto}
2998258945Sroberto
2999258945Srobertostatic void
3000280849Scydispatch_connect(isc__socket_t *sock) {
3001258945Sroberto	intev_t *iev;
3002258945Sroberto	isc_socket_connev_t *ev;
3003258945Sroberto
3004258945Sroberto	iev = &sock->writable_ev;
3005258945Sroberto
3006258945Sroberto	ev = sock->connect_ev;
3007258945Sroberto	INSIST(ev != NULL); /* XXX */
3008258945Sroberto
3009258945Sroberto	INSIST(sock->connecting);
3010258945Sroberto
3011258945Sroberto	sock->references++;  /* keep socket around for this internal event */
3012258945Sroberto	iev->ev_sender = sock;
3013258945Sroberto	iev->ev_action = internal_connect;
3014258945Sroberto	iev->ev_arg = sock;
3015258945Sroberto
3016258945Sroberto	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
3017258945Sroberto}
3018258945Sroberto
3019258945Sroberto/*
3020258945Sroberto * Dequeue an item off the given socket's read queue, set the result code
3021258945Sroberto * in the done event to the one provided, and send it to the task it was
3022258945Sroberto * destined for.
3023258945Sroberto *
3024258945Sroberto * If the event to be sent is on a list, remove it before sending.  If
3025258945Sroberto * asked to, send and detach from the socket as well.
3026258945Sroberto *
3027258945Sroberto * Caller must have the socket locked if the event is attached to the socket.
3028258945Sroberto */
3029258945Srobertostatic void
3030280849Scysend_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
3031258945Sroberto	isc_task_t *task;
3032258945Sroberto
3033258945Sroberto	task = (*dev)->ev_sender;
3034258945Sroberto
3035258945Sroberto	(*dev)->ev_sender = sock;
3036258945Sroberto
3037258945Sroberto	if (ISC_LINK_LINKED(*dev, ev_link))
3038258945Sroberto		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
3039258945Sroberto
3040258945Sroberto	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
3041258945Sroberto	    == ISC_SOCKEVENTATTR_ATTACHED)
3042258945Sroberto		isc_task_sendanddetach(&task, (isc_event_t **)dev);
3043258945Sroberto	else
3044258945Sroberto		isc_task_send(task, (isc_event_t **)dev);
3045258945Sroberto}
3046258945Sroberto
3047258945Sroberto/*
3048258945Sroberto * See comments for send_recvdone_event() above.
3049258945Sroberto *
3050258945Sroberto * Caller must have the socket locked if the event is attached to the socket.
3051258945Sroberto */
3052258945Srobertostatic void
3053280849Scysend_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
3054258945Sroberto	isc_task_t *task;
3055258945Sroberto
3056258945Sroberto	INSIST(dev != NULL && *dev != NULL);
3057258945Sroberto
3058258945Sroberto	task = (*dev)->ev_sender;
3059258945Sroberto	(*dev)->ev_sender = sock;
3060258945Sroberto
3061258945Sroberto	if (ISC_LINK_LINKED(*dev, ev_link))
3062258945Sroberto		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
3063258945Sroberto
3064258945Sroberto	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
3065258945Sroberto	    == ISC_SOCKEVENTATTR_ATTACHED)
3066258945Sroberto		isc_task_sendanddetach(&task, (isc_event_t **)dev);
3067258945Sroberto	else
3068258945Sroberto		isc_task_send(task, (isc_event_t **)dev);
3069258945Sroberto}
3070258945Sroberto
3071258945Sroberto/*
3072258945Sroberto * Call accept() on a socket, to get the new file descriptor.  The listen
3073258945Sroberto * socket is used as a prototype to create a new isc_socket_t.  The new
3074258945Sroberto * socket has one outstanding reference.  The task receiving the event
3075258945Sroberto * will be detached from just after the event is delivered.
3076258945Sroberto *
3077258945Sroberto * On entry to this function, the event delivered is the internal
3078258945Sroberto * readable event, and the first item on the accept_list should be
3079258945Sroberto * the done event we want to send.  If the list is empty, this is a no-op,
3080258945Sroberto * so just unlock and return.
3081258945Sroberto */
3082258945Srobertostatic void
3083258945Srobertointernal_accept(isc_task_t *me, isc_event_t *ev) {
3084280849Scy	isc__socket_t *sock;
3085280849Scy	isc__socketmgr_t *manager;
3086258945Sroberto	isc_socket_newconnev_t *dev;
3087258945Sroberto	isc_task_t *task;
3088258945Sroberto	ISC_SOCKADDR_LEN_T addrlen;
3089258945Sroberto	int fd;
3090258945Sroberto	isc_result_t result = ISC_R_SUCCESS;
3091258945Sroberto	char strbuf[ISC_STRERRORSIZE];
3092258945Sroberto	const char *err = "accept";
3093258945Sroberto
3094258945Sroberto	UNUSED(me);
3095258945Sroberto
3096258945Sroberto	sock = ev->ev_sender;
3097258945Sroberto	INSIST(VALID_SOCKET(sock));
3098258945Sroberto
3099258945Sroberto	LOCK(&sock->lock);
3100258945Sroberto	socket_log(sock, NULL, TRACE,
3101258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
3102258945Sroberto		   "internal_accept called, locked socket");
3103258945Sroberto
3104258945Sroberto	manager = sock->manager;
3105258945Sroberto	INSIST(VALID_MANAGER(manager));
3106258945Sroberto
3107258945Sroberto	INSIST(sock->listener);
3108258945Sroberto	INSIST(sock->pending_accept == 1);
3109258945Sroberto	sock->pending_accept = 0;
3110258945Sroberto
3111258945Sroberto	INSIST(sock->references > 0);
3112258945Sroberto	sock->references--;  /* the internal event is done with this socket */
3113258945Sroberto	if (sock->references == 0) {
3114258945Sroberto		UNLOCK(&sock->lock);
3115258945Sroberto		destroy(&sock);
3116258945Sroberto		return;
3117258945Sroberto	}
3118258945Sroberto
3119258945Sroberto	/*
3120258945Sroberto	 * Get the first item off the accept list.
3121258945Sroberto	 * If it is empty, unlock the socket and return.
3122258945Sroberto	 */
3123258945Sroberto	dev = ISC_LIST_HEAD(sock->accept_list);
3124258945Sroberto	if (dev == NULL) {
3125258945Sroberto		UNLOCK(&sock->lock);
3126258945Sroberto		return;
3127258945Sroberto	}
3128258945Sroberto
3129258945Sroberto	/*
3130258945Sroberto	 * Try to accept the new connection.  If the accept fails with
3131258945Sroberto	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
3132258945Sroberto	 * again.  Also ignore ECONNRESET, which has been reported to
3133258945Sroberto	 * be spuriously returned on Linux 2.2.19 although it is not
3134258945Sroberto	 * a documented error for accept().  ECONNABORTED has been
3135258945Sroberto	 * reported for Solaris 8.  The rest are thrown in not because
3136258945Sroberto	 * we have seen them but because they are ignored by other
3137258945Sroberto	 * daemons such as BIND 8 and Apache.
3138258945Sroberto	 */
3139258945Sroberto
3140280849Scy	addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
3141280849Scy	memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
3142280849Scy	fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
3143258945Sroberto		    (void *)&addrlen);
3144258945Sroberto
3145258945Sroberto#ifdef F_DUPFD
3146258945Sroberto	/*
3147258945Sroberto	 * Leave a space for stdio to work in.
3148258945Sroberto	 */
3149258945Sroberto	if (fd >= 0 && fd < 20) {
3150258945Sroberto		int new, tmp;
3151258945Sroberto		new = fcntl(fd, F_DUPFD, 20);
3152258945Sroberto		tmp = errno;
3153258945Sroberto		(void)close(fd);
3154258945Sroberto		errno = tmp;
3155258945Sroberto		fd = new;
3156258945Sroberto		err = "accept/fcntl";
3157258945Sroberto	}
3158258945Sroberto#endif
3159258945Sroberto
3160258945Sroberto	if (fd < 0) {
3161258945Sroberto		if (SOFT_ERROR(errno))
3162258945Sroberto			goto soft_error;
3163258945Sroberto		switch (errno) {
3164258945Sroberto		case ENFILE:
3165258945Sroberto		case EMFILE:
3166258945Sroberto			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3167258945Sroberto				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3168258945Sroberto				       isc_msgcat, ISC_MSGSET_SOCKET,
3169258945Sroberto				       ISC_MSG_TOOMANYFDS,
3170258945Sroberto				       "%s: too many open file descriptors",
3171258945Sroberto				       err);
3172258945Sroberto			goto soft_error;
3173258945Sroberto
3174258945Sroberto		case ENOBUFS:
3175258945Sroberto		case ENOMEM:
3176258945Sroberto		case ECONNRESET:
3177258945Sroberto		case ECONNABORTED:
3178258945Sroberto		case EHOSTUNREACH:
3179258945Sroberto		case EHOSTDOWN:
3180258945Sroberto		case ENETUNREACH:
3181258945Sroberto		case ENETDOWN:
3182258945Sroberto		case ECONNREFUSED:
3183258945Sroberto#ifdef EPROTO
3184258945Sroberto		case EPROTO:
3185258945Sroberto#endif
3186258945Sroberto#ifdef ENONET
3187258945Sroberto		case ENONET:
3188258945Sroberto#endif
3189258945Sroberto			goto soft_error;
3190258945Sroberto		default:
3191258945Sroberto			break;
3192258945Sroberto		}
3193258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
3194258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
3195258945Sroberto				 "internal_accept: %s() %s: %s", err,
3196258945Sroberto				 isc_msgcat_get(isc_msgcat,
3197258945Sroberto						ISC_MSGSET_GENERAL,
3198258945Sroberto						ISC_MSG_FAILED,
3199258945Sroberto						"failed"),
3200258945Sroberto				 strbuf);
3201258945Sroberto		fd = -1;
3202258945Sroberto		result = ISC_R_UNEXPECTED;
3203258945Sroberto	} else {
3204258945Sroberto		if (addrlen == 0U) {
3205258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
3206258945Sroberto					 "internal_accept(): "
3207258945Sroberto					 "accept() failed to return "
3208258945Sroberto					 "remote address");
3209258945Sroberto
3210258945Sroberto			(void)close(fd);
3211258945Sroberto			goto soft_error;
3212280849Scy		} else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
3213258945Sroberto			   sock->pf)
3214258945Sroberto		{
3215258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
3216258945Sroberto					 "internal_accept(): "
3217258945Sroberto					 "accept() returned peer address "
3218258945Sroberto					 "family %u (expected %u)",
3219280849Scy					 NEWCONNSOCK(dev)->peer_address.
3220258945Sroberto					 type.sa.sa_family,
3221258945Sroberto					 sock->pf);
3222258945Sroberto			(void)close(fd);
3223258945Sroberto			goto soft_error;
3224258945Sroberto		} else if (fd >= (int)manager->maxsocks) {
3225258945Sroberto			isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
3226258945Sroberto				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
3227258945Sroberto				       isc_msgcat, ISC_MSGSET_SOCKET,
3228258945Sroberto				       ISC_MSG_TOOMANYFDS,
3229258945Sroberto				       "accept: "
3230258945Sroberto				       "file descriptor exceeds limit (%d/%u)",
3231258945Sroberto				       fd, manager->maxsocks);
3232258945Sroberto			(void)close(fd);
3233258945Sroberto			goto soft_error;
3234258945Sroberto		}
3235258945Sroberto	}
3236258945Sroberto
3237258945Sroberto	if (fd != -1) {
3238280849Scy		NEWCONNSOCK(dev)->peer_address.length = addrlen;
3239280849Scy		NEWCONNSOCK(dev)->pf = sock->pf;
3240258945Sroberto	}
3241258945Sroberto
3242258945Sroberto	/*
3243258945Sroberto	 * Pull off the done event.
3244258945Sroberto	 */
3245258945Sroberto	ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
3246258945Sroberto
3247258945Sroberto	/*
3248258945Sroberto	 * Poke watcher if there are more pending accepts.
3249258945Sroberto	 */
3250258945Sroberto	if (!ISC_LIST_EMPTY(sock->accept_list))
3251258945Sroberto		select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3252258945Sroberto
3253258945Sroberto	UNLOCK(&sock->lock);
3254258945Sroberto
3255280849Scy	if (fd != -1) {
3256280849Scy		result = make_nonblock(fd);
3257280849Scy		if (result != ISC_R_SUCCESS) {
3258280849Scy			(void)close(fd);
3259280849Scy			fd = -1;
3260280849Scy		}
3261258945Sroberto	}
3262258945Sroberto
3263258945Sroberto	/*
3264258945Sroberto	 * -1 means the new socket didn't happen.
3265258945Sroberto	 */
3266258945Sroberto	if (fd != -1) {
3267258945Sroberto		int lockid = FDLOCK_ID(fd);
3268258945Sroberto
3269258945Sroberto		LOCK(&manager->fdlock[lockid]);
3270280849Scy		manager->fds[fd] = NEWCONNSOCK(dev);
3271258945Sroberto		manager->fdstate[fd] = MANAGED;
3272258945Sroberto		UNLOCK(&manager->fdlock[lockid]);
3273258945Sroberto
3274258945Sroberto		LOCK(&manager->lock);
3275280849Scy		ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
3276258945Sroberto
3277280849Scy		NEWCONNSOCK(dev)->fd = fd;
3278280849Scy		NEWCONNSOCK(dev)->bound = 1;
3279280849Scy		NEWCONNSOCK(dev)->connected = 1;
3280258945Sroberto
3281258945Sroberto		/*
3282258945Sroberto		 * Save away the remote address
3283258945Sroberto		 */
3284280849Scy		dev->address = NEWCONNSOCK(dev)->peer_address;
3285258945Sroberto
3286258945Sroberto#ifdef USE_SELECT
3287258945Sroberto		if (manager->maxfd < fd)
3288258945Sroberto			manager->maxfd = fd;
3289258945Sroberto#endif
3290258945Sroberto
3291280849Scy		socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
3292258945Sroberto			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
3293258945Sroberto			   "accepted connection, new socket %p",
3294258945Sroberto			   dev->newsocket);
3295258945Sroberto
3296258945Sroberto		UNLOCK(&manager->lock);
3297258945Sroberto
3298258945Sroberto		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3299258945Sroberto	} else {
3300258945Sroberto		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3301280849Scy		NEWCONNSOCK(dev)->references--;
3302280849Scy		free_socket((isc__socket_t **)&dev->newsocket);
3303258945Sroberto	}
3304258945Sroberto
3305258945Sroberto	/*
3306258945Sroberto	 * Fill in the done event details and send it off.
3307258945Sroberto	 */
3308258945Sroberto	dev->result = result;
3309258945Sroberto	task = dev->ev_sender;
3310258945Sroberto	dev->ev_sender = sock;
3311258945Sroberto
3312258945Sroberto	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3313258945Sroberto	return;
3314258945Sroberto
3315258945Sroberto soft_error:
3316258945Sroberto	select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
3317258945Sroberto	UNLOCK(&sock->lock);
3318258945Sroberto
3319258945Sroberto	inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3320258945Sroberto	return;
3321258945Sroberto}
3322258945Sroberto
3323258945Srobertostatic void
3324258945Srobertointernal_recv(isc_task_t *me, isc_event_t *ev) {
3325258945Sroberto	isc_socketevent_t *dev;
3326280849Scy	isc__socket_t *sock;
3327258945Sroberto
3328258945Sroberto	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3329258945Sroberto
3330258945Sroberto	sock = ev->ev_sender;
3331258945Sroberto	INSIST(VALID_SOCKET(sock));
3332258945Sroberto
3333258945Sroberto	LOCK(&sock->lock);
3334258945Sroberto	socket_log(sock, NULL, IOEVENT,
3335258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3336258945Sroberto		   "internal_recv: task %p got event %p", me, ev);
3337258945Sroberto
3338258945Sroberto	INSIST(sock->pending_recv == 1);
3339258945Sroberto	sock->pending_recv = 0;
3340258945Sroberto
3341258945Sroberto	INSIST(sock->references > 0);
3342258945Sroberto	sock->references--;  /* the internal event is done with this socket */
3343258945Sroberto	if (sock->references == 0) {
3344258945Sroberto		UNLOCK(&sock->lock);
3345258945Sroberto		destroy(&sock);
3346258945Sroberto		return;
3347258945Sroberto	}
3348258945Sroberto
3349258945Sroberto	/*
3350258945Sroberto	 * Try to do as much I/O as possible on this socket.  There are no
3351258945Sroberto	 * limits here, currently.
3352258945Sroberto	 */
3353258945Sroberto	dev = ISC_LIST_HEAD(sock->recv_list);
3354258945Sroberto	while (dev != NULL) {
3355258945Sroberto		switch (doio_recv(sock, dev)) {
3356258945Sroberto		case DOIO_SOFT:
3357258945Sroberto			goto poke;
3358258945Sroberto
3359258945Sroberto		case DOIO_EOF:
3360258945Sroberto			/*
3361258945Sroberto			 * read of 0 means the remote end was closed.
3362258945Sroberto			 * Run through the event queue and dispatch all
3363258945Sroberto			 * the events with an EOF result code.
3364258945Sroberto			 */
3365258945Sroberto			do {
3366258945Sroberto				dev->result = ISC_R_EOF;
3367258945Sroberto				send_recvdone_event(sock, &dev);
3368258945Sroberto				dev = ISC_LIST_HEAD(sock->recv_list);
3369258945Sroberto			} while (dev != NULL);
3370258945Sroberto			goto poke;
3371258945Sroberto
3372258945Sroberto		case DOIO_SUCCESS:
3373258945Sroberto		case DOIO_HARD:
3374258945Sroberto			send_recvdone_event(sock, &dev);
3375258945Sroberto			break;
3376258945Sroberto		}
3377258945Sroberto
3378258945Sroberto		dev = ISC_LIST_HEAD(sock->recv_list);
3379258945Sroberto	}
3380258945Sroberto
3381258945Sroberto poke:
3382258945Sroberto	if (!ISC_LIST_EMPTY(sock->recv_list))
3383258945Sroberto		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3384258945Sroberto
3385258945Sroberto	UNLOCK(&sock->lock);
3386258945Sroberto}
3387258945Sroberto
3388258945Srobertostatic void
3389258945Srobertointernal_send(isc_task_t *me, isc_event_t *ev) {
3390258945Sroberto	isc_socketevent_t *dev;
3391280849Scy	isc__socket_t *sock;
3392258945Sroberto
3393258945Sroberto	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3394258945Sroberto
3395258945Sroberto	/*
3396258945Sroberto	 * Find out what socket this is and lock it.
3397258945Sroberto	 */
3398280849Scy	sock = (isc__socket_t *)ev->ev_sender;
3399258945Sroberto	INSIST(VALID_SOCKET(sock));
3400258945Sroberto
3401258945Sroberto	LOCK(&sock->lock);
3402258945Sroberto	socket_log(sock, NULL, IOEVENT,
3403258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3404258945Sroberto		   "internal_send: task %p got event %p", me, ev);
3405258945Sroberto
3406258945Sroberto	INSIST(sock->pending_send == 1);
3407258945Sroberto	sock->pending_send = 0;
3408258945Sroberto
3409258945Sroberto	INSIST(sock->references > 0);
3410258945Sroberto	sock->references--;  /* the internal event is done with this socket */
3411258945Sroberto	if (sock->references == 0) {
3412258945Sroberto		UNLOCK(&sock->lock);
3413258945Sroberto		destroy(&sock);
3414258945Sroberto		return;
3415258945Sroberto	}
3416258945Sroberto
3417258945Sroberto	/*
3418258945Sroberto	 * Try to do as much I/O as possible on this socket.  There are no
3419258945Sroberto	 * limits here, currently.
3420258945Sroberto	 */
3421258945Sroberto	dev = ISC_LIST_HEAD(sock->send_list);
3422258945Sroberto	while (dev != NULL) {
3423258945Sroberto		switch (doio_send(sock, dev)) {
3424258945Sroberto		case DOIO_SOFT:
3425258945Sroberto			goto poke;
3426258945Sroberto
3427258945Sroberto		case DOIO_HARD:
3428258945Sroberto		case DOIO_SUCCESS:
3429258945Sroberto			send_senddone_event(sock, &dev);
3430258945Sroberto			break;
3431258945Sroberto		}
3432258945Sroberto
3433258945Sroberto		dev = ISC_LIST_HEAD(sock->send_list);
3434258945Sroberto	}
3435258945Sroberto
3436258945Sroberto poke:
3437258945Sroberto	if (!ISC_LIST_EMPTY(sock->send_list))
3438258945Sroberto		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3439258945Sroberto
3440258945Sroberto	UNLOCK(&sock->lock);
3441258945Sroberto}
3442258945Sroberto
3443258945Srobertostatic void
3444258945Srobertointernal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
3445280849Scy	isc__socket_t *sock;
3446258945Sroberto	int more_data;
3447258945Sroberto
3448258945Sroberto	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3449258945Sroberto
3450258945Sroberto	/*
3451258945Sroberto	 * Find out what socket this is and lock it.
3452258945Sroberto	 */
3453280849Scy	sock = (isc__socket_t *)ev->ev_sender;
3454258945Sroberto	INSIST(VALID_SOCKET(sock));
3455258945Sroberto
3456258945Sroberto	LOCK(&sock->lock);
3457258945Sroberto	socket_log(sock, NULL, IOEVENT,
3458258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
3459258945Sroberto		   "internal_fdwatch_write: task %p got event %p", me, ev);
3460258945Sroberto
3461258945Sroberto	INSIST(sock->pending_send == 1);
3462258945Sroberto
3463258945Sroberto	UNLOCK(&sock->lock);
3464280849Scy	more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
3465280849Scy				      sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE);
3466258945Sroberto	LOCK(&sock->lock);
3467258945Sroberto
3468258945Sroberto	sock->pending_send = 0;
3469258945Sroberto
3470258945Sroberto	INSIST(sock->references > 0);
3471258945Sroberto	sock->references--;  /* the internal event is done with this socket */
3472258945Sroberto	if (sock->references == 0) {
3473258945Sroberto		UNLOCK(&sock->lock);
3474258945Sroberto		destroy(&sock);
3475258945Sroberto		return;
3476258945Sroberto	}
3477258945Sroberto
3478258945Sroberto	if (more_data)
3479258945Sroberto		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
3480258945Sroberto
3481258945Sroberto	UNLOCK(&sock->lock);
3482258945Sroberto}
3483258945Sroberto
3484258945Srobertostatic void
3485258945Srobertointernal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
3486280849Scy	isc__socket_t *sock;
3487258945Sroberto	int more_data;
3488258945Sroberto
3489258945Sroberto	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
3490258945Sroberto
3491258945Sroberto	/*
3492258945Sroberto	 * Find out what socket this is and lock it.
3493258945Sroberto	 */
3494280849Scy	sock = (isc__socket_t *)ev->ev_sender;
3495258945Sroberto	INSIST(VALID_SOCKET(sock));
3496258945Sroberto
3497258945Sroberto	LOCK(&sock->lock);
3498258945Sroberto	socket_log(sock, NULL, IOEVENT,
3499258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
3500258945Sroberto		   "internal_fdwatch_read: task %p got event %p", me, ev);
3501258945Sroberto
3502258945Sroberto	INSIST(sock->pending_recv == 1);
3503258945Sroberto
3504258945Sroberto	UNLOCK(&sock->lock);
3505280849Scy	more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
3506280849Scy				      sock->fdwatcharg, ISC_SOCKFDWATCH_READ);
3507258945Sroberto	LOCK(&sock->lock);
3508258945Sroberto
3509258945Sroberto	sock->pending_recv = 0;
3510258945Sroberto
3511258945Sroberto	INSIST(sock->references > 0);
3512258945Sroberto	sock->references--;  /* the internal event is done with this socket */
3513258945Sroberto	if (sock->references == 0) {
3514258945Sroberto		UNLOCK(&sock->lock);
3515258945Sroberto		destroy(&sock);
3516258945Sroberto		return;
3517258945Sroberto	}
3518258945Sroberto
3519258945Sroberto	if (more_data)
3520258945Sroberto		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
3521258945Sroberto
3522258945Sroberto	UNLOCK(&sock->lock);
3523258945Sroberto}
3524258945Sroberto
3525258945Sroberto/*
3526258945Sroberto * Process read/writes on each fd here.  Avoid locking
3527258945Sroberto * and unlocking twice if both reads and writes are possible.
3528258945Sroberto */
3529258945Srobertostatic void
3530280849Scyprocess_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
3531258945Sroberto	   isc_boolean_t writeable)
3532258945Sroberto{
3533280849Scy	isc__socket_t *sock;
3534258945Sroberto	isc_boolean_t unlock_sock;
3535258945Sroberto	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
3536258945Sroberto	int lockid = FDLOCK_ID(fd);
3537258945Sroberto
3538258945Sroberto	/*
3539258945Sroberto	 * If the socket is going to be closed, don't do more I/O.
3540258945Sroberto	 */
3541258945Sroberto	LOCK(&manager->fdlock[lockid]);
3542258945Sroberto	if (manager->fdstate[fd] == CLOSE_PENDING) {
3543258945Sroberto		UNLOCK(&manager->fdlock[lockid]);
3544258945Sroberto
3545258945Sroberto		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3546258945Sroberto		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3547258945Sroberto		return;
3548258945Sroberto	}
3549258945Sroberto
3550258945Sroberto	sock = manager->fds[fd];
3551258945Sroberto	unlock_sock = ISC_FALSE;
3552258945Sroberto	if (readable) {
3553258945Sroberto		if (sock == NULL) {
3554258945Sroberto			unwatch_read = ISC_TRUE;
3555258945Sroberto			goto check_write;
3556258945Sroberto		}
3557258945Sroberto		unlock_sock = ISC_TRUE;
3558258945Sroberto		LOCK(&sock->lock);
3559258945Sroberto		if (!SOCK_DEAD(sock)) {
3560258945Sroberto			if (sock->listener)
3561258945Sroberto				dispatch_accept(sock);
3562258945Sroberto			else
3563258945Sroberto				dispatch_recv(sock);
3564258945Sroberto		}
3565258945Sroberto		unwatch_read = ISC_TRUE;
3566258945Sroberto	}
3567258945Srobertocheck_write:
3568258945Sroberto	if (writeable) {
3569258945Sroberto		if (sock == NULL) {
3570258945Sroberto			unwatch_write = ISC_TRUE;
3571258945Sroberto			goto unlock_fd;
3572258945Sroberto		}
3573258945Sroberto		if (!unlock_sock) {
3574258945Sroberto			unlock_sock = ISC_TRUE;
3575258945Sroberto			LOCK(&sock->lock);
3576258945Sroberto		}
3577258945Sroberto		if (!SOCK_DEAD(sock)) {
3578258945Sroberto			if (sock->connecting)
3579258945Sroberto				dispatch_connect(sock);
3580258945Sroberto			else
3581258945Sroberto				dispatch_send(sock);
3582258945Sroberto		}
3583258945Sroberto		unwatch_write = ISC_TRUE;
3584258945Sroberto	}
3585258945Sroberto	if (unlock_sock)
3586258945Sroberto		UNLOCK(&sock->lock);
3587258945Sroberto
3588258945Sroberto unlock_fd:
3589258945Sroberto	UNLOCK(&manager->fdlock[lockid]);
3590258945Sroberto	if (unwatch_read)
3591258945Sroberto		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
3592258945Sroberto	if (unwatch_write)
3593258945Sroberto		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
3594258945Sroberto
3595258945Sroberto}
3596258945Sroberto
3597258945Sroberto#ifdef USE_KQUEUE
3598258945Srobertostatic isc_boolean_t
3599280849Scyprocess_fds(isc__socketmgr_t *manager, struct kevent *events, int nevents) {
3600258945Sroberto	int i;
3601258945Sroberto	isc_boolean_t readable, writable;
3602258945Sroberto	isc_boolean_t done = ISC_FALSE;
3603280849Scy#ifdef USE_WATCHER_THREAD
3604258945Sroberto	isc_boolean_t have_ctlevent = ISC_FALSE;
3605258945Sroberto#endif
3606258945Sroberto
3607258945Sroberto	if (nevents == manager->nevents) {
3608258945Sroberto		/*
3609258945Sroberto		 * This is not an error, but something unexpected.  If this
3610258945Sroberto		 * happens, it may indicate the need for increasing
3611258945Sroberto		 * ISC_SOCKET_MAXEVENTS.
3612258945Sroberto		 */
3613258945Sroberto		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3614258945Sroberto			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3615258945Sroberto			    "maximum number of FD events (%d) received",
3616258945Sroberto			    nevents);
3617258945Sroberto	}
3618258945Sroberto
3619258945Sroberto	for (i = 0; i < nevents; i++) {
3620258945Sroberto		REQUIRE(events[i].ident < manager->maxsocks);
3621280849Scy#ifdef USE_WATCHER_THREAD
3622258945Sroberto		if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
3623258945Sroberto			have_ctlevent = ISC_TRUE;
3624258945Sroberto			continue;
3625258945Sroberto		}
3626258945Sroberto#endif
3627258945Sroberto		readable = ISC_TF(events[i].filter == EVFILT_READ);
3628258945Sroberto		writable = ISC_TF(events[i].filter == EVFILT_WRITE);
3629258945Sroberto		process_fd(manager, events[i].ident, readable, writable);
3630258945Sroberto	}
3631258945Sroberto
3632280849Scy#ifdef USE_WATCHER_THREAD
3633258945Sroberto	if (have_ctlevent)
3634258945Sroberto		done = process_ctlfd(manager);
3635258945Sroberto#endif
3636258945Sroberto
3637258945Sroberto	return (done);
3638258945Sroberto}
3639258945Sroberto#elif defined(USE_EPOLL)
3640258945Srobertostatic isc_boolean_t
3641280849Scyprocess_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents)
3642280849Scy{
3643258945Sroberto	int i;
3644258945Sroberto	isc_boolean_t done = ISC_FALSE;
3645280849Scy#ifdef USE_WATCHER_THREAD
3646258945Sroberto	isc_boolean_t have_ctlevent = ISC_FALSE;
3647258945Sroberto#endif
3648258945Sroberto
3649258945Sroberto	if (nevents == manager->nevents) {
3650258945Sroberto		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3651258945Sroberto			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3652258945Sroberto			    "maximum number of FD events (%d) received",
3653258945Sroberto			    nevents);
3654258945Sroberto	}
3655258945Sroberto
3656258945Sroberto	for (i = 0; i < nevents; i++) {
3657258945Sroberto		REQUIRE(events[i].data.fd < (int)manager->maxsocks);
3658280849Scy#ifdef USE_WATCHER_THREAD
3659258945Sroberto		if (events[i].data.fd == manager->pipe_fds[0]) {
3660258945Sroberto			have_ctlevent = ISC_TRUE;
3661258945Sroberto			continue;
3662258945Sroberto		}
3663258945Sroberto#endif
3664258945Sroberto		if ((events[i].events & EPOLLERR) != 0 ||
3665258945Sroberto		    (events[i].events & EPOLLHUP) != 0) {
3666258945Sroberto			/*
3667258945Sroberto			 * epoll does not set IN/OUT bits on an erroneous
3668258945Sroberto			 * condition, so we need to try both anyway.  This is a
3669258945Sroberto			 * bit inefficient, but should be okay for such rare
3670258945Sroberto			 * events.  Note also that the read or write attempt
3671258945Sroberto			 * won't block because we use non-blocking sockets.
3672258945Sroberto			 */
3673258945Sroberto			events[i].events |= (EPOLLIN | EPOLLOUT);
3674258945Sroberto		}
3675258945Sroberto		process_fd(manager, events[i].data.fd,
3676258945Sroberto			   (events[i].events & EPOLLIN) != 0,
3677258945Sroberto			   (events[i].events & EPOLLOUT) != 0);
3678258945Sroberto	}
3679258945Sroberto
3680280849Scy#ifdef USE_WATCHER_THREAD
3681258945Sroberto	if (have_ctlevent)
3682258945Sroberto		done = process_ctlfd(manager);
3683258945Sroberto#endif
3684258945Sroberto
3685258945Sroberto	return (done);
3686258945Sroberto}
3687258945Sroberto#elif defined(USE_DEVPOLL)
3688258945Srobertostatic isc_boolean_t
3689280849Scyprocess_fds(isc__socketmgr_t *manager, struct pollfd *events, int nevents) {
3690258945Sroberto	int i;
3691258945Sroberto	isc_boolean_t done = ISC_FALSE;
3692280849Scy#ifdef USE_WATCHER_THREAD
3693258945Sroberto	isc_boolean_t have_ctlevent = ISC_FALSE;
3694258945Sroberto#endif
3695258945Sroberto
3696258945Sroberto	if (nevents == manager->nevents) {
3697258945Sroberto		manager_log(manager, ISC_LOGCATEGORY_GENERAL,
3698258945Sroberto			    ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3699258945Sroberto			    "maximum number of FD events (%d) received",
3700258945Sroberto			    nevents);
3701258945Sroberto	}
3702258945Sroberto
3703258945Sroberto	for (i = 0; i < nevents; i++) {
3704258945Sroberto		REQUIRE(events[i].fd < (int)manager->maxsocks);
3705280849Scy#ifdef USE_WATCHER_THREAD
3706258945Sroberto		if (events[i].fd == manager->pipe_fds[0]) {
3707258945Sroberto			have_ctlevent = ISC_TRUE;
3708258945Sroberto			continue;
3709258945Sroberto		}
3710258945Sroberto#endif
3711258945Sroberto		process_fd(manager, events[i].fd,
3712258945Sroberto			   (events[i].events & POLLIN) != 0,
3713258945Sroberto			   (events[i].events & POLLOUT) != 0);
3714258945Sroberto	}
3715258945Sroberto
3716280849Scy#ifdef USE_WATCHER_THREAD
3717258945Sroberto	if (have_ctlevent)
3718258945Sroberto		done = process_ctlfd(manager);
3719258945Sroberto#endif
3720258945Sroberto
3721258945Sroberto	return (done);
3722258945Sroberto}
3723258945Sroberto#elif defined(USE_SELECT)
3724258945Srobertostatic void
3725280849Scyprocess_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
3726280849Scy	    fd_set *writefds)
3727258945Sroberto{
3728258945Sroberto	int i;
3729258945Sroberto
3730258945Sroberto	REQUIRE(maxfd <= (int)manager->maxsocks);
3731258945Sroberto
3732258945Sroberto	for (i = 0; i < maxfd; i++) {
3733280849Scy#ifdef USE_WATCHER_THREAD
3734258945Sroberto		if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
3735258945Sroberto			continue;
3736280849Scy#endif /* USE_WATCHER_THREAD */
3737258945Sroberto		process_fd(manager, i, FD_ISSET(i, readfds),
3738258945Sroberto			   FD_ISSET(i, writefds));
3739258945Sroberto	}
3740258945Sroberto}
3741258945Sroberto#endif
3742258945Sroberto
3743280849Scy#ifdef USE_WATCHER_THREAD
3744258945Srobertostatic isc_boolean_t
3745280849Scyprocess_ctlfd(isc__socketmgr_t *manager) {
3746258945Sroberto	int msg, fd;
3747258945Sroberto
3748258945Sroberto	for (;;) {
3749258945Sroberto		select_readmsg(manager, &fd, &msg);
3750258945Sroberto
3751258945Sroberto		manager_log(manager, IOEVENT,
3752258945Sroberto			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
3753258945Sroberto					   ISC_MSG_WATCHERMSG,
3754258945Sroberto					   "watcher got message %d "
3755258945Sroberto					   "for socket %d"), msg, fd);
3756258945Sroberto
3757258945Sroberto		/*
3758258945Sroberto		 * Nothing to read?
3759258945Sroberto		 */
3760258945Sroberto		if (msg == SELECT_POKE_NOTHING)
3761258945Sroberto			break;
3762258945Sroberto
3763258945Sroberto		/*
3764258945Sroberto		 * Handle shutdown message.  We really should
3765258945Sroberto		 * jump out of this loop right away, but
3766258945Sroberto		 * it doesn't matter if we have to do a little
3767258945Sroberto		 * more work first.
3768258945Sroberto		 */
3769258945Sroberto		if (msg == SELECT_POKE_SHUTDOWN)
3770258945Sroberto			return (ISC_TRUE);
3771258945Sroberto
3772258945Sroberto		/*
3773258945Sroberto		 * This is a wakeup on a socket.  Look
3774258945Sroberto		 * at the event queue for both read and write,
3775258945Sroberto		 * and decide if we need to watch on it now
3776258945Sroberto		 * or not.
3777258945Sroberto		 */
3778258945Sroberto		wakeup_socket(manager, fd, msg);
3779258945Sroberto	}
3780258945Sroberto
3781258945Sroberto	return (ISC_FALSE);
3782258945Sroberto}
3783258945Sroberto
3784258945Sroberto/*
3785258945Sroberto * This is the thread that will loop forever, always in a select or poll
3786258945Sroberto * call.
3787258945Sroberto *
3788258945Sroberto * When select returns something to do, track down what thread gets to do
3789258945Sroberto * this I/O and post the event to it.
3790258945Sroberto */
3791258945Srobertostatic isc_threadresult_t
3792258945Srobertowatcher(void *uap) {
3793280849Scy	isc__socketmgr_t *manager = uap;
3794258945Sroberto	isc_boolean_t done;
3795258945Sroberto	int cc;
3796258945Sroberto#ifdef USE_KQUEUE
3797258945Sroberto	const char *fnname = "kevent()";
3798258945Sroberto#elif defined (USE_EPOLL)
3799258945Sroberto	const char *fnname = "epoll_wait()";
3800258945Sroberto#elif defined(USE_DEVPOLL)
3801258945Sroberto	const char *fnname = "ioctl(DP_POLL)";
3802258945Sroberto	struct dvpoll dvp;
3803258945Sroberto#elif defined (USE_SELECT)
3804258945Sroberto	const char *fnname = "select()";
3805258945Sroberto	int maxfd;
3806280849Scy	int ctlfd;
3807258945Sroberto#endif
3808258945Sroberto	char strbuf[ISC_STRERRORSIZE];
3809258945Sroberto#ifdef ISC_SOCKET_USE_POLLWATCH
3810258945Sroberto	pollstate_t pollstate = poll_idle;
3811258945Sroberto#endif
3812258945Sroberto
3813280849Scy#if defined (USE_SELECT)
3814258945Sroberto	/*
3815258945Sroberto	 * Get the control fd here.  This will never change.
3816258945Sroberto	 */
3817258945Sroberto	ctlfd = manager->pipe_fds[0];
3818280849Scy#endif
3819258945Sroberto	done = ISC_FALSE;
3820258945Sroberto	while (!done) {
3821258945Sroberto		do {
3822258945Sroberto#ifdef USE_KQUEUE
3823258945Sroberto			cc = kevent(manager->kqueue_fd, NULL, 0,
3824258945Sroberto				    manager->events, manager->nevents, NULL);
3825258945Sroberto#elif defined(USE_EPOLL)
3826258945Sroberto			cc = epoll_wait(manager->epoll_fd, manager->events,
3827258945Sroberto					manager->nevents, -1);
3828258945Sroberto#elif defined(USE_DEVPOLL)
3829258945Sroberto			dvp.dp_fds = manager->events;
3830258945Sroberto			dvp.dp_nfds = manager->nevents;
3831258945Sroberto#ifndef ISC_SOCKET_USE_POLLWATCH
3832258945Sroberto			dvp.dp_timeout = -1;
3833258945Sroberto#else
3834258945Sroberto			if (pollstate == poll_idle)
3835258945Sroberto				dvp.dp_timeout = -1;
3836258945Sroberto			else
3837258945Sroberto				dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
3838258945Sroberto#endif	/* ISC_SOCKET_USE_POLLWATCH */
3839258945Sroberto			cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
3840258945Sroberto#elif defined(USE_SELECT)
3841258945Sroberto			LOCK(&manager->lock);
3842258945Sroberto			memcpy(manager->read_fds_copy, manager->read_fds,
3843258945Sroberto			       manager->fd_bufsize);
3844258945Sroberto			memcpy(manager->write_fds_copy, manager->write_fds,
3845258945Sroberto			       manager->fd_bufsize);
3846258945Sroberto			maxfd = manager->maxfd + 1;
3847258945Sroberto			UNLOCK(&manager->lock);
3848258945Sroberto
3849258945Sroberto			cc = select(maxfd, manager->read_fds_copy,
3850258945Sroberto				    manager->write_fds_copy, NULL, NULL);
3851258945Sroberto#endif	/* USE_KQUEUE */
3852258945Sroberto
3853258945Sroberto			if (cc < 0 && !SOFT_ERROR(errno)) {
3854258945Sroberto				isc__strerror(errno, strbuf, sizeof(strbuf));
3855258945Sroberto				FATAL_ERROR(__FILE__, __LINE__,
3856258945Sroberto					    "%s %s: %s", fnname,
3857258945Sroberto					    isc_msgcat_get(isc_msgcat,
3858258945Sroberto							   ISC_MSGSET_GENERAL,
3859258945Sroberto							   ISC_MSG_FAILED,
3860258945Sroberto							   "failed"), strbuf);
3861258945Sroberto			}
3862258945Sroberto
3863258945Sroberto#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3864258945Sroberto			if (cc == 0) {
3865258945Sroberto				if (pollstate == poll_active)
3866258945Sroberto					pollstate = poll_checking;
3867258945Sroberto				else if (pollstate == poll_checking)
3868258945Sroberto					pollstate = poll_idle;
3869258945Sroberto			} else if (cc > 0) {
3870258945Sroberto				if (pollstate == poll_checking) {
3871258945Sroberto					/*
3872258945Sroberto					 * XXX: We'd like to use a more
3873258945Sroberto					 * verbose log level as it's actually an
3874258945Sroberto					 * unexpected event, but the kernel bug
3875258945Sroberto					 * reportedly happens pretty frequently
3876258945Sroberto					 * (and it can also be a false positive)
3877258945Sroberto					 * so it would be just too noisy.
3878258945Sroberto					 */
3879258945Sroberto					manager_log(manager,
3880258945Sroberto						    ISC_LOGCATEGORY_GENERAL,
3881258945Sroberto						    ISC_LOGMODULE_SOCKET,
3882258945Sroberto						    ISC_LOG_DEBUG(1),
3883258945Sroberto						    "unexpected POLL timeout");
3884258945Sroberto				}
3885258945Sroberto				pollstate = poll_active;
3886258945Sroberto			}
3887258945Sroberto#endif
3888258945Sroberto		} while (cc < 0);
3889258945Sroberto
3890258945Sroberto#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
3891258945Sroberto		done = process_fds(manager, manager->events, cc);
3892258945Sroberto#elif defined(USE_SELECT)
3893258945Sroberto		process_fds(manager, maxfd, manager->read_fds_copy,
3894258945Sroberto			    manager->write_fds_copy);
3895258945Sroberto
3896258945Sroberto		/*
3897258945Sroberto		 * Process reads on internal, control fd.
3898258945Sroberto		 */
3899258945Sroberto		if (FD_ISSET(ctlfd, manager->read_fds_copy))
3900258945Sroberto			done = process_ctlfd(manager);
3901258945Sroberto#endif
3902258945Sroberto	}
3903258945Sroberto
3904258945Sroberto	manager_log(manager, TRACE, "%s",
3905258945Sroberto		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3906258945Sroberto				   ISC_MSG_EXITING, "watcher exiting"));
3907258945Sroberto
3908258945Sroberto	return ((isc_threadresult_t)0);
3909258945Sroberto}
3910280849Scy#endif /* USE_WATCHER_THREAD */
3911258945Sroberto
3912280849Scy#ifdef BIND9
3913280849ScyISC_SOCKETFUNC_SCOPE void
3914280849Scyisc__socketmgr_setreserved(isc_socketmgr_t *manager0, isc_uint32_t reserved) {
3915280849Scy	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3916258945Sroberto
3917258945Sroberto	REQUIRE(VALID_MANAGER(manager));
3918258945Sroberto
3919258945Sroberto	manager->reserved = reserved;
3920258945Sroberto}
3921258945Sroberto
3922280849ScyISC_SOCKETFUNC_SCOPE void
3923280849Scyisc___socketmgr_maxudp(isc_socketmgr_t *manager0, int maxudp) {
3924280849Scy	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
3925280849Scy
3926280849Scy	REQUIRE(VALID_MANAGER(manager));
3927280849Scy
3928280849Scy	manager->maxudp = maxudp;
3929280849Scy}
3930280849Scy#endif	/* BIND9 */
3931280849Scy
3932258945Sroberto/*
3933258945Sroberto * Create a new socket manager.
3934258945Sroberto */
3935258945Sroberto
3936258945Srobertostatic isc_result_t
3937280849Scysetup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
3938258945Sroberto	isc_result_t result;
3939258945Sroberto#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3940258945Sroberto	char strbuf[ISC_STRERRORSIZE];
3941258945Sroberto#endif
3942258945Sroberto
3943258945Sroberto#ifdef USE_KQUEUE
3944258945Sroberto	manager->nevents = ISC_SOCKET_MAXEVENTS;
3945258945Sroberto	manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
3946258945Sroberto				      manager->nevents);
3947258945Sroberto	if (manager->events == NULL)
3948258945Sroberto		return (ISC_R_NOMEMORY);
3949258945Sroberto	manager->kqueue_fd = kqueue();
3950258945Sroberto	if (manager->kqueue_fd == -1) {
3951258945Sroberto		result = isc__errno2result(errno);
3952258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
3953258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
3954258945Sroberto				 "kqueue %s: %s",
3955258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3956258945Sroberto						ISC_MSG_FAILED, "failed"),
3957258945Sroberto				 strbuf);
3958258945Sroberto		isc_mem_put(mctx, manager->events,
3959258945Sroberto			    sizeof(struct kevent) * manager->nevents);
3960258945Sroberto		return (result);
3961258945Sroberto	}
3962258945Sroberto
3963280849Scy#ifdef USE_WATCHER_THREAD
3964258945Sroberto	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3965258945Sroberto	if (result != ISC_R_SUCCESS) {
3966258945Sroberto		close(manager->kqueue_fd);
3967258945Sroberto		isc_mem_put(mctx, manager->events,
3968258945Sroberto			    sizeof(struct kevent) * manager->nevents);
3969258945Sroberto		return (result);
3970258945Sroberto	}
3971280849Scy#endif	/* USE_WATCHER_THREAD */
3972258945Sroberto#elif defined(USE_EPOLL)
3973258945Sroberto	manager->nevents = ISC_SOCKET_MAXEVENTS;
3974258945Sroberto	manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
3975258945Sroberto				      manager->nevents);
3976258945Sroberto	if (manager->events == NULL)
3977258945Sroberto		return (ISC_R_NOMEMORY);
3978258945Sroberto	manager->epoll_fd = epoll_create(manager->nevents);
3979258945Sroberto	if (manager->epoll_fd == -1) {
3980258945Sroberto		result = isc__errno2result(errno);
3981258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
3982258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
3983258945Sroberto				 "epoll_create %s: %s",
3984258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
3985258945Sroberto						ISC_MSG_FAILED, "failed"),
3986258945Sroberto				 strbuf);
3987258945Sroberto		isc_mem_put(mctx, manager->events,
3988258945Sroberto			    sizeof(struct epoll_event) * manager->nevents);
3989258945Sroberto		return (result);
3990258945Sroberto	}
3991280849Scy#ifdef USE_WATCHER_THREAD
3992258945Sroberto	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
3993258945Sroberto	if (result != ISC_R_SUCCESS) {
3994258945Sroberto		close(manager->epoll_fd);
3995258945Sroberto		isc_mem_put(mctx, manager->events,
3996258945Sroberto			    sizeof(struct epoll_event) * manager->nevents);
3997258945Sroberto		return (result);
3998258945Sroberto	}
3999280849Scy#endif	/* USE_WATCHER_THREAD */
4000258945Sroberto#elif defined(USE_DEVPOLL)
4001258945Sroberto	/*
4002258945Sroberto	 * XXXJT: /dev/poll seems to reject large numbers of events,
4003258945Sroberto	 * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
4004258945Sroberto	 */
4005258945Sroberto	manager->nevents = ISC_SOCKET_MAXEVENTS;
4006258945Sroberto	manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
4007258945Sroberto				      manager->nevents);
4008258945Sroberto	if (manager->events == NULL)
4009258945Sroberto		return (ISC_R_NOMEMORY);
4010258945Sroberto	/*
4011258945Sroberto	 * Note: fdpollinfo should be able to support all possible FDs, so
4012258945Sroberto	 * it must have maxsocks entries (not nevents).
4013258945Sroberto	 */
4014258945Sroberto	manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
4015258945Sroberto					  manager->maxsocks);
4016258945Sroberto	if (manager->fdpollinfo == NULL) {
4017258945Sroberto		isc_mem_put(mctx, manager->events,
4018280849Scy			    sizeof(struct pollfd) * manager->nevents);
4019258945Sroberto		return (ISC_R_NOMEMORY);
4020258945Sroberto	}
4021258945Sroberto	memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
4022258945Sroberto	manager->devpoll_fd = open("/dev/poll", O_RDWR);
4023258945Sroberto	if (manager->devpoll_fd == -1) {
4024258945Sroberto		result = isc__errno2result(errno);
4025258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
4026258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
4027258945Sroberto				 "open(/dev/poll) %s: %s",
4028258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4029258945Sroberto						ISC_MSG_FAILED, "failed"),
4030258945Sroberto				 strbuf);
4031258945Sroberto		isc_mem_put(mctx, manager->events,
4032258945Sroberto			    sizeof(struct pollfd) * manager->nevents);
4033258945Sroberto		isc_mem_put(mctx, manager->fdpollinfo,
4034258945Sroberto			    sizeof(pollinfo_t) * manager->maxsocks);
4035258945Sroberto		return (result);
4036258945Sroberto	}
4037280849Scy#ifdef USE_WATCHER_THREAD
4038258945Sroberto	result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4039258945Sroberto	if (result != ISC_R_SUCCESS) {
4040258945Sroberto		close(manager->devpoll_fd);
4041258945Sroberto		isc_mem_put(mctx, manager->events,
4042258945Sroberto			    sizeof(struct pollfd) * manager->nevents);
4043258945Sroberto		isc_mem_put(mctx, manager->fdpollinfo,
4044258945Sroberto			    sizeof(pollinfo_t) * manager->maxsocks);
4045258945Sroberto		return (result);
4046258945Sroberto	}
4047280849Scy#endif	/* USE_WATCHER_THREAD */
4048258945Sroberto#elif defined(USE_SELECT)
4049258945Sroberto	UNUSED(result);
4050258945Sroberto
4051258945Sroberto#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
4052258945Sroberto	/*
4053258945Sroberto	 * Note: this code should also cover the case of MAXSOCKETS <=
4054258945Sroberto	 * FD_SETSIZE, but we separate the cases to avoid possible portability
4055258945Sroberto	 * issues regarding howmany() and the actual representation of fd_set.
4056258945Sroberto	 */
4057258945Sroberto	manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
4058258945Sroberto		sizeof(fd_mask);
4059258945Sroberto#else
4060258945Sroberto	manager->fd_bufsize = sizeof(fd_set);
4061258945Sroberto#endif
4062258945Sroberto
4063258945Sroberto	manager->read_fds = NULL;
4064258945Sroberto	manager->read_fds_copy = NULL;
4065258945Sroberto	manager->write_fds = NULL;
4066258945Sroberto	manager->write_fds_copy = NULL;
4067258945Sroberto
4068258945Sroberto	manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
4069258945Sroberto	if (manager->read_fds != NULL)
4070258945Sroberto		manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
4071258945Sroberto	if (manager->read_fds_copy != NULL)
4072258945Sroberto		manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
4073258945Sroberto	if (manager->write_fds != NULL) {
4074258945Sroberto		manager->write_fds_copy = isc_mem_get(mctx,
4075258945Sroberto						      manager->fd_bufsize);
4076258945Sroberto	}
4077258945Sroberto	if (manager->write_fds_copy == NULL) {
4078258945Sroberto		if (manager->write_fds != NULL) {
4079258945Sroberto			isc_mem_put(mctx, manager->write_fds,
4080258945Sroberto				    manager->fd_bufsize);
4081258945Sroberto		}
4082258945Sroberto		if (manager->read_fds_copy != NULL) {
4083258945Sroberto			isc_mem_put(mctx, manager->read_fds_copy,
4084258945Sroberto				    manager->fd_bufsize);
4085258945Sroberto		}
4086258945Sroberto		if (manager->read_fds != NULL) {
4087258945Sroberto			isc_mem_put(mctx, manager->read_fds,
4088258945Sroberto				    manager->fd_bufsize);
4089258945Sroberto		}
4090258945Sroberto		return (ISC_R_NOMEMORY);
4091258945Sroberto	}
4092258945Sroberto	memset(manager->read_fds, 0, manager->fd_bufsize);
4093258945Sroberto	memset(manager->write_fds, 0, manager->fd_bufsize);
4094258945Sroberto
4095280849Scy#ifdef USE_WATCHER_THREAD
4096258945Sroberto	(void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4097258945Sroberto	manager->maxfd = manager->pipe_fds[0];
4098280849Scy#else /* USE_WATCHER_THREAD */
4099258945Sroberto	manager->maxfd = 0;
4100280849Scy#endif /* USE_WATCHER_THREAD */
4101258945Sroberto#endif	/* USE_KQUEUE */
4102258945Sroberto
4103258945Sroberto	return (ISC_R_SUCCESS);
4104258945Sroberto}
4105258945Sroberto
4106258945Srobertostatic void
4107280849Scycleanup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
4108280849Scy#ifdef USE_WATCHER_THREAD
4109258945Sroberto	isc_result_t result;
4110258945Sroberto
4111258945Sroberto	result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
4112258945Sroberto	if (result != ISC_R_SUCCESS) {
4113258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
4114258945Sroberto				 "epoll_ctl(DEL) %s",
4115258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4116258945Sroberto						ISC_MSG_FAILED, "failed"));
4117258945Sroberto	}
4118280849Scy#endif	/* USE_WATCHER_THREAD */
4119258945Sroberto
4120258945Sroberto#ifdef USE_KQUEUE
4121258945Sroberto	close(manager->kqueue_fd);
4122258945Sroberto	isc_mem_put(mctx, manager->events,
4123258945Sroberto		    sizeof(struct kevent) * manager->nevents);
4124258945Sroberto#elif defined(USE_EPOLL)
4125258945Sroberto	close(manager->epoll_fd);
4126258945Sroberto	isc_mem_put(mctx, manager->events,
4127258945Sroberto		    sizeof(struct epoll_event) * manager->nevents);
4128258945Sroberto#elif defined(USE_DEVPOLL)
4129258945Sroberto	close(manager->devpoll_fd);
4130258945Sroberto	isc_mem_put(mctx, manager->events,
4131258945Sroberto		    sizeof(struct pollfd) * manager->nevents);
4132258945Sroberto	isc_mem_put(mctx, manager->fdpollinfo,
4133258945Sroberto		    sizeof(pollinfo_t) * manager->maxsocks);
4134258945Sroberto#elif defined(USE_SELECT)
4135258945Sroberto	if (manager->read_fds != NULL)
4136258945Sroberto		isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
4137258945Sroberto	if (manager->read_fds_copy != NULL)
4138258945Sroberto		isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
4139258945Sroberto	if (manager->write_fds != NULL)
4140258945Sroberto		isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
4141258945Sroberto	if (manager->write_fds_copy != NULL)
4142258945Sroberto		isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
4143258945Sroberto#endif	/* USE_KQUEUE */
4144258945Sroberto}
4145258945Sroberto
4146280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4147280849Scyisc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
4148280849Scy	return (isc__socketmgr_create2(mctx, managerp, 0));
4149258945Sroberto}
4150258945Sroberto
4151280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4152280849Scyisc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
4153280849Scy		       unsigned int maxsocks)
4154258945Sroberto{
4155258945Sroberto	int i;
4156280849Scy	isc__socketmgr_t *manager;
4157280849Scy#ifdef USE_WATCHER_THREAD
4158258945Sroberto	char strbuf[ISC_STRERRORSIZE];
4159258945Sroberto#endif
4160258945Sroberto	isc_result_t result;
4161258945Sroberto
4162258945Sroberto	REQUIRE(managerp != NULL && *managerp == NULL);
4163258945Sroberto
4164280849Scy#ifdef USE_SHARED_MANAGER
4165258945Sroberto	if (socketmgr != NULL) {
4166258945Sroberto		/* Don't allow maxsocks to be updated */
4167258945Sroberto		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
4168258945Sroberto			return (ISC_R_EXISTS);
4169258945Sroberto
4170258945Sroberto		socketmgr->refs++;
4171280849Scy		*managerp = (isc_socketmgr_t *)socketmgr;
4172258945Sroberto		return (ISC_R_SUCCESS);
4173258945Sroberto	}
4174280849Scy#endif /* USE_SHARED_MANAGER */
4175258945Sroberto
4176258945Sroberto	if (maxsocks == 0)
4177258945Sroberto		maxsocks = ISC_SOCKET_MAXSOCKETS;
4178258945Sroberto
4179258945Sroberto	manager = isc_mem_get(mctx, sizeof(*manager));
4180258945Sroberto	if (manager == NULL)
4181258945Sroberto		return (ISC_R_NOMEMORY);
4182258945Sroberto
4183258945Sroberto	/* zero-clear so that necessary cleanup on failure will be easy */
4184258945Sroberto	memset(manager, 0, sizeof(*manager));
4185258945Sroberto	manager->maxsocks = maxsocks;
4186258945Sroberto	manager->reserved = 0;
4187280849Scy	manager->maxudp = 0;
4188258945Sroberto	manager->fds = isc_mem_get(mctx,
4189280849Scy				   manager->maxsocks * sizeof(isc__socket_t *));
4190258945Sroberto	if (manager->fds == NULL) {
4191258945Sroberto		result = ISC_R_NOMEMORY;
4192258945Sroberto		goto free_manager;
4193258945Sroberto	}
4194258945Sroberto	manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
4195258945Sroberto	if (manager->fdstate == NULL) {
4196258945Sroberto		result = ISC_R_NOMEMORY;
4197258945Sroberto		goto free_manager;
4198258945Sroberto	}
4199258945Sroberto	manager->stats = NULL;
4200258945Sroberto
4201280849Scy	manager->common.methods = &socketmgrmethods;
4202280849Scy	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
4203280849Scy	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
4204258945Sroberto	manager->mctx = NULL;
4205258945Sroberto	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
4206258945Sroberto	ISC_LIST_INIT(manager->socklist);
4207258945Sroberto	result = isc_mutex_init(&manager->lock);
4208258945Sroberto	if (result != ISC_R_SUCCESS)
4209258945Sroberto		goto free_manager;
4210258945Sroberto	manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
4211258945Sroberto	if (manager->fdlock == NULL) {
4212258945Sroberto		result = ISC_R_NOMEMORY;
4213258945Sroberto		goto cleanup_lock;
4214258945Sroberto	}
4215258945Sroberto	for (i = 0; i < FDLOCK_COUNT; i++) {
4216258945Sroberto		result = isc_mutex_init(&manager->fdlock[i]);
4217258945Sroberto		if (result != ISC_R_SUCCESS) {
4218258945Sroberto			while (--i >= 0)
4219258945Sroberto				DESTROYLOCK(&manager->fdlock[i]);
4220258945Sroberto			isc_mem_put(mctx, manager->fdlock,
4221258945Sroberto				    FDLOCK_COUNT * sizeof(isc_mutex_t));
4222258945Sroberto			manager->fdlock = NULL;
4223258945Sroberto			goto cleanup_lock;
4224258945Sroberto		}
4225258945Sroberto	}
4226258945Sroberto
4227280849Scy#ifdef USE_WATCHER_THREAD
4228258945Sroberto	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
4229258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
4230258945Sroberto				 "isc_condition_init() %s",
4231258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4232258945Sroberto						ISC_MSG_FAILED, "failed"));
4233258945Sroberto		result = ISC_R_UNEXPECTED;
4234258945Sroberto		goto cleanup_lock;
4235258945Sroberto	}
4236258945Sroberto
4237258945Sroberto	/*
4238258945Sroberto	 * Create the special fds that will be used to wake up the
4239258945Sroberto	 * select/poll loop when something internal needs to be done.
4240258945Sroberto	 */
4241258945Sroberto	if (pipe(manager->pipe_fds) != 0) {
4242258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
4243258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
4244258945Sroberto				 "pipe() %s: %s",
4245258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4246258945Sroberto						ISC_MSG_FAILED, "failed"),
4247258945Sroberto				 strbuf);
4248258945Sroberto		result = ISC_R_UNEXPECTED;
4249258945Sroberto		goto cleanup_condition;
4250258945Sroberto	}
4251258945Sroberto
4252258945Sroberto	RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
4253258945Sroberto#if 0
4254258945Sroberto	RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
4255258945Sroberto#endif
4256280849Scy#endif	/* USE_WATCHER_THREAD */
4257280849Scy
4258280849Scy#ifdef USE_SHARED_MANAGER
4259258945Sroberto	manager->refs = 1;
4260280849Scy#endif /* USE_SHARED_MANAGER */
4261258945Sroberto
4262258945Sroberto	/*
4263258945Sroberto	 * Set up initial state for the select loop
4264258945Sroberto	 */
4265258945Sroberto	result = setup_watcher(mctx, manager);
4266258945Sroberto	if (result != ISC_R_SUCCESS)
4267258945Sroberto		goto cleanup;
4268258945Sroberto	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
4269280849Scy#ifdef USE_WATCHER_THREAD
4270258945Sroberto	/*
4271258945Sroberto	 * Start up the select/poll thread.
4272258945Sroberto	 */
4273258945Sroberto	if (isc_thread_create(watcher, manager, &manager->watcher) !=
4274258945Sroberto	    ISC_R_SUCCESS) {
4275258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
4276258945Sroberto				 "isc_thread_create() %s",
4277258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4278258945Sroberto						ISC_MSG_FAILED, "failed"));
4279258945Sroberto		cleanup_watcher(mctx, manager);
4280258945Sroberto		result = ISC_R_UNEXPECTED;
4281258945Sroberto		goto cleanup;
4282258945Sroberto	}
4283280849Scy#endif /* USE_WATCHER_THREAD */
4284258945Sroberto	isc_mem_attach(mctx, &manager->mctx);
4285258945Sroberto
4286280849Scy#ifdef USE_SHARED_MANAGER
4287258945Sroberto	socketmgr = manager;
4288280849Scy#endif /* USE_SHARED_MANAGER */
4289280849Scy	*managerp = (isc_socketmgr_t *)manager;
4290258945Sroberto
4291258945Sroberto	return (ISC_R_SUCCESS);
4292258945Sroberto
4293258945Srobertocleanup:
4294280849Scy#ifdef USE_WATCHER_THREAD
4295258945Sroberto	(void)close(manager->pipe_fds[0]);
4296258945Sroberto	(void)close(manager->pipe_fds[1]);
4297280849Scy#endif	/* USE_WATCHER_THREAD */
4298258945Sroberto
4299280849Scy#ifdef USE_WATCHER_THREAD
4300258945Srobertocleanup_condition:
4301258945Sroberto	(void)isc_condition_destroy(&manager->shutdown_ok);
4302280849Scy#endif	/* USE_WATCHER_THREAD */
4303258945Sroberto
4304258945Sroberto
4305258945Srobertocleanup_lock:
4306258945Sroberto	if (manager->fdlock != NULL) {
4307258945Sroberto		for (i = 0; i < FDLOCK_COUNT; i++)
4308258945Sroberto			DESTROYLOCK(&manager->fdlock[i]);
4309258945Sroberto	}
4310258945Sroberto	DESTROYLOCK(&manager->lock);
4311258945Sroberto
4312258945Srobertofree_manager:
4313258945Sroberto	if (manager->fdlock != NULL) {
4314258945Sroberto		isc_mem_put(mctx, manager->fdlock,
4315258945Sroberto			    FDLOCK_COUNT * sizeof(isc_mutex_t));
4316258945Sroberto	}
4317258945Sroberto	if (manager->fdstate != NULL) {
4318258945Sroberto		isc_mem_put(mctx, manager->fdstate,
4319258945Sroberto			    manager->maxsocks * sizeof(int));
4320258945Sroberto	}
4321258945Sroberto	if (manager->fds != NULL) {
4322258945Sroberto		isc_mem_put(mctx, manager->fds,
4323258945Sroberto			    manager->maxsocks * sizeof(isc_socket_t *));
4324258945Sroberto	}
4325258945Sroberto	isc_mem_put(mctx, manager, sizeof(*manager));
4326258945Sroberto
4327258945Sroberto	return (result);
4328258945Sroberto}
4329258945Sroberto
4330280849Scy#ifdef BIND9
4331258945Srobertoisc_result_t
4332280849Scyisc__socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) {
4333280849Scy	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
4334258945Sroberto	REQUIRE(VALID_MANAGER(manager));
4335258945Sroberto	REQUIRE(nsockp != NULL);
4336258945Sroberto
4337258945Sroberto	*nsockp = manager->maxsocks;
4338258945Sroberto
4339258945Sroberto	return (ISC_R_SUCCESS);
4340258945Sroberto}
4341258945Sroberto
4342258945Srobertovoid
4343280849Scyisc__socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) {
4344280849Scy	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
4345280849Scy
4346258945Sroberto	REQUIRE(VALID_MANAGER(manager));
4347258945Sroberto	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
4348258945Sroberto	REQUIRE(manager->stats == NULL);
4349258945Sroberto	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
4350258945Sroberto
4351258945Sroberto	isc_stats_attach(stats, &manager->stats);
4352258945Sroberto}
4353280849Scy#endif
4354258945Sroberto
4355280849ScyISC_SOCKETFUNC_SCOPE void
4356280849Scyisc__socketmgr_destroy(isc_socketmgr_t **managerp) {
4357280849Scy	isc__socketmgr_t *manager;
4358258945Sroberto	int i;
4359258945Sroberto	isc_mem_t *mctx;
4360258945Sroberto
4361258945Sroberto	/*
4362258945Sroberto	 * Destroy a socket manager.
4363258945Sroberto	 */
4364258945Sroberto
4365258945Sroberto	REQUIRE(managerp != NULL);
4366280849Scy	manager = (isc__socketmgr_t *)*managerp;
4367258945Sroberto	REQUIRE(VALID_MANAGER(manager));
4368258945Sroberto
4369280849Scy#ifdef USE_SHARED_MANAGER
4370280849Scy	manager->refs--;
4371280849Scy	if (manager->refs > 0) {
4372258945Sroberto		*managerp = NULL;
4373258945Sroberto		return;
4374258945Sroberto	}
4375280849Scy	socketmgr = NULL;
4376280849Scy#endif /* USE_SHARED_MANAGER */
4377258945Sroberto
4378258945Sroberto	LOCK(&manager->lock);
4379258945Sroberto
4380258945Sroberto	/*
4381258945Sroberto	 * Wait for all sockets to be destroyed.
4382258945Sroberto	 */
4383258945Sroberto	while (!ISC_LIST_EMPTY(manager->socklist)) {
4384280849Scy#ifdef USE_WATCHER_THREAD
4385258945Sroberto		manager_log(manager, CREATION, "%s",
4386258945Sroberto			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
4387258945Sroberto					   ISC_MSG_SOCKETSREMAIN,
4388258945Sroberto					   "sockets exist"));
4389258945Sroberto		WAIT(&manager->shutdown_ok, &manager->lock);
4390280849Scy#else /* USE_WATCHER_THREAD */
4391280849Scy		UNLOCK(&manager->lock);
4392280849Scy		isc__taskmgr_dispatch(NULL);
4393280849Scy		LOCK(&manager->lock);
4394280849Scy#endif /* USE_WATCHER_THREAD */
4395258945Sroberto	}
4396258945Sroberto
4397258945Sroberto	UNLOCK(&manager->lock);
4398258945Sroberto
4399258945Sroberto	/*
4400258945Sroberto	 * Here, poke our select/poll thread.  Do this by closing the write
4401258945Sroberto	 * half of the pipe, which will send EOF to the read half.
4402258945Sroberto	 * This is currently a no-op in the non-threaded case.
4403258945Sroberto	 */
4404258945Sroberto	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
4405258945Sroberto
4406280849Scy#ifdef USE_WATCHER_THREAD
4407258945Sroberto	/*
4408258945Sroberto	 * Wait for thread to exit.
4409258945Sroberto	 */
4410258945Sroberto	if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
4411258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
4412258945Sroberto				 "isc_thread_join() %s",
4413258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
4414258945Sroberto						ISC_MSG_FAILED, "failed"));
4415280849Scy#endif /* USE_WATCHER_THREAD */
4416258945Sroberto
4417258945Sroberto	/*
4418258945Sroberto	 * Clean up.
4419258945Sroberto	 */
4420258945Sroberto	cleanup_watcher(manager->mctx, manager);
4421258945Sroberto
4422280849Scy#ifdef USE_WATCHER_THREAD
4423258945Sroberto	(void)close(manager->pipe_fds[0]);
4424258945Sroberto	(void)close(manager->pipe_fds[1]);
4425258945Sroberto	(void)isc_condition_destroy(&manager->shutdown_ok);
4426280849Scy#endif /* USE_WATCHER_THREAD */
4427258945Sroberto
4428258945Sroberto	for (i = 0; i < (int)manager->maxsocks; i++)
4429258945Sroberto		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
4430258945Sroberto			(void)close(i);
4431258945Sroberto
4432258945Sroberto	isc_mem_put(manager->mctx, manager->fds,
4433280849Scy		    manager->maxsocks * sizeof(isc__socket_t *));
4434258945Sroberto	isc_mem_put(manager->mctx, manager->fdstate,
4435258945Sroberto		    manager->maxsocks * sizeof(int));
4436258945Sroberto
4437258945Sroberto	if (manager->stats != NULL)
4438258945Sroberto		isc_stats_detach(&manager->stats);
4439258945Sroberto
4440258945Sroberto	if (manager->fdlock != NULL) {
4441258945Sroberto		for (i = 0; i < FDLOCK_COUNT; i++)
4442258945Sroberto			DESTROYLOCK(&manager->fdlock[i]);
4443258945Sroberto		isc_mem_put(manager->mctx, manager->fdlock,
4444258945Sroberto			    FDLOCK_COUNT * sizeof(isc_mutex_t));
4445258945Sroberto	}
4446258945Sroberto	DESTROYLOCK(&manager->lock);
4447280849Scy	manager->common.magic = 0;
4448280849Scy	manager->common.impmagic = 0;
4449258945Sroberto	mctx= manager->mctx;
4450258945Sroberto	isc_mem_put(mctx, manager, sizeof(*manager));
4451258945Sroberto
4452258945Sroberto	isc_mem_detach(&mctx);
4453258945Sroberto
4454258945Sroberto	*managerp = NULL;
4455280849Scy
4456280849Scy#ifdef USE_SHARED_MANAGER
4457280849Scy	socketmgr = NULL;
4458280849Scy#endif
4459258945Sroberto}
4460258945Sroberto
4461258945Srobertostatic isc_result_t
4462280849Scysocket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4463258945Sroberto	    unsigned int flags)
4464258945Sroberto{
4465258945Sroberto	int io_state;
4466258945Sroberto	isc_boolean_t have_lock = ISC_FALSE;
4467258945Sroberto	isc_task_t *ntask = NULL;
4468258945Sroberto	isc_result_t result = ISC_R_SUCCESS;
4469258945Sroberto
4470258945Sroberto	dev->ev_sender = task;
4471258945Sroberto
4472258945Sroberto	if (sock->type == isc_sockettype_udp) {
4473258945Sroberto		io_state = doio_recv(sock, dev);
4474258945Sroberto	} else {
4475258945Sroberto		LOCK(&sock->lock);
4476258945Sroberto		have_lock = ISC_TRUE;
4477258945Sroberto
4478258945Sroberto		if (ISC_LIST_EMPTY(sock->recv_list))
4479258945Sroberto			io_state = doio_recv(sock, dev);
4480258945Sroberto		else
4481258945Sroberto			io_state = DOIO_SOFT;
4482258945Sroberto	}
4483258945Sroberto
4484258945Sroberto	switch (io_state) {
4485258945Sroberto	case DOIO_SOFT:
4486258945Sroberto		/*
4487258945Sroberto		 * We couldn't read all or part of the request right now, so
4488258945Sroberto		 * queue it.
4489258945Sroberto		 *
4490258945Sroberto		 * Attach to socket and to task
4491258945Sroberto		 */
4492258945Sroberto		isc_task_attach(task, &ntask);
4493258945Sroberto		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4494258945Sroberto
4495258945Sroberto		if (!have_lock) {
4496258945Sroberto			LOCK(&sock->lock);
4497258945Sroberto			have_lock = ISC_TRUE;
4498258945Sroberto		}
4499258945Sroberto
4500258945Sroberto		/*
4501258945Sroberto		 * Enqueue the request.  If the socket was previously not being
4502258945Sroberto		 * watched, poke the watcher to start paying attention to it.
4503258945Sroberto		 */
4504258945Sroberto		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
4505258945Sroberto			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
4506258945Sroberto		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4507258945Sroberto
4508258945Sroberto		socket_log(sock, NULL, EVENT, NULL, 0, 0,
4509258945Sroberto			   "socket_recv: event %p -> task %p",
4510258945Sroberto			   dev, ntask);
4511258945Sroberto
4512258945Sroberto		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4513258945Sroberto			result = ISC_R_INPROGRESS;
4514258945Sroberto		break;
4515258945Sroberto
4516258945Sroberto	case DOIO_EOF:
4517258945Sroberto		dev->result = ISC_R_EOF;
4518258945Sroberto		/* fallthrough */
4519258945Sroberto
4520258945Sroberto	case DOIO_HARD:
4521258945Sroberto	case DOIO_SUCCESS:
4522258945Sroberto		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4523258945Sroberto			send_recvdone_event(sock, &dev);
4524258945Sroberto		break;
4525258945Sroberto	}
4526258945Sroberto
4527258945Sroberto	if (have_lock)
4528258945Sroberto		UNLOCK(&sock->lock);
4529258945Sroberto
4530258945Sroberto	return (result);
4531258945Sroberto}
4532258945Sroberto
4533280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4534280849Scyisc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
4535280849Scy		  unsigned int minimum, isc_task_t *task,
4536280849Scy		  isc_taskaction_t action, const void *arg)
4537258945Sroberto{
4538280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
4539258945Sroberto	isc_socketevent_t *dev;
4540280849Scy	isc__socketmgr_t *manager;
4541258945Sroberto	unsigned int iocount;
4542258945Sroberto	isc_buffer_t *buffer;
4543258945Sroberto
4544258945Sroberto	REQUIRE(VALID_SOCKET(sock));
4545258945Sroberto	REQUIRE(buflist != NULL);
4546258945Sroberto	REQUIRE(!ISC_LIST_EMPTY(*buflist));
4547258945Sroberto	REQUIRE(task != NULL);
4548258945Sroberto	REQUIRE(action != NULL);
4549258945Sroberto
4550258945Sroberto	manager = sock->manager;
4551258945Sroberto	REQUIRE(VALID_MANAGER(manager));
4552258945Sroberto
4553258945Sroberto	iocount = isc_bufferlist_availablecount(buflist);
4554258945Sroberto	REQUIRE(iocount > 0);
4555258945Sroberto
4556258945Sroberto	INSIST(sock->bound);
4557258945Sroberto
4558258945Sroberto	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4559280849Scy	if (dev == NULL)
4560258945Sroberto		return (ISC_R_NOMEMORY);
4561258945Sroberto
4562258945Sroberto	/*
4563258945Sroberto	 * UDP sockets are always partial read
4564258945Sroberto	 */
4565258945Sroberto	if (sock->type == isc_sockettype_udp)
4566258945Sroberto		dev->minimum = 1;
4567258945Sroberto	else {
4568258945Sroberto		if (minimum == 0)
4569258945Sroberto			dev->minimum = iocount;
4570258945Sroberto		else
4571258945Sroberto			dev->minimum = minimum;
4572258945Sroberto	}
4573258945Sroberto
4574258945Sroberto	/*
4575258945Sroberto	 * Move each buffer from the passed in list to our internal one.
4576258945Sroberto	 */
4577258945Sroberto	buffer = ISC_LIST_HEAD(*buflist);
4578258945Sroberto	while (buffer != NULL) {
4579258945Sroberto		ISC_LIST_DEQUEUE(*buflist, buffer, link);
4580258945Sroberto		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4581258945Sroberto		buffer = ISC_LIST_HEAD(*buflist);
4582258945Sroberto	}
4583258945Sroberto
4584258945Sroberto	return (socket_recv(sock, dev, task, 0));
4585258945Sroberto}
4586258945Sroberto
4587280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4588280849Scyisc__socket_recv(isc_socket_t *sock0, isc_region_t *region,
4589280849Scy		 unsigned int minimum, isc_task_t *task,
4590280849Scy		 isc_taskaction_t action, const void *arg)
4591258945Sroberto{
4592280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
4593258945Sroberto	isc_socketevent_t *dev;
4594280849Scy	isc__socketmgr_t *manager;
4595258945Sroberto
4596258945Sroberto	REQUIRE(VALID_SOCKET(sock));
4597258945Sroberto	REQUIRE(action != NULL);
4598258945Sroberto
4599258945Sroberto	manager = sock->manager;
4600258945Sroberto	REQUIRE(VALID_MANAGER(manager));
4601258945Sroberto
4602258945Sroberto	INSIST(sock->bound);
4603258945Sroberto
4604258945Sroberto	dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
4605258945Sroberto	if (dev == NULL)
4606258945Sroberto		return (ISC_R_NOMEMORY);
4607258945Sroberto
4608280849Scy	return (isc__socket_recv2(sock0, region, minimum, task, dev, 0));
4609258945Sroberto}
4610258945Sroberto
4611280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4612280849Scyisc__socket_recv2(isc_socket_t *sock0, isc_region_t *region,
4613280849Scy		  unsigned int minimum, isc_task_t *task,
4614280849Scy		  isc_socketevent_t *event, unsigned int flags)
4615258945Sroberto{
4616280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
4617280849Scy
4618258945Sroberto	event->ev_sender = sock;
4619280849Scy	event->result = ISC_R_UNSET;
4620258945Sroberto	ISC_LIST_INIT(event->bufferlist);
4621258945Sroberto	event->region = *region;
4622258945Sroberto	event->n = 0;
4623258945Sroberto	event->offset = 0;
4624258945Sroberto	event->attributes = 0;
4625258945Sroberto
4626258945Sroberto	/*
4627258945Sroberto	 * UDP sockets are always partial read.
4628258945Sroberto	 */
4629258945Sroberto	if (sock->type == isc_sockettype_udp)
4630258945Sroberto		event->minimum = 1;
4631258945Sroberto	else {
4632258945Sroberto		if (minimum == 0)
4633258945Sroberto			event->minimum = region->length;
4634258945Sroberto		else
4635258945Sroberto			event->minimum = minimum;
4636258945Sroberto	}
4637258945Sroberto
4638258945Sroberto	return (socket_recv(sock, event, task, flags));
4639258945Sroberto}
4640258945Sroberto
4641258945Srobertostatic isc_result_t
4642280849Scysocket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4643258945Sroberto	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4644258945Sroberto	    unsigned int flags)
4645258945Sroberto{
4646258945Sroberto	int io_state;
4647258945Sroberto	isc_boolean_t have_lock = ISC_FALSE;
4648258945Sroberto	isc_task_t *ntask = NULL;
4649258945Sroberto	isc_result_t result = ISC_R_SUCCESS;
4650258945Sroberto
4651258945Sroberto	dev->ev_sender = task;
4652258945Sroberto
4653258945Sroberto	set_dev_address(address, sock, dev);
4654258945Sroberto	if (pktinfo != NULL) {
4655258945Sroberto		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4656258945Sroberto		dev->pktinfo = *pktinfo;
4657258945Sroberto
4658258945Sroberto		if (!isc_sockaddr_issitelocal(&dev->address) &&
4659258945Sroberto		    !isc_sockaddr_islinklocal(&dev->address)) {
4660258945Sroberto			socket_log(sock, NULL, TRACE, isc_msgcat,
4661258945Sroberto				   ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
4662258945Sroberto				   "pktinfo structure provided, ifindex %u "
4663258945Sroberto				   "(set to 0)", pktinfo->ipi6_ifindex);
4664258945Sroberto
4665258945Sroberto			/*
4666258945Sroberto			 * Set the pktinfo index to 0 here, to let the
4667258945Sroberto			 * kernel decide what interface it should send on.
4668258945Sroberto			 */
4669258945Sroberto			dev->pktinfo.ipi6_ifindex = 0;
4670258945Sroberto		}
4671258945Sroberto	}
4672258945Sroberto
4673258945Sroberto	if (sock->type == isc_sockettype_udp)
4674258945Sroberto		io_state = doio_send(sock, dev);
4675258945Sroberto	else {
4676258945Sroberto		LOCK(&sock->lock);
4677258945Sroberto		have_lock = ISC_TRUE;
4678258945Sroberto
4679258945Sroberto		if (ISC_LIST_EMPTY(sock->send_list))
4680258945Sroberto			io_state = doio_send(sock, dev);
4681258945Sroberto		else
4682258945Sroberto			io_state = DOIO_SOFT;
4683258945Sroberto	}
4684258945Sroberto
4685258945Sroberto	switch (io_state) {
4686258945Sroberto	case DOIO_SOFT:
4687258945Sroberto		/*
4688258945Sroberto		 * We couldn't send all or part of the request right now, so
4689258945Sroberto		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4690258945Sroberto		 */
4691258945Sroberto		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4692258945Sroberto			isc_task_attach(task, &ntask);
4693258945Sroberto			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4694258945Sroberto
4695258945Sroberto			if (!have_lock) {
4696258945Sroberto				LOCK(&sock->lock);
4697258945Sroberto				have_lock = ISC_TRUE;
4698258945Sroberto			}
4699258945Sroberto
4700258945Sroberto			/*
4701258945Sroberto			 * Enqueue the request.  If the socket was previously
4702258945Sroberto			 * not being watched, poke the watcher to start
4703258945Sroberto			 * paying attention to it.
4704258945Sroberto			 */
4705258945Sroberto			if (ISC_LIST_EMPTY(sock->send_list) &&
4706258945Sroberto			    !sock->pending_send)
4707258945Sroberto				select_poke(sock->manager, sock->fd,
4708258945Sroberto					    SELECT_POKE_WRITE);
4709258945Sroberto			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4710258945Sroberto
4711258945Sroberto			socket_log(sock, NULL, EVENT, NULL, 0, 0,
4712258945Sroberto				   "socket_send: event %p -> task %p",
4713258945Sroberto				   dev, ntask);
4714258945Sroberto
4715258945Sroberto			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
4716258945Sroberto				result = ISC_R_INPROGRESS;
4717258945Sroberto			break;
4718258945Sroberto		}
4719258945Sroberto
4720258945Sroberto	case DOIO_HARD:
4721258945Sroberto	case DOIO_SUCCESS:
4722258945Sroberto		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
4723258945Sroberto			send_senddone_event(sock, &dev);
4724258945Sroberto		break;
4725258945Sroberto	}
4726258945Sroberto
4727258945Sroberto	if (have_lock)
4728258945Sroberto		UNLOCK(&sock->lock);
4729258945Sroberto
4730258945Sroberto	return (result);
4731258945Sroberto}
4732258945Sroberto
4733280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4734280849Scyisc__socket_send(isc_socket_t *sock, isc_region_t *region,
4735280849Scy		 isc_task_t *task, isc_taskaction_t action, const void *arg)
4736258945Sroberto{
4737258945Sroberto	/*
4738258945Sroberto	 * REQUIRE() checking is performed in isc_socket_sendto().
4739258945Sroberto	 */
4740280849Scy	return (isc__socket_sendto(sock, region, task, action, arg, NULL,
4741280849Scy				   NULL));
4742258945Sroberto}
4743258945Sroberto
4744280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4745280849Scyisc__socket_sendto(isc_socket_t *sock0, isc_region_t *region,
4746280849Scy		   isc_task_t *task, isc_taskaction_t action, const void *arg,
4747280849Scy		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4748258945Sroberto{
4749280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
4750258945Sroberto	isc_socketevent_t *dev;
4751280849Scy	isc__socketmgr_t *manager;
4752258945Sroberto
4753258945Sroberto	REQUIRE(VALID_SOCKET(sock));
4754258945Sroberto	REQUIRE(region != NULL);
4755258945Sroberto	REQUIRE(task != NULL);
4756258945Sroberto	REQUIRE(action != NULL);
4757258945Sroberto
4758258945Sroberto	manager = sock->manager;
4759258945Sroberto	REQUIRE(VALID_MANAGER(manager));
4760258945Sroberto
4761258945Sroberto	INSIST(sock->bound);
4762258945Sroberto
4763258945Sroberto	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4764280849Scy	if (dev == NULL)
4765258945Sroberto		return (ISC_R_NOMEMORY);
4766258945Sroberto
4767258945Sroberto	dev->region = *region;
4768258945Sroberto
4769258945Sroberto	return (socket_send(sock, dev, task, address, pktinfo, 0));
4770258945Sroberto}
4771258945Sroberto
4772280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4773280849Scyisc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
4774280849Scy		  isc_task_t *task, isc_taskaction_t action, const void *arg)
4775258945Sroberto{
4776280849Scy	return (isc__socket_sendtov(sock, buflist, task, action, arg, NULL,
4777280849Scy				    NULL));
4778258945Sroberto}
4779258945Sroberto
4780280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4781280849Scyisc__socket_sendtov(isc_socket_t *sock0, isc_bufferlist_t *buflist,
4782280849Scy		    isc_task_t *task, isc_taskaction_t action, const void *arg,
4783280849Scy		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
4784258945Sroberto{
4785280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
4786258945Sroberto	isc_socketevent_t *dev;
4787280849Scy	isc__socketmgr_t *manager;
4788258945Sroberto	unsigned int iocount;
4789258945Sroberto	isc_buffer_t *buffer;
4790258945Sroberto
4791258945Sroberto	REQUIRE(VALID_SOCKET(sock));
4792258945Sroberto	REQUIRE(buflist != NULL);
4793258945Sroberto	REQUIRE(!ISC_LIST_EMPTY(*buflist));
4794258945Sroberto	REQUIRE(task != NULL);
4795258945Sroberto	REQUIRE(action != NULL);
4796258945Sroberto
4797258945Sroberto	manager = sock->manager;
4798258945Sroberto	REQUIRE(VALID_MANAGER(manager));
4799258945Sroberto
4800258945Sroberto	iocount = isc_bufferlist_usedcount(buflist);
4801258945Sroberto	REQUIRE(iocount > 0);
4802258945Sroberto
4803258945Sroberto	dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
4804280849Scy	if (dev == NULL)
4805258945Sroberto		return (ISC_R_NOMEMORY);
4806258945Sroberto
4807258945Sroberto	/*
4808258945Sroberto	 * Move each buffer from the passed in list to our internal one.
4809258945Sroberto	 */
4810258945Sroberto	buffer = ISC_LIST_HEAD(*buflist);
4811258945Sroberto	while (buffer != NULL) {
4812258945Sroberto		ISC_LIST_DEQUEUE(*buflist, buffer, link);
4813258945Sroberto		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
4814258945Sroberto		buffer = ISC_LIST_HEAD(*buflist);
4815258945Sroberto	}
4816258945Sroberto
4817258945Sroberto	return (socket_send(sock, dev, task, address, pktinfo, 0));
4818258945Sroberto}
4819258945Sroberto
4820280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4821280849Scyisc__socket_sendto2(isc_socket_t *sock0, isc_region_t *region,
4822280849Scy		    isc_task_t *task,
4823280849Scy		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4824280849Scy		    isc_socketevent_t *event, unsigned int flags)
4825258945Sroberto{
4826280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
4827280849Scy
4828280849Scy	REQUIRE(VALID_SOCKET(sock));
4829258945Sroberto	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
4830258945Sroberto	if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
4831258945Sroberto		REQUIRE(sock->type == isc_sockettype_udp);
4832258945Sroberto	event->ev_sender = sock;
4833280849Scy	event->result = ISC_R_UNSET;
4834258945Sroberto	ISC_LIST_INIT(event->bufferlist);
4835258945Sroberto	event->region = *region;
4836258945Sroberto	event->n = 0;
4837258945Sroberto	event->offset = 0;
4838258945Sroberto	event->attributes = 0;
4839258945Sroberto
4840258945Sroberto	return (socket_send(sock, event, task, address, pktinfo, flags));
4841258945Sroberto}
4842258945Sroberto
4843280849ScyISC_SOCKETFUNC_SCOPE void
4844280849Scyisc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
4845258945Sroberto#ifdef ISC_PLATFORM_HAVESYSUNH
4846258945Sroberto	int s;
4847258945Sroberto	struct stat sb;
4848258945Sroberto	char strbuf[ISC_STRERRORSIZE];
4849258945Sroberto
4850258945Sroberto	if (sockaddr->type.sa.sa_family != AF_UNIX)
4851258945Sroberto		return;
4852258945Sroberto
4853258945Sroberto#ifndef S_ISSOCK
4854258945Sroberto#if defined(S_IFMT) && defined(S_IFSOCK)
4855258945Sroberto#define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
4856258945Sroberto#elif defined(_S_IFMT) && defined(S_IFSOCK)
4857258945Sroberto#define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
4858258945Sroberto#endif
4859258945Sroberto#endif
4860258945Sroberto
4861258945Sroberto#ifndef S_ISFIFO
4862258945Sroberto#if defined(S_IFMT) && defined(S_IFIFO)
4863258945Sroberto#define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
4864258945Sroberto#elif defined(_S_IFMT) && defined(S_IFIFO)
4865258945Sroberto#define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
4866258945Sroberto#endif
4867258945Sroberto#endif
4868258945Sroberto
4869258945Sroberto#if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4870258945Sroberto#error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4871258945Sroberto#endif
4872258945Sroberto
4873258945Sroberto#ifndef S_ISFIFO
4874258945Sroberto#define S_ISFIFO(mode) 0
4875258945Sroberto#endif
4876258945Sroberto
4877258945Sroberto#ifndef S_ISSOCK
4878258945Sroberto#define S_ISSOCK(mode) 0
4879258945Sroberto#endif
4880258945Sroberto
4881258945Sroberto	if (active) {
4882258945Sroberto		if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4883258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
4884258945Sroberto			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4885258945Sroberto				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4886258945Sroberto				      "isc_socket_cleanunix: stat(%s): %s",
4887258945Sroberto				      sockaddr->type.sunix.sun_path, strbuf);
4888258945Sroberto			return;
4889258945Sroberto		}
4890258945Sroberto		if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4891258945Sroberto			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4892258945Sroberto				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4893258945Sroberto				      "isc_socket_cleanunix: %s: not a socket",
4894258945Sroberto				      sockaddr->type.sunix.sun_path);
4895258945Sroberto			return;
4896258945Sroberto		}
4897258945Sroberto		if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4898258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
4899258945Sroberto			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4900258945Sroberto				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4901258945Sroberto				      "isc_socket_cleanunix: unlink(%s): %s",
4902258945Sroberto				      sockaddr->type.sunix.sun_path, strbuf);
4903258945Sroberto		}
4904258945Sroberto		return;
4905258945Sroberto	}
4906258945Sroberto
4907258945Sroberto	s = socket(AF_UNIX, SOCK_STREAM, 0);
4908258945Sroberto	if (s < 0) {
4909258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
4910258945Sroberto		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4911258945Sroberto			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4912258945Sroberto			      "isc_socket_cleanunix: socket(%s): %s",
4913258945Sroberto			      sockaddr->type.sunix.sun_path, strbuf);
4914258945Sroberto		return;
4915258945Sroberto	}
4916258945Sroberto
4917258945Sroberto	if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4918258945Sroberto		switch (errno) {
4919258945Sroberto		case ENOENT:    /* We exited cleanly last time */
4920258945Sroberto			break;
4921258945Sroberto		default:
4922258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
4923258945Sroberto			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4924258945Sroberto				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4925258945Sroberto				      "isc_socket_cleanunix: stat(%s): %s",
4926258945Sroberto				      sockaddr->type.sunix.sun_path, strbuf);
4927258945Sroberto			break;
4928258945Sroberto		}
4929258945Sroberto		goto cleanup;
4930258945Sroberto	}
4931258945Sroberto
4932258945Sroberto	if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4933258945Sroberto		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4934258945Sroberto			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4935258945Sroberto			      "isc_socket_cleanunix: %s: not a socket",
4936258945Sroberto			      sockaddr->type.sunix.sun_path);
4937258945Sroberto		goto cleanup;
4938258945Sroberto	}
4939258945Sroberto
4940258945Sroberto	if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
4941258945Sroberto		    sizeof(sockaddr->type.sunix)) < 0) {
4942258945Sroberto		switch (errno) {
4943258945Sroberto		case ECONNREFUSED:
4944258945Sroberto		case ECONNRESET:
4945258945Sroberto			if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4946258945Sroberto				isc__strerror(errno, strbuf, sizeof(strbuf));
4947258945Sroberto				isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4948258945Sroberto					      ISC_LOGMODULE_SOCKET,
4949258945Sroberto					      ISC_LOG_WARNING,
4950258945Sroberto					      "isc_socket_cleanunix: "
4951258945Sroberto					      "unlink(%s): %s",
4952258945Sroberto					      sockaddr->type.sunix.sun_path,
4953258945Sroberto					      strbuf);
4954258945Sroberto			}
4955258945Sroberto			break;
4956258945Sroberto		default:
4957258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
4958258945Sroberto			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4959258945Sroberto				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4960258945Sroberto				      "isc_socket_cleanunix: connect(%s): %s",
4961258945Sroberto				      sockaddr->type.sunix.sun_path, strbuf);
4962258945Sroberto			break;
4963258945Sroberto		}
4964258945Sroberto	}
4965258945Sroberto cleanup:
4966258945Sroberto	close(s);
4967258945Sroberto#else
4968258945Sroberto	UNUSED(sockaddr);
4969258945Sroberto	UNUSED(active);
4970258945Sroberto#endif
4971258945Sroberto}
4972258945Sroberto
4973280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
4974280849Scyisc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
4975258945Sroberto		    isc_uint32_t owner, isc_uint32_t group)
4976258945Sroberto{
4977258945Sroberto#ifdef ISC_PLATFORM_HAVESYSUNH
4978258945Sroberto	isc_result_t result = ISC_R_SUCCESS;
4979258945Sroberto	char strbuf[ISC_STRERRORSIZE];
4980258945Sroberto	char path[sizeof(sockaddr->type.sunix.sun_path)];
4981258945Sroberto#ifdef NEED_SECURE_DIRECTORY
4982258945Sroberto	char *slash;
4983258945Sroberto#endif
4984258945Sroberto
4985258945Sroberto	REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4986258945Sroberto	INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4987258945Sroberto	strcpy(path, sockaddr->type.sunix.sun_path);
4988258945Sroberto
4989258945Sroberto#ifdef NEED_SECURE_DIRECTORY
4990258945Sroberto	slash = strrchr(path, '/');
4991258945Sroberto	if (slash != NULL) {
4992258945Sroberto		if (slash != path)
4993258945Sroberto			*slash = '\0';
4994258945Sroberto		else
4995258945Sroberto			strcpy(path, "/");
4996258945Sroberto	} else
4997258945Sroberto		strcpy(path, ".");
4998258945Sroberto#endif
4999258945Sroberto
5000258945Sroberto	if (chmod(path, perm) < 0) {
5001258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
5002258945Sroberto		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
5003258945Sroberto			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
5004258945Sroberto			      "isc_socket_permunix: chmod(%s, %d): %s",
5005258945Sroberto			      path, perm, strbuf);
5006258945Sroberto		result = ISC_R_FAILURE;
5007258945Sroberto	}
5008258945Sroberto	if (chown(path, owner, group) < 0) {
5009258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
5010258945Sroberto		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
5011258945Sroberto			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
5012258945Sroberto			      "isc_socket_permunix: chown(%s, %d, %d): %s",
5013258945Sroberto			      path, owner, group,
5014258945Sroberto			      strbuf);
5015258945Sroberto		result = ISC_R_FAILURE;
5016258945Sroberto	}
5017258945Sroberto	return (result);
5018258945Sroberto#else
5019258945Sroberto	UNUSED(sockaddr);
5020258945Sroberto	UNUSED(perm);
5021258945Sroberto	UNUSED(owner);
5022258945Sroberto	UNUSED(group);
5023258945Sroberto	return (ISC_R_NOTIMPLEMENTED);
5024258945Sroberto#endif
5025258945Sroberto}
5026258945Sroberto
5027280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
5028280849Scyisc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
5029280849Scy		 unsigned int options) {
5030280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5031258945Sroberto	char strbuf[ISC_STRERRORSIZE];
5032258945Sroberto	int on = 1;
5033258945Sroberto
5034280849Scy	REQUIRE(VALID_SOCKET(sock));
5035280849Scy
5036258945Sroberto	LOCK(&sock->lock);
5037258945Sroberto
5038258945Sroberto	INSIST(!sock->bound);
5039280849Scy	INSIST(!sock->dupped);
5040258945Sroberto
5041258945Sroberto	if (sock->pf != sockaddr->type.sa.sa_family) {
5042258945Sroberto		UNLOCK(&sock->lock);
5043258945Sroberto		return (ISC_R_FAMILYMISMATCH);
5044258945Sroberto	}
5045280849Scy
5046258945Sroberto	/*
5047258945Sroberto	 * Only set SO_REUSEADDR when we want a specific port.
5048258945Sroberto	 */
5049258945Sroberto#ifdef AF_UNIX
5050258945Sroberto	if (sock->pf == AF_UNIX)
5051258945Sroberto		goto bind_socket;
5052258945Sroberto#endif
5053258945Sroberto	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
5054258945Sroberto	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
5055258945Sroberto	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
5056258945Sroberto		       sizeof(on)) < 0) {
5057258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
5058258945Sroberto				 "setsockopt(%d) %s", sock->fd,
5059258945Sroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
5060258945Sroberto						ISC_MSG_FAILED, "failed"));
5061258945Sroberto		/* Press on... */
5062258945Sroberto	}
5063258945Sroberto#ifdef AF_UNIX
5064258945Sroberto bind_socket:
5065258945Sroberto#endif
5066258945Sroberto	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
5067258945Sroberto		inc_stats(sock->manager->stats,
5068258945Sroberto			  sock->statsindex[STATID_BINDFAIL]);
5069258945Sroberto
5070258945Sroberto		UNLOCK(&sock->lock);
5071258945Sroberto		switch (errno) {
5072258945Sroberto		case EACCES:
5073258945Sroberto			return (ISC_R_NOPERM);
5074258945Sroberto		case EADDRNOTAVAIL:
5075258945Sroberto			return (ISC_R_ADDRNOTAVAIL);
5076258945Sroberto		case EADDRINUSE:
5077258945Sroberto			return (ISC_R_ADDRINUSE);
5078258945Sroberto		case EINVAL:
5079258945Sroberto			return (ISC_R_BOUND);
5080258945Sroberto		default:
5081258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
5082258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
5083258945Sroberto					 strbuf);
5084258945Sroberto			return (ISC_R_UNEXPECTED);
5085258945Sroberto		}
5086258945Sroberto	}
5087258945Sroberto
5088258945Sroberto	socket_log(sock, sockaddr, TRACE,
5089258945Sroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
5090258945Sroberto	sock->bound = 1;
5091258945Sroberto
5092258945Sroberto	UNLOCK(&sock->lock);
5093258945Sroberto	return (ISC_R_SUCCESS);
5094258945Sroberto}
5095258945Sroberto
5096280849Scy/*
5097280849Scy * Enable this only for specific OS versions, and only when they have repaired
5098280849Scy * their problems with it.  Until then, this is is broken and needs to be
5099280849Scy * diabled by default.  See RT22589 for details.
5100280849Scy */
5101280849Scy#undef ENABLE_ACCEPTFILTER
5102280849Scy
5103280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
5104280849Scyisc__socket_filter(isc_socket_t *sock0, const char *filter) {
5105280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5106280849Scy#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
5107258945Sroberto	char strbuf[ISC_STRERRORSIZE];
5108258945Sroberto	struct accept_filter_arg afa;
5109258945Sroberto#else
5110258945Sroberto	UNUSED(sock);
5111258945Sroberto	UNUSED(filter);
5112258945Sroberto#endif
5113258945Sroberto
5114258945Sroberto	REQUIRE(VALID_SOCKET(sock));
5115258945Sroberto
5116280849Scy#if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
5117258945Sroberto	bzero(&afa, sizeof(afa));
5118258945Sroberto	strncpy(afa.af_name, filter, sizeof(afa.af_name));
5119258945Sroberto	if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
5120258945Sroberto			 &afa, sizeof(afa)) == -1) {
5121258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
5122258945Sroberto		socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
5123258945Sroberto			   ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
5124258945Sroberto			   strbuf);
5125258945Sroberto		return (ISC_R_FAILURE);
5126258945Sroberto	}
5127258945Sroberto	return (ISC_R_SUCCESS);
5128258945Sroberto#else
5129258945Sroberto	return (ISC_R_NOTIMPLEMENTED);
5130258945Sroberto#endif
5131258945Sroberto}
5132258945Sroberto
5133258945Sroberto/*
5134258945Sroberto * Set up to listen on a given socket.  We do this by creating an internal
5135258945Sroberto * event that will be dispatched when the socket has read activity.  The
5136258945Sroberto * watcher will send the internal event to the task when there is a new
5137258945Sroberto * connection.
5138258945Sroberto *
5139258945Sroberto * Unlike in read, we don't preallocate a done event here.  Every time there
5140258945Sroberto * is a new connection we'll have to allocate a new one anyway, so we might
5141258945Sroberto * as well keep things simple rather than having to track them.
5142258945Sroberto */
5143280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
5144280849Scyisc__socket_listen(isc_socket_t *sock0, unsigned int backlog) {
5145280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5146258945Sroberto	char strbuf[ISC_STRERRORSIZE];
5147258945Sroberto
5148258945Sroberto	REQUIRE(VALID_SOCKET(sock));
5149258945Sroberto
5150258945Sroberto	LOCK(&sock->lock);
5151258945Sroberto
5152258945Sroberto	REQUIRE(!sock->listener);
5153258945Sroberto	REQUIRE(sock->bound);
5154258945Sroberto	REQUIRE(sock->type == isc_sockettype_tcp ||
5155258945Sroberto		sock->type == isc_sockettype_unix);
5156258945Sroberto
5157258945Sroberto	if (backlog == 0)
5158258945Sroberto		backlog = SOMAXCONN;
5159258945Sroberto
5160258945Sroberto	if (listen(sock->fd, (int)backlog) < 0) {
5161258945Sroberto		UNLOCK(&sock->lock);
5162258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
5163258945Sroberto
5164258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
5165258945Sroberto
5166258945Sroberto		return (ISC_R_UNEXPECTED);
5167258945Sroberto	}
5168258945Sroberto
5169258945Sroberto	sock->listener = 1;
5170258945Sroberto
5171258945Sroberto	UNLOCK(&sock->lock);
5172258945Sroberto	return (ISC_R_SUCCESS);
5173258945Sroberto}
5174258945Sroberto
5175258945Sroberto/*
5176258945Sroberto * This should try to do aggressive accept() XXXMLG
5177258945Sroberto */
5178280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
5179280849Scyisc__socket_accept(isc_socket_t *sock0,
5180258945Sroberto		  isc_task_t *task, isc_taskaction_t action, const void *arg)
5181258945Sroberto{
5182280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5183258945Sroberto	isc_socket_newconnev_t *dev;
5184280849Scy	isc__socketmgr_t *manager;
5185258945Sroberto	isc_task_t *ntask = NULL;
5186280849Scy	isc__socket_t *nsock;
5187258945Sroberto	isc_result_t result;
5188258945Sroberto	isc_boolean_t do_poke = ISC_FALSE;
5189258945Sroberto
5190258945Sroberto	REQUIRE(VALID_SOCKET(sock));
5191258945Sroberto	manager = sock->manager;
5192258945Sroberto	REQUIRE(VALID_MANAGER(manager));
5193258945Sroberto
5194258945Sroberto	LOCK(&sock->lock);
5195258945Sroberto
5196258945Sroberto	REQUIRE(sock->listener);
5197258945Sroberto
5198258945Sroberto	/*
5199258945Sroberto	 * Sender field is overloaded here with the task we will be sending
5200258945Sroberto	 * this event to.  Just before the actual event is delivered the
5201258945Sroberto	 * actual ev_sender will be touched up to be the socket.
5202258945Sroberto	 */
5203258945Sroberto	dev = (isc_socket_newconnev_t *)
5204258945Sroberto		isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
5205258945Sroberto				   action, arg, sizeof(*dev));
5206258945Sroberto	if (dev == NULL) {
5207258945Sroberto		UNLOCK(&sock->lock);
5208258945Sroberto		return (ISC_R_NOMEMORY);
5209258945Sroberto	}
5210258945Sroberto	ISC_LINK_INIT(dev, ev_link);
5211258945Sroberto
5212258945Sroberto	result = allocate_socket(manager, sock->type, &nsock);
5213258945Sroberto	if (result != ISC_R_SUCCESS) {
5214258945Sroberto		isc_event_free(ISC_EVENT_PTR(&dev));
5215258945Sroberto		UNLOCK(&sock->lock);
5216258945Sroberto		return (result);
5217258945Sroberto	}
5218258945Sroberto
5219258945Sroberto	/*
5220258945Sroberto	 * Attach to socket and to task.
5221258945Sroberto	 */
5222258945Sroberto	isc_task_attach(task, &ntask);
5223280849Scy	if (isc_task_exiting(ntask)) {
5224280849Scy		free_socket(&nsock);
5225280849Scy		isc_task_detach(&ntask);
5226280849Scy		isc_event_free(ISC_EVENT_PTR(&dev));
5227280849Scy		UNLOCK(&sock->lock);
5228280849Scy		return (ISC_R_SHUTTINGDOWN);
5229280849Scy	}
5230258945Sroberto	nsock->references++;
5231258945Sroberto	nsock->statsindex = sock->statsindex;
5232258945Sroberto
5233258945Sroberto	dev->ev_sender = ntask;
5234280849Scy	dev->newsocket = (isc_socket_t *)nsock;
5235258945Sroberto
5236258945Sroberto	/*
5237258945Sroberto	 * Poke watcher here.  We still have the socket locked, so there
5238258945Sroberto	 * is no race condition.  We will keep the lock for such a short
5239258945Sroberto	 * bit of time waking it up now or later won't matter all that much.
5240258945Sroberto	 */
5241258945Sroberto	if (ISC_LIST_EMPTY(sock->accept_list))
5242258945Sroberto		do_poke = ISC_TRUE;
5243258945Sroberto
5244258945Sroberto	ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
5245258945Sroberto
5246258945Sroberto	if (do_poke)
5247258945Sroberto		select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
5248258945Sroberto
5249258945Sroberto	UNLOCK(&sock->lock);
5250258945Sroberto	return (ISC_R_SUCCESS);
5251258945Sroberto}
5252258945Sroberto
5253280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
5254280849Scyisc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
5255258945Sroberto		   isc_task_t *task, isc_taskaction_t action, const void *arg)
5256258945Sroberto{
5257280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5258258945Sroberto	isc_socket_connev_t *dev;
5259258945Sroberto	isc_task_t *ntask = NULL;
5260280849Scy	isc__socketmgr_t *manager;
5261258945Sroberto	int cc;
5262258945Sroberto	char strbuf[ISC_STRERRORSIZE];
5263280849Scy	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
5264258945Sroberto
5265258945Sroberto	REQUIRE(VALID_SOCKET(sock));
5266258945Sroberto	REQUIRE(addr != NULL);
5267258945Sroberto	REQUIRE(task != NULL);
5268258945Sroberto	REQUIRE(action != NULL);
5269258945Sroberto
5270258945Sroberto	manager = sock->manager;
5271258945Sroberto	REQUIRE(VALID_MANAGER(manager));
5272258945Sroberto	REQUIRE(addr != NULL);
5273258945Sroberto
5274258945Sroberto	if (isc_sockaddr_ismulticast(addr))
5275258945Sroberto		return (ISC_R_MULTICAST);
5276258945Sroberto
5277258945Sroberto	LOCK(&sock->lock);
5278258945Sroberto
5279258945Sroberto	REQUIRE(!sock->connecting);
5280258945Sroberto
5281258945Sroberto	dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
5282258945Sroberto							ISC_SOCKEVENT_CONNECT,
5283258945Sroberto							action,	arg,
5284258945Sroberto							sizeof(*dev));
5285258945Sroberto	if (dev == NULL) {
5286258945Sroberto		UNLOCK(&sock->lock);
5287258945Sroberto		return (ISC_R_NOMEMORY);
5288258945Sroberto	}
5289258945Sroberto	ISC_LINK_INIT(dev, ev_link);
5290258945Sroberto
5291258945Sroberto	/*
5292258945Sroberto	 * Try to do the connect right away, as there can be only one
5293258945Sroberto	 * outstanding, and it might happen to complete.
5294258945Sroberto	 */
5295258945Sroberto	sock->peer_address = *addr;
5296258945Sroberto	cc = connect(sock->fd, &addr->type.sa, addr->length);
5297258945Sroberto	if (cc < 0) {
5298258945Sroberto		/*
5299258945Sroberto		 * HP-UX "fails" to connect a UDP socket and sets errno to
5300258945Sroberto		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
5301258945Sroberto		 * a success and let the user detect it if it's really an error
5302258945Sroberto		 * at the time of sending a packet on the socket.
5303258945Sroberto		 */
5304258945Sroberto		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
5305258945Sroberto			cc = 0;
5306258945Sroberto			goto success;
5307258945Sroberto		}
5308258945Sroberto		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
5309258945Sroberto			goto queue;
5310258945Sroberto
5311258945Sroberto		switch (errno) {
5312258945Sroberto#define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
5313258945Sroberto			ERROR_MATCH(EACCES, ISC_R_NOPERM);
5314258945Sroberto			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5315258945Sroberto			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5316258945Sroberto			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5317258945Sroberto			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5318258945Sroberto#ifdef EHOSTDOWN
5319258945Sroberto			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5320258945Sroberto#endif
5321258945Sroberto			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5322258945Sroberto			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5323258945Sroberto			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5324258945Sroberto			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5325258945Sroberto			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5326258945Sroberto#undef ERROR_MATCH
5327258945Sroberto		}
5328258945Sroberto
5329258945Sroberto		sock->connected = 0;
5330258945Sroberto
5331258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
5332280849Scy		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
5333280849Scy		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
5334280849Scy				 addrbuf, errno, strbuf);
5335258945Sroberto
5336258945Sroberto		UNLOCK(&sock->lock);
5337258945Sroberto		inc_stats(sock->manager->stats,
5338258945Sroberto			  sock->statsindex[STATID_CONNECTFAIL]);
5339258945Sroberto		isc_event_free(ISC_EVENT_PTR(&dev));
5340258945Sroberto		return (ISC_R_UNEXPECTED);
5341258945Sroberto
5342258945Sroberto	err_exit:
5343258945Sroberto		sock->connected = 0;
5344258945Sroberto		isc_task_send(task, ISC_EVENT_PTR(&dev));
5345258945Sroberto
5346258945Sroberto		UNLOCK(&sock->lock);
5347258945Sroberto		inc_stats(sock->manager->stats,
5348258945Sroberto			  sock->statsindex[STATID_CONNECTFAIL]);
5349258945Sroberto		return (ISC_R_SUCCESS);
5350258945Sroberto	}
5351258945Sroberto
5352258945Sroberto	/*
5353258945Sroberto	 * If connect completed, fire off the done event.
5354258945Sroberto	 */
5355258945Sroberto success:
5356258945Sroberto	if (cc == 0) {
5357258945Sroberto		sock->connected = 1;
5358258945Sroberto		sock->bound = 1;
5359258945Sroberto		dev->result = ISC_R_SUCCESS;
5360258945Sroberto		isc_task_send(task, ISC_EVENT_PTR(&dev));
5361258945Sroberto
5362258945Sroberto		UNLOCK(&sock->lock);
5363258945Sroberto
5364258945Sroberto		inc_stats(sock->manager->stats,
5365258945Sroberto			  sock->statsindex[STATID_CONNECT]);
5366258945Sroberto
5367258945Sroberto		return (ISC_R_SUCCESS);
5368258945Sroberto	}
5369258945Sroberto
5370258945Sroberto queue:
5371258945Sroberto
5372258945Sroberto	/*
5373258945Sroberto	 * Attach to task.
5374258945Sroberto	 */
5375258945Sroberto	isc_task_attach(task, &ntask);
5376258945Sroberto
5377258945Sroberto	sock->connecting = 1;
5378258945Sroberto
5379258945Sroberto	dev->ev_sender = ntask;
5380258945Sroberto
5381258945Sroberto	/*
5382258945Sroberto	 * Poke watcher here.  We still have the socket locked, so there
5383258945Sroberto	 * is no race condition.  We will keep the lock for such a short
5384258945Sroberto	 * bit of time waking it up now or later won't matter all that much.
5385258945Sroberto	 */
5386258945Sroberto	if (sock->connect_ev == NULL)
5387258945Sroberto		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
5388258945Sroberto
5389258945Sroberto	sock->connect_ev = dev;
5390258945Sroberto
5391258945Sroberto	UNLOCK(&sock->lock);
5392258945Sroberto	return (ISC_R_SUCCESS);
5393258945Sroberto}
5394258945Sroberto
5395258945Sroberto/*
5396258945Sroberto * Called when a socket with a pending connect() finishes.
5397258945Sroberto */
5398258945Srobertostatic void
5399258945Srobertointernal_connect(isc_task_t *me, isc_event_t *ev) {
5400280849Scy	isc__socket_t *sock;
5401258945Sroberto	isc_socket_connev_t *dev;
5402258945Sroberto	isc_task_t *task;
5403258945Sroberto	int cc;
5404258945Sroberto	ISC_SOCKADDR_LEN_T optlen;
5405258945Sroberto	char strbuf[ISC_STRERRORSIZE];
5406258945Sroberto	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5407258945Sroberto
5408258945Sroberto	UNUSED(me);
5409258945Sroberto	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
5410258945Sroberto
5411258945Sroberto	sock = ev->ev_sender;
5412258945Sroberto	INSIST(VALID_SOCKET(sock));
5413258945Sroberto
5414258945Sroberto	LOCK(&sock->lock);
5415258945Sroberto
5416258945Sroberto	/*
5417258945Sroberto	 * When the internal event was sent the reference count was bumped
5418258945Sroberto	 * to keep the socket around for us.  Decrement the count here.
5419258945Sroberto	 */
5420258945Sroberto	INSIST(sock->references > 0);
5421258945Sroberto	sock->references--;
5422258945Sroberto	if (sock->references == 0) {
5423258945Sroberto		UNLOCK(&sock->lock);
5424258945Sroberto		destroy(&sock);
5425258945Sroberto		return;
5426258945Sroberto	}
5427258945Sroberto
5428258945Sroberto	/*
5429258945Sroberto	 * Has this event been canceled?
5430258945Sroberto	 */
5431258945Sroberto	dev = sock->connect_ev;
5432258945Sroberto	if (dev == NULL) {
5433258945Sroberto		INSIST(!sock->connecting);
5434258945Sroberto		UNLOCK(&sock->lock);
5435258945Sroberto		return;
5436258945Sroberto	}
5437258945Sroberto
5438258945Sroberto	INSIST(sock->connecting);
5439258945Sroberto	sock->connecting = 0;
5440258945Sroberto
5441258945Sroberto	/*
5442258945Sroberto	 * Get any possible error status here.
5443258945Sroberto	 */
5444258945Sroberto	optlen = sizeof(cc);
5445258945Sroberto	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
5446258945Sroberto		       (void *)&cc, (void *)&optlen) < 0)
5447258945Sroberto		cc = errno;
5448258945Sroberto	else
5449258945Sroberto		errno = cc;
5450258945Sroberto
5451258945Sroberto	if (errno != 0) {
5452258945Sroberto		/*
5453258945Sroberto		 * If the error is EAGAIN, just re-select on this
5454258945Sroberto		 * fd and pretend nothing strange happened.
5455258945Sroberto		 */
5456258945Sroberto		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
5457258945Sroberto			sock->connecting = 1;
5458258945Sroberto			select_poke(sock->manager, sock->fd,
5459258945Sroberto				    SELECT_POKE_CONNECT);
5460258945Sroberto			UNLOCK(&sock->lock);
5461258945Sroberto
5462258945Sroberto			return;
5463258945Sroberto		}
5464258945Sroberto
5465258945Sroberto		inc_stats(sock->manager->stats,
5466258945Sroberto			  sock->statsindex[STATID_CONNECTFAIL]);
5467258945Sroberto
5468258945Sroberto		/*
5469258945Sroberto		 * Translate other errors into ISC_R_* flavors.
5470258945Sroberto		 */
5471258945Sroberto		switch (errno) {
5472258945Sroberto#define ERROR_MATCH(a, b) case a: dev->result = b; break;
5473258945Sroberto			ERROR_MATCH(EACCES, ISC_R_NOPERM);
5474258945Sroberto			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
5475258945Sroberto			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
5476258945Sroberto			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
5477258945Sroberto			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
5478258945Sroberto#ifdef EHOSTDOWN
5479258945Sroberto			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
5480258945Sroberto#endif
5481258945Sroberto			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
5482258945Sroberto			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
5483258945Sroberto			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
5484258945Sroberto			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
5485258945Sroberto			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5486258945Sroberto			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5487258945Sroberto#undef ERROR_MATCH
5488258945Sroberto		default:
5489258945Sroberto			dev->result = ISC_R_UNEXPECTED;
5490258945Sroberto			isc_sockaddr_format(&sock->peer_address, peerbuf,
5491258945Sroberto					    sizeof(peerbuf));
5492258945Sroberto			isc__strerror(errno, strbuf, sizeof(strbuf));
5493258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
5494258945Sroberto					 "internal_connect: connect(%s) %s",
5495258945Sroberto					 peerbuf, strbuf);
5496258945Sroberto		}
5497258945Sroberto	} else {
5498258945Sroberto		inc_stats(sock->manager->stats,
5499258945Sroberto			  sock->statsindex[STATID_CONNECT]);
5500258945Sroberto		dev->result = ISC_R_SUCCESS;
5501258945Sroberto		sock->connected = 1;
5502258945Sroberto		sock->bound = 1;
5503258945Sroberto	}
5504258945Sroberto
5505258945Sroberto	sock->connect_ev = NULL;
5506258945Sroberto
5507258945Sroberto	UNLOCK(&sock->lock);
5508258945Sroberto
5509258945Sroberto	task = dev->ev_sender;
5510258945Sroberto	dev->ev_sender = sock;
5511258945Sroberto	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
5512258945Sroberto}
5513258945Sroberto
5514280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
5515280849Scyisc__socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
5516280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5517258945Sroberto	isc_result_t result;
5518258945Sroberto
5519258945Sroberto	REQUIRE(VALID_SOCKET(sock));
5520258945Sroberto	REQUIRE(addressp != NULL);
5521258945Sroberto
5522258945Sroberto	LOCK(&sock->lock);
5523258945Sroberto
5524258945Sroberto	if (sock->connected) {
5525258945Sroberto		*addressp = sock->peer_address;
5526258945Sroberto		result = ISC_R_SUCCESS;
5527258945Sroberto	} else {
5528258945Sroberto		result = ISC_R_NOTCONNECTED;
5529258945Sroberto	}
5530258945Sroberto
5531258945Sroberto	UNLOCK(&sock->lock);
5532258945Sroberto
5533258945Sroberto	return (result);
5534258945Sroberto}
5535258945Sroberto
5536280849ScyISC_SOCKETFUNC_SCOPE isc_result_t
5537280849Scyisc__socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
5538280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5539258945Sroberto	ISC_SOCKADDR_LEN_T len;
5540258945Sroberto	isc_result_t result;
5541258945Sroberto	char strbuf[ISC_STRERRORSIZE];
5542258945Sroberto
5543258945Sroberto	REQUIRE(VALID_SOCKET(sock));
5544258945Sroberto	REQUIRE(addressp != NULL);
5545258945Sroberto
5546258945Sroberto	LOCK(&sock->lock);
5547258945Sroberto
5548258945Sroberto	if (!sock->bound) {
5549258945Sroberto		result = ISC_R_NOTBOUND;
5550258945Sroberto		goto out;
5551258945Sroberto	}
5552258945Sroberto
5553258945Sroberto	result = ISC_R_SUCCESS;
5554258945Sroberto
5555258945Sroberto	len = sizeof(addressp->type);
5556258945Sroberto	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5557258945Sroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
5558258945Sroberto		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
5559258945Sroberto				 strbuf);
5560258945Sroberto		result = ISC_R_UNEXPECTED;
5561258945Sroberto		goto out;
5562258945Sroberto	}
5563258945Sroberto	addressp->length = (unsigned int)len;
5564258945Sroberto
5565258945Sroberto out:
5566258945Sroberto	UNLOCK(&sock->lock);
5567258945Sroberto
5568258945Sroberto	return (result);
5569258945Sroberto}
5570258945Sroberto
5571258945Sroberto/*
5572258945Sroberto * Run through the list of events on this socket, and cancel the ones
5573258945Sroberto * queued for task "task" of type "how".  "how" is a bitmask.
5574258945Sroberto */
5575280849ScyISC_SOCKETFUNC_SCOPE void
5576280849Scyisc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
5577280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5578258945Sroberto
5579258945Sroberto	REQUIRE(VALID_SOCKET(sock));
5580258945Sroberto
5581258945Sroberto	/*
5582258945Sroberto	 * Quick exit if there is nothing to do.  Don't even bother locking
5583258945Sroberto	 * in this case.
5584258945Sroberto	 */
5585258945Sroberto	if (how == 0)
5586258945Sroberto		return;
5587258945Sroberto
5588258945Sroberto	LOCK(&sock->lock);
5589258945Sroberto
5590258945Sroberto	/*
5591258945Sroberto	 * All of these do the same thing, more or less.
5592258945Sroberto	 * Each will:
5593258945Sroberto	 *	o If the internal event is marked as "posted" try to
5594258945Sroberto	 *	  remove it from the task's queue.  If this fails, mark it
5595258945Sroberto	 *	  as canceled instead, and let the task clean it up later.
5596258945Sroberto	 *	o For each I/O request for that task of that type, post
5597258945Sroberto	 *	  its done event with status of "ISC_R_CANCELED".
5598258945Sroberto	 *	o Reset any state needed.
5599258945Sroberto	 */
5600258945Sroberto	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
5601258945Sroberto	    && !ISC_LIST_EMPTY(sock->recv_list)) {
5602258945Sroberto		isc_socketevent_t      *dev;
5603258945Sroberto		isc_socketevent_t      *next;
5604258945Sroberto		isc_task_t	       *current_task;
5605258945Sroberto
5606258945Sroberto		dev = ISC_LIST_HEAD(sock->recv_list);
5607258945Sroberto
5608258945Sroberto		while (dev != NULL) {
5609258945Sroberto			current_task = dev->ev_sender;
5610258945Sroberto			next = ISC_LIST_NEXT(dev, ev_link);
5611258945Sroberto
5612258945Sroberto			if ((task == NULL) || (task == current_task)) {
5613258945Sroberto				dev->result = ISC_R_CANCELED;
5614258945Sroberto				send_recvdone_event(sock, &dev);
5615258945Sroberto			}
5616258945Sroberto			dev = next;
5617258945Sroberto		}
5618258945Sroberto	}
5619258945Sroberto
5620258945Sroberto	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
5621258945Sroberto	    && !ISC_LIST_EMPTY(sock->send_list)) {
5622258945Sroberto		isc_socketevent_t      *dev;
5623258945Sroberto		isc_socketevent_t      *next;
5624258945Sroberto		isc_task_t	       *current_task;
5625258945Sroberto
5626258945Sroberto		dev = ISC_LIST_HEAD(sock->send_list);
5627258945Sroberto
5628258945Sroberto		while (dev != NULL) {
5629258945Sroberto			current_task = dev->ev_sender;
5630258945Sroberto			next = ISC_LIST_NEXT(dev, ev_link);
5631258945Sroberto
5632258945Sroberto			if ((task == NULL) || (task == current_task)) {
5633258945Sroberto				dev->result = ISC_R_CANCELED;
5634258945Sroberto				send_senddone_event(sock, &dev);
5635258945Sroberto			}
5636258945Sroberto			dev = next;
5637258945Sroberto		}
5638258945Sroberto	}
5639258945Sroberto
5640258945Sroberto	if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
5641258945Sroberto	    && !ISC_LIST_EMPTY(sock->accept_list)) {
5642258945Sroberto		isc_socket_newconnev_t *dev;
5643258945Sroberto		isc_socket_newconnev_t *next;
5644258945Sroberto		isc_task_t	       *current_task;
5645258945Sroberto
5646258945Sroberto		dev = ISC_LIST_HEAD(sock->accept_list);
5647258945Sroberto		while (dev != NULL) {
5648258945Sroberto			current_task = dev->ev_sender;
5649258945Sroberto			next = ISC_LIST_NEXT(dev, ev_link);
5650258945Sroberto
5651258945Sroberto			if ((task == NULL) || (task == current_task)) {
5652258945Sroberto
5653258945Sroberto				ISC_LIST_UNLINK(sock->accept_list, dev,
5654258945Sroberto						ev_link);
5655258945Sroberto
5656280849Scy				NEWCONNSOCK(dev)->references--;
5657280849Scy				free_socket((isc__socket_t **)&dev->newsocket);
5658258945Sroberto
5659258945Sroberto				dev->result = ISC_R_CANCELED;
5660258945Sroberto				dev->ev_sender = sock;
5661258945Sroberto				isc_task_sendanddetach(&current_task,
5662258945Sroberto						       ISC_EVENT_PTR(&dev));
5663258945Sroberto			}
5664258945Sroberto
5665258945Sroberto			dev = next;
5666258945Sroberto		}
5667258945Sroberto	}
5668258945Sroberto
5669258945Sroberto	/*
5670258945Sroberto	 * Connecting is not a list.
5671258945Sroberto	 */
5672258945Sroberto	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
5673258945Sroberto	    && sock->connect_ev != NULL) {
5674258945Sroberto		isc_socket_connev_t    *dev;
5675258945Sroberto		isc_task_t	       *current_task;
5676258945Sroberto
5677258945Sroberto		INSIST(sock->connecting);
5678258945Sroberto		sock->connecting = 0;
5679258945Sroberto
5680258945Sroberto		dev = sock->connect_ev;
5681258945Sroberto		current_task = dev->ev_sender;
5682258945Sroberto
5683258945Sroberto		if ((task == NULL) || (task == current_task)) {
5684258945Sroberto			sock->connect_ev = NULL;
5685258945Sroberto
5686258945Sroberto			dev->result = ISC_R_CANCELED;
5687258945Sroberto			dev->ev_sender = sock;
5688258945Sroberto			isc_task_sendanddetach(&current_task,
5689258945Sroberto					       ISC_EVENT_PTR(&dev));
5690258945Sroberto		}
5691258945Sroberto	}
5692258945Sroberto
5693258945Sroberto	UNLOCK(&sock->lock);
5694258945Sroberto}
5695258945Sroberto
5696280849ScyISC_SOCKETFUNC_SCOPE isc_sockettype_t
5697280849Scyisc__socket_gettype(isc_socket_t *sock0) {
5698280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5699280849Scy
5700258945Sroberto	REQUIRE(VALID_SOCKET(sock));
5701258945Sroberto
5702258945Sroberto	return (sock->type);
5703258945Sroberto}
5704258945Sroberto
5705280849ScyISC_SOCKETFUNC_SCOPE isc_boolean_t
5706280849Scyisc__socket_isbound(isc_socket_t *sock0) {
5707280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5708258945Sroberto	isc_boolean_t val;
5709258945Sroberto
5710280849Scy	REQUIRE(VALID_SOCKET(sock));
5711280849Scy
5712258945Sroberto	LOCK(&sock->lock);
5713258945Sroberto	val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
5714258945Sroberto	UNLOCK(&sock->lock);
5715258945Sroberto
5716258945Sroberto	return (val);
5717258945Sroberto}
5718258945Sroberto
5719280849ScyISC_SOCKETFUNC_SCOPE void
5720280849Scyisc__socket_ipv6only(isc_socket_t *sock0, isc_boolean_t yes) {
5721280849Scy	isc__socket_t *sock = (isc__socket_t *)sock0;
5722258945Sroberto#if defined(IPV6_V6ONLY)
5723258945Sroberto	int onoff = yes ? 1 : 0;
5724258945Sroberto#else
5725258945Sroberto	UNUSED(yes);
5726258945Sroberto	UNUSED(sock);
5727258945Sroberto#endif
5728258945Sroberto
5729258945Sroberto	REQUIRE(VALID_SOCKET(sock));
5730280849Scy	INSIST(!sock->dupped);
5731258945Sroberto
5732258945Sroberto#ifdef IPV6_V6ONLY
5733258945Sroberto	if (sock->pf == AF_INET6) {
5734258945Sroberto		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5735258945Sroberto			       (void *)&onoff, sizeof(int)) < 0) {
5736258945Sroberto			char strbuf[ISC_STRERRORSIZE];
5737280849Scy			isc__strerror(errno, strbuf, sizeof(strbuf));
5738258945Sroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
5739258945Sroberto					 "setsockopt(%d, IPV6_V6ONLY) "
5740258945Sroberto					 "%s: %s", sock->fd,
5741258945Sroberto					 isc_msgcat_get(isc_msgcat,
5742258945Sroberto							ISC_MSGSET_GENERAL,
5743258945Sroberto							ISC_MSG_FAILED,
5744258945Sroberto							"failed"),
5745258945Sroberto					 strbuf);
5746258945Sroberto		}
5747258945Sroberto	}
5748258945Sroberto	FIX_IPV6_RECVPKTINFO(sock);	/* AIX */
5749258945Sroberto#endif
5750258945Sroberto}
5751258945Sroberto
5752280849Scy#ifndef USE_WATCHER_THREAD
5753280849Scy/*
5754280849Scy * In our assumed scenario, we can simply use a single static object.
5755280849Scy * XXX: this is not true if the application uses multiple threads with
5756280849Scy *      'multi-context' mode.  Fixing this is a future TODO item.
5757280849Scy */
5758258945Srobertostatic isc_socketwait_t swait_private;
5759258945Sroberto
5760258945Srobertoint
5761280849Scyisc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
5762280849Scy			  isc_socketwait_t **swaitp)
5763280849Scy{
5764280849Scy	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
5765280849Scy
5766280849Scy
5767258945Sroberto	int n;
5768258945Sroberto#ifdef USE_KQUEUE
5769258945Sroberto	struct timespec ts, *tsp;
5770258945Sroberto#endif
5771258945Sroberto#ifdef USE_EPOLL
5772258945Sroberto	int timeout;
5773258945Sroberto#endif
5774258945Sroberto#ifdef USE_DEVPOLL
5775258945Sroberto	struct dvpoll dvp;
5776258945Sroberto#endif
5777258945Sroberto
5778258945Sroberto	REQUIRE(swaitp != NULL && *swaitp == NULL);
5779258945Sroberto
5780280849Scy#ifdef USE_SHARED_MANAGER
5781280849Scy	if (manager == NULL)
5782280849Scy		manager = socketmgr;
5783280849Scy#endif
5784280849Scy	if (manager == NULL)
5785258945Sroberto		return (0);
5786258945Sroberto
5787258945Sroberto#ifdef USE_KQUEUE
5788258945Sroberto	if (tvp != NULL) {
5789258945Sroberto		ts.tv_sec = tvp->tv_sec;
5790258945Sroberto		ts.tv_nsec = tvp->tv_usec * 1000;
5791258945Sroberto		tsp = &ts;
5792258945Sroberto	} else
5793258945Sroberto		tsp = NULL;
5794280849Scy	swait_private.nevents = kevent(manager->kqueue_fd, NULL, 0,
5795280849Scy				       manager->events, manager->nevents,
5796258945Sroberto				       tsp);
5797258945Sroberto	n = swait_private.nevents;
5798258945Sroberto#elif defined(USE_EPOLL)
5799258945Sroberto	if (tvp != NULL)
5800258945Sroberto		timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
5801258945Sroberto	else
5802258945Sroberto		timeout = -1;
5803280849Scy	swait_private.nevents = epoll_wait(manager->epoll_fd,
5804280849Scy					   manager->events,
5805280849Scy					   manager->nevents, timeout);
5806258945Sroberto	n = swait_private.nevents;
5807258945Sroberto#elif defined(USE_DEVPOLL)
5808280849Scy	dvp.dp_fds = manager->events;
5809280849Scy	dvp.dp_nfds = manager->nevents;
5810258945Sroberto	if (tvp != NULL) {
5811258945Sroberto		dvp.dp_timeout = tvp->tv_sec * 1000 +
5812258945Sroberto			(tvp->tv_usec + 999) / 1000;
5813258945Sroberto	} else
5814258945Sroberto		dvp.dp_timeout = -1;
5815280849Scy	swait_private.nevents = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
5816258945Sroberto	n = swait_private.nevents;
5817258945Sroberto#elif defined(USE_SELECT)
5818280849Scy	memcpy(manager->read_fds_copy, manager->read_fds,  manager->fd_bufsize);
5819280849Scy	memcpy(manager->write_fds_copy, manager->write_fds,
5820280849Scy	       manager->fd_bufsize);
5821258945Sroberto
5822280849Scy	swait_private.readset = manager->read_fds_copy;
5823280849Scy	swait_private.writeset = manager->write_fds_copy;
5824280849Scy	swait_private.maxfd = manager->maxfd + 1;
5825258945Sroberto
5826258945Sroberto	n = select(swait_private.maxfd, swait_private.readset,
5827258945Sroberto		   swait_private.writeset, NULL, tvp);
5828258945Sroberto#endif
5829258945Sroberto
5830258945Sroberto	*swaitp = &swait_private;
5831258945Sroberto	return (n);
5832258945Sroberto}
5833258945Sroberto
5834258945Srobertoisc_result_t
5835280849Scyisc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
5836280849Scy	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
5837280849Scy
5838258945Sroberto	REQUIRE(swait == &swait_private);
5839258945Sroberto
5840280849Scy#ifdef USE_SHARED_MANAGER
5841280849Scy	if (manager == NULL)
5842280849Scy		manager = socketmgr;
5843280849Scy#endif
5844280849Scy	if (manager == NULL)
5845258945Sroberto		return (ISC_R_NOTFOUND);
5846258945Sroberto
5847258945Sroberto#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
5848280849Scy	(void)process_fds(manager, manager->events, swait->nevents);
5849258945Sroberto	return (ISC_R_SUCCESS);
5850258945Sroberto#elif defined(USE_SELECT)
5851280849Scy	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
5852258945Sroberto	return (ISC_R_SUCCESS);
5853258945Sroberto#endif
5854258945Sroberto}
5855280849Scy#endif /* USE_WATCHER_THREAD */
5856258945Sroberto
5857280849Scy#ifdef BIND9
5858258945Srobertovoid
5859280849Scyisc__socket_setname(isc_socket_t *socket0, const char *name, void *tag) {
5860280849Scy	isc__socket_t *socket = (isc__socket_t *)socket0;
5861258945Sroberto
5862258945Sroberto	/*
5863258945Sroberto	 * Name 'socket'.
5864258945Sroberto	 */
5865258945Sroberto
5866258945Sroberto	REQUIRE(VALID_SOCKET(socket));
5867258945Sroberto
5868258945Sroberto	LOCK(&socket->lock);
5869258945Sroberto	memset(socket->name, 0, sizeof(socket->name));
5870258945Sroberto	strncpy(socket->name, name, sizeof(socket->name) - 1);
5871258945Sroberto	socket->tag = tag;
5872258945Sroberto	UNLOCK(&socket->lock);
5873258945Sroberto}
5874258945Sroberto
5875280849ScyISC_SOCKETFUNC_SCOPE const char *
5876280849Scyisc__socket_getname(isc_socket_t *socket0) {
5877280849Scy	isc__socket_t *socket = (isc__socket_t *)socket0;
5878280849Scy
5879258945Sroberto	return (socket->name);
5880258945Sroberto}
5881258945Sroberto
5882258945Srobertovoid *
5883280849Scyisc__socket_gettag(isc_socket_t *socket0) {
5884280849Scy	isc__socket_t *socket = (isc__socket_t *)socket0;
5885280849Scy
5886258945Sroberto	return (socket->tag);
5887258945Sroberto}
5888280849Scy#endif	/* BIND9 */
5889258945Sroberto
5890280849Scy#ifdef USE_SOCKETIMPREGISTER
5891280849Scyisc_result_t
5892280849Scyisc__socket_register() {
5893280849Scy	return (isc_socket_register(isc__socketmgr_create));
5894280849Scy}
5895280849Scy#endif
5896258945Sroberto
5897280849ScyISC_SOCKETFUNC_SCOPE int
5898280849Scyisc__socket_getfd(isc_socket_t *socket0) {
5899280849Scy	isc__socket_t *socket = (isc__socket_t *)socket0;
5900280849Scy
5901280849Scy	return ((short) socket->fd);
5902280849Scy}
5903280849Scy
5904280849Scy#if defined(HAVE_LIBXML2) && defined(BIND9)
5905280849Scy
5906258945Srobertostatic const char *
5907258945Sroberto_socktype(isc_sockettype_t type)
5908258945Sroberto{
5909258945Sroberto	if (type == isc_sockettype_udp)
5910258945Sroberto		return ("udp");
5911258945Sroberto	else if (type == isc_sockettype_tcp)
5912258945Sroberto		return ("tcp");
5913258945Sroberto	else if (type == isc_sockettype_unix)
5914258945Sroberto		return ("unix");
5915258945Sroberto	else if (type == isc_sockettype_fdwatch)
5916258945Sroberto		return ("fdwatch");
5917258945Sroberto	else
5918258945Sroberto		return ("not-initialized");
5919258945Sroberto}
5920258945Sroberto
5921280849ScyISC_SOCKETFUNC_SCOPE void
5922280849Scyisc_socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer) {
5923280849Scy	isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
5924280849Scy	isc__socket_t *sock;
5925258945Sroberto	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5926258945Sroberto	isc_sockaddr_t addr;
5927258945Sroberto	ISC_SOCKADDR_LEN_T len;
5928258945Sroberto
5929258945Sroberto	LOCK(&mgr->lock);
5930258945Sroberto
5931280849Scy#ifdef USE_SHARED_MANAGER
5932258945Sroberto	xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5933258945Sroberto	xmlTextWriterWriteFormatString(writer, "%d", mgr->refs);
5934258945Sroberto	xmlTextWriterEndElement(writer);
5935280849Scy#endif	/* USE_SHARED_MANAGER */
5936258945Sroberto
5937258945Sroberto	xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets");
5938258945Sroberto	sock = ISC_LIST_HEAD(mgr->socklist);
5939258945Sroberto	while (sock != NULL) {
5940258945Sroberto		LOCK(&sock->lock);
5941258945Sroberto		xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket");
5942258945Sroberto
5943258945Sroberto		xmlTextWriterStartElement(writer, ISC_XMLCHAR "id");
5944258945Sroberto		xmlTextWriterWriteFormatString(writer, "%p", sock);
5945258945Sroberto		xmlTextWriterEndElement(writer);
5946258945Sroberto
5947258945Sroberto		if (sock->name[0] != 0) {
5948258945Sroberto			xmlTextWriterStartElement(writer, ISC_XMLCHAR "name");
5949258945Sroberto			xmlTextWriterWriteFormatString(writer, "%s",
5950258945Sroberto						       sock->name);
5951258945Sroberto			xmlTextWriterEndElement(writer); /* name */
5952258945Sroberto		}
5953258945Sroberto
5954258945Sroberto		xmlTextWriterStartElement(writer, ISC_XMLCHAR "references");
5955258945Sroberto		xmlTextWriterWriteFormatString(writer, "%d", sock->references);
5956258945Sroberto		xmlTextWriterEndElement(writer);
5957258945Sroberto
5958258945Sroberto		xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
5959258945Sroberto					  ISC_XMLCHAR _socktype(sock->type));
5960258945Sroberto
5961258945Sroberto		if (sock->connected) {
5962258945Sroberto			isc_sockaddr_format(&sock->peer_address, peerbuf,
5963258945Sroberto					    sizeof(peerbuf));
5964258945Sroberto			xmlTextWriterWriteElement(writer,
5965258945Sroberto						  ISC_XMLCHAR "peer-address",
5966258945Sroberto						  ISC_XMLCHAR peerbuf);
5967258945Sroberto		}
5968258945Sroberto
5969258945Sroberto		len = sizeof(addr);
5970258945Sroberto		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5971258945Sroberto			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5972258945Sroberto			xmlTextWriterWriteElement(writer,
5973258945Sroberto						  ISC_XMLCHAR "local-address",
5974258945Sroberto						  ISC_XMLCHAR peerbuf);
5975258945Sroberto		}
5976258945Sroberto
5977258945Sroberto		xmlTextWriterStartElement(writer, ISC_XMLCHAR "states");
5978258945Sroberto		if (sock->pending_recv)
5979258945Sroberto			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5980258945Sroberto						ISC_XMLCHAR "pending-receive");
5981258945Sroberto		if (sock->pending_send)
5982258945Sroberto			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5983258945Sroberto						  ISC_XMLCHAR "pending-send");
5984258945Sroberto		if (sock->pending_accept)
5985258945Sroberto			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5986258945Sroberto						 ISC_XMLCHAR "pending_accept");
5987258945Sroberto		if (sock->listener)
5988258945Sroberto			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5989258945Sroberto						  ISC_XMLCHAR "listener");
5990258945Sroberto		if (sock->connected)
5991258945Sroberto			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5992258945Sroberto						  ISC_XMLCHAR "connected");
5993258945Sroberto		if (sock->connecting)
5994258945Sroberto			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5995258945Sroberto						  ISC_XMLCHAR "connecting");
5996258945Sroberto		if (sock->bound)
5997258945Sroberto			xmlTextWriterWriteElement(writer, ISC_XMLCHAR "state",
5998258945Sroberto						  ISC_XMLCHAR "bound");
5999258945Sroberto
6000258945Sroberto		xmlTextWriterEndElement(writer); /* states */
6001258945Sroberto
6002258945Sroberto		xmlTextWriterEndElement(writer); /* socket */
6003258945Sroberto
6004258945Sroberto		UNLOCK(&sock->lock);
6005258945Sroberto		sock = ISC_LIST_NEXT(sock, link);
6006258945Sroberto	}
6007258945Sroberto	xmlTextWriterEndElement(writer); /* sockets */
6008258945Sroberto
6009258945Sroberto	UNLOCK(&mgr->lock);
6010258945Sroberto}
6011258945Sroberto#endif /* HAVE_LIBXML2 */
6012