netevent.c revision 291767
1/*
2 * util/netevent.c - event notification
3 *
4 * Copyright (c) 2007, NLnet Labs. All rights reserved.
5 *
6 * This software is open source.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * Redistributions of source code must retain the above copyright notice,
13 * this list of conditions and the following disclaimer.
14 *
15 * Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 *
19 * Neither the name of the NLNET LABS nor the names of its contributors may
20 * be used to endorse or promote products derived from this software without
21 * specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36/**
37 * \file
38 *
39 * This file contains event notification functions.
40 */
41#include "config.h"
42#include "util/netevent.h"
43#include "util/log.h"
44#include "util/net_help.h"
45#include "util/fptr_wlist.h"
46#include "sldns/pkthdr.h"
47#include "sldns/sbuffer.h"
48#include "dnstap/dnstap.h"
49#ifdef HAVE_OPENSSL_SSL_H
50#include <openssl/ssl.h>
51#endif
52#ifdef HAVE_OPENSSL_ERR_H
53#include <openssl/err.h>
54#endif
55
56/* -------- Start of local definitions -------- */
57/** if CMSG_ALIGN is not defined on this platform, a workaround */
58#ifndef CMSG_ALIGN
59#  ifdef _CMSG_DATA_ALIGN
60#    define CMSG_ALIGN _CMSG_DATA_ALIGN
61#  else
62#    define CMSG_ALIGN(len) (((len)+sizeof(long)-1) & ~(sizeof(long)-1))
63#  endif
64#endif
65
66/** if CMSG_LEN is not defined on this platform, a workaround */
67#ifndef CMSG_LEN
68#  define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr))+(len))
69#endif
70
71/** if CMSG_SPACE is not defined on this platform, a workaround */
72#ifndef CMSG_SPACE
73#  ifdef _CMSG_HDR_ALIGN
74#    define CMSG_SPACE(l) (CMSG_ALIGN(l)+_CMSG_HDR_ALIGN(sizeof(struct cmsghdr)))
75#  else
76#    define CMSG_SPACE(l) (CMSG_ALIGN(l)+CMSG_ALIGN(sizeof(struct cmsghdr)))
77#  endif
78#endif
79
80/** The TCP reading or writing query timeout in seconds */
81#define TCP_QUERY_TIMEOUT 120
82
83#ifndef NONBLOCKING_IS_BROKEN
84/** number of UDP reads to perform per read indication from select */
85#define NUM_UDP_PER_SELECT 100
86#else
87#define NUM_UDP_PER_SELECT 1
88#endif
89
90/* We define libevent structures here to hide the libevent stuff. */
91
92#ifdef USE_MINI_EVENT
93#  ifdef USE_WINSOCK
94#    include "util/winsock_event.h"
95#  else
96#    include "util/mini_event.h"
97#  endif /* USE_WINSOCK */
98#else /* USE_MINI_EVENT */
99   /* we use libevent */
100#  ifdef HAVE_EVENT_H
101#    include <event.h>
102#  else
103#    include "event2/event.h"
104#    include "event2/event_struct.h"
105#    include "event2/event_compat.h"
106#  endif
107#endif /* USE_MINI_EVENT */
108
109/**
110 * The internal event structure for keeping libevent info for the event.
111 * Possibly other structures (list, tree) this is part of.
112 */
113struct internal_event {
114	/** the comm base */
115	struct comm_base* base;
116	/** libevent event type, alloced here */
117	struct event ev;
118};
119
120/**
121 * Internal base structure, so that every thread has its own events.
122 */
123struct internal_base {
124	/** libevent event_base type. */
125	struct event_base* base;
126	/** seconds time pointer points here */
127	time_t secs;
128	/** timeval with current time */
129	struct timeval now;
130	/** the event used for slow_accept timeouts */
131	struct event slow_accept;
132	/** true if slow_accept is enabled */
133	int slow_accept_enabled;
134};
135
136/**
137 * Internal timer structure, to store timer event in.
138 */
139struct internal_timer {
140	/** the comm base */
141	struct comm_base* base;
142	/** libevent event type, alloced here */
143	struct event ev;
144	/** is timer enabled */
145	uint8_t enabled;
146};
147
148/**
149 * Internal signal structure, to store signal event in.
150 */
151struct internal_signal {
152	/** libevent event type, alloced here */
153	struct event ev;
154	/** next in signal list */
155	struct internal_signal* next;
156};
157
158/** create a tcp handler with a parent */
159static struct comm_point* comm_point_create_tcp_handler(
160	struct comm_base *base, struct comm_point* parent, size_t bufsize,
161        comm_point_callback_t* callback, void* callback_arg);
162
163/* -------- End of local definitions -------- */
164
165#ifdef USE_MINI_EVENT
166/** minievent updates the time when it blocks. */
167#define comm_base_now(x) /* nothing to do */
168#else /* !USE_MINI_EVENT */
169/** fillup the time values in the event base */
170static void
171comm_base_now(struct comm_base* b)
172{
173	if(gettimeofday(&b->eb->now, NULL) < 0) {
174		log_err("gettimeofday: %s", strerror(errno));
175	}
176	b->eb->secs = (time_t)b->eb->now.tv_sec;
177}
178#endif /* USE_MINI_EVENT */
179
180struct comm_base*
181comm_base_create(int sigs)
182{
183	struct comm_base* b = (struct comm_base*)calloc(1,
184		sizeof(struct comm_base));
185	if(!b)
186		return NULL;
187	b->eb = (struct internal_base*)calloc(1, sizeof(struct internal_base));
188	if(!b->eb) {
189		free(b);
190		return NULL;
191	}
192#ifdef USE_MINI_EVENT
193	(void)sigs;
194	/* use mini event time-sharing feature */
195	b->eb->base = event_init(&b->eb->secs, &b->eb->now);
196#else
197#  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
198	/* libev */
199	if(sigs)
200		b->eb->base=(struct event_base *)ev_default_loop(EVFLAG_AUTO);
201	else
202		b->eb->base=(struct event_base *)ev_loop_new(EVFLAG_AUTO);
203#  else
204	(void)sigs;
205#    ifdef HAVE_EVENT_BASE_NEW
206	b->eb->base = event_base_new();
207#    else
208	b->eb->base = event_init();
209#    endif
210#  endif
211#endif
212	if(!b->eb->base) {
213		free(b->eb);
214		free(b);
215		return NULL;
216	}
217	comm_base_now(b);
218	/* avoid event_get_method call which causes crashes even when
219	 * not printing, because its result is passed */
220	verbose(VERB_ALGO,
221#if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
222		"libev"
223#elif defined(USE_MINI_EVENT)
224		"event "
225#else
226		"libevent "
227#endif
228		"%s uses %s method.",
229		event_get_version(),
230#ifdef HAVE_EVENT_BASE_GET_METHOD
231		event_base_get_method(b->eb->base)
232#else
233		"not_obtainable"
234#endif
235	);
236	return b;
237}
238
239struct comm_base*
240comm_base_create_event(struct event_base* base)
241{
242	struct comm_base* b = (struct comm_base*)calloc(1,
243		sizeof(struct comm_base));
244	if(!b)
245		return NULL;
246	b->eb = (struct internal_base*)calloc(1, sizeof(struct internal_base));
247	if(!b->eb) {
248		free(b);
249		return NULL;
250	}
251	b->eb->base = base;
252	comm_base_now(b);
253	return b;
254}
255
256void
257comm_base_delete(struct comm_base* b)
258{
259	if(!b)
260		return;
261	if(b->eb->slow_accept_enabled) {
262		if(event_del(&b->eb->slow_accept) != 0) {
263			log_err("could not event_del slow_accept");
264		}
265	}
266#ifdef USE_MINI_EVENT
267	event_base_free(b->eb->base);
268#elif defined(HAVE_EVENT_BASE_FREE) && defined(HAVE_EVENT_BASE_ONCE)
269	/* only libevent 1.2+ has it, but in 1.2 it is broken -
270	   assertion fails on signal handling ev that is not deleted
271 	   in libevent 1.3c (event_base_once appears) this is fixed. */
272	event_base_free(b->eb->base);
273#endif /* HAVE_EVENT_BASE_FREE and HAVE_EVENT_BASE_ONCE */
274	b->eb->base = NULL;
275	free(b->eb);
276	free(b);
277}
278
279void
280comm_base_delete_no_base(struct comm_base* b)
281{
282	if(!b)
283		return;
284	if(b->eb->slow_accept_enabled) {
285		if(event_del(&b->eb->slow_accept) != 0) {
286			log_err("could not event_del slow_accept");
287		}
288	}
289	b->eb->base = NULL;
290	free(b->eb);
291	free(b);
292}
293
294void
295comm_base_timept(struct comm_base* b, time_t** tt, struct timeval** tv)
296{
297	*tt = &b->eb->secs;
298	*tv = &b->eb->now;
299}
300
301void
302comm_base_dispatch(struct comm_base* b)
303{
304	int retval;
305	retval = event_base_dispatch(b->eb->base);
306	if(retval != 0) {
307		fatal_exit("event_dispatch returned error %d, "
308			"errno is %s", retval, strerror(errno));
309	}
310}
311
312void comm_base_exit(struct comm_base* b)
313{
314	if(event_base_loopexit(b->eb->base, NULL) != 0) {
315		log_err("Could not loopexit");
316	}
317}
318
319void comm_base_set_slow_accept_handlers(struct comm_base* b,
320	void (*stop_acc)(void*), void (*start_acc)(void*), void* arg)
321{
322	b->stop_accept = stop_acc;
323	b->start_accept = start_acc;
324	b->cb_arg = arg;
325}
326
327struct event_base* comm_base_internal(struct comm_base* b)
328{
329	return b->eb->base;
330}
331
332/** see if errno for udp has to be logged or not uses globals */
333static int
334udp_send_errno_needs_log(struct sockaddr* addr, socklen_t addrlen)
335{
336	/* do not log transient errors (unless high verbosity) */
337#if defined(ENETUNREACH) || defined(EHOSTDOWN) || defined(EHOSTUNREACH) || defined(ENETDOWN)
338	switch(errno) {
339#  ifdef ENETUNREACH
340		case ENETUNREACH:
341#  endif
342#  ifdef EHOSTDOWN
343		case EHOSTDOWN:
344#  endif
345#  ifdef EHOSTUNREACH
346		case EHOSTUNREACH:
347#  endif
348#  ifdef ENETDOWN
349		case ENETDOWN:
350#  endif
351			if(verbosity < VERB_ALGO)
352				return 0;
353		default:
354			break;
355	}
356#endif
357	/* permission denied is gotten for every send if the
358	 * network is disconnected (on some OS), squelch it */
359	if(errno == EPERM && verbosity < VERB_DETAIL)
360		return 0;
361	/* squelch errors where people deploy AAAA ::ffff:bla for
362	 * authority servers, which we try for intranets. */
363	if(errno == EINVAL && addr_is_ip4mapped(
364		(struct sockaddr_storage*)addr, addrlen) &&
365		verbosity < VERB_DETAIL)
366		return 0;
367	/* SO_BROADCAST sockopt can give access to 255.255.255.255,
368	 * but a dns cache does not need it. */
369	if(errno == EACCES && addr_is_broadcast(
370		(struct sockaddr_storage*)addr, addrlen) &&
371		verbosity < VERB_DETAIL)
372		return 0;
373	return 1;
374}
375
376int tcp_connect_errno_needs_log(struct sockaddr* addr, socklen_t addrlen)
377{
378	return udp_send_errno_needs_log(addr, addrlen);
379}
380
381/* send a UDP reply */
382int
383comm_point_send_udp_msg(struct comm_point *c, sldns_buffer* packet,
384	struct sockaddr* addr, socklen_t addrlen)
385{
386	ssize_t sent;
387	log_assert(c->fd != -1);
388#ifdef UNBOUND_DEBUG
389	if(sldns_buffer_remaining(packet) == 0)
390		log_err("error: send empty UDP packet");
391#endif
392	log_assert(addr && addrlen > 0);
393	sent = sendto(c->fd, (void*)sldns_buffer_begin(packet),
394		sldns_buffer_remaining(packet), 0,
395		addr, addrlen);
396	if(sent == -1) {
397		if(!udp_send_errno_needs_log(addr, addrlen))
398			return 0;
399#ifndef USE_WINSOCK
400		verbose(VERB_OPS, "sendto failed: %s", strerror(errno));
401#else
402		verbose(VERB_OPS, "sendto failed: %s",
403			wsa_strerror(WSAGetLastError()));
404#endif
405		log_addr(VERB_OPS, "remote address is",
406			(struct sockaddr_storage*)addr, addrlen);
407		return 0;
408	} else if((size_t)sent != sldns_buffer_remaining(packet)) {
409		log_err("sent %d in place of %d bytes",
410			(int)sent, (int)sldns_buffer_remaining(packet));
411		return 0;
412	}
413	return 1;
414}
415
416#if defined(AF_INET6) && defined(IPV6_PKTINFO) && (defined(HAVE_RECVMSG) || defined(HAVE_SENDMSG))
417/** print debug ancillary info */
418static void p_ancil(const char* str, struct comm_reply* r)
419{
420	if(r->srctype != 4 && r->srctype != 6) {
421		log_info("%s: unknown srctype %d", str, r->srctype);
422		return;
423	}
424	if(r->srctype == 6) {
425		char buf[1024];
426		if(inet_ntop(AF_INET6, &r->pktinfo.v6info.ipi6_addr,
427			buf, (socklen_t)sizeof(buf)) == 0) {
428			(void)strlcpy(buf, "(inet_ntop error)", sizeof(buf));
429		}
430		buf[sizeof(buf)-1]=0;
431		log_info("%s: %s %d", str, buf, r->pktinfo.v6info.ipi6_ifindex);
432	} else if(r->srctype == 4) {
433#ifdef IP_PKTINFO
434		char buf1[1024], buf2[1024];
435		if(inet_ntop(AF_INET, &r->pktinfo.v4info.ipi_addr,
436			buf1, (socklen_t)sizeof(buf1)) == 0) {
437			(void)strlcpy(buf1, "(inet_ntop error)", sizeof(buf1));
438		}
439		buf1[sizeof(buf1)-1]=0;
440#ifdef HAVE_STRUCT_IN_PKTINFO_IPI_SPEC_DST
441		if(inet_ntop(AF_INET, &r->pktinfo.v4info.ipi_spec_dst,
442			buf2, (socklen_t)sizeof(buf2)) == 0) {
443			(void)strlcpy(buf2, "(inet_ntop error)", sizeof(buf2));
444		}
445		buf2[sizeof(buf2)-1]=0;
446#else
447		buf2[0]=0;
448#endif
449		log_info("%s: %d %s %s", str, r->pktinfo.v4info.ipi_ifindex,
450			buf1, buf2);
451#elif defined(IP_RECVDSTADDR)
452		char buf1[1024];
453		if(inet_ntop(AF_INET, &r->pktinfo.v4addr,
454			buf1, (socklen_t)sizeof(buf1)) == 0) {
455			(void)strlcpy(buf1, "(inet_ntop error)", sizeof(buf1));
456		}
457		buf1[sizeof(buf1)-1]=0;
458		log_info("%s: %s", str, buf1);
459#endif /* IP_PKTINFO or PI_RECVDSTDADDR */
460	}
461}
462#endif /* AF_INET6 && IPV6_PKTINFO && HAVE_RECVMSG||HAVE_SENDMSG */
463
464/** send a UDP reply over specified interface*/
465static int
466comm_point_send_udp_msg_if(struct comm_point *c, sldns_buffer* packet,
467	struct sockaddr* addr, socklen_t addrlen, struct comm_reply* r)
468{
469#if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_SENDMSG)
470	ssize_t sent;
471	struct msghdr msg;
472	struct iovec iov[1];
473	char control[256];
474#ifndef S_SPLINT_S
475	struct cmsghdr *cmsg;
476#endif /* S_SPLINT_S */
477
478	log_assert(c->fd != -1);
479#ifdef UNBOUND_DEBUG
480	if(sldns_buffer_remaining(packet) == 0)
481		log_err("error: send empty UDP packet");
482#endif
483	log_assert(addr && addrlen > 0);
484
485	msg.msg_name = addr;
486	msg.msg_namelen = addrlen;
487	iov[0].iov_base = sldns_buffer_begin(packet);
488	iov[0].iov_len = sldns_buffer_remaining(packet);
489	msg.msg_iov = iov;
490	msg.msg_iovlen = 1;
491	msg.msg_control = control;
492#ifndef S_SPLINT_S
493	msg.msg_controllen = sizeof(control);
494#endif /* S_SPLINT_S */
495	msg.msg_flags = 0;
496
497#ifndef S_SPLINT_S
498	cmsg = CMSG_FIRSTHDR(&msg);
499	if(r->srctype == 4) {
500#ifdef IP_PKTINFO
501		void* cmsg_data;
502		msg.msg_controllen = CMSG_SPACE(sizeof(struct in_pktinfo));
503		log_assert(msg.msg_controllen <= sizeof(control));
504		cmsg->cmsg_level = IPPROTO_IP;
505		cmsg->cmsg_type = IP_PKTINFO;
506		memmove(CMSG_DATA(cmsg), &r->pktinfo.v4info,
507			sizeof(struct in_pktinfo));
508		/* unset the ifindex to not bypass the routing tables */
509		cmsg_data = CMSG_DATA(cmsg);
510		((struct in_pktinfo *) cmsg_data)->ipi_ifindex = 0;
511		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo));
512#elif defined(IP_SENDSRCADDR)
513		msg.msg_controllen = CMSG_SPACE(sizeof(struct in_addr));
514		log_assert(msg.msg_controllen <= sizeof(control));
515		cmsg->cmsg_level = IPPROTO_IP;
516		cmsg->cmsg_type = IP_SENDSRCADDR;
517		memmove(CMSG_DATA(cmsg), &r->pktinfo.v4addr,
518			sizeof(struct in_addr));
519		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_addr));
520#else
521		verbose(VERB_ALGO, "no IP_PKTINFO or IP_SENDSRCADDR");
522		msg.msg_control = NULL;
523#endif /* IP_PKTINFO or IP_SENDSRCADDR */
524	} else if(r->srctype == 6) {
525		void* cmsg_data;
526		msg.msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
527		log_assert(msg.msg_controllen <= sizeof(control));
528		cmsg->cmsg_level = IPPROTO_IPV6;
529		cmsg->cmsg_type = IPV6_PKTINFO;
530		memmove(CMSG_DATA(cmsg), &r->pktinfo.v6info,
531			sizeof(struct in6_pktinfo));
532		/* unset the ifindex to not bypass the routing tables */
533		cmsg_data = CMSG_DATA(cmsg);
534		((struct in6_pktinfo *) cmsg_data)->ipi6_ifindex = 0;
535		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
536	} else {
537		/* try to pass all 0 to use default route */
538		msg.msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
539		log_assert(msg.msg_controllen <= sizeof(control));
540		cmsg->cmsg_level = IPPROTO_IPV6;
541		cmsg->cmsg_type = IPV6_PKTINFO;
542		memset(CMSG_DATA(cmsg), 0, sizeof(struct in6_pktinfo));
543		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
544	}
545#endif /* S_SPLINT_S */
546	if(verbosity >= VERB_ALGO)
547		p_ancil("send_udp over interface", r);
548	sent = sendmsg(c->fd, &msg, 0);
549	if(sent == -1) {
550		if(!udp_send_errno_needs_log(addr, addrlen))
551			return 0;
552		verbose(VERB_OPS, "sendmsg failed: %s", strerror(errno));
553		log_addr(VERB_OPS, "remote address is",
554			(struct sockaddr_storage*)addr, addrlen);
555		return 0;
556	} else if((size_t)sent != sldns_buffer_remaining(packet)) {
557		log_err("sent %d in place of %d bytes",
558			(int)sent, (int)sldns_buffer_remaining(packet));
559		return 0;
560	}
561	return 1;
562#else
563	(void)c;
564	(void)packet;
565	(void)addr;
566	(void)addrlen;
567	(void)r;
568	log_err("sendmsg: IPV6_PKTINFO not supported");
569	return 0;
570#endif /* AF_INET6 && IPV6_PKTINFO && HAVE_SENDMSG */
571}
572
573void
574comm_point_udp_ancil_callback(int fd, short event, void* arg)
575{
576#if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG)
577	struct comm_reply rep;
578	struct msghdr msg;
579	struct iovec iov[1];
580	ssize_t rcv;
581	char ancil[256];
582	int i;
583#ifndef S_SPLINT_S
584	struct cmsghdr* cmsg;
585#endif /* S_SPLINT_S */
586
587	rep.c = (struct comm_point*)arg;
588	log_assert(rep.c->type == comm_udp);
589
590	if(!(event&EV_READ))
591		return;
592	log_assert(rep.c && rep.c->buffer && rep.c->fd == fd);
593	comm_base_now(rep.c->ev->base);
594	for(i=0; i<NUM_UDP_PER_SELECT; i++) {
595		sldns_buffer_clear(rep.c->buffer);
596		rep.addrlen = (socklen_t)sizeof(rep.addr);
597		log_assert(fd != -1);
598		log_assert(sldns_buffer_remaining(rep.c->buffer) > 0);
599		msg.msg_name = &rep.addr;
600		msg.msg_namelen = (socklen_t)sizeof(rep.addr);
601		iov[0].iov_base = sldns_buffer_begin(rep.c->buffer);
602		iov[0].iov_len = sldns_buffer_remaining(rep.c->buffer);
603		msg.msg_iov = iov;
604		msg.msg_iovlen = 1;
605		msg.msg_control = ancil;
606#ifndef S_SPLINT_S
607		msg.msg_controllen = sizeof(ancil);
608#endif /* S_SPLINT_S */
609		msg.msg_flags = 0;
610		rcv = recvmsg(fd, &msg, 0);
611		if(rcv == -1) {
612			if(errno != EAGAIN && errno != EINTR) {
613				log_err("recvmsg failed: %s", strerror(errno));
614			}
615			return;
616		}
617		rep.addrlen = msg.msg_namelen;
618		sldns_buffer_skip(rep.c->buffer, rcv);
619		sldns_buffer_flip(rep.c->buffer);
620		rep.srctype = 0;
621#ifndef S_SPLINT_S
622		for(cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
623			cmsg = CMSG_NXTHDR(&msg, cmsg)) {
624			if( cmsg->cmsg_level == IPPROTO_IPV6 &&
625				cmsg->cmsg_type == IPV6_PKTINFO) {
626				rep.srctype = 6;
627				memmove(&rep.pktinfo.v6info, CMSG_DATA(cmsg),
628					sizeof(struct in6_pktinfo));
629				break;
630#ifdef IP_PKTINFO
631			} else if( cmsg->cmsg_level == IPPROTO_IP &&
632				cmsg->cmsg_type == IP_PKTINFO) {
633				rep.srctype = 4;
634				memmove(&rep.pktinfo.v4info, CMSG_DATA(cmsg),
635					sizeof(struct in_pktinfo));
636				break;
637#elif defined(IP_RECVDSTADDR)
638			} else if( cmsg->cmsg_level == IPPROTO_IP &&
639				cmsg->cmsg_type == IP_RECVDSTADDR) {
640				rep.srctype = 4;
641				memmove(&rep.pktinfo.v4addr, CMSG_DATA(cmsg),
642					sizeof(struct in_addr));
643				break;
644#endif /* IP_PKTINFO or IP_RECVDSTADDR */
645			}
646		}
647		if(verbosity >= VERB_ALGO)
648			p_ancil("receive_udp on interface", &rep);
649#endif /* S_SPLINT_S */
650		fptr_ok(fptr_whitelist_comm_point(rep.c->callback));
651		if((*rep.c->callback)(rep.c, rep.c->cb_arg, NETEVENT_NOERROR, &rep)) {
652			/* send back immediate reply */
653			(void)comm_point_send_udp_msg_if(rep.c, rep.c->buffer,
654				(struct sockaddr*)&rep.addr, rep.addrlen, &rep);
655		}
656		if(rep.c->fd == -1) /* commpoint closed */
657			break;
658	}
659#else
660	(void)fd;
661	(void)event;
662	(void)arg;
663	fatal_exit("recvmsg: No support for IPV6_PKTINFO. "
664		"Please disable interface-automatic");
665#endif /* AF_INET6 && IPV6_PKTINFO && HAVE_RECVMSG */
666}
667
668void
669comm_point_udp_callback(int fd, short event, void* arg)
670{
671	struct comm_reply rep;
672	ssize_t rcv;
673	int i;
674
675	rep.c = (struct comm_point*)arg;
676	log_assert(rep.c->type == comm_udp);
677
678	if(!(event&EV_READ))
679		return;
680	log_assert(rep.c && rep.c->buffer && rep.c->fd == fd);
681	comm_base_now(rep.c->ev->base);
682	for(i=0; i<NUM_UDP_PER_SELECT; i++) {
683		sldns_buffer_clear(rep.c->buffer);
684		rep.addrlen = (socklen_t)sizeof(rep.addr);
685		log_assert(fd != -1);
686		log_assert(sldns_buffer_remaining(rep.c->buffer) > 0);
687		rcv = recvfrom(fd, (void*)sldns_buffer_begin(rep.c->buffer),
688			sldns_buffer_remaining(rep.c->buffer), 0,
689			(struct sockaddr*)&rep.addr, &rep.addrlen);
690		if(rcv == -1) {
691#ifndef USE_WINSOCK
692			if(errno != EAGAIN && errno != EINTR)
693				log_err("recvfrom %d failed: %s",
694					fd, strerror(errno));
695#else
696			if(WSAGetLastError() != WSAEINPROGRESS &&
697				WSAGetLastError() != WSAECONNRESET &&
698				WSAGetLastError()!= WSAEWOULDBLOCK)
699				log_err("recvfrom failed: %s",
700					wsa_strerror(WSAGetLastError()));
701#endif
702			return;
703		}
704		sldns_buffer_skip(rep.c->buffer, rcv);
705		sldns_buffer_flip(rep.c->buffer);
706		rep.srctype = 0;
707		fptr_ok(fptr_whitelist_comm_point(rep.c->callback));
708		if((*rep.c->callback)(rep.c, rep.c->cb_arg, NETEVENT_NOERROR, &rep)) {
709			/* send back immediate reply */
710			(void)comm_point_send_udp_msg(rep.c, rep.c->buffer,
711				(struct sockaddr*)&rep.addr, rep.addrlen);
712		}
713		if(rep.c->fd != fd) /* commpoint closed to -1 or reused for
714		another UDP port. Note rep.c cannot be reused with TCP fd. */
715			break;
716	}
717}
718
719/** Use a new tcp handler for new query fd, set to read query */
720static void
721setup_tcp_handler(struct comm_point* c, int fd)
722{
723	log_assert(c->type == comm_tcp);
724	log_assert(c->fd == -1);
725	sldns_buffer_clear(c->buffer);
726	c->tcp_is_reading = 1;
727	c->tcp_byte_count = 0;
728	comm_point_start_listening(c, fd, TCP_QUERY_TIMEOUT);
729}
730
731void comm_base_handle_slow_accept(int ATTR_UNUSED(fd),
732	short ATTR_UNUSED(event), void* arg)
733{
734	struct comm_base* b = (struct comm_base*)arg;
735	/* timeout for the slow accept, re-enable accepts again */
736	if(b->start_accept) {
737		verbose(VERB_ALGO, "wait is over, slow accept disabled");
738		fptr_ok(fptr_whitelist_start_accept(b->start_accept));
739		(*b->start_accept)(b->cb_arg);
740		b->eb->slow_accept_enabled = 0;
741	}
742}
743
744int comm_point_perform_accept(struct comm_point* c,
745	struct sockaddr_storage* addr, socklen_t* addrlen)
746{
747	int new_fd;
748	*addrlen = (socklen_t)sizeof(*addr);
749	new_fd = accept(c->fd, (struct sockaddr*)addr, addrlen);
750	if(new_fd == -1) {
751#ifndef USE_WINSOCK
752		/* EINTR is signal interrupt. others are closed connection. */
753		if(	errno == EINTR || errno == EAGAIN
754#ifdef EWOULDBLOCK
755			|| errno == EWOULDBLOCK
756#endif
757#ifdef ECONNABORTED
758			|| errno == ECONNABORTED
759#endif
760#ifdef EPROTO
761			|| errno == EPROTO
762#endif /* EPROTO */
763			)
764			return -1;
765#if defined(ENFILE) && defined(EMFILE)
766		if(errno == ENFILE || errno == EMFILE) {
767			/* out of file descriptors, likely outside of our
768			 * control. stop accept() calls for some time */
769			if(c->ev->base->stop_accept) {
770				struct comm_base* b = c->ev->base;
771				struct timeval tv;
772				verbose(VERB_ALGO, "out of file descriptors: "
773					"slow accept");
774				b->eb->slow_accept_enabled = 1;
775				fptr_ok(fptr_whitelist_stop_accept(
776					b->stop_accept));
777				(*b->stop_accept)(b->cb_arg);
778				/* set timeout, no mallocs */
779				tv.tv_sec = NETEVENT_SLOW_ACCEPT_TIME/1000;
780				tv.tv_usec = NETEVENT_SLOW_ACCEPT_TIME%1000;
781				event_set(&b->eb->slow_accept, -1, EV_TIMEOUT,
782					comm_base_handle_slow_accept, b);
783				if(event_base_set(b->eb->base,
784					&b->eb->slow_accept) != 0) {
785					/* we do not want to log here, because
786					 * that would spam the logfiles.
787					 * error: "event_base_set failed." */
788				}
789				if(event_add(&b->eb->slow_accept, &tv) != 0) {
790					/* we do not want to log here,
791					 * error: "event_add failed." */
792				}
793			}
794			return -1;
795		}
796#endif
797		log_err_addr("accept failed", strerror(errno), addr, *addrlen);
798#else /* USE_WINSOCK */
799		if(WSAGetLastError() == WSAEINPROGRESS ||
800			WSAGetLastError() == WSAECONNRESET)
801			return -1;
802		if(WSAGetLastError() == WSAEWOULDBLOCK) {
803			winsock_tcp_wouldblock(&c->ev->ev, EV_READ);
804			return -1;
805		}
806		log_err_addr("accept failed", wsa_strerror(WSAGetLastError()),
807			addr, *addrlen);
808#endif
809		return -1;
810	}
811	fd_set_nonblock(new_fd);
812	return new_fd;
813}
814
815#ifdef USE_WINSOCK
816static long win_bio_cb(BIO *b, int oper, const char* ATTR_UNUSED(argp),
817        int ATTR_UNUSED(argi), long argl, long retvalue)
818{
819	verbose(VERB_ALGO, "bio_cb %d, %s %s %s", oper,
820		(oper&BIO_CB_RETURN)?"return":"before",
821		(oper&BIO_CB_READ)?"read":((oper&BIO_CB_WRITE)?"write":"other"),
822		WSAGetLastError()==WSAEWOULDBLOCK?"wsawb":"");
823	/* on windows, check if previous operation caused EWOULDBLOCK */
824	if( (oper == (BIO_CB_READ|BIO_CB_RETURN) && argl == 0) ||
825		(oper == (BIO_CB_GETS|BIO_CB_RETURN) && argl == 0)) {
826		if(WSAGetLastError() == WSAEWOULDBLOCK)
827			winsock_tcp_wouldblock((struct event*)
828				BIO_get_callback_arg(b), EV_READ);
829	}
830	if( (oper == (BIO_CB_WRITE|BIO_CB_RETURN) && argl == 0) ||
831		(oper == (BIO_CB_PUTS|BIO_CB_RETURN) && argl == 0)) {
832		if(WSAGetLastError() == WSAEWOULDBLOCK)
833			winsock_tcp_wouldblock((struct event*)
834				BIO_get_callback_arg(b), EV_WRITE);
835	}
836	/* return original return value */
837	return retvalue;
838}
839
840/** set win bio callbacks for nonblocking operations */
841void
842comm_point_tcp_win_bio_cb(struct comm_point* c, void* thessl)
843{
844	SSL* ssl = (SSL*)thessl;
845	/* set them both just in case, but usually they are the same BIO */
846	BIO_set_callback(SSL_get_rbio(ssl), &win_bio_cb);
847	BIO_set_callback_arg(SSL_get_rbio(ssl), (char*)&c->ev->ev);
848	BIO_set_callback(SSL_get_wbio(ssl), &win_bio_cb);
849	BIO_set_callback_arg(SSL_get_wbio(ssl), (char*)&c->ev->ev);
850}
851#endif
852
853void
854comm_point_tcp_accept_callback(int fd, short event, void* arg)
855{
856	struct comm_point* c = (struct comm_point*)arg, *c_hdl;
857	int new_fd;
858	log_assert(c->type == comm_tcp_accept);
859	if(!(event & EV_READ)) {
860		log_info("ignoring tcp accept event %d", (int)event);
861		return;
862	}
863	comm_base_now(c->ev->base);
864	/* find free tcp handler. */
865	if(!c->tcp_free) {
866		log_warn("accepted too many tcp, connections full");
867		return;
868	}
869	/* accept incoming connection. */
870	c_hdl = c->tcp_free;
871	log_assert(fd != -1);
872	new_fd = comm_point_perform_accept(c, &c_hdl->repinfo.addr,
873		&c_hdl->repinfo.addrlen);
874	if(new_fd == -1)
875		return;
876	if(c->ssl) {
877		c_hdl->ssl = incoming_ssl_fd(c->ssl, new_fd);
878		if(!c_hdl->ssl) {
879			c_hdl->fd = new_fd;
880			comm_point_close(c_hdl);
881			return;
882		}
883		c_hdl->ssl_shake_state = comm_ssl_shake_read;
884#ifdef USE_WINSOCK
885		comm_point_tcp_win_bio_cb(c_hdl, c_hdl->ssl);
886#endif
887	}
888
889	/* grab the tcp handler buffers */
890	c->cur_tcp_count++;
891	c->tcp_free = c_hdl->tcp_free;
892	if(!c->tcp_free) {
893		/* stop accepting incoming queries for now. */
894		comm_point_stop_listening(c);
895	}
896	setup_tcp_handler(c_hdl, new_fd);
897}
898
899/** Make tcp handler free for next assignment */
900static void
901reclaim_tcp_handler(struct comm_point* c)
902{
903	log_assert(c->type == comm_tcp);
904	if(c->ssl) {
905#ifdef HAVE_SSL
906		SSL_shutdown(c->ssl);
907		SSL_free(c->ssl);
908		c->ssl = NULL;
909#endif
910	}
911	comm_point_close(c);
912	if(c->tcp_parent) {
913		c->tcp_parent->cur_tcp_count--;
914		c->tcp_free = c->tcp_parent->tcp_free;
915		c->tcp_parent->tcp_free = c;
916		if(!c->tcp_free) {
917			/* re-enable listening on accept socket */
918			comm_point_start_listening(c->tcp_parent, -1, -1);
919		}
920	}
921}
922
923/** do the callback when writing is done */
924static void
925tcp_callback_writer(struct comm_point* c)
926{
927	log_assert(c->type == comm_tcp);
928	sldns_buffer_clear(c->buffer);
929	if(c->tcp_do_toggle_rw)
930		c->tcp_is_reading = 1;
931	c->tcp_byte_count = 0;
932	/* switch from listening(write) to listening(read) */
933	comm_point_stop_listening(c);
934	comm_point_start_listening(c, -1, -1);
935}
936
937/** do the callback when reading is done */
938static void
939tcp_callback_reader(struct comm_point* c)
940{
941	log_assert(c->type == comm_tcp || c->type == comm_local);
942	sldns_buffer_flip(c->buffer);
943	if(c->tcp_do_toggle_rw)
944		c->tcp_is_reading = 0;
945	c->tcp_byte_count = 0;
946	if(c->type == comm_tcp)
947		comm_point_stop_listening(c);
948	fptr_ok(fptr_whitelist_comm_point(c->callback));
949	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
950		comm_point_start_listening(c, -1, TCP_QUERY_TIMEOUT);
951	}
952}
953
954/** continue ssl handshake */
955#ifdef HAVE_SSL
956static int
957ssl_handshake(struct comm_point* c)
958{
959	int r;
960	if(c->ssl_shake_state == comm_ssl_shake_hs_read) {
961		/* read condition satisfied back to writing */
962		comm_point_listen_for_rw(c, 1, 1);
963		c->ssl_shake_state = comm_ssl_shake_none;
964		return 1;
965	}
966	if(c->ssl_shake_state == comm_ssl_shake_hs_write) {
967		/* write condition satisfied, back to reading */
968		comm_point_listen_for_rw(c, 1, 0);
969		c->ssl_shake_state = comm_ssl_shake_none;
970		return 1;
971	}
972
973	ERR_clear_error();
974	r = SSL_do_handshake(c->ssl);
975	if(r != 1) {
976		int want = SSL_get_error(c->ssl, r);
977		if(want == SSL_ERROR_WANT_READ) {
978			if(c->ssl_shake_state == comm_ssl_shake_read)
979				return 1;
980			c->ssl_shake_state = comm_ssl_shake_read;
981			comm_point_listen_for_rw(c, 1, 0);
982			return 1;
983		} else if(want == SSL_ERROR_WANT_WRITE) {
984			if(c->ssl_shake_state == comm_ssl_shake_write)
985				return 1;
986			c->ssl_shake_state = comm_ssl_shake_write;
987			comm_point_listen_for_rw(c, 0, 1);
988			return 1;
989		} else if(r == 0) {
990			return 0; /* closed */
991		} else if(want == SSL_ERROR_SYSCALL) {
992			/* SYSCALL and errno==0 means closed uncleanly */
993			if(errno != 0)
994				log_err("SSL_handshake syscall: %s",
995					strerror(errno));
996			return 0;
997		} else {
998			log_crypto_err("ssl handshake failed");
999			log_addr(1, "ssl handshake failed", &c->repinfo.addr,
1000				c->repinfo.addrlen);
1001			return 0;
1002		}
1003	}
1004	/* this is where peer verification could take place */
1005	log_addr(VERB_ALGO, "SSL DNS connection", &c->repinfo.addr,
1006		c->repinfo.addrlen);
1007
1008	/* setup listen rw correctly */
1009	if(c->tcp_is_reading) {
1010		if(c->ssl_shake_state != comm_ssl_shake_read)
1011			comm_point_listen_for_rw(c, 1, 0);
1012	} else {
1013		comm_point_listen_for_rw(c, 1, 1);
1014	}
1015	c->ssl_shake_state = comm_ssl_shake_none;
1016	return 1;
1017}
1018#endif /* HAVE_SSL */
1019
1020/** ssl read callback on TCP */
1021static int
1022ssl_handle_read(struct comm_point* c)
1023{
1024#ifdef HAVE_SSL
1025	int r;
1026	if(c->ssl_shake_state != comm_ssl_shake_none) {
1027		if(!ssl_handshake(c))
1028			return 0;
1029		if(c->ssl_shake_state != comm_ssl_shake_none)
1030			return 1;
1031	}
1032	if(c->tcp_byte_count < sizeof(uint16_t)) {
1033		/* read length bytes */
1034		ERR_clear_error();
1035		if((r=SSL_read(c->ssl, (void*)sldns_buffer_at(c->buffer,
1036			c->tcp_byte_count), (int)(sizeof(uint16_t) -
1037			c->tcp_byte_count))) <= 0) {
1038			int want = SSL_get_error(c->ssl, r);
1039			if(want == SSL_ERROR_ZERO_RETURN) {
1040				return 0; /* shutdown, closed */
1041			} else if(want == SSL_ERROR_WANT_READ) {
1042				return 1; /* read more later */
1043			} else if(want == SSL_ERROR_WANT_WRITE) {
1044				c->ssl_shake_state = comm_ssl_shake_hs_write;
1045				comm_point_listen_for_rw(c, 0, 1);
1046				return 1;
1047			} else if(want == SSL_ERROR_SYSCALL) {
1048				if(errno != 0)
1049					log_err("SSL_read syscall: %s",
1050						strerror(errno));
1051				return 0;
1052			}
1053			log_crypto_err("could not SSL_read");
1054			return 0;
1055		}
1056		c->tcp_byte_count += r;
1057		if(c->tcp_byte_count != sizeof(uint16_t))
1058			return 1;
1059		if(sldns_buffer_read_u16_at(c->buffer, 0) >
1060			sldns_buffer_capacity(c->buffer)) {
1061			verbose(VERB_QUERY, "ssl: dropped larger than buffer");
1062			return 0;
1063		}
1064		sldns_buffer_set_limit(c->buffer,
1065			sldns_buffer_read_u16_at(c->buffer, 0));
1066		if(sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) {
1067			verbose(VERB_QUERY, "ssl: dropped bogus too short.");
1068			return 0;
1069		}
1070		verbose(VERB_ALGO, "Reading ssl tcp query of length %d",
1071			(int)sldns_buffer_limit(c->buffer));
1072	}
1073	log_assert(sldns_buffer_remaining(c->buffer) > 0);
1074	ERR_clear_error();
1075	r = SSL_read(c->ssl, (void*)sldns_buffer_current(c->buffer),
1076		(int)sldns_buffer_remaining(c->buffer));
1077	if(r <= 0) {
1078		int want = SSL_get_error(c->ssl, r);
1079		if(want == SSL_ERROR_ZERO_RETURN) {
1080			return 0; /* shutdown, closed */
1081		} else if(want == SSL_ERROR_WANT_READ) {
1082			return 1; /* read more later */
1083		} else if(want == SSL_ERROR_WANT_WRITE) {
1084			c->ssl_shake_state = comm_ssl_shake_hs_write;
1085			comm_point_listen_for_rw(c, 0, 1);
1086			return 1;
1087		} else if(want == SSL_ERROR_SYSCALL) {
1088			if(errno != 0)
1089				log_err("SSL_read syscall: %s",
1090					strerror(errno));
1091			return 0;
1092		}
1093		log_crypto_err("could not SSL_read");
1094		return 0;
1095	}
1096	sldns_buffer_skip(c->buffer, (ssize_t)r);
1097	if(sldns_buffer_remaining(c->buffer) <= 0) {
1098		tcp_callback_reader(c);
1099	}
1100	return 1;
1101#else
1102	(void)c;
1103	return 0;
1104#endif /* HAVE_SSL */
1105}
1106
1107/** ssl write callback on TCP */
1108static int
1109ssl_handle_write(struct comm_point* c)
1110{
1111#ifdef HAVE_SSL
1112	int r;
1113	if(c->ssl_shake_state != comm_ssl_shake_none) {
1114		if(!ssl_handshake(c))
1115			return 0;
1116		if(c->ssl_shake_state != comm_ssl_shake_none)
1117			return 1;
1118	}
1119	/* ignore return, if fails we may simply block */
1120	(void)SSL_set_mode(c->ssl, SSL_MODE_ENABLE_PARTIAL_WRITE);
1121	if(c->tcp_byte_count < sizeof(uint16_t)) {
1122		uint16_t len = htons(sldns_buffer_limit(c->buffer));
1123		ERR_clear_error();
1124		r = SSL_write(c->ssl,
1125			(void*)(((uint8_t*)&len)+c->tcp_byte_count),
1126			(int)(sizeof(uint16_t)-c->tcp_byte_count));
1127		if(r <= 0) {
1128			int want = SSL_get_error(c->ssl, r);
1129			if(want == SSL_ERROR_ZERO_RETURN) {
1130				return 0; /* closed */
1131			} else if(want == SSL_ERROR_WANT_READ) {
1132				c->ssl_shake_state = comm_ssl_shake_read;
1133				comm_point_listen_for_rw(c, 1, 0);
1134				return 1; /* wait for read condition */
1135			} else if(want == SSL_ERROR_WANT_WRITE) {
1136				return 1; /* write more later */
1137			} else if(want == SSL_ERROR_SYSCALL) {
1138				if(errno != 0)
1139					log_err("SSL_write syscall: %s",
1140						strerror(errno));
1141				return 0;
1142			}
1143			log_crypto_err("could not SSL_write");
1144			return 0;
1145		}
1146		c->tcp_byte_count += r;
1147		if(c->tcp_byte_count < sizeof(uint16_t))
1148			return 1;
1149		sldns_buffer_set_position(c->buffer, c->tcp_byte_count -
1150			sizeof(uint16_t));
1151		if(sldns_buffer_remaining(c->buffer) == 0) {
1152			tcp_callback_writer(c);
1153			return 1;
1154		}
1155	}
1156	log_assert(sldns_buffer_remaining(c->buffer) > 0);
1157	ERR_clear_error();
1158	r = SSL_write(c->ssl, (void*)sldns_buffer_current(c->buffer),
1159		(int)sldns_buffer_remaining(c->buffer));
1160	if(r <= 0) {
1161		int want = SSL_get_error(c->ssl, r);
1162		if(want == SSL_ERROR_ZERO_RETURN) {
1163			return 0; /* closed */
1164		} else if(want == SSL_ERROR_WANT_READ) {
1165			c->ssl_shake_state = comm_ssl_shake_read;
1166			comm_point_listen_for_rw(c, 1, 0);
1167			return 1; /* wait for read condition */
1168		} else if(want == SSL_ERROR_WANT_WRITE) {
1169			return 1; /* write more later */
1170		} else if(want == SSL_ERROR_SYSCALL) {
1171			if(errno != 0)
1172				log_err("SSL_write syscall: %s",
1173					strerror(errno));
1174			return 0;
1175		}
1176		log_crypto_err("could not SSL_write");
1177		return 0;
1178	}
1179	sldns_buffer_skip(c->buffer, (ssize_t)r);
1180
1181	if(sldns_buffer_remaining(c->buffer) == 0) {
1182		tcp_callback_writer(c);
1183	}
1184	return 1;
1185#else
1186	(void)c;
1187	return 0;
1188#endif /* HAVE_SSL */
1189}
1190
1191/** handle ssl tcp connection with dns contents */
1192static int
1193ssl_handle_it(struct comm_point* c)
1194{
1195	if(c->tcp_is_reading)
1196		return ssl_handle_read(c);
1197	return ssl_handle_write(c);
1198}
1199
1200/** Handle tcp reading callback.
1201 * @param fd: file descriptor of socket.
1202 * @param c: comm point to read from into buffer.
1203 * @param short_ok: if true, very short packets are OK (for comm_local).
1204 * @return: 0 on error
1205 */
1206static int
1207comm_point_tcp_handle_read(int fd, struct comm_point* c, int short_ok)
1208{
1209	ssize_t r;
1210	log_assert(c->type == comm_tcp || c->type == comm_local);
1211	if(c->ssl)
1212		return ssl_handle_it(c);
1213	if(!c->tcp_is_reading)
1214		return 0;
1215
1216	log_assert(fd != -1);
1217	if(c->tcp_byte_count < sizeof(uint16_t)) {
1218		/* read length bytes */
1219		r = recv(fd,(void*)sldns_buffer_at(c->buffer,c->tcp_byte_count),
1220			sizeof(uint16_t)-c->tcp_byte_count, 0);
1221		if(r == 0)
1222			return 0;
1223		else if(r == -1) {
1224#ifndef USE_WINSOCK
1225			if(errno == EINTR || errno == EAGAIN)
1226				return 1;
1227#ifdef ECONNRESET
1228			if(errno == ECONNRESET && verbosity < 2)
1229				return 0; /* silence reset by peer */
1230#endif
1231			log_err_addr("read (in tcp s)", strerror(errno),
1232				&c->repinfo.addr, c->repinfo.addrlen);
1233#else /* USE_WINSOCK */
1234			if(WSAGetLastError() == WSAECONNRESET)
1235				return 0;
1236			if(WSAGetLastError() == WSAEINPROGRESS)
1237				return 1;
1238			if(WSAGetLastError() == WSAEWOULDBLOCK) {
1239				winsock_tcp_wouldblock(&c->ev->ev, EV_READ);
1240				return 1;
1241			}
1242			log_err_addr("read (in tcp s)",
1243				wsa_strerror(WSAGetLastError()),
1244				&c->repinfo.addr, c->repinfo.addrlen);
1245#endif
1246			return 0;
1247		}
1248		c->tcp_byte_count += r;
1249		if(c->tcp_byte_count != sizeof(uint16_t))
1250			return 1;
1251		if(sldns_buffer_read_u16_at(c->buffer, 0) >
1252			sldns_buffer_capacity(c->buffer)) {
1253			verbose(VERB_QUERY, "tcp: dropped larger than buffer");
1254			return 0;
1255		}
1256		sldns_buffer_set_limit(c->buffer,
1257			sldns_buffer_read_u16_at(c->buffer, 0));
1258		if(!short_ok &&
1259			sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) {
1260			verbose(VERB_QUERY, "tcp: dropped bogus too short.");
1261			return 0;
1262		}
1263		verbose(VERB_ALGO, "Reading tcp query of length %d",
1264			(int)sldns_buffer_limit(c->buffer));
1265	}
1266
1267	log_assert(sldns_buffer_remaining(c->buffer) > 0);
1268	r = recv(fd, (void*)sldns_buffer_current(c->buffer),
1269		sldns_buffer_remaining(c->buffer), 0);
1270	if(r == 0) {
1271		return 0;
1272	} else if(r == -1) {
1273#ifndef USE_WINSOCK
1274		if(errno == EINTR || errno == EAGAIN)
1275			return 1;
1276		log_err_addr("read (in tcp r)", strerror(errno),
1277			&c->repinfo.addr, c->repinfo.addrlen);
1278#else /* USE_WINSOCK */
1279		if(WSAGetLastError() == WSAECONNRESET)
1280			return 0;
1281		if(WSAGetLastError() == WSAEINPROGRESS)
1282			return 1;
1283		if(WSAGetLastError() == WSAEWOULDBLOCK) {
1284			winsock_tcp_wouldblock(&c->ev->ev, EV_READ);
1285			return 1;
1286		}
1287		log_err_addr("read (in tcp r)",
1288			wsa_strerror(WSAGetLastError()),
1289			&c->repinfo.addr, c->repinfo.addrlen);
1290#endif
1291		return 0;
1292	}
1293	sldns_buffer_skip(c->buffer, r);
1294	if(sldns_buffer_remaining(c->buffer) <= 0) {
1295		tcp_callback_reader(c);
1296	}
1297	return 1;
1298}
1299
1300/**
1301 * Handle tcp writing callback.
1302 * @param fd: file descriptor of socket.
1303 * @param c: comm point to write buffer out of.
1304 * @return: 0 on error
1305 */
1306static int
1307comm_point_tcp_handle_write(int fd, struct comm_point* c)
1308{
1309	ssize_t r;
1310	log_assert(c->type == comm_tcp);
1311	if(c->tcp_is_reading && !c->ssl)
1312		return 0;
1313	log_assert(fd != -1);
1314	if(c->tcp_byte_count == 0 && c->tcp_check_nb_connect) {
1315		/* check for pending error from nonblocking connect */
1316		/* from Stevens, unix network programming, vol1, 3rd ed, p450*/
1317		int error = 0;
1318		socklen_t len = (socklen_t)sizeof(error);
1319		if(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&error,
1320			&len) < 0){
1321#ifndef USE_WINSOCK
1322			error = errno; /* on solaris errno is error */
1323#else /* USE_WINSOCK */
1324			error = WSAGetLastError();
1325#endif
1326		}
1327#ifndef USE_WINSOCK
1328#if defined(EINPROGRESS) && defined(EWOULDBLOCK)
1329		if(error == EINPROGRESS || error == EWOULDBLOCK)
1330			return 1; /* try again later */
1331		else
1332#endif
1333		if(error != 0 && verbosity < 2)
1334			return 0; /* silence lots of chatter in the logs */
1335                else if(error != 0) {
1336			log_err_addr("tcp connect", strerror(error),
1337				&c->repinfo.addr, c->repinfo.addrlen);
1338#else /* USE_WINSOCK */
1339		/* examine error */
1340		if(error == WSAEINPROGRESS)
1341			return 1;
1342		else if(error == WSAEWOULDBLOCK) {
1343			winsock_tcp_wouldblock(&c->ev->ev, EV_WRITE);
1344			return 1;
1345		} else if(error != 0 && verbosity < 2)
1346			return 0;
1347		else if(error != 0) {
1348			log_err_addr("tcp connect", wsa_strerror(error),
1349				&c->repinfo.addr, c->repinfo.addrlen);
1350#endif /* USE_WINSOCK */
1351			return 0;
1352		}
1353	}
1354	if(c->ssl)
1355		return ssl_handle_it(c);
1356
1357	if(c->tcp_byte_count < sizeof(uint16_t)) {
1358		uint16_t len = htons(sldns_buffer_limit(c->buffer));
1359#ifdef HAVE_WRITEV
1360		struct iovec iov[2];
1361		iov[0].iov_base = (uint8_t*)&len + c->tcp_byte_count;
1362		iov[0].iov_len = sizeof(uint16_t) - c->tcp_byte_count;
1363		iov[1].iov_base = sldns_buffer_begin(c->buffer);
1364		iov[1].iov_len = sldns_buffer_limit(c->buffer);
1365		log_assert(iov[0].iov_len > 0);
1366		log_assert(iov[1].iov_len > 0);
1367		r = writev(fd, iov, 2);
1368#else /* HAVE_WRITEV */
1369		r = send(fd, (void*)(((uint8_t*)&len)+c->tcp_byte_count),
1370			sizeof(uint16_t)-c->tcp_byte_count, 0);
1371#endif /* HAVE_WRITEV */
1372		if(r == -1) {
1373#ifndef USE_WINSOCK
1374#  ifdef EPIPE
1375                	if(errno == EPIPE && verbosity < 2)
1376                        	return 0; /* silence 'broken pipe' */
1377  #endif
1378			if(errno == EINTR || errno == EAGAIN)
1379				return 1;
1380#  ifdef HAVE_WRITEV
1381			log_err_addr("tcp writev", strerror(errno),
1382				&c->repinfo.addr, c->repinfo.addrlen);
1383#  else /* HAVE_WRITEV */
1384			log_err_addr("tcp send s", strerror(errno),
1385				&c->repinfo.addr, c->repinfo.addrlen);
1386#  endif /* HAVE_WRITEV */
1387#else
1388			if(WSAGetLastError() == WSAENOTCONN)
1389				return 1;
1390			if(WSAGetLastError() == WSAEINPROGRESS)
1391				return 1;
1392			if(WSAGetLastError() == WSAEWOULDBLOCK) {
1393				winsock_tcp_wouldblock(&c->ev->ev, EV_WRITE);
1394				return 1;
1395			}
1396			log_err_addr("tcp send s",
1397				wsa_strerror(WSAGetLastError()),
1398				&c->repinfo.addr, c->repinfo.addrlen);
1399#endif
1400			return 0;
1401		}
1402		c->tcp_byte_count += r;
1403		if(c->tcp_byte_count < sizeof(uint16_t))
1404			return 1;
1405		sldns_buffer_set_position(c->buffer, c->tcp_byte_count -
1406			sizeof(uint16_t));
1407		if(sldns_buffer_remaining(c->buffer) == 0) {
1408			tcp_callback_writer(c);
1409			return 1;
1410		}
1411	}
1412	log_assert(sldns_buffer_remaining(c->buffer) > 0);
1413	r = send(fd, (void*)sldns_buffer_current(c->buffer),
1414		sldns_buffer_remaining(c->buffer), 0);
1415	if(r == -1) {
1416#ifndef USE_WINSOCK
1417		if(errno == EINTR || errno == EAGAIN)
1418			return 1;
1419		log_err_addr("tcp send r", strerror(errno),
1420			&c->repinfo.addr, c->repinfo.addrlen);
1421#else
1422		if(WSAGetLastError() == WSAEINPROGRESS)
1423			return 1;
1424		if(WSAGetLastError() == WSAEWOULDBLOCK) {
1425			winsock_tcp_wouldblock(&c->ev->ev, EV_WRITE);
1426			return 1;
1427		}
1428		log_err_addr("tcp send r", wsa_strerror(WSAGetLastError()),
1429			&c->repinfo.addr, c->repinfo.addrlen);
1430#endif
1431		return 0;
1432	}
1433	sldns_buffer_skip(c->buffer, r);
1434
1435	if(sldns_buffer_remaining(c->buffer) == 0) {
1436		tcp_callback_writer(c);
1437	}
1438
1439	return 1;
1440}
1441
1442void
1443comm_point_tcp_handle_callback(int fd, short event, void* arg)
1444{
1445	struct comm_point* c = (struct comm_point*)arg;
1446	log_assert(c->type == comm_tcp);
1447	comm_base_now(c->ev->base);
1448
1449	if(event&EV_READ) {
1450		if(!comm_point_tcp_handle_read(fd, c, 0)) {
1451			reclaim_tcp_handler(c);
1452			if(!c->tcp_do_close) {
1453				fptr_ok(fptr_whitelist_comm_point(
1454					c->callback));
1455				(void)(*c->callback)(c, c->cb_arg,
1456					NETEVENT_CLOSED, NULL);
1457			}
1458		}
1459		return;
1460	}
1461	if(event&EV_WRITE) {
1462		if(!comm_point_tcp_handle_write(fd, c)) {
1463			reclaim_tcp_handler(c);
1464			if(!c->tcp_do_close) {
1465				fptr_ok(fptr_whitelist_comm_point(
1466					c->callback));
1467				(void)(*c->callback)(c, c->cb_arg,
1468					NETEVENT_CLOSED, NULL);
1469			}
1470		}
1471		return;
1472	}
1473	if(event&EV_TIMEOUT) {
1474		verbose(VERB_QUERY, "tcp took too long, dropped");
1475		reclaim_tcp_handler(c);
1476		if(!c->tcp_do_close) {
1477			fptr_ok(fptr_whitelist_comm_point(c->callback));
1478			(void)(*c->callback)(c, c->cb_arg,
1479				NETEVENT_TIMEOUT, NULL);
1480		}
1481		return;
1482	}
1483	log_err("Ignored event %d for tcphdl.", event);
1484}
1485
1486void comm_point_local_handle_callback(int fd, short event, void* arg)
1487{
1488	struct comm_point* c = (struct comm_point*)arg;
1489	log_assert(c->type == comm_local);
1490	comm_base_now(c->ev->base);
1491
1492	if(event&EV_READ) {
1493		if(!comm_point_tcp_handle_read(fd, c, 1)) {
1494			fptr_ok(fptr_whitelist_comm_point(c->callback));
1495			(void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED,
1496				NULL);
1497		}
1498		return;
1499	}
1500	log_err("Ignored event %d for localhdl.", event);
1501}
1502
1503void comm_point_raw_handle_callback(int ATTR_UNUSED(fd),
1504	short event, void* arg)
1505{
1506	struct comm_point* c = (struct comm_point*)arg;
1507	int err = NETEVENT_NOERROR;
1508	log_assert(c->type == comm_raw);
1509	comm_base_now(c->ev->base);
1510
1511	if(event&EV_TIMEOUT)
1512		err = NETEVENT_TIMEOUT;
1513	fptr_ok(fptr_whitelist_comm_point_raw(c->callback));
1514	(void)(*c->callback)(c, c->cb_arg, err, NULL);
1515}
1516
1517struct comm_point*
1518comm_point_create_udp(struct comm_base *base, int fd, sldns_buffer* buffer,
1519	comm_point_callback_t* callback, void* callback_arg)
1520{
1521	struct comm_point* c = (struct comm_point*)calloc(1,
1522		sizeof(struct comm_point));
1523	short evbits;
1524	if(!c)
1525		return NULL;
1526	c->ev = (struct internal_event*)calloc(1,
1527		sizeof(struct internal_event));
1528	if(!c->ev) {
1529		free(c);
1530		return NULL;
1531	}
1532	c->ev->base = base;
1533	c->fd = fd;
1534	c->buffer = buffer;
1535	c->timeout = NULL;
1536	c->tcp_is_reading = 0;
1537	c->tcp_byte_count = 0;
1538	c->tcp_parent = NULL;
1539	c->max_tcp_count = 0;
1540	c->cur_tcp_count = 0;
1541	c->tcp_handlers = NULL;
1542	c->tcp_free = NULL;
1543	c->type = comm_udp;
1544	c->tcp_do_close = 0;
1545	c->do_not_close = 0;
1546	c->tcp_do_toggle_rw = 0;
1547	c->tcp_check_nb_connect = 0;
1548	c->inuse = 0;
1549	c->callback = callback;
1550	c->cb_arg = callback_arg;
1551	evbits = EV_READ | EV_PERSIST;
1552	/* libevent stuff */
1553	event_set(&c->ev->ev, c->fd, evbits, comm_point_udp_callback, c);
1554	if(event_base_set(base->eb->base, &c->ev->ev) != 0) {
1555		log_err("could not baseset udp event");
1556		comm_point_delete(c);
1557		return NULL;
1558	}
1559	if(fd!=-1 && event_add(&c->ev->ev, c->timeout) != 0 ) {
1560		log_err("could not add udp event");
1561		comm_point_delete(c);
1562		return NULL;
1563	}
1564	return c;
1565}
1566
1567struct comm_point*
1568comm_point_create_udp_ancil(struct comm_base *base, int fd,
1569	sldns_buffer* buffer,
1570	comm_point_callback_t* callback, void* callback_arg)
1571{
1572	struct comm_point* c = (struct comm_point*)calloc(1,
1573		sizeof(struct comm_point));
1574	short evbits;
1575	if(!c)
1576		return NULL;
1577	c->ev = (struct internal_event*)calloc(1,
1578		sizeof(struct internal_event));
1579	if(!c->ev) {
1580		free(c);
1581		return NULL;
1582	}
1583	c->ev->base = base;
1584	c->fd = fd;
1585	c->buffer = buffer;
1586	c->timeout = NULL;
1587	c->tcp_is_reading = 0;
1588	c->tcp_byte_count = 0;
1589	c->tcp_parent = NULL;
1590	c->max_tcp_count = 0;
1591	c->cur_tcp_count = 0;
1592	c->tcp_handlers = NULL;
1593	c->tcp_free = NULL;
1594	c->type = comm_udp;
1595	c->tcp_do_close = 0;
1596	c->do_not_close = 0;
1597	c->inuse = 0;
1598	c->tcp_do_toggle_rw = 0;
1599	c->tcp_check_nb_connect = 0;
1600	c->callback = callback;
1601	c->cb_arg = callback_arg;
1602	evbits = EV_READ | EV_PERSIST;
1603	/* libevent stuff */
1604	event_set(&c->ev->ev, c->fd, evbits, comm_point_udp_ancil_callback, c);
1605	if(event_base_set(base->eb->base, &c->ev->ev) != 0) {
1606		log_err("could not baseset udp event");
1607		comm_point_delete(c);
1608		return NULL;
1609	}
1610	if(fd!=-1 && event_add(&c->ev->ev, c->timeout) != 0 ) {
1611		log_err("could not add udp event");
1612		comm_point_delete(c);
1613		return NULL;
1614	}
1615	return c;
1616}
1617
1618static struct comm_point*
1619comm_point_create_tcp_handler(struct comm_base *base,
1620	struct comm_point* parent, size_t bufsize,
1621        comm_point_callback_t* callback, void* callback_arg)
1622{
1623	struct comm_point* c = (struct comm_point*)calloc(1,
1624		sizeof(struct comm_point));
1625	short evbits;
1626	if(!c)
1627		return NULL;
1628	c->ev = (struct internal_event*)calloc(1,
1629		sizeof(struct internal_event));
1630	if(!c->ev) {
1631		free(c);
1632		return NULL;
1633	}
1634	c->ev->base = base;
1635	c->fd = -1;
1636	c->buffer = sldns_buffer_new(bufsize);
1637	if(!c->buffer) {
1638		free(c->ev);
1639		free(c);
1640		return NULL;
1641	}
1642	c->timeout = (struct timeval*)malloc(sizeof(struct timeval));
1643	if(!c->timeout) {
1644		sldns_buffer_free(c->buffer);
1645		free(c->ev);
1646		free(c);
1647		return NULL;
1648	}
1649	c->tcp_is_reading = 0;
1650	c->tcp_byte_count = 0;
1651	c->tcp_parent = parent;
1652	c->max_tcp_count = 0;
1653	c->cur_tcp_count = 0;
1654	c->tcp_handlers = NULL;
1655	c->tcp_free = NULL;
1656	c->type = comm_tcp;
1657	c->tcp_do_close = 0;
1658	c->do_not_close = 0;
1659	c->tcp_do_toggle_rw = 1;
1660	c->tcp_check_nb_connect = 0;
1661	c->repinfo.c = c;
1662	c->callback = callback;
1663	c->cb_arg = callback_arg;
1664	/* add to parent free list */
1665	c->tcp_free = parent->tcp_free;
1666	parent->tcp_free = c;
1667	/* libevent stuff */
1668	evbits = EV_PERSIST | EV_READ | EV_TIMEOUT;
1669	event_set(&c->ev->ev, c->fd, evbits, comm_point_tcp_handle_callback, c);
1670	if(event_base_set(base->eb->base, &c->ev->ev) != 0)
1671	{
1672		log_err("could not basetset tcphdl event");
1673		parent->tcp_free = c->tcp_free;
1674		free(c->ev);
1675		free(c);
1676		return NULL;
1677	}
1678	return c;
1679}
1680
1681struct comm_point*
1682comm_point_create_tcp(struct comm_base *base, int fd, int num, size_t bufsize,
1683        comm_point_callback_t* callback, void* callback_arg)
1684{
1685	struct comm_point* c = (struct comm_point*)calloc(1,
1686		sizeof(struct comm_point));
1687	short evbits;
1688	int i;
1689	/* first allocate the TCP accept listener */
1690	if(!c)
1691		return NULL;
1692	c->ev = (struct internal_event*)calloc(1,
1693		sizeof(struct internal_event));
1694	if(!c->ev) {
1695		free(c);
1696		return NULL;
1697	}
1698	c->ev->base = base;
1699	c->fd = fd;
1700	c->buffer = NULL;
1701	c->timeout = NULL;
1702	c->tcp_is_reading = 0;
1703	c->tcp_byte_count = 0;
1704	c->tcp_parent = NULL;
1705	c->max_tcp_count = num;
1706	c->cur_tcp_count = 0;
1707	c->tcp_handlers = (struct comm_point**)calloc((size_t)num,
1708		sizeof(struct comm_point*));
1709	if(!c->tcp_handlers) {
1710		free(c->ev);
1711		free(c);
1712		return NULL;
1713	}
1714	c->tcp_free = NULL;
1715	c->type = comm_tcp_accept;
1716	c->tcp_do_close = 0;
1717	c->do_not_close = 0;
1718	c->tcp_do_toggle_rw = 0;
1719	c->tcp_check_nb_connect = 0;
1720	c->callback = NULL;
1721	c->cb_arg = NULL;
1722	evbits = EV_READ | EV_PERSIST;
1723	/* libevent stuff */
1724	event_set(&c->ev->ev, c->fd, evbits, comm_point_tcp_accept_callback, c);
1725	if(event_base_set(base->eb->base, &c->ev->ev) != 0 ||
1726		event_add(&c->ev->ev, c->timeout) != 0 )
1727	{
1728		log_err("could not add tcpacc event");
1729		comm_point_delete(c);
1730		return NULL;
1731	}
1732
1733	/* now prealloc the tcp handlers */
1734	for(i=0; i<num; i++) {
1735		c->tcp_handlers[i] = comm_point_create_tcp_handler(base,
1736			c, bufsize, callback, callback_arg);
1737		if(!c->tcp_handlers[i]) {
1738			comm_point_delete(c);
1739			return NULL;
1740		}
1741	}
1742
1743	return c;
1744}
1745
1746struct comm_point*
1747comm_point_create_tcp_out(struct comm_base *base, size_t bufsize,
1748        comm_point_callback_t* callback, void* callback_arg)
1749{
1750	struct comm_point* c = (struct comm_point*)calloc(1,
1751		sizeof(struct comm_point));
1752	short evbits;
1753	if(!c)
1754		return NULL;
1755	c->ev = (struct internal_event*)calloc(1,
1756		sizeof(struct internal_event));
1757	if(!c->ev) {
1758		free(c);
1759		return NULL;
1760	}
1761	c->ev->base = base;
1762	c->fd = -1;
1763	c->buffer = sldns_buffer_new(bufsize);
1764	if(!c->buffer) {
1765		free(c->ev);
1766		free(c);
1767		return NULL;
1768	}
1769	c->timeout = NULL;
1770	c->tcp_is_reading = 0;
1771	c->tcp_byte_count = 0;
1772	c->tcp_parent = NULL;
1773	c->max_tcp_count = 0;
1774	c->cur_tcp_count = 0;
1775	c->tcp_handlers = NULL;
1776	c->tcp_free = NULL;
1777	c->type = comm_tcp;
1778	c->tcp_do_close = 0;
1779	c->do_not_close = 0;
1780	c->tcp_do_toggle_rw = 1;
1781	c->tcp_check_nb_connect = 1;
1782	c->repinfo.c = c;
1783	c->callback = callback;
1784	c->cb_arg = callback_arg;
1785	evbits = EV_PERSIST | EV_WRITE;
1786	event_set(&c->ev->ev, c->fd, evbits, comm_point_tcp_handle_callback, c);
1787	if(event_base_set(base->eb->base, &c->ev->ev) != 0)
1788	{
1789		log_err("could not basetset tcpout event");
1790		sldns_buffer_free(c->buffer);
1791		free(c->ev);
1792		free(c);
1793		return NULL;
1794	}
1795
1796	return c;
1797}
1798
1799struct comm_point*
1800comm_point_create_local(struct comm_base *base, int fd, size_t bufsize,
1801        comm_point_callback_t* callback, void* callback_arg)
1802{
1803	struct comm_point* c = (struct comm_point*)calloc(1,
1804		sizeof(struct comm_point));
1805	short evbits;
1806	if(!c)
1807		return NULL;
1808	c->ev = (struct internal_event*)calloc(1,
1809		sizeof(struct internal_event));
1810	if(!c->ev) {
1811		free(c);
1812		return NULL;
1813	}
1814	c->ev->base = base;
1815	c->fd = fd;
1816	c->buffer = sldns_buffer_new(bufsize);
1817	if(!c->buffer) {
1818		free(c->ev);
1819		free(c);
1820		return NULL;
1821	}
1822	c->timeout = NULL;
1823	c->tcp_is_reading = 1;
1824	c->tcp_byte_count = 0;
1825	c->tcp_parent = NULL;
1826	c->max_tcp_count = 0;
1827	c->cur_tcp_count = 0;
1828	c->tcp_handlers = NULL;
1829	c->tcp_free = NULL;
1830	c->type = comm_local;
1831	c->tcp_do_close = 0;
1832	c->do_not_close = 1;
1833	c->tcp_do_toggle_rw = 0;
1834	c->tcp_check_nb_connect = 0;
1835	c->callback = callback;
1836	c->cb_arg = callback_arg;
1837	/* libevent stuff */
1838	evbits = EV_PERSIST | EV_READ;
1839	event_set(&c->ev->ev, c->fd, evbits, comm_point_local_handle_callback,
1840		c);
1841	if(event_base_set(base->eb->base, &c->ev->ev) != 0 ||
1842		event_add(&c->ev->ev, c->timeout) != 0 )
1843	{
1844		log_err("could not add localhdl event");
1845		free(c->ev);
1846		free(c);
1847		return NULL;
1848	}
1849	return c;
1850}
1851
1852struct comm_point*
1853comm_point_create_raw(struct comm_base* base, int fd, int writing,
1854	comm_point_callback_t* callback, void* callback_arg)
1855{
1856	struct comm_point* c = (struct comm_point*)calloc(1,
1857		sizeof(struct comm_point));
1858	short evbits;
1859	if(!c)
1860		return NULL;
1861	c->ev = (struct internal_event*)calloc(1,
1862		sizeof(struct internal_event));
1863	if(!c->ev) {
1864		free(c);
1865		return NULL;
1866	}
1867	c->ev->base = base;
1868	c->fd = fd;
1869	c->buffer = NULL;
1870	c->timeout = NULL;
1871	c->tcp_is_reading = 0;
1872	c->tcp_byte_count = 0;
1873	c->tcp_parent = NULL;
1874	c->max_tcp_count = 0;
1875	c->cur_tcp_count = 0;
1876	c->tcp_handlers = NULL;
1877	c->tcp_free = NULL;
1878	c->type = comm_raw;
1879	c->tcp_do_close = 0;
1880	c->do_not_close = 1;
1881	c->tcp_do_toggle_rw = 0;
1882	c->tcp_check_nb_connect = 0;
1883	c->callback = callback;
1884	c->cb_arg = callback_arg;
1885	/* libevent stuff */
1886	if(writing)
1887		evbits = EV_PERSIST | EV_WRITE;
1888	else 	evbits = EV_PERSIST | EV_READ;
1889	event_set(&c->ev->ev, c->fd, evbits, comm_point_raw_handle_callback,
1890		c);
1891	if(event_base_set(base->eb->base, &c->ev->ev) != 0 ||
1892		event_add(&c->ev->ev, c->timeout) != 0 )
1893	{
1894		log_err("could not add rawhdl event");
1895		free(c->ev);
1896		free(c);
1897		return NULL;
1898	}
1899	return c;
1900}
1901
1902void
1903comm_point_close(struct comm_point* c)
1904{
1905	if(!c)
1906		return;
1907	if(c->fd != -1)
1908		if(event_del(&c->ev->ev) != 0) {
1909			log_err("could not event_del on close");
1910		}
1911	/* close fd after removing from event lists, or epoll.. is messed up */
1912	if(c->fd != -1 && !c->do_not_close) {
1913		verbose(VERB_ALGO, "close fd %d", c->fd);
1914#ifndef USE_WINSOCK
1915		close(c->fd);
1916#else
1917		closesocket(c->fd);
1918#endif
1919	}
1920	c->fd = -1;
1921}
1922
1923void
1924comm_point_delete(struct comm_point* c)
1925{
1926	if(!c)
1927		return;
1928	if(c->type == comm_tcp && c->ssl) {
1929#ifdef HAVE_SSL
1930		SSL_shutdown(c->ssl);
1931		SSL_free(c->ssl);
1932#endif
1933	}
1934	comm_point_close(c);
1935	if(c->tcp_handlers) {
1936		int i;
1937		for(i=0; i<c->max_tcp_count; i++)
1938			comm_point_delete(c->tcp_handlers[i]);
1939		free(c->tcp_handlers);
1940	}
1941	free(c->timeout);
1942	if(c->type == comm_tcp || c->type == comm_local)
1943		sldns_buffer_free(c->buffer);
1944	free(c->ev);
1945	free(c);
1946}
1947
1948void
1949comm_point_send_reply(struct comm_reply *repinfo)
1950{
1951	log_assert(repinfo && repinfo->c);
1952	if(repinfo->c->type == comm_udp) {
1953		if(repinfo->srctype)
1954			comm_point_send_udp_msg_if(repinfo->c,
1955			repinfo->c->buffer, (struct sockaddr*)&repinfo->addr,
1956			repinfo->addrlen, repinfo);
1957		else
1958			comm_point_send_udp_msg(repinfo->c, repinfo->c->buffer,
1959			(struct sockaddr*)&repinfo->addr, repinfo->addrlen);
1960#ifdef USE_DNSTAP
1961		if(repinfo->c->dtenv != NULL &&
1962		   repinfo->c->dtenv->log_client_response_messages)
1963			dt_msg_send_client_response(repinfo->c->dtenv,
1964			&repinfo->addr, repinfo->c->type, repinfo->c->buffer);
1965#endif
1966	} else {
1967#ifdef USE_DNSTAP
1968		if(repinfo->c->tcp_parent->dtenv != NULL &&
1969		   repinfo->c->tcp_parent->dtenv->log_client_response_messages)
1970			dt_msg_send_client_response(repinfo->c->tcp_parent->dtenv,
1971			&repinfo->addr, repinfo->c->type, repinfo->c->buffer);
1972#endif
1973		comm_point_start_listening(repinfo->c, -1, TCP_QUERY_TIMEOUT);
1974	}
1975}
1976
1977void
1978comm_point_drop_reply(struct comm_reply* repinfo)
1979{
1980	if(!repinfo)
1981		return;
1982	log_assert(repinfo && repinfo->c);
1983	log_assert(repinfo->c->type != comm_tcp_accept);
1984	if(repinfo->c->type == comm_udp)
1985		return;
1986	reclaim_tcp_handler(repinfo->c);
1987}
1988
1989void
1990comm_point_stop_listening(struct comm_point* c)
1991{
1992	verbose(VERB_ALGO, "comm point stop listening %d", c->fd);
1993	if(event_del(&c->ev->ev) != 0) {
1994		log_err("event_del error to stoplisten");
1995	}
1996}
1997
1998void
1999comm_point_start_listening(struct comm_point* c, int newfd, int sec)
2000{
2001	verbose(VERB_ALGO, "comm point start listening %d",
2002		c->fd==-1?newfd:c->fd);
2003	if(c->type == comm_tcp_accept && !c->tcp_free) {
2004		/* no use to start listening no free slots. */
2005		return;
2006	}
2007	if(sec != -1 && sec != 0) {
2008		if(!c->timeout) {
2009			c->timeout = (struct timeval*)malloc(sizeof(
2010				struct timeval));
2011			if(!c->timeout) {
2012				log_err("cpsl: malloc failed. No net read.");
2013				return;
2014			}
2015		}
2016		c->ev->ev.ev_events |= EV_TIMEOUT;
2017#ifndef S_SPLINT_S /* splint fails on struct timeval. */
2018		c->timeout->tv_sec = sec;
2019		c->timeout->tv_usec = 0;
2020#endif /* S_SPLINT_S */
2021	}
2022	if(c->type == comm_tcp) {
2023		c->ev->ev.ev_events &= ~(EV_READ|EV_WRITE);
2024		if(c->tcp_is_reading)
2025			c->ev->ev.ev_events |= EV_READ;
2026		else	c->ev->ev.ev_events |= EV_WRITE;
2027	}
2028	if(newfd != -1) {
2029		if(c->fd != -1) {
2030#ifndef USE_WINSOCK
2031			close(c->fd);
2032#else
2033			closesocket(c->fd);
2034#endif
2035		}
2036		c->fd = newfd;
2037		c->ev->ev.ev_fd = c->fd;
2038	}
2039	if(event_add(&c->ev->ev, sec==0?NULL:c->timeout) != 0) {
2040		log_err("event_add failed. in cpsl.");
2041	}
2042}
2043
2044void comm_point_listen_for_rw(struct comm_point* c, int rd, int wr)
2045{
2046	verbose(VERB_ALGO, "comm point listen_for_rw %d %d", c->fd, wr);
2047	if(event_del(&c->ev->ev) != 0) {
2048		log_err("event_del error to cplf");
2049	}
2050	c->ev->ev.ev_events &= ~(EV_READ|EV_WRITE);
2051	if(rd) c->ev->ev.ev_events |= EV_READ;
2052	if(wr) c->ev->ev.ev_events |= EV_WRITE;
2053	if(event_add(&c->ev->ev, c->timeout) != 0) {
2054		log_err("event_add failed. in cplf.");
2055	}
2056}
2057
2058size_t comm_point_get_mem(struct comm_point* c)
2059{
2060	size_t s;
2061	if(!c)
2062		return 0;
2063	s = sizeof(*c) + sizeof(*c->ev);
2064	if(c->timeout)
2065		s += sizeof(*c->timeout);
2066	if(c->type == comm_tcp || c->type == comm_local)
2067		s += sizeof(*c->buffer) + sldns_buffer_capacity(c->buffer);
2068	if(c->type == comm_tcp_accept) {
2069		int i;
2070		for(i=0; i<c->max_tcp_count; i++)
2071			s += comm_point_get_mem(c->tcp_handlers[i]);
2072	}
2073	return s;
2074}
2075
2076struct comm_timer*
2077comm_timer_create(struct comm_base* base, void (*cb)(void*), void* cb_arg)
2078{
2079	struct comm_timer *tm = (struct comm_timer*)calloc(1,
2080		sizeof(struct comm_timer));
2081	if(!tm)
2082		return NULL;
2083	tm->ev_timer = (struct internal_timer*)calloc(1,
2084		sizeof(struct internal_timer));
2085	if(!tm->ev_timer) {
2086		log_err("malloc failed");
2087		free(tm);
2088		return NULL;
2089	}
2090	tm->ev_timer->base = base;
2091	tm->callback = cb;
2092	tm->cb_arg = cb_arg;
2093	event_set(&tm->ev_timer->ev, -1, EV_TIMEOUT,
2094		comm_timer_callback, tm);
2095	if(event_base_set(base->eb->base, &tm->ev_timer->ev) != 0) {
2096		log_err("timer_create: event_base_set failed.");
2097		free(tm->ev_timer);
2098		free(tm);
2099		return NULL;
2100	}
2101	return tm;
2102}
2103
2104void
2105comm_timer_disable(struct comm_timer* timer)
2106{
2107	if(!timer)
2108		return;
2109	evtimer_del(&timer->ev_timer->ev);
2110	timer->ev_timer->enabled = 0;
2111}
2112
2113void
2114comm_timer_set(struct comm_timer* timer, struct timeval* tv)
2115{
2116	log_assert(tv);
2117	if(timer->ev_timer->enabled)
2118		comm_timer_disable(timer);
2119	event_set(&timer->ev_timer->ev, -1, EV_TIMEOUT,
2120		comm_timer_callback, timer);
2121	if(event_base_set(timer->ev_timer->base->eb->base,
2122		&timer->ev_timer->ev) != 0)
2123		log_err("comm_timer_set: set_base failed.");
2124	if(evtimer_add(&timer->ev_timer->ev, tv) != 0)
2125		log_err("comm_timer_set: evtimer_add failed.");
2126	timer->ev_timer->enabled = 1;
2127}
2128
2129void
2130comm_timer_delete(struct comm_timer* timer)
2131{
2132	if(!timer)
2133		return;
2134	comm_timer_disable(timer);
2135	free(timer->ev_timer);
2136	free(timer);
2137}
2138
2139void
2140comm_timer_callback(int ATTR_UNUSED(fd), short event, void* arg)
2141{
2142	struct comm_timer* tm = (struct comm_timer*)arg;
2143	if(!(event&EV_TIMEOUT))
2144		return;
2145	comm_base_now(tm->ev_timer->base);
2146	tm->ev_timer->enabled = 0;
2147	fptr_ok(fptr_whitelist_comm_timer(tm->callback));
2148	(*tm->callback)(tm->cb_arg);
2149}
2150
2151int
2152comm_timer_is_set(struct comm_timer* timer)
2153{
2154	return (int)timer->ev_timer->enabled;
2155}
2156
2157size_t
2158comm_timer_get_mem(struct comm_timer* timer)
2159{
2160	return sizeof(*timer) + sizeof(struct internal_timer);
2161}
2162
2163struct comm_signal*
2164comm_signal_create(struct comm_base* base,
2165        void (*callback)(int, void*), void* cb_arg)
2166{
2167	struct comm_signal* com = (struct comm_signal*)malloc(
2168		sizeof(struct comm_signal));
2169	if(!com) {
2170		log_err("malloc failed");
2171		return NULL;
2172	}
2173	com->base = base;
2174	com->callback = callback;
2175	com->cb_arg = cb_arg;
2176	com->ev_signal = NULL;
2177	return com;
2178}
2179
2180void
2181comm_signal_callback(int sig, short event, void* arg)
2182{
2183	struct comm_signal* comsig = (struct comm_signal*)arg;
2184	if(!(event & EV_SIGNAL))
2185		return;
2186	comm_base_now(comsig->base);
2187	fptr_ok(fptr_whitelist_comm_signal(comsig->callback));
2188	(*comsig->callback)(sig, comsig->cb_arg);
2189}
2190
2191int
2192comm_signal_bind(struct comm_signal* comsig, int sig)
2193{
2194	struct internal_signal* entry = (struct internal_signal*)calloc(1,
2195		sizeof(struct internal_signal));
2196	if(!entry) {
2197		log_err("malloc failed");
2198		return 0;
2199	}
2200	log_assert(comsig);
2201	/* add signal event */
2202	signal_set(&entry->ev, sig, comm_signal_callback, comsig);
2203	if(event_base_set(comsig->base->eb->base, &entry->ev) != 0) {
2204		log_err("Could not set signal base");
2205		free(entry);
2206		return 0;
2207	}
2208	if(signal_add(&entry->ev, NULL) != 0) {
2209		log_err("Could not add signal handler");
2210		free(entry);
2211		return 0;
2212	}
2213	/* link into list */
2214	entry->next = comsig->ev_signal;
2215	comsig->ev_signal = entry;
2216	return 1;
2217}
2218
2219void
2220comm_signal_delete(struct comm_signal* comsig)
2221{
2222	struct internal_signal* p, *np;
2223	if(!comsig)
2224		return;
2225	p=comsig->ev_signal;
2226	while(p) {
2227		np = p->next;
2228		signal_del(&p->ev);
2229		free(p);
2230		p = np;
2231	}
2232	free(comsig);
2233}
2234