1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
5 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
6 * Copyright (c) 2009 Apple, Inc.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32#include "opt_ktrace.h"
33#include "opt_kqueue.h"
34
35#ifdef COMPAT_FREEBSD11
36#define	_WANT_FREEBSD11_KEVENT
37#endif
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/capsicum.h>
42#include <sys/kernel.h>
43#include <sys/limits.h>
44#include <sys/lock.h>
45#include <sys/mutex.h>
46#include <sys/proc.h>
47#include <sys/malloc.h>
48#include <sys/unistd.h>
49#include <sys/file.h>
50#include <sys/filedesc.h>
51#include <sys/filio.h>
52#include <sys/fcntl.h>
53#include <sys/kthread.h>
54#include <sys/selinfo.h>
55#include <sys/queue.h>
56#include <sys/event.h>
57#include <sys/eventvar.h>
58#include <sys/poll.h>
59#include <sys/protosw.h>
60#include <sys/resourcevar.h>
61#include <sys/sigio.h>
62#include <sys/signalvar.h>
63#include <sys/socket.h>
64#include <sys/socketvar.h>
65#include <sys/stat.h>
66#include <sys/sysctl.h>
67#include <sys/sysproto.h>
68#include <sys/syscallsubr.h>
69#include <sys/taskqueue.h>
70#include <sys/uio.h>
71#include <sys/user.h>
72#ifdef KTRACE
73#include <sys/ktrace.h>
74#endif
75#include <machine/atomic.h>
76
77#include <vm/uma.h>
78
79static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
80
81/*
82 * This lock is used if multiple kq locks are required.  This possibly
83 * should be made into a per proc lock.
84 */
85static struct mtx	kq_global;
86MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
87#define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
88	if (!haslck)				\
89		mtx_lock(lck);			\
90	haslck = 1;				\
91} while (0)
92#define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
93	if (haslck)				\
94		mtx_unlock(lck);			\
95	haslck = 0;				\
96} while (0)
97
98TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
99
100static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
101static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
102static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
103		    struct thread *td, int mflag);
104static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
105static void	kqueue_release(struct kqueue *kq, int locked);
106static void	kqueue_destroy(struct kqueue *kq);
107static void	kqueue_drain(struct kqueue *kq, struct thread *td);
108static int	kqueue_expand(struct kqueue *kq, const struct filterops *fops,
109		    uintptr_t ident, int mflag);
110static void	kqueue_task(void *arg, int pending);
111static int	kqueue_scan(struct kqueue *kq, int maxevents,
112		    struct kevent_copyops *k_ops,
113		    const struct timespec *timeout,
114		    struct kevent *keva, struct thread *td);
115static void 	kqueue_wakeup(struct kqueue *kq);
116static const struct filterops *kqueue_fo_find(int filt);
117static void	kqueue_fo_release(int filt);
118struct g_kevent_args;
119static int	kern_kevent_generic(struct thread *td,
120		    struct g_kevent_args *uap,
121		    struct kevent_copyops *k_ops, const char *struct_name);
122
123static fo_ioctl_t	kqueue_ioctl;
124static fo_poll_t	kqueue_poll;
125static fo_kqfilter_t	kqueue_kqfilter;
126static fo_stat_t	kqueue_stat;
127static fo_close_t	kqueue_close;
128static fo_fill_kinfo_t	kqueue_fill_kinfo;
129
130static struct fileops kqueueops = {
131	.fo_read = invfo_rdwr,
132	.fo_write = invfo_rdwr,
133	.fo_truncate = invfo_truncate,
134	.fo_ioctl = kqueue_ioctl,
135	.fo_poll = kqueue_poll,
136	.fo_kqfilter = kqueue_kqfilter,
137	.fo_stat = kqueue_stat,
138	.fo_close = kqueue_close,
139	.fo_chmod = invfo_chmod,
140	.fo_chown = invfo_chown,
141	.fo_sendfile = invfo_sendfile,
142	.fo_cmp = file_kcmp_generic,
143	.fo_fill_kinfo = kqueue_fill_kinfo,
144};
145
146static int 	knote_attach(struct knote *kn, struct kqueue *kq);
147static void 	knote_drop(struct knote *kn, struct thread *td);
148static void 	knote_drop_detached(struct knote *kn, struct thread *td);
149static void 	knote_enqueue(struct knote *kn);
150static void 	knote_dequeue(struct knote *kn);
151static void 	knote_init(void);
152static struct 	knote *knote_alloc(int mflag);
153static void 	knote_free(struct knote *kn);
154
155static void	filt_kqdetach(struct knote *kn);
156static int	filt_kqueue(struct knote *kn, long hint);
157static int	filt_procattach(struct knote *kn);
158static void	filt_procdetach(struct knote *kn);
159static int	filt_proc(struct knote *kn, long hint);
160static int	filt_fileattach(struct knote *kn);
161static void	filt_timerexpire(void *knx);
162static void	filt_timerexpire_l(struct knote *kn, bool proc_locked);
163static int	filt_timerattach(struct knote *kn);
164static void	filt_timerdetach(struct knote *kn);
165static void	filt_timerstart(struct knote *kn, sbintime_t to);
166static void	filt_timertouch(struct knote *kn, struct kevent *kev,
167		    u_long type);
168static int	filt_timervalidate(struct knote *kn, sbintime_t *to);
169static int	filt_timer(struct knote *kn, long hint);
170static int	filt_userattach(struct knote *kn);
171static void	filt_userdetach(struct knote *kn);
172static int	filt_user(struct knote *kn, long hint);
173static void	filt_usertouch(struct knote *kn, struct kevent *kev,
174		    u_long type);
175
176static struct filterops file_filtops = {
177	.f_isfd = 1,
178	.f_attach = filt_fileattach,
179};
180static struct filterops kqread_filtops = {
181	.f_isfd = 1,
182	.f_detach = filt_kqdetach,
183	.f_event = filt_kqueue,
184};
185/* XXX - move to kern_proc.c?  */
186static struct filterops proc_filtops = {
187	.f_isfd = 0,
188	.f_attach = filt_procattach,
189	.f_detach = filt_procdetach,
190	.f_event = filt_proc,
191};
192static struct filterops timer_filtops = {
193	.f_isfd = 0,
194	.f_attach = filt_timerattach,
195	.f_detach = filt_timerdetach,
196	.f_event = filt_timer,
197	.f_touch = filt_timertouch,
198};
199static struct filterops user_filtops = {
200	.f_attach = filt_userattach,
201	.f_detach = filt_userdetach,
202	.f_event = filt_user,
203	.f_touch = filt_usertouch,
204};
205
206static uma_zone_t	knote_zone;
207static unsigned int __exclusive_cache_line	kq_ncallouts;
208static unsigned int 	kq_calloutmax = 4 * 1024;
209SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
210    &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
211
212/* XXX - ensure not influx ? */
213#define KNOTE_ACTIVATE(kn, islock) do { 				\
214	if ((islock))							\
215		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
216	else								\
217		KQ_LOCK((kn)->kn_kq);					\
218	(kn)->kn_status |= KN_ACTIVE;					\
219	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
220		knote_enqueue((kn));					\
221	if (!(islock))							\
222		KQ_UNLOCK((kn)->kn_kq);					\
223} while (0)
224#define KQ_LOCK(kq) do {						\
225	mtx_lock(&(kq)->kq_lock);					\
226} while (0)
227#define KQ_FLUX_WAKEUP(kq) do {						\
228	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
229		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
230		wakeup((kq));						\
231	}								\
232} while (0)
233#define KQ_UNLOCK_FLUX(kq) do {						\
234	KQ_FLUX_WAKEUP(kq);						\
235	mtx_unlock(&(kq)->kq_lock);					\
236} while (0)
237#define KQ_UNLOCK(kq) do {						\
238	mtx_unlock(&(kq)->kq_lock);					\
239} while (0)
240#define KQ_OWNED(kq) do {						\
241	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
242} while (0)
243#define KQ_NOTOWNED(kq) do {						\
244	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
245} while (0)
246
247static struct knlist *
248kn_list_lock(struct knote *kn)
249{
250	struct knlist *knl;
251
252	knl = kn->kn_knlist;
253	if (knl != NULL)
254		knl->kl_lock(knl->kl_lockarg);
255	return (knl);
256}
257
258static void
259kn_list_unlock(struct knlist *knl)
260{
261	bool do_free;
262
263	if (knl == NULL)
264		return;
265	do_free = knl->kl_autodestroy && knlist_empty(knl);
266	knl->kl_unlock(knl->kl_lockarg);
267	if (do_free) {
268		knlist_destroy(knl);
269		free(knl, M_KQUEUE);
270	}
271}
272
273static bool
274kn_in_flux(struct knote *kn)
275{
276
277	return (kn->kn_influx > 0);
278}
279
280static void
281kn_enter_flux(struct knote *kn)
282{
283
284	KQ_OWNED(kn->kn_kq);
285	MPASS(kn->kn_influx < INT_MAX);
286	kn->kn_influx++;
287}
288
289static bool
290kn_leave_flux(struct knote *kn)
291{
292
293	KQ_OWNED(kn->kn_kq);
294	MPASS(kn->kn_influx > 0);
295	kn->kn_influx--;
296	return (kn->kn_influx == 0);
297}
298
299#define	KNL_ASSERT_LOCK(knl, islocked) do {				\
300	if (islocked)							\
301		KNL_ASSERT_LOCKED(knl);				\
302	else								\
303		KNL_ASSERT_UNLOCKED(knl);				\
304} while (0)
305#ifdef INVARIANTS
306#define	KNL_ASSERT_LOCKED(knl) do {					\
307	knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED);		\
308} while (0)
309#define	KNL_ASSERT_UNLOCKED(knl) do {					\
310	knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED);		\
311} while (0)
312#else /* !INVARIANTS */
313#define	KNL_ASSERT_LOCKED(knl) do {} while (0)
314#define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
315#endif /* INVARIANTS */
316
317#ifndef	KN_HASHSIZE
318#define	KN_HASHSIZE		64		/* XXX should be tunable */
319#endif
320
321#define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
322
323static int
324filt_nullattach(struct knote *kn)
325{
326
327	return (ENXIO);
328};
329
330struct filterops null_filtops = {
331	.f_isfd = 0,
332	.f_attach = filt_nullattach,
333};
334
335/* XXX - make SYSINIT to add these, and move into respective modules. */
336extern struct filterops sig_filtops;
337extern struct filterops fs_filtops;
338
339/*
340 * Table for all system-defined filters.
341 */
342static struct mtx	filterops_lock;
343MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF);
344static struct {
345	const struct filterops *for_fop;
346	int for_nolock;
347	int for_refcnt;
348} sysfilt_ops[EVFILT_SYSCOUNT] = {
349	{ &file_filtops, 1 },			/* EVFILT_READ */
350	{ &file_filtops, 1 },			/* EVFILT_WRITE */
351	{ &null_filtops },			/* EVFILT_AIO */
352	{ &file_filtops, 1 },			/* EVFILT_VNODE */
353	{ &proc_filtops, 1 },			/* EVFILT_PROC */
354	{ &sig_filtops, 1 },			/* EVFILT_SIGNAL */
355	{ &timer_filtops, 1 },			/* EVFILT_TIMER */
356	{ &file_filtops, 1 },			/* EVFILT_PROCDESC */
357	{ &fs_filtops, 1 },			/* EVFILT_FS */
358	{ &null_filtops },			/* EVFILT_LIO */
359	{ &user_filtops, 1 },			/* EVFILT_USER */
360	{ &null_filtops },			/* EVFILT_SENDFILE */
361	{ &file_filtops, 1 },                   /* EVFILT_EMPTY */
362};
363
364/*
365 * Simple redirection for all cdevsw style objects to call their fo_kqfilter
366 * method.
367 */
368static int
369filt_fileattach(struct knote *kn)
370{
371
372	return (fo_kqfilter(kn->kn_fp, kn));
373}
374
375/*ARGSUSED*/
376static int
377kqueue_kqfilter(struct file *fp, struct knote *kn)
378{
379	struct kqueue *kq = kn->kn_fp->f_data;
380
381	if (kn->kn_filter != EVFILT_READ)
382		return (EINVAL);
383
384	kn->kn_status |= KN_KQUEUE;
385	kn->kn_fop = &kqread_filtops;
386	knlist_add(&kq->kq_sel.si_note, kn, 0);
387
388	return (0);
389}
390
391static void
392filt_kqdetach(struct knote *kn)
393{
394	struct kqueue *kq = kn->kn_fp->f_data;
395
396	knlist_remove(&kq->kq_sel.si_note, kn, 0);
397}
398
399/*ARGSUSED*/
400static int
401filt_kqueue(struct knote *kn, long hint)
402{
403	struct kqueue *kq = kn->kn_fp->f_data;
404
405	kn->kn_data = kq->kq_count;
406	return (kn->kn_data > 0);
407}
408
409/* XXX - move to kern_proc.c?  */
410static int
411filt_procattach(struct knote *kn)
412{
413	struct proc *p;
414	int error;
415	bool exiting, immediate;
416
417	exiting = immediate = false;
418	if (kn->kn_sfflags & NOTE_EXIT)
419		p = pfind_any(kn->kn_id);
420	else
421		p = pfind(kn->kn_id);
422	if (p == NULL)
423		return (ESRCH);
424	if (p->p_flag & P_WEXIT)
425		exiting = true;
426
427	if ((error = p_cansee(curthread, p))) {
428		PROC_UNLOCK(p);
429		return (error);
430	}
431
432	kn->kn_ptr.p_proc = p;
433	kn->kn_flags |= EV_CLEAR;		/* automatically set */
434
435	/*
436	 * Internal flag indicating registration done by kernel for the
437	 * purposes of getting a NOTE_CHILD notification.
438	 */
439	if (kn->kn_flags & EV_FLAG2) {
440		kn->kn_flags &= ~EV_FLAG2;
441		kn->kn_data = kn->kn_sdata;		/* ppid */
442		kn->kn_fflags = NOTE_CHILD;
443		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
444		immediate = true; /* Force immediate activation of child note. */
445	}
446	/*
447	 * Internal flag indicating registration done by kernel (for other than
448	 * NOTE_CHILD).
449	 */
450	if (kn->kn_flags & EV_FLAG1) {
451		kn->kn_flags &= ~EV_FLAG1;
452	}
453
454	knlist_add(p->p_klist, kn, 1);
455
456	/*
457	 * Immediately activate any child notes or, in the case of a zombie
458	 * target process, exit notes.  The latter is necessary to handle the
459	 * case where the target process, e.g. a child, dies before the kevent
460	 * is registered.
461	 */
462	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
463		KNOTE_ACTIVATE(kn, 0);
464
465	PROC_UNLOCK(p);
466
467	return (0);
468}
469
470/*
471 * The knote may be attached to a different process, which may exit,
472 * leaving nothing for the knote to be attached to.  So when the process
473 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
474 * it will be deleted when read out.  However, as part of the knote deletion,
475 * this routine is called, so a check is needed to avoid actually performing
476 * a detach, because the original process does not exist any more.
477 */
478/* XXX - move to kern_proc.c?  */
479static void
480filt_procdetach(struct knote *kn)
481{
482
483	knlist_remove(kn->kn_knlist, kn, 0);
484	kn->kn_ptr.p_proc = NULL;
485}
486
487/* XXX - move to kern_proc.c?  */
488static int
489filt_proc(struct knote *kn, long hint)
490{
491	struct proc *p;
492	u_int event;
493
494	p = kn->kn_ptr.p_proc;
495	if (p == NULL) /* already activated, from attach filter */
496		return (0);
497
498	/* Mask off extra data. */
499	event = (u_int)hint & NOTE_PCTRLMASK;
500
501	/* If the user is interested in this event, record it. */
502	if (kn->kn_sfflags & event)
503		kn->kn_fflags |= event;
504
505	/* Process is gone, so flag the event as finished. */
506	if (event == NOTE_EXIT) {
507		kn->kn_flags |= EV_EOF | EV_ONESHOT;
508		kn->kn_ptr.p_proc = NULL;
509		if (kn->kn_fflags & NOTE_EXIT)
510			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
511		if (kn->kn_fflags == 0)
512			kn->kn_flags |= EV_DROP;
513		return (1);
514	}
515
516	return (kn->kn_fflags != 0);
517}
518
519/*
520 * Called when the process forked. It mostly does the same as the
521 * knote(), activating all knotes registered to be activated when the
522 * process forked. Additionally, for each knote attached to the
523 * parent, check whether user wants to track the new process. If so
524 * attach a new knote to it, and immediately report an event with the
525 * child's pid.
526 */
527void
528knote_fork(struct knlist *list, int pid)
529{
530	struct kqueue *kq;
531	struct knote *kn;
532	struct kevent kev;
533	int error;
534
535	MPASS(list != NULL);
536	KNL_ASSERT_LOCKED(list);
537	if (SLIST_EMPTY(&list->kl_list))
538		return;
539
540	memset(&kev, 0, sizeof(kev));
541	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
542		kq = kn->kn_kq;
543		KQ_LOCK(kq);
544		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
545			KQ_UNLOCK(kq);
546			continue;
547		}
548
549		/*
550		 * The same as knote(), activate the event.
551		 */
552		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
553			if (kn->kn_fop->f_event(kn, NOTE_FORK))
554				KNOTE_ACTIVATE(kn, 1);
555			KQ_UNLOCK(kq);
556			continue;
557		}
558
559		/*
560		 * The NOTE_TRACK case. In addition to the activation
561		 * of the event, we need to register new events to
562		 * track the child. Drop the locks in preparation for
563		 * the call to kqueue_register().
564		 */
565		kn_enter_flux(kn);
566		KQ_UNLOCK(kq);
567		list->kl_unlock(list->kl_lockarg);
568
569		/*
570		 * Activate existing knote and register tracking knotes with
571		 * new process.
572		 *
573		 * First register a knote to get just the child notice. This
574		 * must be a separate note from a potential NOTE_EXIT
575		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
576		 * to use the data field (in conflicting ways).
577		 */
578		kev.ident = pid;
579		kev.filter = kn->kn_filter;
580		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
581		    EV_FLAG2;
582		kev.fflags = kn->kn_sfflags;
583		kev.data = kn->kn_id;		/* parent */
584		kev.udata = kn->kn_kevent.udata;/* preserve udata */
585		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
586		if (error)
587			kn->kn_fflags |= NOTE_TRACKERR;
588
589		/*
590		 * Then register another knote to track other potential events
591		 * from the new process.
592		 */
593		kev.ident = pid;
594		kev.filter = kn->kn_filter;
595		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
596		kev.fflags = kn->kn_sfflags;
597		kev.data = kn->kn_id;		/* parent */
598		kev.udata = kn->kn_kevent.udata;/* preserve udata */
599		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
600		if (error)
601			kn->kn_fflags |= NOTE_TRACKERR;
602		if (kn->kn_fop->f_event(kn, NOTE_FORK))
603			KNOTE_ACTIVATE(kn, 0);
604		list->kl_lock(list->kl_lockarg);
605		KQ_LOCK(kq);
606		kn_leave_flux(kn);
607		KQ_UNLOCK_FLUX(kq);
608	}
609}
610
611/*
612 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
613 * interval timer support code.
614 */
615
616#define NOTE_TIMER_PRECMASK						\
617    (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
618
619static sbintime_t
620timer2sbintime(int64_t data, int flags)
621{
622	int64_t secs;
623
624        /*
625         * Macros for converting to the fractional second portion of an
626         * sbintime_t using 64bit multiplication to improve precision.
627         */
628#define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
629#define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
630#define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
631	switch (flags & NOTE_TIMER_PRECMASK) {
632	case NOTE_SECONDS:
633#ifdef __LP64__
634		if (data > (SBT_MAX / SBT_1S))
635			return (SBT_MAX);
636#endif
637		return ((sbintime_t)data << 32);
638	case NOTE_MSECONDS: /* FALLTHROUGH */
639	case 0:
640		if (data >= 1000) {
641			secs = data / 1000;
642#ifdef __LP64__
643			if (secs > (SBT_MAX / SBT_1S))
644				return (SBT_MAX);
645#endif
646			return (secs << 32 | MS_TO_SBT(data % 1000));
647		}
648		return (MS_TO_SBT(data));
649	case NOTE_USECONDS:
650		if (data >= 1000000) {
651			secs = data / 1000000;
652#ifdef __LP64__
653			if (secs > (SBT_MAX / SBT_1S))
654				return (SBT_MAX);
655#endif
656			return (secs << 32 | US_TO_SBT(data % 1000000));
657		}
658		return (US_TO_SBT(data));
659	case NOTE_NSECONDS:
660		if (data >= 1000000000) {
661			secs = data / 1000000000;
662#ifdef __LP64__
663			if (secs > (SBT_MAX / SBT_1S))
664				return (SBT_MAX);
665#endif
666			return (secs << 32 | NS_TO_SBT(data % 1000000000));
667		}
668		return (NS_TO_SBT(data));
669	default:
670		break;
671	}
672	return (-1);
673}
674
675struct kq_timer_cb_data {
676	struct callout c;
677	struct proc *p;
678	struct knote *kn;
679	int cpuid;
680	int flags;
681	TAILQ_ENTRY(kq_timer_cb_data) link;
682	sbintime_t next;	/* next timer event fires at */
683	sbintime_t to;		/* precalculated timer period, 0 for abs */
684};
685
686#define	KQ_TIMER_CB_ENQUEUED	0x01
687
688static void
689kqtimer_sched_callout(struct kq_timer_cb_data *kc)
690{
691	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn,
692	    kc->cpuid, C_ABSOLUTE);
693}
694
695void
696kqtimer_proc_continue(struct proc *p)
697{
698	struct kq_timer_cb_data *kc, *kc1;
699	struct bintime bt;
700	sbintime_t now;
701
702	PROC_LOCK_ASSERT(p, MA_OWNED);
703
704	getboottimebin(&bt);
705	now = bttosbt(bt);
706
707	TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) {
708		TAILQ_REMOVE(&p->p_kqtim_stop, kc, link);
709		kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
710		if (kc->next <= now)
711			filt_timerexpire_l(kc->kn, true);
712		else
713			kqtimer_sched_callout(kc);
714	}
715}
716
717static void
718filt_timerexpire_l(struct knote *kn, bool proc_locked)
719{
720	struct kq_timer_cb_data *kc;
721	struct proc *p;
722	uint64_t delta;
723	sbintime_t now;
724
725	kc = kn->kn_ptr.p_v;
726
727	if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) {
728		kn->kn_data++;
729		KNOTE_ACTIVATE(kn, 0);
730		return;
731	}
732
733	now = sbinuptime();
734	if (now >= kc->next) {
735		delta = (now - kc->next) / kc->to;
736		if (delta == 0)
737			delta = 1;
738		kn->kn_data += delta;
739		kc->next += delta * kc->to;
740		if (now >= kc->next)	/* overflow */
741			kc->next = now + kc->to;
742		KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
743	}
744
745	/*
746	 * Initial check for stopped kc->p is racy.  It is fine to
747	 * miss the set of the stop flags, at worst we would schedule
748	 * one more callout.  On the other hand, it is not fine to not
749	 * schedule when we we missed clearing of the flags, we
750	 * recheck them under the lock and observe consistent state.
751	 */
752	p = kc->p;
753	if (P_SHOULDSTOP(p) || P_KILLED(p)) {
754		if (!proc_locked)
755			PROC_LOCK(p);
756		if (P_SHOULDSTOP(p) || P_KILLED(p)) {
757			if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) {
758				kc->flags |= KQ_TIMER_CB_ENQUEUED;
759				TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link);
760			}
761			if (!proc_locked)
762				PROC_UNLOCK(p);
763			return;
764		}
765		if (!proc_locked)
766			PROC_UNLOCK(p);
767	}
768	kqtimer_sched_callout(kc);
769}
770
771static void
772filt_timerexpire(void *knx)
773{
774	filt_timerexpire_l(knx, false);
775}
776
777/*
778 * data contains amount of time to sleep
779 */
780static int
781filt_timervalidate(struct knote *kn, sbintime_t *to)
782{
783	struct bintime bt;
784	sbintime_t sbt;
785
786	if (kn->kn_sdata < 0)
787		return (EINVAL);
788	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
789		kn->kn_sdata = 1;
790	/*
791	 * The only fflags values supported are the timer unit
792	 * (precision) and the absolute time indicator.
793	 */
794	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
795		return (EINVAL);
796
797	*to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
798	if (*to < 0)
799		return (EINVAL);
800	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
801		getboottimebin(&bt);
802		sbt = bttosbt(bt);
803		*to = MAX(0, *to - sbt);
804	}
805	return (0);
806}
807
808static int
809filt_timerattach(struct knote *kn)
810{
811	struct kq_timer_cb_data *kc;
812	sbintime_t to;
813	int error;
814
815	to = -1;
816	error = filt_timervalidate(kn, &to);
817	if (error != 0)
818		return (error);
819	KASSERT(to > 0 || (kn->kn_flags & EV_ONESHOT) != 0 ||
820	    (kn->kn_sfflags & NOTE_ABSTIME) != 0,
821	    ("%s: periodic timer has a calculated zero timeout", __func__));
822	KASSERT(to >= 0,
823	    ("%s: timer has a calculated negative timeout", __func__));
824
825	if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) {
826		atomic_subtract_int(&kq_ncallouts, 1);
827		return (ENOMEM);
828	}
829
830	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
831		kn->kn_flags |= EV_CLEAR;	/* automatically set */
832	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
833	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
834	kc->kn = kn;
835	kc->p = curproc;
836	kc->cpuid = PCPU_GET(cpuid);
837	kc->flags = 0;
838	callout_init(&kc->c, 1);
839	filt_timerstart(kn, to);
840
841	return (0);
842}
843
844static void
845filt_timerstart(struct knote *kn, sbintime_t to)
846{
847	struct kq_timer_cb_data *kc;
848
849	kc = kn->kn_ptr.p_v;
850	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
851		kc->next = to;
852		kc->to = 0;
853	} else {
854		kc->next = to + sbinuptime();
855		kc->to = to;
856	}
857	kqtimer_sched_callout(kc);
858}
859
860static void
861filt_timerdetach(struct knote *kn)
862{
863	struct kq_timer_cb_data *kc;
864	unsigned int old __unused;
865	bool pending;
866
867	kc = kn->kn_ptr.p_v;
868	do {
869		callout_drain(&kc->c);
870
871		/*
872		 * kqtimer_proc_continue() might have rescheduled this callout.
873		 * Double-check, using the process mutex as an interlock.
874		 */
875		PROC_LOCK(kc->p);
876		if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) {
877			kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
878			TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link);
879		}
880		pending = callout_pending(&kc->c);
881		PROC_UNLOCK(kc->p);
882	} while (pending);
883	free(kc, M_KQUEUE);
884	old = atomic_fetchadd_int(&kq_ncallouts, -1);
885	KASSERT(old > 0, ("Number of callouts cannot become negative"));
886	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
887}
888
889static void
890filt_timertouch(struct knote *kn, struct kevent *kev, u_long type)
891{
892	struct kq_timer_cb_data *kc;
893	struct kqueue *kq;
894	sbintime_t to;
895	int error;
896
897	switch (type) {
898	case EVENT_REGISTER:
899		/* Handle re-added timers that update data/fflags */
900		if (kev->flags & EV_ADD) {
901			kc = kn->kn_ptr.p_v;
902
903			/* Drain any existing callout. */
904			callout_drain(&kc->c);
905
906			/* Throw away any existing undelivered record
907			 * of the timer expiration. This is done under
908			 * the presumption that if a process is
909			 * re-adding this timer with new parameters,
910			 * it is no longer interested in what may have
911			 * happened under the old parameters. If it is
912			 * interested, it can wait for the expiration,
913			 * delete the old timer definition, and then
914			 * add the new one.
915			 *
916			 * This has to be done while the kq is locked:
917			 *   - if enqueued, dequeue
918			 *   - make it no longer active
919			 *   - clear the count of expiration events
920			 */
921			kq = kn->kn_kq;
922			KQ_LOCK(kq);
923			if (kn->kn_status & KN_QUEUED)
924				knote_dequeue(kn);
925
926			kn->kn_status &= ~KN_ACTIVE;
927			kn->kn_data = 0;
928			KQ_UNLOCK(kq);
929
930			/* Reschedule timer based on new data/fflags */
931			kn->kn_sfflags = kev->fflags;
932			kn->kn_sdata = kev->data;
933			error = filt_timervalidate(kn, &to);
934			if (error != 0) {
935			  	kn->kn_flags |= EV_ERROR;
936				kn->kn_data = error;
937			} else
938			  	filt_timerstart(kn, to);
939		}
940		break;
941
942        case EVENT_PROCESS:
943		*kev = kn->kn_kevent;
944		if (kn->kn_flags & EV_CLEAR) {
945			kn->kn_data = 0;
946			kn->kn_fflags = 0;
947		}
948		break;
949
950	default:
951		panic("filt_timertouch() - invalid type (%ld)", type);
952		break;
953	}
954}
955
956static int
957filt_timer(struct knote *kn, long hint)
958{
959
960	return (kn->kn_data != 0);
961}
962
963static int
964filt_userattach(struct knote *kn)
965{
966
967	/*
968	 * EVFILT_USER knotes are not attached to anything in the kernel.
969	 */
970	kn->kn_hook = NULL;
971	if (kn->kn_fflags & NOTE_TRIGGER)
972		kn->kn_hookid = 1;
973	else
974		kn->kn_hookid = 0;
975	return (0);
976}
977
978static void
979filt_userdetach(__unused struct knote *kn)
980{
981
982	/*
983	 * EVFILT_USER knotes are not attached to anything in the kernel.
984	 */
985}
986
987static int
988filt_user(struct knote *kn, __unused long hint)
989{
990
991	return (kn->kn_hookid);
992}
993
994static void
995filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
996{
997	u_int ffctrl;
998
999	switch (type) {
1000	case EVENT_REGISTER:
1001		if (kev->fflags & NOTE_TRIGGER)
1002			kn->kn_hookid = 1;
1003
1004		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1005		kev->fflags &= NOTE_FFLAGSMASK;
1006		switch (ffctrl) {
1007		case NOTE_FFNOP:
1008			break;
1009
1010		case NOTE_FFAND:
1011			kn->kn_sfflags &= kev->fflags;
1012			break;
1013
1014		case NOTE_FFOR:
1015			kn->kn_sfflags |= kev->fflags;
1016			break;
1017
1018		case NOTE_FFCOPY:
1019			kn->kn_sfflags = kev->fflags;
1020			break;
1021
1022		default:
1023			/* XXX Return error? */
1024			break;
1025		}
1026		kn->kn_sdata = kev->data;
1027		if (kev->flags & EV_CLEAR) {
1028			kn->kn_hookid = 0;
1029			kn->kn_data = 0;
1030			kn->kn_fflags = 0;
1031		}
1032		break;
1033
1034        case EVENT_PROCESS:
1035		*kev = kn->kn_kevent;
1036		kev->fflags = kn->kn_sfflags;
1037		kev->data = kn->kn_sdata;
1038		if (kn->kn_flags & EV_CLEAR) {
1039			kn->kn_hookid = 0;
1040			kn->kn_data = 0;
1041			kn->kn_fflags = 0;
1042		}
1043		break;
1044
1045	default:
1046		panic("filt_usertouch() - invalid type (%ld)", type);
1047		break;
1048	}
1049}
1050
1051int
1052sys_kqueue(struct thread *td, struct kqueue_args *uap)
1053{
1054
1055	return (kern_kqueue(td, 0, NULL));
1056}
1057
1058int
1059sys_kqueuex(struct thread *td, struct kqueuex_args *uap)
1060{
1061	int flags;
1062
1063	if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0)
1064		return (EINVAL);
1065	flags = 0;
1066	if ((uap->flags & KQUEUE_CLOEXEC) != 0)
1067		flags |= O_CLOEXEC;
1068	return (kern_kqueue(td, flags, NULL));
1069}
1070
1071static void
1072kqueue_init(struct kqueue *kq)
1073{
1074
1075	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
1076	TAILQ_INIT(&kq->kq_head);
1077	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
1078	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
1079}
1080
1081int
1082kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
1083{
1084	struct filedesc *fdp;
1085	struct kqueue *kq;
1086	struct file *fp;
1087	struct ucred *cred;
1088	int fd, error;
1089
1090	fdp = td->td_proc->p_fd;
1091	cred = td->td_ucred;
1092	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
1093		return (ENOMEM);
1094
1095	error = falloc_caps(td, &fp, &fd, flags, fcaps);
1096	if (error != 0) {
1097		chgkqcnt(cred->cr_ruidinfo, -1, 0);
1098		return (error);
1099	}
1100
1101	/* An extra reference on `fp' has been held for us by falloc(). */
1102	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
1103	kqueue_init(kq);
1104	kq->kq_fdp = fdp;
1105	kq->kq_cred = crhold(cred);
1106
1107	FILEDESC_XLOCK(fdp);
1108	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
1109	FILEDESC_XUNLOCK(fdp);
1110
1111	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
1112	fdrop(fp, td);
1113
1114	td->td_retval[0] = fd;
1115	return (0);
1116}
1117
1118struct g_kevent_args {
1119	int	fd;
1120	const void *changelist;
1121	int	nchanges;
1122	void	*eventlist;
1123	int	nevents;
1124	const struct timespec *timeout;
1125};
1126
1127int
1128sys_kevent(struct thread *td, struct kevent_args *uap)
1129{
1130	struct kevent_copyops k_ops = {
1131		.arg = uap,
1132		.k_copyout = kevent_copyout,
1133		.k_copyin = kevent_copyin,
1134		.kevent_size = sizeof(struct kevent),
1135	};
1136	struct g_kevent_args gk_args = {
1137		.fd = uap->fd,
1138		.changelist = uap->changelist,
1139		.nchanges = uap->nchanges,
1140		.eventlist = uap->eventlist,
1141		.nevents = uap->nevents,
1142		.timeout = uap->timeout,
1143	};
1144
1145	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
1146}
1147
1148static int
1149kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
1150    struct kevent_copyops *k_ops, const char *struct_name)
1151{
1152	struct timespec ts, *tsp;
1153#ifdef KTRACE
1154	struct kevent *eventlist = uap->eventlist;
1155#endif
1156	int error;
1157
1158	if (uap->timeout != NULL) {
1159		error = copyin(uap->timeout, &ts, sizeof(ts));
1160		if (error)
1161			return (error);
1162		tsp = &ts;
1163	} else
1164		tsp = NULL;
1165
1166#ifdef KTRACE
1167	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
1168		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
1169		    uap->nchanges, k_ops->kevent_size);
1170#endif
1171
1172	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
1173	    k_ops, tsp);
1174
1175#ifdef KTRACE
1176	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
1177		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
1178		    td->td_retval[0], k_ops->kevent_size);
1179#endif
1180
1181	return (error);
1182}
1183
1184/*
1185 * Copy 'count' items into the destination list pointed to by uap->eventlist.
1186 */
1187static int
1188kevent_copyout(void *arg, struct kevent *kevp, int count)
1189{
1190	struct kevent_args *uap;
1191	int error;
1192
1193	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1194	uap = (struct kevent_args *)arg;
1195
1196	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
1197	if (error == 0)
1198		uap->eventlist += count;
1199	return (error);
1200}
1201
1202/*
1203 * Copy 'count' items from the list pointed to by uap->changelist.
1204 */
1205static int
1206kevent_copyin(void *arg, struct kevent *kevp, int count)
1207{
1208	struct kevent_args *uap;
1209	int error;
1210
1211	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1212	uap = (struct kevent_args *)arg;
1213
1214	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
1215	if (error == 0)
1216		uap->changelist += count;
1217	return (error);
1218}
1219
1220#ifdef COMPAT_FREEBSD11
1221static int
1222kevent11_copyout(void *arg, struct kevent *kevp, int count)
1223{
1224	struct freebsd11_kevent_args *uap;
1225	struct freebsd11_kevent kev11;
1226	int error, i;
1227
1228	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1229	uap = (struct freebsd11_kevent_args *)arg;
1230
1231	for (i = 0; i < count; i++) {
1232		kev11.ident = kevp->ident;
1233		kev11.filter = kevp->filter;
1234		kev11.flags = kevp->flags;
1235		kev11.fflags = kevp->fflags;
1236		kev11.data = kevp->data;
1237		kev11.udata = kevp->udata;
1238		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
1239		if (error != 0)
1240			break;
1241		uap->eventlist++;
1242		kevp++;
1243	}
1244	return (error);
1245}
1246
1247/*
1248 * Copy 'count' items from the list pointed to by uap->changelist.
1249 */
1250static int
1251kevent11_copyin(void *arg, struct kevent *kevp, int count)
1252{
1253	struct freebsd11_kevent_args *uap;
1254	struct freebsd11_kevent kev11;
1255	int error, i;
1256
1257	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1258	uap = (struct freebsd11_kevent_args *)arg;
1259
1260	for (i = 0; i < count; i++) {
1261		error = copyin(uap->changelist, &kev11, sizeof(kev11));
1262		if (error != 0)
1263			break;
1264		kevp->ident = kev11.ident;
1265		kevp->filter = kev11.filter;
1266		kevp->flags = kev11.flags;
1267		kevp->fflags = kev11.fflags;
1268		kevp->data = (uintptr_t)kev11.data;
1269		kevp->udata = kev11.udata;
1270		bzero(&kevp->ext, sizeof(kevp->ext));
1271		uap->changelist++;
1272		kevp++;
1273	}
1274	return (error);
1275}
1276
1277int
1278freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
1279{
1280	struct kevent_copyops k_ops = {
1281		.arg = uap,
1282		.k_copyout = kevent11_copyout,
1283		.k_copyin = kevent11_copyin,
1284		.kevent_size = sizeof(struct freebsd11_kevent),
1285	};
1286	struct g_kevent_args gk_args = {
1287		.fd = uap->fd,
1288		.changelist = uap->changelist,
1289		.nchanges = uap->nchanges,
1290		.eventlist = uap->eventlist,
1291		.nevents = uap->nevents,
1292		.timeout = uap->timeout,
1293	};
1294
1295	return (kern_kevent_generic(td, &gk_args, &k_ops, "freebsd11_kevent"));
1296}
1297#endif
1298
1299int
1300kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
1301    struct kevent_copyops *k_ops, const struct timespec *timeout)
1302{
1303	cap_rights_t rights;
1304	struct file *fp;
1305	int error;
1306
1307	cap_rights_init_zero(&rights);
1308	if (nchanges > 0)
1309		cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE);
1310	if (nevents > 0)
1311		cap_rights_set_one(&rights, CAP_KQUEUE_EVENT);
1312	error = fget(td, fd, &rights, &fp);
1313	if (error != 0)
1314		return (error);
1315
1316	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
1317	fdrop(fp, td);
1318
1319	return (error);
1320}
1321
1322static int
1323kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
1324    struct kevent_copyops *k_ops, const struct timespec *timeout)
1325{
1326	struct kevent keva[KQ_NEVENTS];
1327	struct kevent *kevp, *changes;
1328	int i, n, nerrors, error;
1329
1330	if (nchanges < 0)
1331		return (EINVAL);
1332
1333	nerrors = 0;
1334	while (nchanges > 0) {
1335		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
1336		error = k_ops->k_copyin(k_ops->arg, keva, n);
1337		if (error)
1338			return (error);
1339		changes = keva;
1340		for (i = 0; i < n; i++) {
1341			kevp = &changes[i];
1342			if (!kevp->filter)
1343				continue;
1344			kevp->flags &= ~EV_SYSFLAGS;
1345			error = kqueue_register(kq, kevp, td, M_WAITOK);
1346			if (error || (kevp->flags & EV_RECEIPT)) {
1347				if (nevents == 0)
1348					return (error);
1349				kevp->flags = EV_ERROR;
1350				kevp->data = error;
1351				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
1352				nevents--;
1353				nerrors++;
1354			}
1355		}
1356		nchanges -= n;
1357	}
1358	if (nerrors) {
1359		td->td_retval[0] = nerrors;
1360		return (0);
1361	}
1362
1363	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
1364}
1365
1366int
1367kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
1368    struct kevent_copyops *k_ops, const struct timespec *timeout)
1369{
1370	struct kqueue *kq;
1371	int error;
1372
1373	error = kqueue_acquire(fp, &kq);
1374	if (error != 0)
1375		return (error);
1376	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
1377	kqueue_release(kq, 0);
1378	return (error);
1379}
1380
1381/*
1382 * Performs a kevent() call on a temporarily created kqueue. This can be
1383 * used to perform one-shot polling, similar to poll() and select().
1384 */
1385int
1386kern_kevent_anonymous(struct thread *td, int nevents,
1387    struct kevent_copyops *k_ops)
1388{
1389	struct kqueue kq = {};
1390	int error;
1391
1392	kqueue_init(&kq);
1393	kq.kq_refcnt = 1;
1394	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
1395	kqueue_drain(&kq, td);
1396	kqueue_destroy(&kq);
1397	return (error);
1398}
1399
1400int
1401kqueue_add_filteropts(int filt, const struct filterops *filtops)
1402{
1403	int error;
1404
1405	error = 0;
1406	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
1407		printf(
1408"trying to add a filterop that is out of range: %d is beyond %d\n",
1409		    ~filt, EVFILT_SYSCOUNT);
1410		return EINVAL;
1411	}
1412	mtx_lock(&filterops_lock);
1413	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
1414	    sysfilt_ops[~filt].for_fop != NULL)
1415		error = EEXIST;
1416	else {
1417		sysfilt_ops[~filt].for_fop = filtops;
1418		sysfilt_ops[~filt].for_refcnt = 0;
1419	}
1420	mtx_unlock(&filterops_lock);
1421
1422	return (error);
1423}
1424
1425int
1426kqueue_del_filteropts(int filt)
1427{
1428	int error;
1429
1430	error = 0;
1431	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1432		return EINVAL;
1433
1434	mtx_lock(&filterops_lock);
1435	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
1436	    sysfilt_ops[~filt].for_fop == NULL)
1437		error = EINVAL;
1438	else if (sysfilt_ops[~filt].for_refcnt != 0)
1439		error = EBUSY;
1440	else {
1441		sysfilt_ops[~filt].for_fop = &null_filtops;
1442		sysfilt_ops[~filt].for_refcnt = 0;
1443	}
1444	mtx_unlock(&filterops_lock);
1445
1446	return error;
1447}
1448
1449static const struct filterops *
1450kqueue_fo_find(int filt)
1451{
1452
1453	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1454		return NULL;
1455
1456	if (sysfilt_ops[~filt].for_nolock)
1457		return sysfilt_ops[~filt].for_fop;
1458
1459	mtx_lock(&filterops_lock);
1460	sysfilt_ops[~filt].for_refcnt++;
1461	if (sysfilt_ops[~filt].for_fop == NULL)
1462		sysfilt_ops[~filt].for_fop = &null_filtops;
1463	mtx_unlock(&filterops_lock);
1464
1465	return sysfilt_ops[~filt].for_fop;
1466}
1467
1468static void
1469kqueue_fo_release(int filt)
1470{
1471
1472	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1473		return;
1474
1475	if (sysfilt_ops[~filt].for_nolock)
1476		return;
1477
1478	mtx_lock(&filterops_lock);
1479	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
1480	    ("filter object refcount not valid on release"));
1481	sysfilt_ops[~filt].for_refcnt--;
1482	mtx_unlock(&filterops_lock);
1483}
1484
1485/*
1486 * A ref to kq (obtained via kqueue_acquire) must be held.
1487 */
1488static int
1489kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td,
1490    int mflag)
1491{
1492	const struct filterops *fops;
1493	struct file *fp;
1494	struct knote *kn, *tkn;
1495	struct knlist *knl;
1496	int error, filt, event;
1497	int haskqglobal, filedesc_unlock;
1498
1499	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
1500		return (EINVAL);
1501
1502	fp = NULL;
1503	kn = NULL;
1504	knl = NULL;
1505	error = 0;
1506	haskqglobal = 0;
1507	filedesc_unlock = 0;
1508
1509	filt = kev->filter;
1510	fops = kqueue_fo_find(filt);
1511	if (fops == NULL)
1512		return EINVAL;
1513
1514	if (kev->flags & EV_ADD) {
1515		/* Reject an invalid flag pair early */
1516		if (kev->flags & EV_KEEPUDATA) {
1517			tkn = NULL;
1518			error = EINVAL;
1519			goto done;
1520		}
1521
1522		/*
1523		 * Prevent waiting with locks.  Non-sleepable
1524		 * allocation failures are handled in the loop, only
1525		 * if the spare knote appears to be actually required.
1526		 */
1527		tkn = knote_alloc(mflag);
1528	} else {
1529		tkn = NULL;
1530	}
1531
1532findkn:
1533	if (fops->f_isfd) {
1534		KASSERT(td != NULL, ("td is NULL"));
1535		if (kev->ident > INT_MAX)
1536			error = EBADF;
1537		else
1538			error = fget(td, kev->ident, &cap_event_rights, &fp);
1539		if (error)
1540			goto done;
1541
1542		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
1543		    kev->ident, M_NOWAIT) != 0) {
1544			/* try again */
1545			fdrop(fp, td);
1546			fp = NULL;
1547			error = kqueue_expand(kq, fops, kev->ident, mflag);
1548			if (error)
1549				goto done;
1550			goto findkn;
1551		}
1552
1553		if (fp->f_type == DTYPE_KQUEUE) {
1554			/*
1555			 * If we add some intelligence about what we are doing,
1556			 * we should be able to support events on ourselves.
1557			 * We need to know when we are doing this to prevent
1558			 * getting both the knlist lock and the kq lock since
1559			 * they are the same thing.
1560			 */
1561			if (fp->f_data == kq) {
1562				error = EINVAL;
1563				goto done;
1564			}
1565
1566			/*
1567			 * Pre-lock the filedesc before the global
1568			 * lock mutex, see the comment in
1569			 * kqueue_close().
1570			 */
1571			FILEDESC_XLOCK(td->td_proc->p_fd);
1572			filedesc_unlock = 1;
1573			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1574		}
1575
1576		KQ_LOCK(kq);
1577		if (kev->ident < kq->kq_knlistsize) {
1578			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1579				if (kev->filter == kn->kn_filter)
1580					break;
1581		}
1582	} else {
1583		if ((kev->flags & EV_ADD) == EV_ADD) {
1584			error = kqueue_expand(kq, fops, kev->ident, mflag);
1585			if (error != 0)
1586				goto done;
1587		}
1588
1589		KQ_LOCK(kq);
1590
1591		/*
1592		 * If possible, find an existing knote to use for this kevent.
1593		 */
1594		if (kev->filter == EVFILT_PROC &&
1595		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
1596			/* This is an internal creation of a process tracking
1597			 * note. Don't attempt to coalesce this with an
1598			 * existing note.
1599			 */
1600			;
1601		} else if (kq->kq_knhashmask != 0) {
1602			struct klist *list;
1603
1604			list = &kq->kq_knhash[
1605			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1606			SLIST_FOREACH(kn, list, kn_link)
1607				if (kev->ident == kn->kn_id &&
1608				    kev->filter == kn->kn_filter)
1609					break;
1610		}
1611	}
1612
1613	/* knote is in the process of changing, wait for it to stabilize. */
1614	if (kn != NULL && kn_in_flux(kn)) {
1615		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1616		if (filedesc_unlock) {
1617			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1618			filedesc_unlock = 0;
1619		}
1620		kq->kq_state |= KQ_FLUXWAIT;
1621		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1622		if (fp != NULL) {
1623			fdrop(fp, td);
1624			fp = NULL;
1625		}
1626		goto findkn;
1627	}
1628
1629	/*
1630	 * kn now contains the matching knote, or NULL if no match
1631	 */
1632	if (kn == NULL) {
1633		if (kev->flags & EV_ADD) {
1634			kn = tkn;
1635			tkn = NULL;
1636			if (kn == NULL) {
1637				KQ_UNLOCK(kq);
1638				error = ENOMEM;
1639				goto done;
1640			}
1641			kn->kn_fp = fp;
1642			kn->kn_kq = kq;
1643			kn->kn_fop = fops;
1644			/*
1645			 * apply reference counts to knote structure, and
1646			 * do not release it at the end of this routine.
1647			 */
1648			fops = NULL;
1649			fp = NULL;
1650
1651			kn->kn_sfflags = kev->fflags;
1652			kn->kn_sdata = kev->data;
1653			kev->fflags = 0;
1654			kev->data = 0;
1655			kn->kn_kevent = *kev;
1656			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1657			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
1658			kn->kn_status = KN_DETACHED;
1659			if ((kev->flags & EV_DISABLE) != 0)
1660				kn->kn_status |= KN_DISABLED;
1661			kn_enter_flux(kn);
1662
1663			error = knote_attach(kn, kq);
1664			KQ_UNLOCK(kq);
1665			if (error != 0) {
1666				tkn = kn;
1667				goto done;
1668			}
1669
1670			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1671				knote_drop_detached(kn, td);
1672				goto done;
1673			}
1674			knl = kn_list_lock(kn);
1675			goto done_ev_add;
1676		} else {
1677			/* No matching knote and the EV_ADD flag is not set. */
1678			KQ_UNLOCK(kq);
1679			error = ENOENT;
1680			goto done;
1681		}
1682	}
1683
1684	if (kev->flags & EV_DELETE) {
1685		kn_enter_flux(kn);
1686		KQ_UNLOCK(kq);
1687		knote_drop(kn, td);
1688		goto done;
1689	}
1690
1691	if (kev->flags & EV_FORCEONESHOT) {
1692		kn->kn_flags |= EV_ONESHOT;
1693		KNOTE_ACTIVATE(kn, 1);
1694	}
1695
1696	if ((kev->flags & EV_ENABLE) != 0)
1697		kn->kn_status &= ~KN_DISABLED;
1698	else if ((kev->flags & EV_DISABLE) != 0)
1699		kn->kn_status |= KN_DISABLED;
1700
1701	/*
1702	 * The user may change some filter values after the initial EV_ADD,
1703	 * but doing so will not reset any filter which has already been
1704	 * triggered.
1705	 */
1706	kn->kn_status |= KN_SCAN;
1707	kn_enter_flux(kn);
1708	KQ_UNLOCK(kq);
1709	knl = kn_list_lock(kn);
1710	if ((kev->flags & EV_KEEPUDATA) == 0)
1711		kn->kn_kevent.udata = kev->udata;
1712	if (!fops->f_isfd && fops->f_touch != NULL) {
1713		fops->f_touch(kn, kev, EVENT_REGISTER);
1714	} else {
1715		kn->kn_sfflags = kev->fflags;
1716		kn->kn_sdata = kev->data;
1717	}
1718
1719done_ev_add:
1720	/*
1721	 * We can get here with kn->kn_knlist == NULL.  This can happen when
1722	 * the initial attach event decides that the event is "completed"
1723	 * already, e.g., filt_procattach() is called on a zombie process.  It
1724	 * will call filt_proc() which will remove it from the list, and NULL
1725	 * kn_knlist.
1726	 *
1727	 * KN_DISABLED will be stable while the knote is in flux, so the
1728	 * unlocked read will not race with an update.
1729	 */
1730	if ((kn->kn_status & KN_DISABLED) == 0)
1731		event = kn->kn_fop->f_event(kn, 0);
1732	else
1733		event = 0;
1734
1735	KQ_LOCK(kq);
1736	if (event)
1737		kn->kn_status |= KN_ACTIVE;
1738	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
1739	    KN_ACTIVE)
1740		knote_enqueue(kn);
1741	kn->kn_status &= ~KN_SCAN;
1742	kn_leave_flux(kn);
1743	kn_list_unlock(knl);
1744	KQ_UNLOCK_FLUX(kq);
1745
1746done:
1747	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1748	if (filedesc_unlock)
1749		FILEDESC_XUNLOCK(td->td_proc->p_fd);
1750	if (fp != NULL)
1751		fdrop(fp, td);
1752	knote_free(tkn);
1753	if (fops != NULL)
1754		kqueue_fo_release(filt);
1755	return (error);
1756}
1757
1758static int
1759kqueue_acquire(struct file *fp, struct kqueue **kqp)
1760{
1761	int error;
1762	struct kqueue *kq;
1763
1764	error = 0;
1765
1766	kq = fp->f_data;
1767	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1768		return (EBADF);
1769	*kqp = kq;
1770	KQ_LOCK(kq);
1771	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1772		KQ_UNLOCK(kq);
1773		return (EBADF);
1774	}
1775	kq->kq_refcnt++;
1776	KQ_UNLOCK(kq);
1777
1778	return error;
1779}
1780
1781static void
1782kqueue_release(struct kqueue *kq, int locked)
1783{
1784	if (locked)
1785		KQ_OWNED(kq);
1786	else
1787		KQ_LOCK(kq);
1788	kq->kq_refcnt--;
1789	if (kq->kq_refcnt == 1)
1790		wakeup(&kq->kq_refcnt);
1791	if (!locked)
1792		KQ_UNLOCK(kq);
1793}
1794
1795static void
1796ast_kqueue(struct thread *td, int tda __unused)
1797{
1798	taskqueue_quiesce(taskqueue_kqueue_ctx);
1799}
1800
1801static void
1802kqueue_schedtask(struct kqueue *kq)
1803{
1804	KQ_OWNED(kq);
1805	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1806	    ("scheduling kqueue task while draining"));
1807
1808	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1809		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
1810		kq->kq_state |= KQ_TASKSCHED;
1811		ast_sched(curthread, TDA_KQUEUE);
1812	}
1813}
1814
1815/*
1816 * Expand the kq to make sure we have storage for fops/ident pair.
1817 *
1818 * Return 0 on success (or no work necessary), return errno on failure.
1819 */
1820static int
1821kqueue_expand(struct kqueue *kq, const struct filterops *fops, uintptr_t ident,
1822    int mflag)
1823{
1824	struct klist *list, *tmp_knhash, *to_free;
1825	u_long tmp_knhashmask;
1826	int error, fd, size;
1827
1828	KQ_NOTOWNED(kq);
1829
1830	error = 0;
1831	to_free = NULL;
1832	if (fops->f_isfd) {
1833		fd = ident;
1834		if (kq->kq_knlistsize <= fd) {
1835			size = kq->kq_knlistsize;
1836			while (size <= fd)
1837				size += KQEXTENT;
1838			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1839			if (list == NULL)
1840				return ENOMEM;
1841			KQ_LOCK(kq);
1842			if ((kq->kq_state & KQ_CLOSING) != 0) {
1843				to_free = list;
1844				error = EBADF;
1845			} else if (kq->kq_knlistsize > fd) {
1846				to_free = list;
1847			} else {
1848				if (kq->kq_knlist != NULL) {
1849					bcopy(kq->kq_knlist, list,
1850					    kq->kq_knlistsize * sizeof(*list));
1851					to_free = kq->kq_knlist;
1852					kq->kq_knlist = NULL;
1853				}
1854				bzero((caddr_t)list +
1855				    kq->kq_knlistsize * sizeof(*list),
1856				    (size - kq->kq_knlistsize) * sizeof(*list));
1857				kq->kq_knlistsize = size;
1858				kq->kq_knlist = list;
1859			}
1860			KQ_UNLOCK(kq);
1861		}
1862	} else {
1863		if (kq->kq_knhashmask == 0) {
1864			tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE,
1865			    &tmp_knhashmask, (mflag & M_WAITOK) != 0 ?
1866			    HASH_WAITOK : HASH_NOWAIT);
1867			if (tmp_knhash == NULL)
1868				return (ENOMEM);
1869			KQ_LOCK(kq);
1870			if ((kq->kq_state & KQ_CLOSING) != 0) {
1871				to_free = tmp_knhash;
1872				error = EBADF;
1873			} else if (kq->kq_knhashmask == 0) {
1874				kq->kq_knhash = tmp_knhash;
1875				kq->kq_knhashmask = tmp_knhashmask;
1876			} else {
1877				to_free = tmp_knhash;
1878			}
1879			KQ_UNLOCK(kq);
1880		}
1881	}
1882	free(to_free, M_KQUEUE);
1883
1884	KQ_NOTOWNED(kq);
1885	return (error);
1886}
1887
1888static void
1889kqueue_task(void *arg, int pending)
1890{
1891	struct kqueue *kq;
1892	int haskqglobal;
1893
1894	haskqglobal = 0;
1895	kq = arg;
1896
1897	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1898	KQ_LOCK(kq);
1899
1900	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1901
1902	kq->kq_state &= ~KQ_TASKSCHED;
1903	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1904		wakeup(&kq->kq_state);
1905	}
1906	KQ_UNLOCK(kq);
1907	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1908}
1909
1910/*
1911 * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1912 * We treat KN_MARKER knotes as if they are in flux.
1913 */
1914static int
1915kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1916    const struct timespec *tsp, struct kevent *keva, struct thread *td)
1917{
1918	struct kevent *kevp;
1919	struct knote *kn, *marker;
1920	struct knlist *knl;
1921	sbintime_t asbt, rsbt;
1922	int count, error, haskqglobal, influx, nkev, touch;
1923
1924	count = maxevents;
1925	nkev = 0;
1926	error = 0;
1927	haskqglobal = 0;
1928
1929	if (maxevents == 0)
1930		goto done_nl;
1931	if (maxevents < 0) {
1932		error = EINVAL;
1933		goto done_nl;
1934	}
1935
1936	rsbt = 0;
1937	if (tsp != NULL) {
1938		if (!timespecvalid_interval(tsp)) {
1939			error = EINVAL;
1940			goto done_nl;
1941		}
1942		if (timespecisset(tsp)) {
1943			if (tsp->tv_sec <= INT32_MAX) {
1944				rsbt = tstosbt(*tsp);
1945				if (TIMESEL(&asbt, rsbt))
1946					asbt += tc_tick_sbt;
1947				if (asbt <= SBT_MAX - rsbt)
1948					asbt += rsbt;
1949				else
1950					asbt = 0;
1951				rsbt >>= tc_precexp;
1952			} else
1953				asbt = 0;
1954		} else
1955			asbt = -1;
1956	} else
1957		asbt = 0;
1958	marker = knote_alloc(M_WAITOK);
1959	marker->kn_status = KN_MARKER;
1960	KQ_LOCK(kq);
1961
1962retry:
1963	kevp = keva;
1964	if (kq->kq_count == 0) {
1965		if (asbt == -1) {
1966			error = EWOULDBLOCK;
1967		} else {
1968			kq->kq_state |= KQ_SLEEP;
1969			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
1970			    "kqread", asbt, rsbt, C_ABSOLUTE);
1971		}
1972		if (error == 0)
1973			goto retry;
1974		/* don't restart after signals... */
1975		if (error == ERESTART)
1976			error = EINTR;
1977		else if (error == EWOULDBLOCK)
1978			error = 0;
1979		goto done;
1980	}
1981
1982	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1983	influx = 0;
1984	while (count) {
1985		KQ_OWNED(kq);
1986		kn = TAILQ_FIRST(&kq->kq_head);
1987
1988		if ((kn->kn_status == KN_MARKER && kn != marker) ||
1989		    kn_in_flux(kn)) {
1990			if (influx) {
1991				influx = 0;
1992				KQ_FLUX_WAKEUP(kq);
1993			}
1994			kq->kq_state |= KQ_FLUXWAIT;
1995			error = msleep(kq, &kq->kq_lock, PSOCK,
1996			    "kqflxwt", 0);
1997			continue;
1998		}
1999
2000		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2001		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
2002			kn->kn_status &= ~KN_QUEUED;
2003			kq->kq_count--;
2004			continue;
2005		}
2006		if (kn == marker) {
2007			KQ_FLUX_WAKEUP(kq);
2008			if (count == maxevents)
2009				goto retry;
2010			goto done;
2011		}
2012		KASSERT(!kn_in_flux(kn),
2013		    ("knote %p is unexpectedly in flux", kn));
2014
2015		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
2016			kn->kn_status &= ~KN_QUEUED;
2017			kn_enter_flux(kn);
2018			kq->kq_count--;
2019			KQ_UNLOCK(kq);
2020			/*
2021			 * We don't need to lock the list since we've
2022			 * marked it as in flux.
2023			 */
2024			knote_drop(kn, td);
2025			KQ_LOCK(kq);
2026			continue;
2027		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
2028			kn->kn_status &= ~KN_QUEUED;
2029			kn_enter_flux(kn);
2030			kq->kq_count--;
2031			KQ_UNLOCK(kq);
2032			/*
2033			 * We don't need to lock the list since we've
2034			 * marked the knote as being in flux.
2035			 */
2036			*kevp = kn->kn_kevent;
2037			knote_drop(kn, td);
2038			KQ_LOCK(kq);
2039			kn = NULL;
2040		} else {
2041			kn->kn_status |= KN_SCAN;
2042			kn_enter_flux(kn);
2043			KQ_UNLOCK(kq);
2044			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
2045				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
2046			knl = kn_list_lock(kn);
2047			if (kn->kn_fop->f_event(kn, 0) == 0) {
2048				KQ_LOCK(kq);
2049				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2050				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
2051				    KN_SCAN);
2052				kn_leave_flux(kn);
2053				kq->kq_count--;
2054				kn_list_unlock(knl);
2055				influx = 1;
2056				continue;
2057			}
2058			touch = (!kn->kn_fop->f_isfd &&
2059			    kn->kn_fop->f_touch != NULL);
2060			if (touch)
2061				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
2062			else
2063				*kevp = kn->kn_kevent;
2064			KQ_LOCK(kq);
2065			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2066			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
2067				/*
2068				 * Manually clear knotes who weren't
2069				 * 'touch'ed.
2070				 */
2071				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
2072					kn->kn_data = 0;
2073					kn->kn_fflags = 0;
2074				}
2075				if (kn->kn_flags & EV_DISPATCH)
2076					kn->kn_status |= KN_DISABLED;
2077				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
2078				kq->kq_count--;
2079			} else
2080				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2081
2082			kn->kn_status &= ~KN_SCAN;
2083			kn_leave_flux(kn);
2084			kn_list_unlock(knl);
2085			influx = 1;
2086		}
2087
2088		/* we are returning a copy to the user */
2089		kevp++;
2090		nkev++;
2091		count--;
2092
2093		if (nkev == KQ_NEVENTS) {
2094			influx = 0;
2095			KQ_UNLOCK_FLUX(kq);
2096			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
2097			nkev = 0;
2098			kevp = keva;
2099			KQ_LOCK(kq);
2100			if (error)
2101				break;
2102		}
2103	}
2104	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
2105done:
2106	KQ_OWNED(kq);
2107	KQ_UNLOCK_FLUX(kq);
2108	knote_free(marker);
2109done_nl:
2110	KQ_NOTOWNED(kq);
2111	if (nkev != 0)
2112		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
2113	td->td_retval[0] = maxevents - count;
2114	return (error);
2115}
2116
2117/*ARGSUSED*/
2118static int
2119kqueue_ioctl(struct file *fp, u_long cmd, void *data,
2120	struct ucred *active_cred, struct thread *td)
2121{
2122	/*
2123	 * Enabling sigio causes two major problems:
2124	 * 1) infinite recursion:
2125	 * Synopsys: kevent is being used to track signals and have FIOASYNC
2126	 * set.  On receipt of a signal this will cause a kqueue to recurse
2127	 * into itself over and over.  Sending the sigio causes the kqueue
2128	 * to become ready, which in turn posts sigio again, forever.
2129	 * Solution: this can be solved by setting a flag in the kqueue that
2130	 * we have a SIGIO in progress.
2131	 * 2) locking problems:
2132	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
2133	 * us above the proc and pgrp locks.
2134	 * Solution: Post a signal using an async mechanism, being sure to
2135	 * record a generation count in the delivery so that we do not deliver
2136	 * a signal to the wrong process.
2137	 *
2138	 * Note, these two mechanisms are somewhat mutually exclusive!
2139	 */
2140#if 0
2141	struct kqueue *kq;
2142
2143	kq = fp->f_data;
2144	switch (cmd) {
2145	case FIOASYNC:
2146		if (*(int *)data) {
2147			kq->kq_state |= KQ_ASYNC;
2148		} else {
2149			kq->kq_state &= ~KQ_ASYNC;
2150		}
2151		return (0);
2152
2153	case FIOSETOWN:
2154		return (fsetown(*(int *)data, &kq->kq_sigio));
2155
2156	case FIOGETOWN:
2157		*(int *)data = fgetown(&kq->kq_sigio);
2158		return (0);
2159	}
2160#endif
2161
2162	return (ENOTTY);
2163}
2164
2165/*ARGSUSED*/
2166static int
2167kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
2168	struct thread *td)
2169{
2170	struct kqueue *kq;
2171	int revents = 0;
2172	int error;
2173
2174	if ((error = kqueue_acquire(fp, &kq)))
2175		return POLLERR;
2176
2177	KQ_LOCK(kq);
2178	if (events & (POLLIN | POLLRDNORM)) {
2179		if (kq->kq_count) {
2180			revents |= events & (POLLIN | POLLRDNORM);
2181		} else {
2182			selrecord(td, &kq->kq_sel);
2183			if (SEL_WAITING(&kq->kq_sel))
2184				kq->kq_state |= KQ_SEL;
2185		}
2186	}
2187	kqueue_release(kq, 1);
2188	KQ_UNLOCK(kq);
2189	return (revents);
2190}
2191
2192/*ARGSUSED*/
2193static int
2194kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
2195{
2196
2197	bzero((void *)st, sizeof *st);
2198	/*
2199	 * We no longer return kq_count because the unlocked value is useless.
2200	 * If you spent all this time getting the count, why not spend your
2201	 * syscall better by calling kevent?
2202	 *
2203	 * XXX - This is needed for libc_r.
2204	 */
2205	st->st_mode = S_IFIFO;
2206	return (0);
2207}
2208
2209static void
2210kqueue_drain(struct kqueue *kq, struct thread *td)
2211{
2212	struct knote *kn;
2213	int i;
2214
2215	KQ_LOCK(kq);
2216
2217	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
2218	    ("kqueue already closing"));
2219	kq->kq_state |= KQ_CLOSING;
2220	if (kq->kq_refcnt > 1)
2221		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
2222
2223	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
2224
2225	KASSERT(knlist_empty(&kq->kq_sel.si_note),
2226	    ("kqueue's knlist not empty"));
2227
2228	for (i = 0; i < kq->kq_knlistsize; i++) {
2229		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
2230			if (kn_in_flux(kn)) {
2231				kq->kq_state |= KQ_FLUXWAIT;
2232				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
2233				continue;
2234			}
2235			kn_enter_flux(kn);
2236			KQ_UNLOCK(kq);
2237			knote_drop(kn, td);
2238			KQ_LOCK(kq);
2239		}
2240	}
2241	if (kq->kq_knhashmask != 0) {
2242		for (i = 0; i <= kq->kq_knhashmask; i++) {
2243			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
2244				if (kn_in_flux(kn)) {
2245					kq->kq_state |= KQ_FLUXWAIT;
2246					msleep(kq, &kq->kq_lock, PSOCK,
2247					       "kqclo2", 0);
2248					continue;
2249				}
2250				kn_enter_flux(kn);
2251				KQ_UNLOCK(kq);
2252				knote_drop(kn, td);
2253				KQ_LOCK(kq);
2254			}
2255		}
2256	}
2257
2258	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
2259		kq->kq_state |= KQ_TASKDRAIN;
2260		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
2261	}
2262
2263	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2264		selwakeuppri(&kq->kq_sel, PSOCK);
2265		if (!SEL_WAITING(&kq->kq_sel))
2266			kq->kq_state &= ~KQ_SEL;
2267	}
2268
2269	KQ_UNLOCK(kq);
2270}
2271
2272static void
2273kqueue_destroy(struct kqueue *kq)
2274{
2275
2276	KASSERT(kq->kq_fdp == NULL,
2277	    ("kqueue still attached to a file descriptor"));
2278	seldrain(&kq->kq_sel);
2279	knlist_destroy(&kq->kq_sel.si_note);
2280	mtx_destroy(&kq->kq_lock);
2281
2282	if (kq->kq_knhash != NULL)
2283		free(kq->kq_knhash, M_KQUEUE);
2284	if (kq->kq_knlist != NULL)
2285		free(kq->kq_knlist, M_KQUEUE);
2286
2287	funsetown(&kq->kq_sigio);
2288}
2289
2290/*ARGSUSED*/
2291static int
2292kqueue_close(struct file *fp, struct thread *td)
2293{
2294	struct kqueue *kq = fp->f_data;
2295	struct filedesc *fdp;
2296	int error;
2297	int filedesc_unlock;
2298
2299	if ((error = kqueue_acquire(fp, &kq)))
2300		return error;
2301	kqueue_drain(kq, td);
2302
2303	/*
2304	 * We could be called due to the knote_drop() doing fdrop(),
2305	 * called from kqueue_register().  In this case the global
2306	 * lock is owned, and filedesc sx is locked before, to not
2307	 * take the sleepable lock after non-sleepable.
2308	 */
2309	fdp = kq->kq_fdp;
2310	kq->kq_fdp = NULL;
2311	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
2312		FILEDESC_XLOCK(fdp);
2313		filedesc_unlock = 1;
2314	} else
2315		filedesc_unlock = 0;
2316	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
2317	if (filedesc_unlock)
2318		FILEDESC_XUNLOCK(fdp);
2319
2320	kqueue_destroy(kq);
2321	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
2322	crfree(kq->kq_cred);
2323	free(kq, M_KQUEUE);
2324	fp->f_data = NULL;
2325
2326	return (0);
2327}
2328
2329static int
2330kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2331{
2332	struct kqueue *kq = fp->f_data;
2333
2334	kif->kf_type = KF_TYPE_KQUEUE;
2335	kif->kf_un.kf_kqueue.kf_kqueue_addr = (uintptr_t)kq;
2336	kif->kf_un.kf_kqueue.kf_kqueue_count = kq->kq_count;
2337	kif->kf_un.kf_kqueue.kf_kqueue_state = kq->kq_state;
2338	return (0);
2339}
2340
2341static void
2342kqueue_wakeup(struct kqueue *kq)
2343{
2344	KQ_OWNED(kq);
2345
2346	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
2347		kq->kq_state &= ~KQ_SLEEP;
2348		wakeup(kq);
2349	}
2350	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2351		selwakeuppri(&kq->kq_sel, PSOCK);
2352		if (!SEL_WAITING(&kq->kq_sel))
2353			kq->kq_state &= ~KQ_SEL;
2354	}
2355	if (!knlist_empty(&kq->kq_sel.si_note))
2356		kqueue_schedtask(kq);
2357	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
2358		pgsigio(&kq->kq_sigio, SIGIO, 0);
2359	}
2360}
2361
2362/*
2363 * Walk down a list of knotes, activating them if their event has triggered.
2364 *
2365 * There is a possibility to optimize in the case of one kq watching another.
2366 * Instead of scheduling a task to wake it up, you could pass enough state
2367 * down the chain to make up the parent kqueue.  Make this code functional
2368 * first.
2369 */
2370void
2371knote(struct knlist *list, long hint, int lockflags)
2372{
2373	struct kqueue *kq;
2374	struct knote *kn, *tkn;
2375	int error;
2376
2377	if (list == NULL)
2378		return;
2379
2380	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
2381
2382	if ((lockflags & KNF_LISTLOCKED) == 0)
2383		list->kl_lock(list->kl_lockarg);
2384
2385	/*
2386	 * If we unlock the list lock (and enter influx), we can
2387	 * eliminate the kqueue scheduling, but this will introduce
2388	 * four lock/unlock's for each knote to test.  Also, marker
2389	 * would be needed to keep iteration position, since filters
2390	 * or other threads could remove events.
2391	 */
2392	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
2393		kq = kn->kn_kq;
2394		KQ_LOCK(kq);
2395		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
2396			/*
2397			 * Do not process the influx notes, except for
2398			 * the influx coming from the kq unlock in the
2399			 * kqueue_scan().  In the later case, we do
2400			 * not interfere with the scan, since the code
2401			 * fragment in kqueue_scan() locks the knlist,
2402			 * and cannot proceed until we finished.
2403			 */
2404			KQ_UNLOCK(kq);
2405		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
2406			kn_enter_flux(kn);
2407			KQ_UNLOCK(kq);
2408			error = kn->kn_fop->f_event(kn, hint);
2409			KQ_LOCK(kq);
2410			kn_leave_flux(kn);
2411			if (error)
2412				KNOTE_ACTIVATE(kn, 1);
2413			KQ_UNLOCK_FLUX(kq);
2414		} else {
2415			if (kn->kn_fop->f_event(kn, hint))
2416				KNOTE_ACTIVATE(kn, 1);
2417			KQ_UNLOCK(kq);
2418		}
2419	}
2420	if ((lockflags & KNF_LISTLOCKED) == 0)
2421		list->kl_unlock(list->kl_lockarg);
2422}
2423
2424/*
2425 * add a knote to a knlist
2426 */
2427void
2428knlist_add(struct knlist *knl, struct knote *kn, int islocked)
2429{
2430
2431	KNL_ASSERT_LOCK(knl, islocked);
2432	KQ_NOTOWNED(kn->kn_kq);
2433	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
2434	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2435	    ("knote %p was not detached", kn));
2436	if (!islocked)
2437		knl->kl_lock(knl->kl_lockarg);
2438	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
2439	if (!islocked)
2440		knl->kl_unlock(knl->kl_lockarg);
2441	KQ_LOCK(kn->kn_kq);
2442	kn->kn_knlist = knl;
2443	kn->kn_status &= ~KN_DETACHED;
2444	KQ_UNLOCK(kn->kn_kq);
2445}
2446
2447static void
2448knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
2449    int kqislocked)
2450{
2451
2452	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
2453	KNL_ASSERT_LOCK(knl, knlislocked);
2454	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
2455	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
2456	KASSERT((kn->kn_status & KN_DETACHED) == 0,
2457	    ("knote %p was already detached", kn));
2458	if (!knlislocked)
2459		knl->kl_lock(knl->kl_lockarg);
2460	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
2461	kn->kn_knlist = NULL;
2462	if (!knlislocked)
2463		kn_list_unlock(knl);
2464	if (!kqislocked)
2465		KQ_LOCK(kn->kn_kq);
2466	kn->kn_status |= KN_DETACHED;
2467	if (!kqislocked)
2468		KQ_UNLOCK(kn->kn_kq);
2469}
2470
2471/*
2472 * remove knote from the specified knlist
2473 */
2474void
2475knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
2476{
2477
2478	knlist_remove_kq(knl, kn, islocked, 0);
2479}
2480
2481int
2482knlist_empty(struct knlist *knl)
2483{
2484
2485	KNL_ASSERT_LOCKED(knl);
2486	return (SLIST_EMPTY(&knl->kl_list));
2487}
2488
2489static struct mtx knlist_lock;
2490MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
2491    MTX_DEF);
2492static void knlist_mtx_lock(void *arg);
2493static void knlist_mtx_unlock(void *arg);
2494
2495static void
2496knlist_mtx_lock(void *arg)
2497{
2498
2499	mtx_lock((struct mtx *)arg);
2500}
2501
2502static void
2503knlist_mtx_unlock(void *arg)
2504{
2505
2506	mtx_unlock((struct mtx *)arg);
2507}
2508
2509static void
2510knlist_mtx_assert_lock(void *arg, int what)
2511{
2512
2513	if (what == LA_LOCKED)
2514		mtx_assert((struct mtx *)arg, MA_OWNED);
2515	else
2516		mtx_assert((struct mtx *)arg, MA_NOTOWNED);
2517}
2518
2519void
2520knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
2521    void (*kl_unlock)(void *),
2522    void (*kl_assert_lock)(void *, int))
2523{
2524
2525	if (lock == NULL)
2526		knl->kl_lockarg = &knlist_lock;
2527	else
2528		knl->kl_lockarg = lock;
2529
2530	if (kl_lock == NULL)
2531		knl->kl_lock = knlist_mtx_lock;
2532	else
2533		knl->kl_lock = kl_lock;
2534	if (kl_unlock == NULL)
2535		knl->kl_unlock = knlist_mtx_unlock;
2536	else
2537		knl->kl_unlock = kl_unlock;
2538	if (kl_assert_lock == NULL)
2539		knl->kl_assert_lock = knlist_mtx_assert_lock;
2540	else
2541		knl->kl_assert_lock = kl_assert_lock;
2542
2543	knl->kl_autodestroy = 0;
2544	SLIST_INIT(&knl->kl_list);
2545}
2546
2547void
2548knlist_init_mtx(struct knlist *knl, struct mtx *lock)
2549{
2550
2551	knlist_init(knl, lock, NULL, NULL, NULL);
2552}
2553
2554struct knlist *
2555knlist_alloc(struct mtx *lock)
2556{
2557	struct knlist *knl;
2558
2559	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
2560	knlist_init_mtx(knl, lock);
2561	return (knl);
2562}
2563
2564void
2565knlist_destroy(struct knlist *knl)
2566{
2567
2568	KASSERT(KNLIST_EMPTY(knl),
2569	    ("destroying knlist %p with knotes on it", knl));
2570}
2571
2572void
2573knlist_detach(struct knlist *knl)
2574{
2575
2576	KNL_ASSERT_LOCKED(knl);
2577	knl->kl_autodestroy = 1;
2578	if (knlist_empty(knl)) {
2579		knlist_destroy(knl);
2580		free(knl, M_KQUEUE);
2581	}
2582}
2583
2584/*
2585 * Even if we are locked, we may need to drop the lock to allow any influx
2586 * knotes time to "settle".
2587 */
2588void
2589knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2590{
2591	struct knote *kn, *kn2;
2592	struct kqueue *kq;
2593
2594	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
2595	if (islocked)
2596		KNL_ASSERT_LOCKED(knl);
2597	else {
2598		KNL_ASSERT_UNLOCKED(knl);
2599again:		/* need to reacquire lock since we have dropped it */
2600		knl->kl_lock(knl->kl_lockarg);
2601	}
2602
2603	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2604		kq = kn->kn_kq;
2605		KQ_LOCK(kq);
2606		if (kn_in_flux(kn)) {
2607			KQ_UNLOCK(kq);
2608			continue;
2609		}
2610		knlist_remove_kq(knl, kn, 1, 1);
2611		if (killkn) {
2612			kn_enter_flux(kn);
2613			KQ_UNLOCK(kq);
2614			knote_drop_detached(kn, td);
2615		} else {
2616			/* Make sure cleared knotes disappear soon */
2617			kn->kn_flags |= EV_EOF | EV_ONESHOT;
2618			KQ_UNLOCK(kq);
2619		}
2620		kq = NULL;
2621	}
2622
2623	if (!SLIST_EMPTY(&knl->kl_list)) {
2624		/* there are still in flux knotes remaining */
2625		kn = SLIST_FIRST(&knl->kl_list);
2626		kq = kn->kn_kq;
2627		KQ_LOCK(kq);
2628		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
2629		knl->kl_unlock(knl->kl_lockarg);
2630		kq->kq_state |= KQ_FLUXWAIT;
2631		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2632		kq = NULL;
2633		goto again;
2634	}
2635
2636	if (islocked)
2637		KNL_ASSERT_LOCKED(knl);
2638	else {
2639		knl->kl_unlock(knl->kl_lockarg);
2640		KNL_ASSERT_UNLOCKED(knl);
2641	}
2642}
2643
2644/*
2645 * Remove all knotes referencing a specified fd must be called with FILEDESC
2646 * lock.  This prevents a race where a new fd comes along and occupies the
2647 * entry and we attach a knote to the fd.
2648 */
2649void
2650knote_fdclose(struct thread *td, int fd)
2651{
2652	struct filedesc *fdp = td->td_proc->p_fd;
2653	struct kqueue *kq;
2654	struct knote *kn;
2655	int influx;
2656
2657	FILEDESC_XLOCK_ASSERT(fdp);
2658
2659	/*
2660	 * We shouldn't have to worry about new kevents appearing on fd
2661	 * since filedesc is locked.
2662	 */
2663	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2664		KQ_LOCK(kq);
2665
2666again:
2667		influx = 0;
2668		while (kq->kq_knlistsize > fd &&
2669		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2670			if (kn_in_flux(kn)) {
2671				/* someone else might be waiting on our knote */
2672				if (influx)
2673					wakeup(kq);
2674				kq->kq_state |= KQ_FLUXWAIT;
2675				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2676				goto again;
2677			}
2678			kn_enter_flux(kn);
2679			KQ_UNLOCK(kq);
2680			influx = 1;
2681			knote_drop(kn, td);
2682			KQ_LOCK(kq);
2683		}
2684		KQ_UNLOCK_FLUX(kq);
2685	}
2686}
2687
2688static int
2689knote_attach(struct knote *kn, struct kqueue *kq)
2690{
2691	struct klist *list;
2692
2693	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
2694	KQ_OWNED(kq);
2695
2696	if ((kq->kq_state & KQ_CLOSING) != 0)
2697		return (EBADF);
2698	if (kn->kn_fop->f_isfd) {
2699		if (kn->kn_id >= kq->kq_knlistsize)
2700			return (ENOMEM);
2701		list = &kq->kq_knlist[kn->kn_id];
2702	} else {
2703		if (kq->kq_knhash == NULL)
2704			return (ENOMEM);
2705		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2706	}
2707	SLIST_INSERT_HEAD(list, kn, kn_link);
2708	return (0);
2709}
2710
2711static void
2712knote_drop(struct knote *kn, struct thread *td)
2713{
2714
2715	if ((kn->kn_status & KN_DETACHED) == 0)
2716		kn->kn_fop->f_detach(kn);
2717	knote_drop_detached(kn, td);
2718}
2719
2720static void
2721knote_drop_detached(struct knote *kn, struct thread *td)
2722{
2723	struct kqueue *kq;
2724	struct klist *list;
2725
2726	kq = kn->kn_kq;
2727
2728	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2729	    ("knote %p still attached", kn));
2730	KQ_NOTOWNED(kq);
2731
2732	KQ_LOCK(kq);
2733	KASSERT(kn->kn_influx == 1,
2734	    ("knote_drop called on %p with influx %d", kn, kn->kn_influx));
2735
2736	if (kn->kn_fop->f_isfd)
2737		list = &kq->kq_knlist[kn->kn_id];
2738	else
2739		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2740
2741	if (!SLIST_EMPTY(list))
2742		SLIST_REMOVE(list, kn, knote, kn_link);
2743	if (kn->kn_status & KN_QUEUED)
2744		knote_dequeue(kn);
2745	KQ_UNLOCK_FLUX(kq);
2746
2747	if (kn->kn_fop->f_isfd) {
2748		fdrop(kn->kn_fp, td);
2749		kn->kn_fp = NULL;
2750	}
2751	kqueue_fo_release(kn->kn_kevent.filter);
2752	kn->kn_fop = NULL;
2753	knote_free(kn);
2754}
2755
2756static void
2757knote_enqueue(struct knote *kn)
2758{
2759	struct kqueue *kq = kn->kn_kq;
2760
2761	KQ_OWNED(kn->kn_kq);
2762	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2763
2764	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2765	kn->kn_status |= KN_QUEUED;
2766	kq->kq_count++;
2767	kqueue_wakeup(kq);
2768}
2769
2770static void
2771knote_dequeue(struct knote *kn)
2772{
2773	struct kqueue *kq = kn->kn_kq;
2774
2775	KQ_OWNED(kn->kn_kq);
2776	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2777
2778	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2779	kn->kn_status &= ~KN_QUEUED;
2780	kq->kq_count--;
2781}
2782
2783static void
2784knote_init(void)
2785{
2786
2787	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2788	    NULL, NULL, UMA_ALIGN_PTR, 0);
2789	ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue);
2790}
2791SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2792
2793static struct knote *
2794knote_alloc(int mflag)
2795{
2796
2797	return (uma_zalloc(knote_zone, mflag | M_ZERO));
2798}
2799
2800static void
2801knote_free(struct knote *kn)
2802{
2803
2804	uma_zfree(knote_zone, kn);
2805}
2806
2807/*
2808 * Register the kev w/ the kq specified by fd.
2809 */
2810int
2811kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag)
2812{
2813	struct kqueue *kq;
2814	struct file *fp;
2815	cap_rights_t rights;
2816	int error;
2817
2818	error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE),
2819	    &fp);
2820	if (error != 0)
2821		return (error);
2822	if ((error = kqueue_acquire(fp, &kq)) != 0)
2823		goto noacquire;
2824
2825	error = kqueue_register(kq, kev, td, mflag);
2826	kqueue_release(kq, 0);
2827
2828noacquire:
2829	fdrop(fp, td);
2830	return (error);
2831}
2832