1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 1997 John S. Dyson.  All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. John S. Dyson's name may not be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
15 * bad that happens because of using this software isn't the responsibility
16 * of the author.  This software is distributed AS-IS.
17 */
18
19/*
20 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21 */
22
23#include <sys/param.h>
24#include <sys/systm.h>
25#include <sys/malloc.h>
26#include <sys/bio.h>
27#include <sys/buf.h>
28#include <sys/capsicum.h>
29#include <sys/eventhandler.h>
30#include <sys/sysproto.h>
31#include <sys/filedesc.h>
32#include <sys/kernel.h>
33#include <sys/module.h>
34#include <sys/kthread.h>
35#include <sys/fcntl.h>
36#include <sys/file.h>
37#include <sys/limits.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/unistd.h>
41#include <sys/posix4.h>
42#include <sys/proc.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/syscallsubr.h>
46#include <sys/protosw.h>
47#include <sys/rwlock.h>
48#include <sys/sema.h>
49#include <sys/socket.h>
50#include <sys/socketvar.h>
51#include <sys/syscall.h>
52#include <sys/sysctl.h>
53#include <sys/syslog.h>
54#include <sys/sx.h>
55#include <sys/taskqueue.h>
56#include <sys/vnode.h>
57#include <sys/conf.h>
58#include <sys/event.h>
59#include <sys/mount.h>
60#include <geom/geom.h>
61
62#include <machine/atomic.h>
63
64#include <vm/vm.h>
65#include <vm/vm_page.h>
66#include <vm/vm_extern.h>
67#include <vm/pmap.h>
68#include <vm/vm_map.h>
69#include <vm/vm_object.h>
70#include <vm/vnode_pager.h>
71#include <vm/uma.h>
72#include <sys/aio.h>
73
74/*
75 * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
76 * overflow. (XXX will be removed soon.)
77 */
78static u_long jobrefid;
79
80/*
81 * Counter for aio_fsync.
82 */
83static uint64_t jobseqno;
84
85#ifndef MAX_AIO_PER_PROC
86#define MAX_AIO_PER_PROC	32
87#endif
88
89#ifndef MAX_AIO_QUEUE_PER_PROC
90#define MAX_AIO_QUEUE_PER_PROC	256
91#endif
92
93#ifndef MAX_AIO_QUEUE
94#define MAX_AIO_QUEUE		1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
95#endif
96
97#ifndef MAX_BUF_AIO
98#define MAX_BUF_AIO		16
99#endif
100
101FEATURE(aio, "Asynchronous I/O");
102SYSCTL_DECL(_p1003_1b);
103
104static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
105static MALLOC_DEFINE(M_AIO, "aio", "structures for asynchronous I/O");
106
107static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
108    "Async IO management");
109
110static int enable_aio_unsafe = 0;
111SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
112    "Permit asynchronous IO on all file types, not just known-safe types");
113
114static unsigned int unsafe_warningcnt = 1;
115SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
116    &unsafe_warningcnt, 0,
117    "Warnings that will be triggered upon failed IO requests on unsafe files");
118
119static int max_aio_procs = MAX_AIO_PROCS;
120SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
121    "Maximum number of kernel processes to use for handling async IO ");
122
123static int num_aio_procs = 0;
124SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
125    "Number of presently active kernel processes for async IO");
126
127/*
128 * The code will adjust the actual number of AIO processes towards this
129 * number when it gets a chance.
130 */
131static int target_aio_procs = TARGET_AIO_PROCS;
132SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
133    0,
134    "Preferred number of ready kernel processes for async IO");
135
136static int max_queue_count = MAX_AIO_QUEUE;
137SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
138    "Maximum number of aio requests to queue, globally");
139
140static int num_queue_count = 0;
141SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
142    "Number of queued aio requests");
143
144static int num_buf_aio = 0;
145SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
146    "Number of aio requests presently handled by the buf subsystem");
147
148static int num_unmapped_aio = 0;
149SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio,
150    0,
151    "Number of aio requests presently handled by unmapped I/O buffers");
152
153/* Number of async I/O processes in the process of being started */
154/* XXX This should be local to aio_aqueue() */
155static int num_aio_resv_start = 0;
156
157static int aiod_lifetime;
158SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
159    "Maximum lifetime for idle aiod");
160
161static int max_aio_per_proc = MAX_AIO_PER_PROC;
162SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
163    0,
164    "Maximum active aio requests per process");
165
166static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
167SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
168    &max_aio_queue_per_proc, 0,
169    "Maximum queued aio requests per process");
170
171static int max_buf_aio = MAX_BUF_AIO;
172SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
173    "Maximum buf aio requests per process");
174
175/*
176 * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
177 * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
178 * vfs.aio.aio_listio_max.
179 */
180SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
181    CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc,
182    0, "Maximum aio requests for a single lio_listio call");
183
184#ifdef COMPAT_FREEBSD6
185typedef struct oaiocb {
186	int	aio_fildes;		/* File descriptor */
187	off_t	aio_offset;		/* File offset for I/O */
188	volatile void *aio_buf;         /* I/O buffer in process space */
189	size_t	aio_nbytes;		/* Number of bytes for I/O */
190	struct	osigevent aio_sigevent;	/* Signal to deliver */
191	int	aio_lio_opcode;		/* LIO opcode */
192	int	aio_reqprio;		/* Request priority -- ignored */
193	struct	__aiocb_private	_aiocb_private;
194} oaiocb_t;
195#endif
196
197/*
198 * Below is a key of locks used to protect each member of struct kaiocb
199 * aioliojob and kaioinfo and any backends.
200 *
201 * * - need not protected
202 * a - locked by kaioinfo lock
203 * b - locked by backend lock, the backend lock can be null in some cases,
204 *     for example, BIO belongs to this type, in this case, proc lock is
205 *     reused.
206 * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
207 */
208
209/*
210 * If the routine that services an AIO request blocks while running in an
211 * AIO kernel process it can starve other I/O requests.  BIO requests
212 * queued via aio_qbio() complete asynchronously and do not use AIO kernel
213 * processes at all.  Socket I/O requests use a separate pool of
214 * kprocs and also force non-blocking I/O.  Other file I/O requests
215 * use the generic fo_read/fo_write operations which can block.  The
216 * fsync and mlock operations can also block while executing.  Ideally
217 * none of these requests would block while executing.
218 *
219 * Note that the service routines cannot toggle O_NONBLOCK in the file
220 * structure directly while handling a request due to races with
221 * userland threads.
222 */
223
224/* jobflags */
225#define	KAIOCB_QUEUEING		0x01
226#define	KAIOCB_CANCELLED	0x02
227#define	KAIOCB_CANCELLING	0x04
228#define	KAIOCB_CHECKSYNC	0x08
229#define	KAIOCB_CLEARED		0x10
230#define	KAIOCB_FINISHED		0x20
231
232/* ioflags */
233#define	KAIOCB_IO_FOFFSET	0x01
234
235/*
236 * AIO process info
237 */
238#define AIOP_FREE	0x1			/* proc on free queue */
239
240struct aioproc {
241	int	aioprocflags;			/* (c) AIO proc flags */
242	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
243	struct	proc *aioproc;			/* (*) the AIO proc */
244};
245
246/*
247 * data-structure for lio signal management
248 */
249struct aioliojob {
250	int	lioj_flags;			/* (a) listio flags */
251	int	lioj_count;			/* (a) count of jobs */
252	int	lioj_finished_count;		/* (a) count of finished jobs */
253	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
254	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
255	struct	knlist klist;			/* (a) list of knotes */
256	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
257};
258
259#define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
260#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
261#define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
262
263/*
264 * per process aio data structure
265 */
266struct kaioinfo {
267	struct	mtx kaio_mtx;		/* the lock to protect this struct */
268	int	kaio_flags;		/* (a) per process kaio flags */
269	int	kaio_active_count;	/* (c) number of currently used AIOs */
270	int	kaio_count;		/* (a) size of AIO queue */
271	int	kaio_buffer_count;	/* (a) number of bio buffers */
272	TAILQ_HEAD(,kaiocb) kaio_all;	/* (a) all AIOs in a process */
273	TAILQ_HEAD(,kaiocb) kaio_done;	/* (a) done queue for process */
274	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
275	TAILQ_HEAD(,kaiocb) kaio_jobqueue;	/* (a) job queue for process */
276	TAILQ_HEAD(,kaiocb) kaio_syncqueue;	/* (a) queue for aio_fsync */
277	TAILQ_HEAD(,kaiocb) kaio_syncready;  /* (a) second q for aio_fsync */
278	struct	task kaio_task;		/* (*) task to kick aio processes */
279	struct	task kaio_sync_task;	/* (*) task to schedule fsync jobs */
280};
281
282#define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
283#define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
284#define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
285#define AIO_MTX(ki)		(&(ki)->kaio_mtx)
286
287#define KAIO_RUNDOWN	0x1	/* process is being run down */
288#define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
289
290/*
291 * Operations used to interact with userland aio control blocks.
292 * Different ABIs provide their own operations.
293 */
294struct aiocb_ops {
295	int	(*aio_copyin)(struct aiocb *ujob, struct kaiocb *kjob, int ty);
296	long	(*fetch_status)(struct aiocb *ujob);
297	long	(*fetch_error)(struct aiocb *ujob);
298	int	(*store_status)(struct aiocb *ujob, long status);
299	int	(*store_error)(struct aiocb *ujob, long error);
300	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
301	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
302};
303
304static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
305static struct sema aio_newproc_sem;
306static struct mtx aio_job_mtx;
307static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
308static struct unrhdr *aiod_unr;
309
310static void	aio_biocleanup(struct bio *bp);
311void		aio_init_aioinfo(struct proc *p);
312static int	aio_onceonly(void);
313static int	aio_free_entry(struct kaiocb *job);
314static void	aio_process_rw(struct kaiocb *job);
315static void	aio_process_sync(struct kaiocb *job);
316static void	aio_process_mlock(struct kaiocb *job);
317static void	aio_schedule_fsync(void *context, int pending);
318static int	aio_newproc(int *);
319int		aio_aqueue(struct thread *td, struct aiocb *ujob,
320		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
321static int	aio_queue_file(struct file *fp, struct kaiocb *job);
322static void	aio_biowakeup(struct bio *bp);
323static void	aio_proc_rundown(void *arg, struct proc *p);
324static void	aio_proc_rundown_exec(void *arg, struct proc *p,
325		    struct image_params *imgp);
326static int	aio_qbio(struct proc *p, struct kaiocb *job);
327static void	aio_daemon(void *param);
328static void	aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
329static bool	aio_clear_cancel_function_locked(struct kaiocb *job);
330static int	aio_kick(struct proc *userp);
331static void	aio_kick_nowait(struct proc *userp);
332static void	aio_kick_helper(void *context, int pending);
333static int	filt_aioattach(struct knote *kn);
334static void	filt_aiodetach(struct knote *kn);
335static int	filt_aio(struct knote *kn, long hint);
336static int	filt_lioattach(struct knote *kn);
337static void	filt_liodetach(struct knote *kn);
338static int	filt_lio(struct knote *kn, long hint);
339
340/*
341 * Zones for:
342 * 	kaio	Per process async io info
343 *	aiocb	async io jobs
344 *	aiolio	list io jobs
345 */
346static uma_zone_t kaio_zone, aiocb_zone, aiolio_zone;
347
348/* kqueue filters for aio */
349static struct filterops aio_filtops = {
350	.f_isfd = 0,
351	.f_attach = filt_aioattach,
352	.f_detach = filt_aiodetach,
353	.f_event = filt_aio,
354};
355static struct filterops lio_filtops = {
356	.f_isfd = 0,
357	.f_attach = filt_lioattach,
358	.f_detach = filt_liodetach,
359	.f_event = filt_lio
360};
361
362static eventhandler_tag exit_tag, exec_tag;
363
364TASKQUEUE_DEFINE_THREAD(aiod_kick);
365
366/*
367 * Main operations function for use as a kernel module.
368 */
369static int
370aio_modload(struct module *module, int cmd, void *arg)
371{
372	int error = 0;
373
374	switch (cmd) {
375	case MOD_LOAD:
376		aio_onceonly();
377		break;
378	case MOD_SHUTDOWN:
379		break;
380	default:
381		error = EOPNOTSUPP;
382		break;
383	}
384	return (error);
385}
386
387static moduledata_t aio_mod = {
388	"aio",
389	&aio_modload,
390	NULL
391};
392
393DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
394MODULE_VERSION(aio, 1);
395
396/*
397 * Startup initialization
398 */
399static int
400aio_onceonly(void)
401{
402
403	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
404	    EVENTHANDLER_PRI_ANY);
405	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
406	    NULL, EVENTHANDLER_PRI_ANY);
407	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
408	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
409	TAILQ_INIT(&aio_freeproc);
410	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
411	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
412	TAILQ_INIT(&aio_jobs);
413	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
414	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
415	    NULL, NULL, UMA_ALIGN_PTR, 0);
416	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
417	    NULL, NULL, UMA_ALIGN_PTR, 0);
418	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
419	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
420	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
421	jobrefid = 1;
422	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
423	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
424	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
425
426	return (0);
427}
428
429/*
430 * Init the per-process aioinfo structure.  The aioinfo limits are set
431 * per-process for user limit (resource) management.
432 */
433void
434aio_init_aioinfo(struct proc *p)
435{
436	struct kaioinfo *ki;
437
438	ki = uma_zalloc(kaio_zone, M_WAITOK);
439	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
440	ki->kaio_flags = 0;
441	ki->kaio_active_count = 0;
442	ki->kaio_count = 0;
443	ki->kaio_buffer_count = 0;
444	TAILQ_INIT(&ki->kaio_all);
445	TAILQ_INIT(&ki->kaio_done);
446	TAILQ_INIT(&ki->kaio_jobqueue);
447	TAILQ_INIT(&ki->kaio_liojoblist);
448	TAILQ_INIT(&ki->kaio_syncqueue);
449	TAILQ_INIT(&ki->kaio_syncready);
450	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
451	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
452	PROC_LOCK(p);
453	if (p->p_aioinfo == NULL) {
454		p->p_aioinfo = ki;
455		PROC_UNLOCK(p);
456	} else {
457		PROC_UNLOCK(p);
458		mtx_destroy(&ki->kaio_mtx);
459		uma_zfree(kaio_zone, ki);
460	}
461
462	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
463		aio_newproc(NULL);
464}
465
466static int
467aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi, bool ext)
468{
469	struct thread *td;
470	int error;
471
472	error = sigev_findtd(p, sigev, &td);
473	if (error)
474		return (error);
475	if (!KSI_ONQ(ksi)) {
476		ksiginfo_set_sigev(ksi, sigev);
477		ksi->ksi_code = SI_ASYNCIO;
478		ksi->ksi_flags |= ext ? (KSI_EXT | KSI_INS) : 0;
479		tdsendsignal(p, td, ksi->ksi_signo, ksi);
480	}
481	PROC_UNLOCK(p);
482	return (error);
483}
484
485/*
486 * Free a job entry.  Wait for completion if it is currently active, but don't
487 * delay forever.  If we delay, we return a flag that says that we have to
488 * restart the queue scan.
489 */
490static int
491aio_free_entry(struct kaiocb *job)
492{
493	struct kaioinfo *ki;
494	struct aioliojob *lj;
495	struct proc *p;
496
497	p = job->userproc;
498	MPASS(curproc == p);
499	ki = p->p_aioinfo;
500	MPASS(ki != NULL);
501
502	AIO_LOCK_ASSERT(ki, MA_OWNED);
503	MPASS(job->jobflags & KAIOCB_FINISHED);
504
505	atomic_subtract_int(&num_queue_count, 1);
506
507	ki->kaio_count--;
508	MPASS(ki->kaio_count >= 0);
509
510	TAILQ_REMOVE(&ki->kaio_done, job, plist);
511	TAILQ_REMOVE(&ki->kaio_all, job, allist);
512
513	lj = job->lio;
514	if (lj) {
515		lj->lioj_count--;
516		lj->lioj_finished_count--;
517
518		if (lj->lioj_count == 0) {
519			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
520			/* lio is going away, we need to destroy any knotes */
521			knlist_delete(&lj->klist, curthread, 1);
522			PROC_LOCK(p);
523			sigqueue_take(&lj->lioj_ksi);
524			PROC_UNLOCK(p);
525			uma_zfree(aiolio_zone, lj);
526		}
527	}
528
529	/* job is going away, we need to destroy any knotes */
530	knlist_delete(&job->klist, curthread, 1);
531	PROC_LOCK(p);
532	sigqueue_take(&job->ksi);
533	PROC_UNLOCK(p);
534
535	AIO_UNLOCK(ki);
536
537	/*
538	 * The thread argument here is used to find the owning process
539	 * and is also passed to fo_close() which may pass it to various
540	 * places such as devsw close() routines.  Because of that, we
541	 * need a thread pointer from the process owning the job that is
542	 * persistent and won't disappear out from under us or move to
543	 * another process.
544	 *
545	 * Currently, all the callers of this function call it to remove
546	 * a kaiocb from the current process' job list either via a
547	 * syscall or due to the current process calling exit() or
548	 * execve().  Thus, we know that p == curproc.  We also know that
549	 * curthread can't exit since we are curthread.
550	 *
551	 * Therefore, we use curthread as the thread to pass to
552	 * knlist_delete().  This does mean that it is possible for the
553	 * thread pointer at close time to differ from the thread pointer
554	 * at open time, but this is already true of file descriptors in
555	 * a multithreaded process.
556	 */
557	if (job->fd_file)
558		fdrop(job->fd_file, curthread);
559	crfree(job->cred);
560	if (job->uiop != &job->uio)
561		freeuio(job->uiop);
562	uma_zfree(aiocb_zone, job);
563	AIO_LOCK(ki);
564
565	return (0);
566}
567
568static void
569aio_proc_rundown_exec(void *arg, struct proc *p,
570    struct image_params *imgp __unused)
571{
572   	aio_proc_rundown(arg, p);
573}
574
575static int
576aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
577{
578	aio_cancel_fn_t *func;
579	int cancelled;
580
581	AIO_LOCK_ASSERT(ki, MA_OWNED);
582	if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
583		return (0);
584	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
585	job->jobflags |= KAIOCB_CANCELLED;
586
587	func = job->cancel_fn;
588
589	/*
590	 * If there is no cancel routine, just leave the job marked as
591	 * cancelled.  The job should be in active use by a caller who
592	 * should complete it normally or when it fails to install a
593	 * cancel routine.
594	 */
595	if (func == NULL)
596		return (0);
597
598	/*
599	 * Set the CANCELLING flag so that aio_complete() will defer
600	 * completions of this job.  This prevents the job from being
601	 * freed out from under the cancel callback.  After the
602	 * callback any deferred completion (whether from the callback
603	 * or any other source) will be completed.
604	 */
605	job->jobflags |= KAIOCB_CANCELLING;
606	AIO_UNLOCK(ki);
607	func(job);
608	AIO_LOCK(ki);
609	job->jobflags &= ~KAIOCB_CANCELLING;
610	if (job->jobflags & KAIOCB_FINISHED) {
611		cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
612		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
613		aio_bio_done_notify(p, job);
614	} else {
615		/*
616		 * The cancel callback might have scheduled an
617		 * operation to cancel this request, but it is
618		 * only counted as cancelled if the request is
619		 * cancelled when the callback returns.
620		 */
621		cancelled = 0;
622	}
623	return (cancelled);
624}
625
626/*
627 * Rundown the jobs for a given process.
628 */
629static void
630aio_proc_rundown(void *arg, struct proc *p)
631{
632	struct kaioinfo *ki;
633	struct aioliojob *lj;
634	struct kaiocb *job, *jobn;
635
636	KASSERT(curthread->td_proc == p,
637	    ("%s: called on non-curproc", __func__));
638	ki = p->p_aioinfo;
639	if (ki == NULL)
640		return;
641
642	AIO_LOCK(ki);
643	ki->kaio_flags |= KAIO_RUNDOWN;
644
645restart:
646
647	/*
648	 * Try to cancel all pending requests. This code simulates
649	 * aio_cancel on all pending I/O requests.
650	 */
651	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
652		aio_cancel_job(p, ki, job);
653	}
654
655	/* Wait for all running I/O to be finished */
656	if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
657		ki->kaio_flags |= KAIO_WAKEUP;
658		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
659		goto restart;
660	}
661
662	/* Free all completed I/O requests. */
663	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
664		aio_free_entry(job);
665
666	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
667		if (lj->lioj_count == 0) {
668			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
669			knlist_delete(&lj->klist, curthread, 1);
670			PROC_LOCK(p);
671			sigqueue_take(&lj->lioj_ksi);
672			PROC_UNLOCK(p);
673			uma_zfree(aiolio_zone, lj);
674		} else {
675			panic("LIO job not cleaned up: C:%d, FC:%d\n",
676			    lj->lioj_count, lj->lioj_finished_count);
677		}
678	}
679	AIO_UNLOCK(ki);
680	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
681	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
682	mtx_destroy(&ki->kaio_mtx);
683	uma_zfree(kaio_zone, ki);
684	p->p_aioinfo = NULL;
685}
686
687/*
688 * Select a job to run (called by an AIO daemon).
689 */
690static struct kaiocb *
691aio_selectjob(struct aioproc *aiop)
692{
693	struct kaiocb *job;
694	struct kaioinfo *ki;
695	struct proc *userp;
696
697	mtx_assert(&aio_job_mtx, MA_OWNED);
698restart:
699	TAILQ_FOREACH(job, &aio_jobs, list) {
700		userp = job->userproc;
701		ki = userp->p_aioinfo;
702
703		if (ki->kaio_active_count < max_aio_per_proc) {
704			TAILQ_REMOVE(&aio_jobs, job, list);
705			if (!aio_clear_cancel_function(job))
706				goto restart;
707
708			/* Account for currently active jobs. */
709			ki->kaio_active_count++;
710			break;
711		}
712	}
713	return (job);
714}
715
716/*
717 * Move all data to a permanent storage device.  This code
718 * simulates the fsync and fdatasync syscalls.
719 */
720static int
721aio_fsync_vnode(struct thread *td, struct vnode *vp, int op)
722{
723	struct mount *mp;
724	int error;
725
726	for (;;) {
727		error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
728		if (error != 0)
729			break;
730		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
731		vnode_pager_clean_async(vp);
732		if (op == LIO_DSYNC)
733			error = VOP_FDATASYNC(vp, td);
734		else
735			error = VOP_FSYNC(vp, MNT_WAIT, td);
736
737		VOP_UNLOCK(vp);
738		vn_finished_write(mp);
739		if (error != ERELOOKUP)
740			break;
741	}
742	return (error);
743}
744
745/*
746 * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
747 * does the I/O request for the non-bio version of the operations.  The normal
748 * vn operations are used, and this code should work in all instances for every
749 * type of file, including pipes, sockets, fifos, and regular files.
750 *
751 * XXX I don't think it works well for socket, pipe, and fifo.
752 */
753static void
754aio_process_rw(struct kaiocb *job)
755{
756	struct ucred *td_savedcred;
757	struct thread *td;
758	struct file *fp;
759	ssize_t cnt;
760	long msgsnd_st, msgsnd_end;
761	long msgrcv_st, msgrcv_end;
762	long oublock_st, oublock_end;
763	long inblock_st, inblock_end;
764	int error, opcode;
765
766	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
767	    job->uaiocb.aio_lio_opcode == LIO_READV ||
768	    job->uaiocb.aio_lio_opcode == LIO_WRITE ||
769	    job->uaiocb.aio_lio_opcode == LIO_WRITEV,
770	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
771
772	aio_switch_vmspace(job);
773	td = curthread;
774	td_savedcred = td->td_ucred;
775	td->td_ucred = job->cred;
776	job->uiop->uio_td = td;
777	fp = job->fd_file;
778
779	opcode = job->uaiocb.aio_lio_opcode;
780	cnt = job->uiop->uio_resid;
781
782	msgrcv_st = td->td_ru.ru_msgrcv;
783	msgsnd_st = td->td_ru.ru_msgsnd;
784	inblock_st = td->td_ru.ru_inblock;
785	oublock_st = td->td_ru.ru_oublock;
786
787	/*
788	 * aio_aqueue() acquires a reference to the file that is
789	 * released in aio_free_entry().
790	 */
791	if (opcode == LIO_READ || opcode == LIO_READV) {
792		if (job->uiop->uio_resid == 0)
793			error = 0;
794		else
795			error = fo_read(fp, job->uiop, fp->f_cred,
796			    (job->ioflags & KAIOCB_IO_FOFFSET) != 0 ? 0 :
797			    FOF_OFFSET, td);
798	} else {
799		if (fp->f_type == DTYPE_VNODE)
800			bwillwrite();
801		error = fo_write(fp, job->uiop, fp->f_cred, (job->ioflags &
802		    KAIOCB_IO_FOFFSET) != 0 ? 0 : FOF_OFFSET, td);
803	}
804	msgrcv_end = td->td_ru.ru_msgrcv;
805	msgsnd_end = td->td_ru.ru_msgsnd;
806	inblock_end = td->td_ru.ru_inblock;
807	oublock_end = td->td_ru.ru_oublock;
808
809	job->msgrcv = msgrcv_end - msgrcv_st;
810	job->msgsnd = msgsnd_end - msgsnd_st;
811	job->inblock = inblock_end - inblock_st;
812	job->outblock = oublock_end - oublock_st;
813
814	if (error != 0 && job->uiop->uio_resid != cnt) {
815		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
816			error = 0;
817		if (error == EPIPE && (opcode & LIO_WRITE)) {
818			PROC_LOCK(job->userproc);
819			kern_psignal(job->userproc, SIGPIPE);
820			PROC_UNLOCK(job->userproc);
821		}
822	}
823
824	cnt -= job->uiop->uio_resid;
825	td->td_ucred = td_savedcred;
826	if (error)
827		aio_complete(job, -1, error);
828	else
829		aio_complete(job, cnt, 0);
830}
831
832static void
833aio_process_sync(struct kaiocb *job)
834{
835	struct thread *td = curthread;
836	struct ucred *td_savedcred = td->td_ucred;
837	struct file *fp = job->fd_file;
838	int error = 0;
839
840	KASSERT(job->uaiocb.aio_lio_opcode & LIO_SYNC,
841	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
842
843	td->td_ucred = job->cred;
844	if (fp->f_vnode != NULL) {
845		error = aio_fsync_vnode(td, fp->f_vnode,
846		    job->uaiocb.aio_lio_opcode);
847	}
848	td->td_ucred = td_savedcred;
849	if (error)
850		aio_complete(job, -1, error);
851	else
852		aio_complete(job, 0, 0);
853}
854
855static void
856aio_process_mlock(struct kaiocb *job)
857{
858	struct aiocb *cb = &job->uaiocb;
859	int error;
860
861	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
862	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
863
864	aio_switch_vmspace(job);
865	error = kern_mlock(job->userproc, job->cred,
866	    __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
867	aio_complete(job, error != 0 ? -1 : 0, error);
868}
869
870static void
871aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
872{
873	struct aioliojob *lj;
874	struct kaioinfo *ki;
875	struct kaiocb *sjob, *sjobn;
876	int lj_done;
877	bool schedule_fsync;
878
879	ki = userp->p_aioinfo;
880	AIO_LOCK_ASSERT(ki, MA_OWNED);
881	lj = job->lio;
882	lj_done = 0;
883	if (lj) {
884		lj->lioj_finished_count++;
885		if (lj->lioj_count == lj->lioj_finished_count)
886			lj_done = 1;
887	}
888	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
889	MPASS(job->jobflags & KAIOCB_FINISHED);
890
891	if (ki->kaio_flags & KAIO_RUNDOWN)
892		goto notification_done;
893
894	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
895	    job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
896		aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi, true);
897
898	KNOTE_LOCKED(&job->klist, 1);
899
900	if (lj_done) {
901		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
902			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
903			KNOTE_LOCKED(&lj->klist, 1);
904		}
905		if ((lj->lioj_flags & (LIOJ_SIGNAL | LIOJ_SIGNAL_POSTED))
906		    == LIOJ_SIGNAL &&
907		    (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
908		    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
909			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi,
910			    true);
911			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
912		}
913	}
914
915notification_done:
916	if (job->jobflags & KAIOCB_CHECKSYNC) {
917		schedule_fsync = false;
918		TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
919			if (job->fd_file != sjob->fd_file ||
920			    job->seqno >= sjob->seqno)
921				continue;
922			if (--sjob->pending > 0)
923				continue;
924			TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
925			if (!aio_clear_cancel_function_locked(sjob))
926				continue;
927			TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
928			schedule_fsync = true;
929		}
930		if (schedule_fsync)
931			taskqueue_enqueue(taskqueue_aiod_kick,
932			    &ki->kaio_sync_task);
933	}
934	if (ki->kaio_flags & KAIO_WAKEUP) {
935		ki->kaio_flags &= ~KAIO_WAKEUP;
936		wakeup(&userp->p_aioinfo);
937	}
938}
939
940static void
941aio_schedule_fsync(void *context, int pending)
942{
943	struct kaioinfo *ki;
944	struct kaiocb *job;
945
946	ki = context;
947	AIO_LOCK(ki);
948	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
949		job = TAILQ_FIRST(&ki->kaio_syncready);
950		TAILQ_REMOVE(&ki->kaio_syncready, job, list);
951		AIO_UNLOCK(ki);
952		aio_schedule(job, aio_process_sync);
953		AIO_LOCK(ki);
954	}
955	AIO_UNLOCK(ki);
956}
957
958bool
959aio_cancel_cleared(struct kaiocb *job)
960{
961
962	/*
963	 * The caller should hold the same queue lock held when
964	 * aio_clear_cancel_function() was called and set this flag
965	 * ensuring this check sees an up-to-date value.  However,
966	 * there is no way to assert that.
967	 */
968	return ((job->jobflags & KAIOCB_CLEARED) != 0);
969}
970
971static bool
972aio_clear_cancel_function_locked(struct kaiocb *job)
973{
974
975	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
976	MPASS(job->cancel_fn != NULL);
977	if (job->jobflags & KAIOCB_CANCELLING) {
978		job->jobflags |= KAIOCB_CLEARED;
979		return (false);
980	}
981	job->cancel_fn = NULL;
982	return (true);
983}
984
985bool
986aio_clear_cancel_function(struct kaiocb *job)
987{
988	struct kaioinfo *ki;
989	bool ret;
990
991	ki = job->userproc->p_aioinfo;
992	AIO_LOCK(ki);
993	ret = aio_clear_cancel_function_locked(job);
994	AIO_UNLOCK(ki);
995	return (ret);
996}
997
998static bool
999aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
1000{
1001
1002	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
1003	if (job->jobflags & KAIOCB_CANCELLED)
1004		return (false);
1005	job->cancel_fn = func;
1006	return (true);
1007}
1008
1009bool
1010aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
1011{
1012	struct kaioinfo *ki;
1013	bool ret;
1014
1015	ki = job->userproc->p_aioinfo;
1016	AIO_LOCK(ki);
1017	ret = aio_set_cancel_function_locked(job, func);
1018	AIO_UNLOCK(ki);
1019	return (ret);
1020}
1021
1022void
1023aio_complete(struct kaiocb *job, long status, int error)
1024{
1025	struct kaioinfo *ki;
1026	struct proc *userp;
1027
1028	job->uaiocb._aiocb_private.error = error;
1029	job->uaiocb._aiocb_private.status = status;
1030
1031	userp = job->userproc;
1032	ki = userp->p_aioinfo;
1033
1034	AIO_LOCK(ki);
1035	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
1036	    ("duplicate aio_complete"));
1037	job->jobflags |= KAIOCB_FINISHED;
1038	if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
1039		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
1040		aio_bio_done_notify(userp, job);
1041	}
1042	AIO_UNLOCK(ki);
1043}
1044
1045void
1046aio_cancel(struct kaiocb *job)
1047{
1048
1049	aio_complete(job, -1, ECANCELED);
1050}
1051
1052void
1053aio_switch_vmspace(struct kaiocb *job)
1054{
1055
1056	vmspace_switch_aio(job->userproc->p_vmspace);
1057}
1058
1059/*
1060 * The AIO daemon, most of the actual work is done in aio_process_*,
1061 * but the setup (and address space mgmt) is done in this routine.
1062 */
1063static void
1064aio_daemon(void *_id)
1065{
1066	struct kaiocb *job;
1067	struct aioproc *aiop;
1068	struct kaioinfo *ki;
1069	struct proc *p;
1070	struct vmspace *myvm;
1071	struct thread *td = curthread;
1072	int id = (intptr_t)_id;
1073
1074	/*
1075	 * Grab an extra reference on the daemon's vmspace so that it
1076	 * doesn't get freed by jobs that switch to a different
1077	 * vmspace.
1078	 */
1079	p = td->td_proc;
1080	myvm = vmspace_acquire_ref(p);
1081
1082	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
1083
1084	/*
1085	 * Allocate and ready the aio control info.  There is one aiop structure
1086	 * per daemon.
1087	 */
1088	aiop = malloc(sizeof(*aiop), M_AIO, M_WAITOK);
1089	aiop->aioproc = p;
1090	aiop->aioprocflags = 0;
1091
1092	/*
1093	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
1094	 * and creating too many daemons.)
1095	 */
1096	sema_post(&aio_newproc_sem);
1097
1098	mtx_lock(&aio_job_mtx);
1099	for (;;) {
1100		/*
1101		 * Take daemon off of free queue
1102		 */
1103		if (aiop->aioprocflags & AIOP_FREE) {
1104			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1105			aiop->aioprocflags &= ~AIOP_FREE;
1106		}
1107
1108		/*
1109		 * Check for jobs.
1110		 */
1111		while ((job = aio_selectjob(aiop)) != NULL) {
1112			mtx_unlock(&aio_job_mtx);
1113
1114			ki = job->userproc->p_aioinfo;
1115			job->handle_fn(job);
1116
1117			mtx_lock(&aio_job_mtx);
1118			/* Decrement the active job count. */
1119			ki->kaio_active_count--;
1120		}
1121
1122		/*
1123		 * Disconnect from user address space.
1124		 */
1125		if (p->p_vmspace != myvm) {
1126			mtx_unlock(&aio_job_mtx);
1127			vmspace_switch_aio(myvm);
1128			mtx_lock(&aio_job_mtx);
1129			/*
1130			 * We have to restart to avoid race, we only sleep if
1131			 * no job can be selected.
1132			 */
1133			continue;
1134		}
1135
1136		mtx_assert(&aio_job_mtx, MA_OWNED);
1137
1138		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
1139		aiop->aioprocflags |= AIOP_FREE;
1140
1141		/*
1142		 * If daemon is inactive for a long time, allow it to exit,
1143		 * thereby freeing resources.
1144		 */
1145		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
1146		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
1147		    (aiop->aioprocflags & AIOP_FREE) &&
1148		    num_aio_procs > target_aio_procs)
1149			break;
1150	}
1151	TAILQ_REMOVE(&aio_freeproc, aiop, list);
1152	num_aio_procs--;
1153	mtx_unlock(&aio_job_mtx);
1154	free(aiop, M_AIO);
1155	free_unr(aiod_unr, id);
1156	vmspace_free(myvm);
1157
1158	KASSERT(p->p_vmspace == myvm,
1159	    ("AIOD: bad vmspace for exiting daemon"));
1160	KASSERT(refcount_load(&myvm->vm_refcnt) > 1,
1161	    ("AIOD: bad vm refcnt for exiting daemon: %d",
1162	    refcount_load(&myvm->vm_refcnt)));
1163	kproc_exit(0);
1164}
1165
1166/*
1167 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
1168 * AIO daemon modifies its environment itself.
1169 */
1170static int
1171aio_newproc(int *start)
1172{
1173	int error;
1174	struct proc *p;
1175	int id;
1176
1177	id = alloc_unr(aiod_unr);
1178	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
1179		RFNOWAIT, 0, "aiod%d", id);
1180	if (error == 0) {
1181		/*
1182		 * Wait until daemon is started.
1183		 */
1184		sema_wait(&aio_newproc_sem);
1185		mtx_lock(&aio_job_mtx);
1186		num_aio_procs++;
1187		if (start != NULL)
1188			(*start)--;
1189		mtx_unlock(&aio_job_mtx);
1190	} else {
1191		free_unr(aiod_unr, id);
1192	}
1193	return (error);
1194}
1195
1196/*
1197 * Try the high-performance, low-overhead bio method for eligible
1198 * VCHR devices.  This method doesn't use an aio helper thread, and
1199 * thus has very low overhead.
1200 *
1201 * Assumes that the caller, aio_aqueue(), has incremented the file
1202 * structure's reference count, preventing its deallocation for the
1203 * duration of this call.
1204 */
1205static int
1206aio_qbio(struct proc *p, struct kaiocb *job)
1207{
1208	struct aiocb *cb;
1209	struct file *fp;
1210	struct buf *pbuf;
1211	struct vnode *vp;
1212	struct cdevsw *csw;
1213	struct cdev *dev;
1214	struct kaioinfo *ki;
1215	struct bio **bios = NULL;
1216	off_t offset;
1217	int bio_cmd, error, i, iovcnt, opcode, poff, ref;
1218	vm_prot_t prot;
1219	bool use_unmapped;
1220
1221	cb = &job->uaiocb;
1222	fp = job->fd_file;
1223	opcode = cb->aio_lio_opcode;
1224
1225	if (!(opcode == LIO_WRITE || opcode == LIO_WRITEV ||
1226	    opcode == LIO_READ || opcode == LIO_READV))
1227		return (-1);
1228	if (fp == NULL || fp->f_type != DTYPE_VNODE)
1229		return (-1);
1230
1231	vp = fp->f_vnode;
1232	if (vp->v_type != VCHR)
1233		return (-1);
1234	if (vp->v_bufobj.bo_bsize == 0)
1235		return (-1);
1236
1237	bio_cmd = (opcode & LIO_WRITE) ? BIO_WRITE : BIO_READ;
1238	iovcnt = job->uiop->uio_iovcnt;
1239	if (iovcnt > max_buf_aio)
1240		return (-1);
1241	for (i = 0; i < iovcnt; i++) {
1242		if (job->uiop->uio_iov[i].iov_len % vp->v_bufobj.bo_bsize != 0)
1243			return (-1);
1244		if (job->uiop->uio_iov[i].iov_len > maxphys) {
1245			error = -1;
1246			return (-1);
1247		}
1248	}
1249	offset = cb->aio_offset;
1250
1251	ref = 0;
1252	csw = devvn_refthread(vp, &dev, &ref);
1253	if (csw == NULL)
1254		return (ENXIO);
1255
1256	if ((csw->d_flags & D_DISK) == 0) {
1257		error = -1;
1258		goto unref;
1259	}
1260	if (job->uiop->uio_resid > dev->si_iosize_max) {
1261		error = -1;
1262		goto unref;
1263	}
1264
1265	ki = p->p_aioinfo;
1266	job->error = 0;
1267
1268	use_unmapped = (dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed;
1269	if (!use_unmapped) {
1270		AIO_LOCK(ki);
1271		if (ki->kaio_buffer_count + iovcnt > max_buf_aio) {
1272			AIO_UNLOCK(ki);
1273			error = EAGAIN;
1274			goto unref;
1275		}
1276		ki->kaio_buffer_count += iovcnt;
1277		AIO_UNLOCK(ki);
1278	}
1279
1280	bios = malloc(sizeof(struct bio *) * iovcnt, M_TEMP, M_WAITOK);
1281	refcount_init(&job->nbio, iovcnt);
1282	for (i = 0; i < iovcnt; i++) {
1283		struct vm_page** pages;
1284		struct bio *bp;
1285		void *buf;
1286		size_t nbytes;
1287		int npages;
1288
1289		buf = job->uiop->uio_iov[i].iov_base;
1290		nbytes = job->uiop->uio_iov[i].iov_len;
1291
1292		bios[i] = g_alloc_bio();
1293		bp = bios[i];
1294
1295		poff = (vm_offset_t)buf & PAGE_MASK;
1296		if (use_unmapped) {
1297			pbuf = NULL;
1298			pages = malloc(sizeof(vm_page_t) * (atop(round_page(
1299			    nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO);
1300		} else {
1301			pbuf = uma_zalloc(pbuf_zone, M_WAITOK);
1302			BUF_KERNPROC(pbuf);
1303			pages = pbuf->b_pages;
1304		}
1305
1306		bp->bio_length = nbytes;
1307		bp->bio_bcount = nbytes;
1308		bp->bio_done = aio_biowakeup;
1309		bp->bio_offset = offset;
1310		bp->bio_cmd = bio_cmd;
1311		bp->bio_dev = dev;
1312		bp->bio_caller1 = job;
1313		bp->bio_caller2 = pbuf;
1314
1315		prot = VM_PROT_READ;
1316		if (opcode == LIO_READ || opcode == LIO_READV)
1317			prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
1318		npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
1319		    (vm_offset_t)buf, bp->bio_length, prot, pages,
1320		    atop(maxphys) + 1);
1321		if (npages < 0) {
1322			if (pbuf != NULL)
1323				uma_zfree(pbuf_zone, pbuf);
1324			else
1325				free(pages, M_TEMP);
1326			error = EFAULT;
1327			g_destroy_bio(bp);
1328			i--;
1329			goto destroy_bios;
1330		}
1331		if (pbuf != NULL) {
1332			pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages);
1333			bp->bio_data = pbuf->b_data + poff;
1334			pbuf->b_npages = npages;
1335			atomic_add_int(&num_buf_aio, 1);
1336		} else {
1337			bp->bio_ma = pages;
1338			bp->bio_ma_n = npages;
1339			bp->bio_ma_offset = poff;
1340			bp->bio_data = unmapped_buf;
1341			bp->bio_flags |= BIO_UNMAPPED;
1342			atomic_add_int(&num_unmapped_aio, 1);
1343		}
1344
1345		offset += nbytes;
1346	}
1347
1348	/* Perform transfer. */
1349	for (i = 0; i < iovcnt; i++)
1350		csw->d_strategy(bios[i]);
1351	free(bios, M_TEMP);
1352
1353	dev_relthread(dev, ref);
1354	return (0);
1355
1356destroy_bios:
1357	for (; i >= 0; i--)
1358		aio_biocleanup(bios[i]);
1359	free(bios, M_TEMP);
1360unref:
1361	dev_relthread(dev, ref);
1362	return (error);
1363}
1364
1365#ifdef COMPAT_FREEBSD6
1366static int
1367convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
1368{
1369
1370	/*
1371	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
1372	 * supported by AIO with the old sigevent structure.
1373	 */
1374	nsig->sigev_notify = osig->sigev_notify;
1375	switch (nsig->sigev_notify) {
1376	case SIGEV_NONE:
1377		break;
1378	case SIGEV_SIGNAL:
1379		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
1380		break;
1381	case SIGEV_KEVENT:
1382		nsig->sigev_notify_kqueue =
1383		    osig->__sigev_u.__sigev_notify_kqueue;
1384		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
1385		break;
1386	default:
1387		return (EINVAL);
1388	}
1389	return (0);
1390}
1391
1392static int
1393aiocb_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob,
1394    int type __unused)
1395{
1396	struct oaiocb *ojob;
1397	struct aiocb *kcb = &kjob->uaiocb;
1398	int error;
1399
1400	bzero(kcb, sizeof(struct aiocb));
1401	error = copyin(ujob, kcb, sizeof(struct oaiocb));
1402	if (error)
1403		return (error);
1404	/* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */
1405	ojob = (struct oaiocb *)kcb;
1406	return (convert_old_sigevent(&ojob->aio_sigevent, &kcb->aio_sigevent));
1407}
1408#endif
1409
1410static int
1411aiocb_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type)
1412{
1413	struct aiocb *kcb = &kjob->uaiocb;
1414	int error;
1415
1416	error = copyin(ujob, kcb, sizeof(struct aiocb));
1417	if (error)
1418		return (error);
1419	if (type == LIO_NOP)
1420		type = kcb->aio_lio_opcode;
1421	if (type & LIO_VECTORED) {
1422		/* malloc a uio and copy in the iovec */
1423		error = copyinuio(__DEVOLATILE(struct iovec*, kcb->aio_iov),
1424		    kcb->aio_iovcnt, &kjob->uiop);
1425	}
1426
1427	return (error);
1428}
1429
1430static long
1431aiocb_fetch_status(struct aiocb *ujob)
1432{
1433
1434	return (fuword(&ujob->_aiocb_private.status));
1435}
1436
1437static long
1438aiocb_fetch_error(struct aiocb *ujob)
1439{
1440
1441	return (fuword(&ujob->_aiocb_private.error));
1442}
1443
1444static int
1445aiocb_store_status(struct aiocb *ujob, long status)
1446{
1447
1448	return (suword(&ujob->_aiocb_private.status, status));
1449}
1450
1451static int
1452aiocb_store_error(struct aiocb *ujob, long error)
1453{
1454
1455	return (suword(&ujob->_aiocb_private.error, error));
1456}
1457
1458static int
1459aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
1460{
1461
1462	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
1463}
1464
1465static int
1466aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
1467{
1468
1469	return (suword(ujobp, (long)ujob));
1470}
1471
1472static struct aiocb_ops aiocb_ops = {
1473	.aio_copyin = aiocb_copyin,
1474	.fetch_status = aiocb_fetch_status,
1475	.fetch_error = aiocb_fetch_error,
1476	.store_status = aiocb_store_status,
1477	.store_error = aiocb_store_error,
1478	.store_kernelinfo = aiocb_store_kernelinfo,
1479	.store_aiocb = aiocb_store_aiocb,
1480};
1481
1482#ifdef COMPAT_FREEBSD6
1483static struct aiocb_ops aiocb_ops_osigevent = {
1484	.aio_copyin = aiocb_copyin_old_sigevent,
1485	.fetch_status = aiocb_fetch_status,
1486	.fetch_error = aiocb_fetch_error,
1487	.store_status = aiocb_store_status,
1488	.store_error = aiocb_store_error,
1489	.store_kernelinfo = aiocb_store_kernelinfo,
1490	.store_aiocb = aiocb_store_aiocb,
1491};
1492#endif
1493
1494/*
1495 * Queue a new AIO request.  Choosing either the threaded or direct bio VCHR
1496 * technique is done in this code.
1497 */
1498int
1499aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
1500    int type, struct aiocb_ops *ops)
1501{
1502	struct proc *p = td->td_proc;
1503	struct file *fp = NULL;
1504	struct kaiocb *job;
1505	struct kaioinfo *ki;
1506	struct kevent kev;
1507	int opcode;
1508	int error;
1509	int fd, kqfd;
1510	int jid;
1511	u_short evflags;
1512
1513	if (p->p_aioinfo == NULL)
1514		aio_init_aioinfo(p);
1515
1516	ki = p->p_aioinfo;
1517
1518	ops->store_status(ujob, -1);
1519	ops->store_error(ujob, 0);
1520	ops->store_kernelinfo(ujob, -1);
1521
1522	if (num_queue_count >= max_queue_count ||
1523	    ki->kaio_count >= max_aio_queue_per_proc) {
1524		error = EAGAIN;
1525		goto err1;
1526	}
1527
1528	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
1529	knlist_init_mtx(&job->klist, AIO_MTX(ki));
1530
1531	error = ops->aio_copyin(ujob, job, type);
1532	if (error)
1533		goto err2;
1534
1535	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
1536		error = EINVAL;
1537		goto err2;
1538	}
1539
1540	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
1541	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
1542	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
1543	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
1544		error = EINVAL;
1545		goto err2;
1546	}
1547
1548	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1549	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
1550		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
1551		error = EINVAL;
1552		goto err2;
1553	}
1554
1555	/* Get the opcode. */
1556	if (type == LIO_NOP) {
1557		switch (job->uaiocb.aio_lio_opcode & ~LIO_FOFFSET) {
1558		case LIO_WRITE:
1559		case LIO_WRITEV:
1560		case LIO_NOP:
1561		case LIO_READ:
1562		case LIO_READV:
1563			opcode = job->uaiocb.aio_lio_opcode & ~LIO_FOFFSET;
1564			if ((job->uaiocb.aio_lio_opcode & LIO_FOFFSET) != 0)
1565				job->ioflags |= KAIOCB_IO_FOFFSET;
1566			break;
1567		default:
1568			error = EINVAL;
1569			goto err2;
1570		}
1571	} else
1572		opcode = job->uaiocb.aio_lio_opcode = type;
1573
1574	ksiginfo_init(&job->ksi);
1575
1576	/* Save userspace address of the job info. */
1577	job->ujob = ujob;
1578
1579	/*
1580	 * Validate the opcode and fetch the file object for the specified
1581	 * file descriptor.
1582	 *
1583	 * XXXRW: Moved the opcode validation up here so that we don't
1584	 * retrieve a file descriptor without knowing what the capabiltity
1585	 * should be.
1586	 */
1587	fd = job->uaiocb.aio_fildes;
1588	switch (opcode) {
1589	case LIO_WRITE:
1590	case LIO_WRITEV:
1591		error = fget_write(td, fd, &cap_pwrite_rights, &fp);
1592		break;
1593	case LIO_READ:
1594	case LIO_READV:
1595		error = fget_read(td, fd, &cap_pread_rights, &fp);
1596		break;
1597	case LIO_SYNC:
1598	case LIO_DSYNC:
1599		error = fget(td, fd, &cap_fsync_rights, &fp);
1600		break;
1601	case LIO_MLOCK:
1602		break;
1603	case LIO_NOP:
1604		error = fget(td, fd, &cap_no_rights, &fp);
1605		break;
1606	default:
1607		error = EINVAL;
1608	}
1609	if (error)
1610		goto err3;
1611
1612	if ((opcode & LIO_SYNC) && fp->f_vnode == NULL) {
1613		error = EINVAL;
1614		goto err3;
1615	}
1616
1617	if ((opcode == LIO_READ || opcode == LIO_READV ||
1618	    opcode == LIO_WRITE || opcode == LIO_WRITEV) &&
1619	    job->uaiocb.aio_offset < 0 &&
1620	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) {
1621		error = EINVAL;
1622		goto err3;
1623	}
1624
1625	if (fp != NULL && fp->f_ops == &path_fileops) {
1626		error = EBADF;
1627		goto err3;
1628	}
1629
1630	job->fd_file = fp;
1631
1632	mtx_lock(&aio_job_mtx);
1633	jid = jobrefid++;
1634	job->seqno = jobseqno++;
1635	mtx_unlock(&aio_job_mtx);
1636	error = ops->store_kernelinfo(ujob, jid);
1637	if (error) {
1638		error = EINVAL;
1639		goto err3;
1640	}
1641	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
1642
1643	if (opcode == LIO_NOP) {
1644		fdrop(fp, td);
1645		MPASS(job->uiop == &job->uio || job->uiop == NULL);
1646		uma_zfree(aiocb_zone, job);
1647		return (0);
1648	}
1649
1650	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
1651		goto no_kqueue;
1652	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
1653	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
1654		error = EINVAL;
1655		goto err3;
1656	}
1657	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
1658	memset(&kev, 0, sizeof(kev));
1659	kev.ident = (uintptr_t)job->ujob;
1660	kev.filter = EVFILT_AIO;
1661	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
1662	kev.data = (intptr_t)job;
1663	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
1664	error = kqfd_register(kqfd, &kev, td, M_WAITOK);
1665	if (error)
1666		goto err3;
1667
1668no_kqueue:
1669
1670	ops->store_error(ujob, EINPROGRESS);
1671	job->uaiocb._aiocb_private.error = EINPROGRESS;
1672	job->userproc = p;
1673	job->cred = crhold(td->td_ucred);
1674	job->jobflags = KAIOCB_QUEUEING;
1675	job->lio = lj;
1676
1677	if (opcode & LIO_VECTORED) {
1678		/* Use the uio copied in by aio_copyin */
1679		MPASS(job->uiop != &job->uio && job->uiop != NULL);
1680	} else {
1681		/* Setup the inline uio */
1682		job->iov[0].iov_base = (void *)(uintptr_t)job->uaiocb.aio_buf;
1683		job->iov[0].iov_len = job->uaiocb.aio_nbytes;
1684		job->uio.uio_iov = job->iov;
1685		job->uio.uio_iovcnt = 1;
1686		job->uio.uio_resid = job->uaiocb.aio_nbytes;
1687		job->uio.uio_segflg = UIO_USERSPACE;
1688		job->uiop = &job->uio;
1689	}
1690	switch (opcode & (LIO_READ | LIO_WRITE)) {
1691	case LIO_READ:
1692		job->uiop->uio_rw = UIO_READ;
1693		break;
1694	case LIO_WRITE:
1695		job->uiop->uio_rw = UIO_WRITE;
1696		break;
1697	}
1698	job->uiop->uio_offset = job->uaiocb.aio_offset;
1699	job->uiop->uio_td = td;
1700
1701	if (opcode == LIO_MLOCK) {
1702		aio_schedule(job, aio_process_mlock);
1703		error = 0;
1704	} else if (fp->f_ops->fo_aio_queue == NULL)
1705		error = aio_queue_file(fp, job);
1706	else
1707		error = fo_aio_queue(fp, job);
1708	if (error)
1709		goto err4;
1710
1711	AIO_LOCK(ki);
1712	job->jobflags &= ~KAIOCB_QUEUEING;
1713	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
1714	ki->kaio_count++;
1715	if (lj)
1716		lj->lioj_count++;
1717	atomic_add_int(&num_queue_count, 1);
1718	if (job->jobflags & KAIOCB_FINISHED) {
1719		/*
1720		 * The queue callback completed the request synchronously.
1721		 * The bulk of the completion is deferred in that case
1722		 * until this point.
1723		 */
1724		aio_bio_done_notify(p, job);
1725	} else
1726		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
1727	AIO_UNLOCK(ki);
1728	return (0);
1729
1730err4:
1731	crfree(job->cred);
1732err3:
1733	if (fp)
1734		fdrop(fp, td);
1735	knlist_delete(&job->klist, curthread, 0);
1736err2:
1737	if (job->uiop != &job->uio)
1738		freeuio(job->uiop);
1739	uma_zfree(aiocb_zone, job);
1740err1:
1741	ops->store_error(ujob, error);
1742	return (error);
1743}
1744
1745static void
1746aio_cancel_daemon_job(struct kaiocb *job)
1747{
1748
1749	mtx_lock(&aio_job_mtx);
1750	if (!aio_cancel_cleared(job))
1751		TAILQ_REMOVE(&aio_jobs, job, list);
1752	mtx_unlock(&aio_job_mtx);
1753	aio_cancel(job);
1754}
1755
1756void
1757aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
1758{
1759
1760	mtx_lock(&aio_job_mtx);
1761	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
1762		mtx_unlock(&aio_job_mtx);
1763		aio_cancel(job);
1764		return;
1765	}
1766	job->handle_fn = func;
1767	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
1768	aio_kick_nowait(job->userproc);
1769	mtx_unlock(&aio_job_mtx);
1770}
1771
1772static void
1773aio_cancel_sync(struct kaiocb *job)
1774{
1775	struct kaioinfo *ki;
1776
1777	ki = job->userproc->p_aioinfo;
1778	AIO_LOCK(ki);
1779	if (!aio_cancel_cleared(job))
1780		TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
1781	AIO_UNLOCK(ki);
1782	aio_cancel(job);
1783}
1784
1785int
1786aio_queue_file(struct file *fp, struct kaiocb *job)
1787{
1788	struct kaioinfo *ki;
1789	struct kaiocb *job2;
1790	struct vnode *vp;
1791	struct mount *mp;
1792	int error;
1793	bool safe;
1794
1795	ki = job->userproc->p_aioinfo;
1796	error = aio_qbio(job->userproc, job);
1797	if (error >= 0)
1798		return (error);
1799	safe = false;
1800	if (fp->f_type == DTYPE_VNODE) {
1801		vp = fp->f_vnode;
1802		if (vp->v_type == VREG || vp->v_type == VDIR) {
1803			mp = fp->f_vnode->v_mount;
1804			if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
1805				safe = true;
1806		}
1807	}
1808	if (!(safe || enable_aio_unsafe)) {
1809		counted_warning(&unsafe_warningcnt,
1810		    "is attempting to use unsafe AIO requests");
1811		return (EOPNOTSUPP);
1812	}
1813
1814	if (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) {
1815		aio_schedule(job, aio_process_rw);
1816		error = 0;
1817	} else if (job->uaiocb.aio_lio_opcode & LIO_SYNC) {
1818		AIO_LOCK(ki);
1819		TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
1820			if (job2->fd_file == job->fd_file &&
1821			    ((job2->uaiocb.aio_lio_opcode & LIO_SYNC) == 0) &&
1822			    job2->seqno < job->seqno) {
1823				job2->jobflags |= KAIOCB_CHECKSYNC;
1824				job->pending++;
1825			}
1826		}
1827		if (job->pending != 0) {
1828			if (!aio_set_cancel_function_locked(job,
1829				aio_cancel_sync)) {
1830				AIO_UNLOCK(ki);
1831				aio_cancel(job);
1832				return (0);
1833			}
1834			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
1835			AIO_UNLOCK(ki);
1836			return (0);
1837		}
1838		AIO_UNLOCK(ki);
1839		aio_schedule(job, aio_process_sync);
1840		error = 0;
1841	} else {
1842		error = EINVAL;
1843	}
1844	return (error);
1845}
1846
1847static void
1848aio_kick_nowait(struct proc *userp)
1849{
1850	struct kaioinfo *ki = userp->p_aioinfo;
1851	struct aioproc *aiop;
1852
1853	mtx_assert(&aio_job_mtx, MA_OWNED);
1854	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1855		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1856		aiop->aioprocflags &= ~AIOP_FREE;
1857		wakeup(aiop->aioproc);
1858	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
1859	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
1860		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
1861	}
1862}
1863
1864static int
1865aio_kick(struct proc *userp)
1866{
1867	struct kaioinfo *ki = userp->p_aioinfo;
1868	struct aioproc *aiop;
1869	int error, ret = 0;
1870
1871	mtx_assert(&aio_job_mtx, MA_OWNED);
1872retryproc:
1873	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1874		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1875		aiop->aioprocflags &= ~AIOP_FREE;
1876		wakeup(aiop->aioproc);
1877	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
1878	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
1879		num_aio_resv_start++;
1880		mtx_unlock(&aio_job_mtx);
1881		error = aio_newproc(&num_aio_resv_start);
1882		mtx_lock(&aio_job_mtx);
1883		if (error) {
1884			num_aio_resv_start--;
1885			goto retryproc;
1886		}
1887	} else {
1888		ret = -1;
1889	}
1890	return (ret);
1891}
1892
1893static void
1894aio_kick_helper(void *context, int pending)
1895{
1896	struct proc *userp = context;
1897
1898	mtx_lock(&aio_job_mtx);
1899	while (--pending >= 0) {
1900		if (aio_kick(userp))
1901			break;
1902	}
1903	mtx_unlock(&aio_job_mtx);
1904}
1905
1906/*
1907 * Support the aio_return system call, as a side-effect, kernel resources are
1908 * released.
1909 */
1910static int
1911kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
1912{
1913	struct proc *p = td->td_proc;
1914	struct kaiocb *job;
1915	struct kaioinfo *ki;
1916	long status, error;
1917
1918	ki = p->p_aioinfo;
1919	if (ki == NULL)
1920		return (EINVAL);
1921	AIO_LOCK(ki);
1922	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
1923		if (job->ujob == ujob)
1924			break;
1925	}
1926	if (job != NULL) {
1927		MPASS(job->jobflags & KAIOCB_FINISHED);
1928		status = job->uaiocb._aiocb_private.status;
1929		error = job->uaiocb._aiocb_private.error;
1930		td->td_retval[0] = status;
1931		td->td_ru.ru_oublock += job->outblock;
1932		td->td_ru.ru_inblock += job->inblock;
1933		td->td_ru.ru_msgsnd += job->msgsnd;
1934		td->td_ru.ru_msgrcv += job->msgrcv;
1935		aio_free_entry(job);
1936		AIO_UNLOCK(ki);
1937		ops->store_error(ujob, error);
1938		ops->store_status(ujob, status);
1939	} else {
1940		error = EINVAL;
1941		AIO_UNLOCK(ki);
1942	}
1943	return (error);
1944}
1945
1946int
1947sys_aio_return(struct thread *td, struct aio_return_args *uap)
1948{
1949
1950	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
1951}
1952
1953/*
1954 * Allow a process to wakeup when any of the I/O requests are completed.
1955 */
1956static int
1957kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
1958    struct timespec *ts)
1959{
1960	struct proc *p = td->td_proc;
1961	struct timeval atv;
1962	struct kaioinfo *ki;
1963	struct kaiocb *firstjob, *job;
1964	int error, i, timo;
1965
1966	timo = 0;
1967	if (ts) {
1968		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
1969			return (EINVAL);
1970
1971		TIMESPEC_TO_TIMEVAL(&atv, ts);
1972		if (itimerfix(&atv))
1973			return (EINVAL);
1974		timo = tvtohz(&atv);
1975	}
1976
1977	ki = p->p_aioinfo;
1978	if (ki == NULL)
1979		return (EAGAIN);
1980
1981	if (njoblist == 0)
1982		return (0);
1983
1984	AIO_LOCK(ki);
1985	for (;;) {
1986		firstjob = NULL;
1987		error = 0;
1988		TAILQ_FOREACH(job, &ki->kaio_all, allist) {
1989			for (i = 0; i < njoblist; i++) {
1990				if (job->ujob == ujoblist[i]) {
1991					if (firstjob == NULL)
1992						firstjob = job;
1993					if (job->jobflags & KAIOCB_FINISHED)
1994						goto RETURN;
1995				}
1996			}
1997		}
1998		/* All tasks were finished. */
1999		if (firstjob == NULL)
2000			break;
2001
2002		ki->kaio_flags |= KAIO_WAKEUP;
2003		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2004		    "aiospn", timo);
2005		if (error == ERESTART)
2006			error = EINTR;
2007		if (error)
2008			break;
2009	}
2010RETURN:
2011	AIO_UNLOCK(ki);
2012	return (error);
2013}
2014
2015int
2016sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
2017{
2018	struct timespec ts, *tsp;
2019	struct aiocb **ujoblist;
2020	int error;
2021
2022	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
2023		return (EINVAL);
2024
2025	if (uap->timeout) {
2026		/* Get timespec struct. */
2027		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
2028			return (error);
2029		tsp = &ts;
2030	} else
2031		tsp = NULL;
2032
2033	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIO, M_WAITOK);
2034	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
2035	if (error == 0)
2036		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2037	free(ujoblist, M_AIO);
2038	return (error);
2039}
2040
2041/*
2042 * aio_cancel cancels any non-bio aio operations not currently in progress.
2043 */
2044int
2045sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
2046{
2047	struct proc *p = td->td_proc;
2048	struct kaioinfo *ki;
2049	struct kaiocb *job, *jobn;
2050	struct file *fp;
2051	int error;
2052	int cancelled = 0;
2053	int notcancelled = 0;
2054	struct vnode *vp;
2055
2056	/* Lookup file object. */
2057	error = fget(td, uap->fd, &cap_no_rights, &fp);
2058	if (error)
2059		return (error);
2060
2061	ki = p->p_aioinfo;
2062	if (ki == NULL)
2063		goto done;
2064
2065	if (fp->f_type == DTYPE_VNODE) {
2066		vp = fp->f_vnode;
2067		if (vn_isdisk(vp)) {
2068			fdrop(fp, td);
2069			td->td_retval[0] = AIO_NOTCANCELED;
2070			return (0);
2071		}
2072	}
2073
2074	AIO_LOCK(ki);
2075	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
2076		if ((uap->fd == job->uaiocb.aio_fildes) &&
2077		    ((uap->aiocbp == NULL) ||
2078		     (uap->aiocbp == job->ujob))) {
2079			if (aio_cancel_job(p, ki, job)) {
2080				cancelled++;
2081			} else {
2082				notcancelled++;
2083			}
2084			if (uap->aiocbp != NULL)
2085				break;
2086		}
2087	}
2088	AIO_UNLOCK(ki);
2089
2090done:
2091	fdrop(fp, td);
2092
2093	if (uap->aiocbp != NULL) {
2094		if (cancelled) {
2095			td->td_retval[0] = AIO_CANCELED;
2096			return (0);
2097		}
2098	}
2099
2100	if (notcancelled) {
2101		td->td_retval[0] = AIO_NOTCANCELED;
2102		return (0);
2103	}
2104
2105	if (cancelled) {
2106		td->td_retval[0] = AIO_CANCELED;
2107		return (0);
2108	}
2109
2110	td->td_retval[0] = AIO_ALLDONE;
2111
2112	return (0);
2113}
2114
2115/*
2116 * aio_error is implemented in the kernel level for compatibility purposes
2117 * only.  For a user mode async implementation, it would be best to do it in
2118 * a userland subroutine.
2119 */
2120static int
2121kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
2122{
2123	struct proc *p = td->td_proc;
2124	struct kaiocb *job;
2125	struct kaioinfo *ki;
2126	int status;
2127
2128	ki = p->p_aioinfo;
2129	if (ki == NULL) {
2130		td->td_retval[0] = EINVAL;
2131		return (0);
2132	}
2133
2134	AIO_LOCK(ki);
2135	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
2136		if (job->ujob == ujob) {
2137			if (job->jobflags & KAIOCB_FINISHED)
2138				td->td_retval[0] =
2139					job->uaiocb._aiocb_private.error;
2140			else
2141				td->td_retval[0] = EINPROGRESS;
2142			AIO_UNLOCK(ki);
2143			return (0);
2144		}
2145	}
2146	AIO_UNLOCK(ki);
2147
2148	/*
2149	 * Hack for failure of aio_aqueue.
2150	 */
2151	status = ops->fetch_status(ujob);
2152	if (status == -1) {
2153		td->td_retval[0] = ops->fetch_error(ujob);
2154		return (0);
2155	}
2156
2157	td->td_retval[0] = EINVAL;
2158	return (0);
2159}
2160
2161int
2162sys_aio_error(struct thread *td, struct aio_error_args *uap)
2163{
2164
2165	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
2166}
2167
2168/* syscall - asynchronous read from a file (REALTIME) */
2169#ifdef COMPAT_FREEBSD6
2170int
2171freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
2172{
2173
2174	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2175	    &aiocb_ops_osigevent));
2176}
2177#endif
2178
2179int
2180sys_aio_read(struct thread *td, struct aio_read_args *uap)
2181{
2182
2183	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
2184}
2185
2186int
2187sys_aio_readv(struct thread *td, struct aio_readv_args *uap)
2188{
2189
2190	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READV, &aiocb_ops));
2191}
2192
2193/* syscall - asynchronous write to a file (REALTIME) */
2194#ifdef COMPAT_FREEBSD6
2195int
2196freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
2197{
2198
2199	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2200	    &aiocb_ops_osigevent));
2201}
2202#endif
2203
2204int
2205sys_aio_write(struct thread *td, struct aio_write_args *uap)
2206{
2207
2208	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
2209}
2210
2211int
2212sys_aio_writev(struct thread *td, struct aio_writev_args *uap)
2213{
2214
2215	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITEV, &aiocb_ops));
2216}
2217
2218int
2219sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
2220{
2221
2222	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
2223}
2224
2225static int
2226kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
2227    struct aiocb **acb_list, int nent, struct sigevent *sig,
2228    struct aiocb_ops *ops)
2229{
2230	struct proc *p = td->td_proc;
2231	struct aiocb *job;
2232	struct kaioinfo *ki;
2233	struct aioliojob *lj;
2234	struct kevent kev;
2235	int error;
2236	int nagain, nerror;
2237	int i;
2238
2239	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
2240		return (EINVAL);
2241
2242	if (nent < 0 || nent > max_aio_queue_per_proc)
2243		return (EINVAL);
2244
2245	if (p->p_aioinfo == NULL)
2246		aio_init_aioinfo(p);
2247
2248	ki = p->p_aioinfo;
2249
2250	lj = uma_zalloc(aiolio_zone, M_WAITOK);
2251	lj->lioj_flags = 0;
2252	lj->lioj_count = 0;
2253	lj->lioj_finished_count = 0;
2254	lj->lioj_signal.sigev_notify = SIGEV_NONE;
2255	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
2256	ksiginfo_init(&lj->lioj_ksi);
2257
2258	/*
2259	 * Setup signal.
2260	 */
2261	if (sig && (mode == LIO_NOWAIT)) {
2262		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
2263		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2264			/* Assume only new style KEVENT */
2265			memset(&kev, 0, sizeof(kev));
2266			kev.filter = EVFILT_LIO;
2267			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
2268			kev.ident = (uintptr_t)uacb_list; /* something unique */
2269			kev.data = (intptr_t)lj;
2270			/* pass user defined sigval data */
2271			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
2272			error = kqfd_register(
2273			    lj->lioj_signal.sigev_notify_kqueue, &kev, td,
2274			    M_WAITOK);
2275			if (error) {
2276				uma_zfree(aiolio_zone, lj);
2277				return (error);
2278			}
2279		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
2280			;
2281		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2282			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
2283				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2284					uma_zfree(aiolio_zone, lj);
2285					return EINVAL;
2286				}
2287				lj->lioj_flags |= LIOJ_SIGNAL;
2288		} else {
2289			uma_zfree(aiolio_zone, lj);
2290			return EINVAL;
2291		}
2292	}
2293
2294	AIO_LOCK(ki);
2295	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2296	/*
2297	 * Add extra aiocb count to avoid the lio to be freed
2298	 * by other threads doing aio_waitcomplete or aio_return,
2299	 * and prevent event from being sent until we have queued
2300	 * all tasks.
2301	 */
2302	lj->lioj_count = 1;
2303	AIO_UNLOCK(ki);
2304
2305	/*
2306	 * Get pointers to the list of I/O requests.
2307	 */
2308	nagain = 0;
2309	nerror = 0;
2310	for (i = 0; i < nent; i++) {
2311		job = acb_list[i];
2312		if (job != NULL) {
2313			error = aio_aqueue(td, job, lj, LIO_NOP, ops);
2314			if (error == EAGAIN)
2315				nagain++;
2316			else if (error != 0)
2317				nerror++;
2318		}
2319	}
2320
2321	error = 0;
2322	AIO_LOCK(ki);
2323	if (mode == LIO_WAIT) {
2324		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
2325			ki->kaio_flags |= KAIO_WAKEUP;
2326			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
2327			    PRIBIO | PCATCH, "aiospn", 0);
2328			if (error == ERESTART)
2329				error = EINTR;
2330			if (error)
2331				break;
2332		}
2333	} else {
2334		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
2335			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2336				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
2337				KNOTE_LOCKED(&lj->klist, 1);
2338			}
2339			if ((lj->lioj_flags & (LIOJ_SIGNAL |
2340			    LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL &&
2341			    (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2342			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
2343				aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi,
2344				    lj->lioj_count != 1);
2345				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2346			}
2347		}
2348	}
2349	lj->lioj_count--;
2350	if (lj->lioj_count == 0) {
2351		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
2352		knlist_delete(&lj->klist, curthread, 1);
2353		PROC_LOCK(p);
2354		sigqueue_take(&lj->lioj_ksi);
2355		PROC_UNLOCK(p);
2356		AIO_UNLOCK(ki);
2357		uma_zfree(aiolio_zone, lj);
2358	} else
2359		AIO_UNLOCK(ki);
2360
2361	if (nerror)
2362		return (EIO);
2363	else if (nagain)
2364		return (EAGAIN);
2365	else
2366		return (error);
2367}
2368
2369/* syscall - list directed I/O (REALTIME) */
2370#ifdef COMPAT_FREEBSD6
2371int
2372freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
2373{
2374	struct aiocb **acb_list;
2375	struct sigevent *sigp, sig;
2376	struct osigevent osig;
2377	int error, nent;
2378
2379	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2380		return (EINVAL);
2381
2382	nent = uap->nent;
2383	if (nent < 0 || nent > max_aio_queue_per_proc)
2384		return (EINVAL);
2385
2386	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2387		error = copyin(uap->sig, &osig, sizeof(osig));
2388		if (error)
2389			return (error);
2390		error = convert_old_sigevent(&osig, &sig);
2391		if (error)
2392			return (error);
2393		sigp = &sig;
2394	} else
2395		sigp = NULL;
2396
2397	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2398	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2399	if (error == 0)
2400		error = kern_lio_listio(td, uap->mode,
2401		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2402		    &aiocb_ops_osigevent);
2403	free(acb_list, M_LIO);
2404	return (error);
2405}
2406#endif
2407
2408/* syscall - list directed I/O (REALTIME) */
2409int
2410sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
2411{
2412	struct aiocb **acb_list;
2413	struct sigevent *sigp, sig;
2414	int error, nent;
2415
2416	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2417		return (EINVAL);
2418
2419	nent = uap->nent;
2420	if (nent < 0 || nent > max_aio_queue_per_proc)
2421		return (EINVAL);
2422
2423	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2424		error = copyin(uap->sig, &sig, sizeof(sig));
2425		if (error)
2426			return (error);
2427		sigp = &sig;
2428	} else
2429		sigp = NULL;
2430
2431	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2432	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2433	if (error == 0)
2434		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
2435		    nent, sigp, &aiocb_ops);
2436	free(acb_list, M_LIO);
2437	return (error);
2438}
2439
2440static void
2441aio_biocleanup(struct bio *bp)
2442{
2443	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
2444	struct kaioinfo *ki;
2445	struct buf *pbuf = (struct buf *)bp->bio_caller2;
2446
2447	/* Release mapping into kernel space. */
2448	if (pbuf != NULL) {
2449		MPASS(pbuf->b_npages <= atop(maxphys) + 1);
2450		pmap_qremove((vm_offset_t)pbuf->b_data, pbuf->b_npages);
2451		vm_page_unhold_pages(pbuf->b_pages, pbuf->b_npages);
2452		uma_zfree(pbuf_zone, pbuf);
2453		atomic_subtract_int(&num_buf_aio, 1);
2454		ki = job->userproc->p_aioinfo;
2455		AIO_LOCK(ki);
2456		ki->kaio_buffer_count--;
2457		AIO_UNLOCK(ki);
2458	} else {
2459		MPASS(bp->bio_ma_n <= atop(maxphys) + 1);
2460		vm_page_unhold_pages(bp->bio_ma, bp->bio_ma_n);
2461		free(bp->bio_ma, M_TEMP);
2462		atomic_subtract_int(&num_unmapped_aio, 1);
2463	}
2464	g_destroy_bio(bp);
2465}
2466
2467static void
2468aio_biowakeup(struct bio *bp)
2469{
2470	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
2471	size_t nbytes;
2472	long bcount = bp->bio_bcount;
2473	long resid = bp->bio_resid;
2474	int opcode, nblks;
2475	int bio_error = bp->bio_error;
2476	uint16_t flags = bp->bio_flags;
2477
2478	opcode = job->uaiocb.aio_lio_opcode;
2479
2480	aio_biocleanup(bp);
2481
2482	nbytes = bcount - resid;
2483	atomic_add_acq_long(&job->nbytes, nbytes);
2484	nblks = btodb(nbytes);
2485
2486	/*
2487	 * If multiple bios experienced an error, the job will reflect the
2488	 * error of whichever failed bio completed last.
2489	 */
2490	if (flags & BIO_ERROR)
2491		atomic_store_int(&job->error, bio_error);
2492	if (opcode & LIO_WRITE)
2493		atomic_add_int(&job->outblock, nblks);
2494	else
2495		atomic_add_int(&job->inblock, nblks);
2496
2497	if (refcount_release(&job->nbio)) {
2498		bio_error = atomic_load_int(&job->error);
2499		if (bio_error != 0)
2500			aio_complete(job, -1, bio_error);
2501		else
2502			aio_complete(job, atomic_load_long(&job->nbytes), 0);
2503	}
2504}
2505
2506/* syscall - wait for the next completion of an aio request */
2507static int
2508kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
2509    struct timespec *ts, struct aiocb_ops *ops)
2510{
2511	struct proc *p = td->td_proc;
2512	struct timeval atv;
2513	struct kaioinfo *ki;
2514	struct kaiocb *job;
2515	struct aiocb *ujob;
2516	long error, status;
2517	int timo;
2518
2519	ops->store_aiocb(ujobp, NULL);
2520
2521	if (ts == NULL) {
2522		timo = 0;
2523	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
2524		timo = -1;
2525	} else {
2526		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
2527			return (EINVAL);
2528
2529		TIMESPEC_TO_TIMEVAL(&atv, ts);
2530		if (itimerfix(&atv))
2531			return (EINVAL);
2532		timo = tvtohz(&atv);
2533	}
2534
2535	if (p->p_aioinfo == NULL)
2536		aio_init_aioinfo(p);
2537	ki = p->p_aioinfo;
2538
2539	error = 0;
2540	job = NULL;
2541	AIO_LOCK(ki);
2542	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
2543		if (timo == -1) {
2544			error = EWOULDBLOCK;
2545			break;
2546		}
2547		ki->kaio_flags |= KAIO_WAKEUP;
2548		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2549		    "aiowc", timo);
2550		if (timo && error == ERESTART)
2551			error = EINTR;
2552		if (error)
2553			break;
2554	}
2555
2556	if (job != NULL) {
2557		MPASS(job->jobflags & KAIOCB_FINISHED);
2558		ujob = job->ujob;
2559		status = job->uaiocb._aiocb_private.status;
2560		error = job->uaiocb._aiocb_private.error;
2561		td->td_retval[0] = status;
2562		td->td_ru.ru_oublock += job->outblock;
2563		td->td_ru.ru_inblock += job->inblock;
2564		td->td_ru.ru_msgsnd += job->msgsnd;
2565		td->td_ru.ru_msgrcv += job->msgrcv;
2566		aio_free_entry(job);
2567		AIO_UNLOCK(ki);
2568		ops->store_aiocb(ujobp, ujob);
2569		ops->store_error(ujob, error);
2570		ops->store_status(ujob, status);
2571	} else
2572		AIO_UNLOCK(ki);
2573
2574	return (error);
2575}
2576
2577int
2578sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2579{
2580	struct timespec ts, *tsp;
2581	int error;
2582
2583	if (uap->timeout) {
2584		/* Get timespec struct. */
2585		error = copyin(uap->timeout, &ts, sizeof(ts));
2586		if (error)
2587			return (error);
2588		tsp = &ts;
2589	} else
2590		tsp = NULL;
2591
2592	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
2593}
2594
2595static int
2596kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
2597    struct aiocb_ops *ops)
2598{
2599	int listop;
2600
2601	switch (op) {
2602	case O_SYNC:
2603		listop = LIO_SYNC;
2604		break;
2605	case O_DSYNC:
2606		listop = LIO_DSYNC;
2607		break;
2608	default:
2609		return (EINVAL);
2610	}
2611
2612	return (aio_aqueue(td, ujob, NULL, listop, ops));
2613}
2614
2615int
2616sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
2617{
2618
2619	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
2620}
2621
2622/* kqueue attach function */
2623static int
2624filt_aioattach(struct knote *kn)
2625{
2626	struct kaiocb *job;
2627
2628	job = (struct kaiocb *)(uintptr_t)kn->kn_sdata;
2629
2630	/*
2631	 * The job pointer must be validated before using it, so
2632	 * registration is restricted to the kernel; the user cannot
2633	 * set EV_FLAG1.
2634	 */
2635	if ((kn->kn_flags & EV_FLAG1) == 0)
2636		return (EPERM);
2637	kn->kn_ptr.p_aio = job;
2638	kn->kn_flags &= ~EV_FLAG1;
2639
2640	knlist_add(&job->klist, kn, 0);
2641
2642	return (0);
2643}
2644
2645/* kqueue detach function */
2646static void
2647filt_aiodetach(struct knote *kn)
2648{
2649	struct knlist *knl;
2650
2651	knl = &kn->kn_ptr.p_aio->klist;
2652	knl->kl_lock(knl->kl_lockarg);
2653	if (!knlist_empty(knl))
2654		knlist_remove(knl, kn, 1);
2655	knl->kl_unlock(knl->kl_lockarg);
2656}
2657
2658/* kqueue filter function */
2659/*ARGSUSED*/
2660static int
2661filt_aio(struct knote *kn, long hint)
2662{
2663	struct kaiocb *job = kn->kn_ptr.p_aio;
2664
2665	kn->kn_data = job->uaiocb._aiocb_private.error;
2666	if (!(job->jobflags & KAIOCB_FINISHED))
2667		return (0);
2668	kn->kn_flags |= EV_EOF;
2669	return (1);
2670}
2671
2672/* kqueue attach function */
2673static int
2674filt_lioattach(struct knote *kn)
2675{
2676	struct aioliojob *lj;
2677
2678	lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata;
2679
2680	/*
2681	 * The aioliojob pointer must be validated before using it, so
2682	 * registration is restricted to the kernel; the user cannot
2683	 * set EV_FLAG1.
2684	 */
2685	if ((kn->kn_flags & EV_FLAG1) == 0)
2686		return (EPERM);
2687	kn->kn_ptr.p_lio = lj;
2688	kn->kn_flags &= ~EV_FLAG1;
2689
2690	knlist_add(&lj->klist, kn, 0);
2691
2692	return (0);
2693}
2694
2695/* kqueue detach function */
2696static void
2697filt_liodetach(struct knote *kn)
2698{
2699	struct knlist *knl;
2700
2701	knl = &kn->kn_ptr.p_lio->klist;
2702	knl->kl_lock(knl->kl_lockarg);
2703	if (!knlist_empty(knl))
2704		knlist_remove(knl, kn, 1);
2705	knl->kl_unlock(knl->kl_lockarg);
2706}
2707
2708/* kqueue filter function */
2709/*ARGSUSED*/
2710static int
2711filt_lio(struct knote *kn, long hint)
2712{
2713	struct aioliojob * lj = kn->kn_ptr.p_lio;
2714
2715	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
2716}
2717
2718#ifdef COMPAT_FREEBSD32
2719#include <sys/mount.h>
2720#include <sys/socket.h>
2721#include <sys/sysent.h>
2722#include <compat/freebsd32/freebsd32.h>
2723#include <compat/freebsd32/freebsd32_proto.h>
2724#include <compat/freebsd32/freebsd32_signal.h>
2725#include <compat/freebsd32/freebsd32_syscall.h>
2726#include <compat/freebsd32/freebsd32_util.h>
2727
2728struct __aiocb_private32 {
2729	int32_t	status;
2730	int32_t	error;
2731	uint32_t kernelinfo;
2732};
2733
2734#ifdef COMPAT_FREEBSD6
2735typedef struct oaiocb32 {
2736	int	aio_fildes;		/* File descriptor */
2737	uint64_t aio_offset __packed;	/* File offset for I/O */
2738	uint32_t aio_buf;		/* I/O buffer in process space */
2739	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2740	struct	osigevent32 aio_sigevent; /* Signal to deliver */
2741	int	aio_lio_opcode;		/* LIO opcode */
2742	int	aio_reqprio;		/* Request priority -- ignored */
2743	struct	__aiocb_private32 _aiocb_private;
2744} oaiocb32_t;
2745#endif
2746
2747typedef struct aiocb32 {
2748	int32_t	aio_fildes;		/* File descriptor */
2749	uint64_t aio_offset __packed;	/* File offset for I/O */
2750	uint32_t aio_buf;	/* I/O buffer in process space */
2751	uint32_t aio_nbytes;	/* Number of bytes for I/O */
2752	int	__spare__[2];
2753	uint32_t __spare2__;
2754	int	aio_lio_opcode;		/* LIO opcode */
2755	int	aio_reqprio;		/* Request priority -- ignored */
2756	struct	__aiocb_private32 _aiocb_private;
2757	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
2758} aiocb32_t;
2759
2760#ifdef COMPAT_FREEBSD6
2761static int
2762convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
2763{
2764
2765	/*
2766	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
2767	 * supported by AIO with the old sigevent structure.
2768	 */
2769	CP(*osig, *nsig, sigev_notify);
2770	switch (nsig->sigev_notify) {
2771	case SIGEV_NONE:
2772		break;
2773	case SIGEV_SIGNAL:
2774		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
2775		break;
2776	case SIGEV_KEVENT:
2777		nsig->sigev_notify_kqueue =
2778		    osig->__sigev_u.__sigev_notify_kqueue;
2779		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
2780		break;
2781	default:
2782		return (EINVAL);
2783	}
2784	return (0);
2785}
2786
2787static int
2788aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob,
2789    int type __unused)
2790{
2791	struct oaiocb32 job32;
2792	struct aiocb *kcb = &kjob->uaiocb;
2793	int error;
2794
2795	bzero(kcb, sizeof(struct aiocb));
2796	error = copyin(ujob, &job32, sizeof(job32));
2797	if (error)
2798		return (error);
2799
2800	/* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */
2801
2802	CP(job32, *kcb, aio_fildes);
2803	CP(job32, *kcb, aio_offset);
2804	PTRIN_CP(job32, *kcb, aio_buf);
2805	CP(job32, *kcb, aio_nbytes);
2806	CP(job32, *kcb, aio_lio_opcode);
2807	CP(job32, *kcb, aio_reqprio);
2808	CP(job32, *kcb, _aiocb_private.status);
2809	CP(job32, *kcb, _aiocb_private.error);
2810	PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo);
2811	return (convert_old_sigevent32(&job32.aio_sigevent,
2812	    &kcb->aio_sigevent));
2813}
2814#endif
2815
2816static int
2817aiocb32_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type)
2818{
2819	struct aiocb32 job32;
2820	struct aiocb *kcb = &kjob->uaiocb;
2821	struct iovec32 *iov32;
2822	int error;
2823
2824	error = copyin(ujob, &job32, sizeof(job32));
2825	if (error)
2826		return (error);
2827	CP(job32, *kcb, aio_fildes);
2828	CP(job32, *kcb, aio_offset);
2829	CP(job32, *kcb, aio_lio_opcode);
2830	if (type == LIO_NOP)
2831		type = kcb->aio_lio_opcode;
2832	if (type & LIO_VECTORED) {
2833		iov32 = PTRIN(job32.aio_iov);
2834		CP(job32, *kcb, aio_iovcnt);
2835		/* malloc a uio and copy in the iovec */
2836		error = freebsd32_copyinuio(iov32,
2837		    kcb->aio_iovcnt, &kjob->uiop);
2838		if (error)
2839			return (error);
2840	} else {
2841		PTRIN_CP(job32, *kcb, aio_buf);
2842		CP(job32, *kcb, aio_nbytes);
2843	}
2844	CP(job32, *kcb, aio_reqprio);
2845	CP(job32, *kcb, _aiocb_private.status);
2846	CP(job32, *kcb, _aiocb_private.error);
2847	PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo);
2848	error = convert_sigevent32(&job32.aio_sigevent, &kcb->aio_sigevent);
2849
2850	return (error);
2851}
2852
2853static long
2854aiocb32_fetch_status(struct aiocb *ujob)
2855{
2856	struct aiocb32 *ujob32;
2857
2858	ujob32 = (struct aiocb32 *)ujob;
2859	return (fuword32(&ujob32->_aiocb_private.status));
2860}
2861
2862static long
2863aiocb32_fetch_error(struct aiocb *ujob)
2864{
2865	struct aiocb32 *ujob32;
2866
2867	ujob32 = (struct aiocb32 *)ujob;
2868	return (fuword32(&ujob32->_aiocb_private.error));
2869}
2870
2871static int
2872aiocb32_store_status(struct aiocb *ujob, long status)
2873{
2874	struct aiocb32 *ujob32;
2875
2876	ujob32 = (struct aiocb32 *)ujob;
2877	return (suword32(&ujob32->_aiocb_private.status, status));
2878}
2879
2880static int
2881aiocb32_store_error(struct aiocb *ujob, long error)
2882{
2883	struct aiocb32 *ujob32;
2884
2885	ujob32 = (struct aiocb32 *)ujob;
2886	return (suword32(&ujob32->_aiocb_private.error, error));
2887}
2888
2889static int
2890aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
2891{
2892	struct aiocb32 *ujob32;
2893
2894	ujob32 = (struct aiocb32 *)ujob;
2895	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
2896}
2897
2898static int
2899aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
2900{
2901
2902	return (suword32(ujobp, (long)ujob));
2903}
2904
2905static struct aiocb_ops aiocb32_ops = {
2906	.aio_copyin = aiocb32_copyin,
2907	.fetch_status = aiocb32_fetch_status,
2908	.fetch_error = aiocb32_fetch_error,
2909	.store_status = aiocb32_store_status,
2910	.store_error = aiocb32_store_error,
2911	.store_kernelinfo = aiocb32_store_kernelinfo,
2912	.store_aiocb = aiocb32_store_aiocb,
2913};
2914
2915#ifdef COMPAT_FREEBSD6
2916static struct aiocb_ops aiocb32_ops_osigevent = {
2917	.aio_copyin = aiocb32_copyin_old_sigevent,
2918	.fetch_status = aiocb32_fetch_status,
2919	.fetch_error = aiocb32_fetch_error,
2920	.store_status = aiocb32_store_status,
2921	.store_error = aiocb32_store_error,
2922	.store_kernelinfo = aiocb32_store_kernelinfo,
2923	.store_aiocb = aiocb32_store_aiocb,
2924};
2925#endif
2926
2927int
2928freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
2929{
2930
2931	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2932}
2933
2934int
2935freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
2936{
2937	struct timespec32 ts32;
2938	struct timespec ts, *tsp;
2939	struct aiocb **ujoblist;
2940	uint32_t *ujoblist32;
2941	int error, i;
2942
2943	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
2944		return (EINVAL);
2945
2946	if (uap->timeout) {
2947		/* Get timespec struct. */
2948		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
2949			return (error);
2950		CP(ts32, ts, tv_sec);
2951		CP(ts32, ts, tv_nsec);
2952		tsp = &ts;
2953	} else
2954		tsp = NULL;
2955
2956	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIO, M_WAITOK);
2957	ujoblist32 = (uint32_t *)ujoblist;
2958	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
2959	    sizeof(ujoblist32[0]));
2960	if (error == 0) {
2961		for (i = uap->nent - 1; i >= 0; i--)
2962			ujoblist[i] = PTRIN(ujoblist32[i]);
2963
2964		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2965	}
2966	free(ujoblist, M_AIO);
2967	return (error);
2968}
2969
2970int
2971freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
2972{
2973
2974	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2975}
2976
2977#ifdef COMPAT_FREEBSD6
2978int
2979freebsd6_freebsd32_aio_read(struct thread *td,
2980    struct freebsd6_freebsd32_aio_read_args *uap)
2981{
2982
2983	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2984	    &aiocb32_ops_osigevent));
2985}
2986#endif
2987
2988int
2989freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
2990{
2991
2992	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2993	    &aiocb32_ops));
2994}
2995
2996int
2997freebsd32_aio_readv(struct thread *td, struct freebsd32_aio_readv_args *uap)
2998{
2999
3000	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READV,
3001	    &aiocb32_ops));
3002}
3003
3004#ifdef COMPAT_FREEBSD6
3005int
3006freebsd6_freebsd32_aio_write(struct thread *td,
3007    struct freebsd6_freebsd32_aio_write_args *uap)
3008{
3009
3010	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
3011	    &aiocb32_ops_osigevent));
3012}
3013#endif
3014
3015int
3016freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
3017{
3018
3019	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
3020	    &aiocb32_ops));
3021}
3022
3023int
3024freebsd32_aio_writev(struct thread *td, struct freebsd32_aio_writev_args *uap)
3025{
3026
3027	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITEV,
3028	    &aiocb32_ops));
3029}
3030
3031int
3032freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
3033{
3034
3035	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
3036	    &aiocb32_ops));
3037}
3038
3039int
3040freebsd32_aio_waitcomplete(struct thread *td,
3041    struct freebsd32_aio_waitcomplete_args *uap)
3042{
3043	struct timespec32 ts32;
3044	struct timespec ts, *tsp;
3045	int error;
3046
3047	if (uap->timeout) {
3048		/* Get timespec struct. */
3049		error = copyin(uap->timeout, &ts32, sizeof(ts32));
3050		if (error)
3051			return (error);
3052		CP(ts32, ts, tv_sec);
3053		CP(ts32, ts, tv_nsec);
3054		tsp = &ts;
3055	} else
3056		tsp = NULL;
3057
3058	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
3059	    &aiocb32_ops));
3060}
3061
3062int
3063freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
3064{
3065
3066	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
3067	    &aiocb32_ops));
3068}
3069
3070#ifdef COMPAT_FREEBSD6
3071int
3072freebsd6_freebsd32_lio_listio(struct thread *td,
3073    struct freebsd6_freebsd32_lio_listio_args *uap)
3074{
3075	struct aiocb **acb_list;
3076	struct sigevent *sigp, sig;
3077	struct osigevent32 osig;
3078	uint32_t *acb_list32;
3079	int error, i, nent;
3080
3081	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
3082		return (EINVAL);
3083
3084	nent = uap->nent;
3085	if (nent < 0 || nent > max_aio_queue_per_proc)
3086		return (EINVAL);
3087
3088	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
3089		error = copyin(uap->sig, &osig, sizeof(osig));
3090		if (error)
3091			return (error);
3092		error = convert_old_sigevent32(&osig, &sig);
3093		if (error)
3094			return (error);
3095		sigp = &sig;
3096	} else
3097		sigp = NULL;
3098
3099	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
3100	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
3101	if (error) {
3102		free(acb_list32, M_LIO);
3103		return (error);
3104	}
3105	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
3106	for (i = 0; i < nent; i++)
3107		acb_list[i] = PTRIN(acb_list32[i]);
3108	free(acb_list32, M_LIO);
3109
3110	error = kern_lio_listio(td, uap->mode,
3111	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
3112	    &aiocb32_ops_osigevent);
3113	free(acb_list, M_LIO);
3114	return (error);
3115}
3116#endif
3117
3118int
3119freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
3120{
3121	struct aiocb **acb_list;
3122	struct sigevent *sigp, sig;
3123	struct sigevent32 sig32;
3124	uint32_t *acb_list32;
3125	int error, i, nent;
3126
3127	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
3128		return (EINVAL);
3129
3130	nent = uap->nent;
3131	if (nent < 0 || nent > max_aio_queue_per_proc)
3132		return (EINVAL);
3133
3134	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
3135		error = copyin(uap->sig, &sig32, sizeof(sig32));
3136		if (error)
3137			return (error);
3138		error = convert_sigevent32(&sig32, &sig);
3139		if (error)
3140			return (error);
3141		sigp = &sig;
3142	} else
3143		sigp = NULL;
3144
3145	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
3146	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
3147	if (error) {
3148		free(acb_list32, M_LIO);
3149		return (error);
3150	}
3151	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
3152	for (i = 0; i < nent; i++)
3153		acb_list[i] = PTRIN(acb_list32[i]);
3154	free(acb_list32, M_LIO);
3155
3156	error = kern_lio_listio(td, uap->mode,
3157	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
3158	    &aiocb32_ops);
3159	free(acb_list, M_LIO);
3160	return (error);
3161}
3162
3163#endif
3164