sys_pipe.c revision 133741
1/*
2 * Copyright (c) 1996 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 */
19
20/*
21 * This file contains a high-performance replacement for the socket-based
22 * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23 * all features of sockets, but does do everything that pipes normally
24 * do.
25 */
26
27/*
28 * This code has two modes of operation, a small write mode and a large
29 * write mode.  The small write mode acts like conventional pipes with
30 * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31 * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33 * the receiving process can copy it directly from the pages in the sending
34 * process.
35 *
36 * If the sending process receives a signal, it is possible that it will
37 * go away, and certainly its address space can change, because control
38 * is returned back to the user-mode side.  In that case, the pipe code
39 * arranges to copy the buffer supplied by the user process, to a pageable
40 * kernel buffer, and the receiving process will grab the data from the
41 * pageable kernel buffer.  Since signals don't happen all that often,
42 * the copy operation is normally eliminated.
43 *
44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45 * happen for small transfers so that the system will not spend all of
46 * its time context switching.
47 *
48 * In order to limit the resource use of pipes, two sysctls exist:
49 *
50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51 * address space available to us in pipe_map.  Whenever the amount in use
52 * exceeds half of this value, all new pipes will be created with size
53 * SMALL_PIPE_SIZE, rather than PIPE_SIZE.  Big pipe creation will be limited
54 * as well.  This value is loader tunable only.
55 *
56 * These values are autotuned in subr_param.c.
57 *
58 * Memory usage may be monitored through the sysctls
59 * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired.
60 *
61 *
62 * Locking rules:  There are two locks present here:  A mutex, used via
63 * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
64 * the flag, as mutexes can not persist over uiomove.  The mutex
65 * exists only to guard access to the flag, and is not in itself a
66 * locking mechanism.
67 *
68 * As pipelock() may have to sleep before it can acquire the flag, it
69 * is important to reread all data after a call to pipelock(); everything
70 * in the structure may have changed.
71 */
72
73#include <sys/cdefs.h>
74__FBSDID("$FreeBSD: head/sys/kern/sys_pipe.c 133741 2004-08-15 06:24:42Z jmg $");
75
76#include "opt_mac.h"
77
78#include <sys/param.h>
79#include <sys/systm.h>
80#include <sys/fcntl.h>
81#include <sys/file.h>
82#include <sys/filedesc.h>
83#include <sys/filio.h>
84#include <sys/kernel.h>
85#include <sys/lock.h>
86#include <sys/mac.h>
87#include <sys/mutex.h>
88#include <sys/ttycom.h>
89#include <sys/stat.h>
90#include <sys/malloc.h>
91#include <sys/poll.h>
92#include <sys/selinfo.h>
93#include <sys/signalvar.h>
94#include <sys/sysctl.h>
95#include <sys/sysproto.h>
96#include <sys/pipe.h>
97#include <sys/proc.h>
98#include <sys/vnode.h>
99#include <sys/uio.h>
100#include <sys/event.h>
101
102#include <vm/vm.h>
103#include <vm/vm_param.h>
104#include <vm/vm_object.h>
105#include <vm/vm_kern.h>
106#include <vm/vm_extern.h>
107#include <vm/pmap.h>
108#include <vm/vm_map.h>
109#include <vm/vm_page.h>
110#include <vm/uma.h>
111
112/*
113 * Use this define if you want to disable *fancy* VM things.  Expect an
114 * approx 30% decrease in transfer rate.  This could be useful for
115 * NetBSD or OpenBSD.
116 */
117/* #define PIPE_NODIRECT */
118
119/*
120 * interfaces to the outside world
121 */
122static fo_rdwr_t	pipe_read;
123static fo_rdwr_t	pipe_write;
124static fo_ioctl_t	pipe_ioctl;
125static fo_poll_t	pipe_poll;
126static fo_kqfilter_t	pipe_kqfilter;
127static fo_stat_t	pipe_stat;
128static fo_close_t	pipe_close;
129
130static struct fileops pipeops = {
131	.fo_read = pipe_read,
132	.fo_write = pipe_write,
133	.fo_ioctl = pipe_ioctl,
134	.fo_poll = pipe_poll,
135	.fo_kqfilter = pipe_kqfilter,
136	.fo_stat = pipe_stat,
137	.fo_close = pipe_close,
138	.fo_flags = DFLAG_PASSABLE
139};
140
141static void	filt_pipedetach(struct knote *kn);
142static int	filt_piperead(struct knote *kn, long hint);
143static int	filt_pipewrite(struct knote *kn, long hint);
144
145static struct filterops pipe_rfiltops =
146	{ 1, NULL, filt_pipedetach, filt_piperead };
147static struct filterops pipe_wfiltops =
148	{ 1, NULL, filt_pipedetach, filt_pipewrite };
149
150/*
151 * Default pipe buffer size(s), this can be kind-of large now because pipe
152 * space is pageable.  The pipe code will try to maintain locality of
153 * reference for performance reasons, so small amounts of outstanding I/O
154 * will not wipe the cache.
155 */
156#define MINPIPESIZE (PIPE_SIZE/3)
157#define MAXPIPESIZE (2*PIPE_SIZE/3)
158
159/*
160 * Limit the number of "big" pipes
161 */
162#define LIMITBIGPIPES	32
163static int nbigpipe;
164
165static int amountpipes;
166static int amountpipekva;
167
168SYSCTL_DECL(_kern_ipc);
169
170SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
171	   &maxpipekva, 0, "Pipe KVA limit");
172SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
173	   &amountpipes, 0, "Current # of pipes");
174SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
175	   &nbigpipe, 0, "Current # of big pipes");
176SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
177	   &amountpipekva, 0, "Pipe KVA usage");
178
179static void pipeinit(void *dummy __unused);
180static void pipeclose(struct pipe *cpipe);
181static void pipe_free_kmem(struct pipe *cpipe);
182static int pipe_create(struct pipe *pipe);
183static __inline int pipelock(struct pipe *cpipe, int catch);
184static __inline void pipeunlock(struct pipe *cpipe);
185static __inline void pipeselwakeup(struct pipe *cpipe);
186#ifndef PIPE_NODIRECT
187static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
188static void pipe_destroy_write_buffer(struct pipe *wpipe);
189static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
190static void pipe_clone_write_buffer(struct pipe *wpipe);
191#endif
192static int pipespace(struct pipe *cpipe, int size);
193static int pipespace_new(struct pipe *cpipe, int size);
194
195static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
196static void	pipe_zone_dtor(void *mem, int size, void *arg);
197static int	pipe_zone_init(void *mem, int size, int flags);
198static void	pipe_zone_fini(void *mem, int size);
199
200static uma_zone_t pipe_zone;
201
202SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
203
204static void
205pipeinit(void *dummy __unused)
206{
207
208	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair),
209	    pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini,
210	    UMA_ALIGN_PTR, 0);
211	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
212}
213
214static int
215pipe_zone_ctor(void *mem, int size, void *arg, int flags)
216{
217	struct pipepair *pp;
218	struct pipe *rpipe, *wpipe;
219
220	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
221
222	pp = (struct pipepair *)mem;
223
224	/*
225	 * We zero both pipe endpoints to make sure all the kmem pointers
226	 * are NULL, flag fields are zero'd, etc.  We timestamp both
227	 * endpoints with the same time.
228	 */
229	rpipe = &pp->pp_rpipe;
230	bzero(rpipe, sizeof(*rpipe));
231	vfs_timestamp(&rpipe->pipe_ctime);
232	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
233
234	wpipe = &pp->pp_wpipe;
235	bzero(wpipe, sizeof(*wpipe));
236	wpipe->pipe_ctime = rpipe->pipe_ctime;
237	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
238
239	rpipe->pipe_peer = wpipe;
240	rpipe->pipe_pair = pp;
241	wpipe->pipe_peer = rpipe;
242	wpipe->pipe_pair = pp;
243
244	/*
245	 * Mark both endpoints as present; they will later get free'd
246	 * one at a time.  When both are free'd, then the whole pair
247	 * is released.
248	 */
249	rpipe->pipe_present = 1;
250	wpipe->pipe_present = 1;
251
252	/*
253	 * Eventually, the MAC Framework may initialize the label
254	 * in ctor or init, but for now we do it elswhere to avoid
255	 * blocking in ctor or init.
256	 */
257	pp->pp_label = NULL;
258
259	atomic_add_int(&amountpipes, 2);
260	return (0);
261}
262
263static void
264pipe_zone_dtor(void *mem, int size, void *arg)
265{
266	struct pipepair *pp;
267
268	KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size"));
269
270	pp = (struct pipepair *)mem;
271
272	atomic_subtract_int(&amountpipes, 2);
273}
274
275static int
276pipe_zone_init(void *mem, int size, int flags)
277{
278	struct pipepair *pp;
279
280	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
281
282	pp = (struct pipepair *)mem;
283
284	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
285	return (0);
286}
287
288static void
289pipe_zone_fini(void *mem, int size)
290{
291	struct pipepair *pp;
292
293	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
294
295	pp = (struct pipepair *)mem;
296
297	mtx_destroy(&pp->pp_mtx);
298}
299
300/*
301 * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail,
302 * let the zone pick up the pieces via pipeclose().
303 */
304
305/* ARGSUSED */
306int
307pipe(td, uap)
308	struct thread *td;
309	struct pipe_args /* {
310		int	dummy;
311	} */ *uap;
312{
313	struct filedesc *fdp = td->td_proc->p_fd;
314	struct file *rf, *wf;
315	struct pipepair *pp;
316	struct pipe *rpipe, *wpipe;
317	int fd, error;
318
319	pp = uma_zalloc(pipe_zone, M_WAITOK);
320#ifdef MAC
321	/*
322	 * The MAC label is shared between the connected endpoints.  As a
323	 * result mac_init_pipe() and mac_create_pipe() are called once
324	 * for the pair, and not on the endpoints.
325	 */
326	mac_init_pipe(pp);
327	mac_create_pipe(td->td_ucred, pp);
328#endif
329	rpipe = &pp->pp_rpipe;
330	wpipe = &pp->pp_wpipe;
331
332	if (pipe_create(rpipe) || pipe_create(wpipe)) {
333		pipeclose(rpipe);
334		pipeclose(wpipe);
335		return (ENFILE);
336	}
337
338	rpipe->pipe_state |= PIPE_DIRECTOK;
339	wpipe->pipe_state |= PIPE_DIRECTOK;
340
341	error = falloc(td, &rf, &fd);
342	if (error) {
343		pipeclose(rpipe);
344		pipeclose(wpipe);
345		return (error);
346	}
347	/* An extra reference on `rf' has been held for us by falloc(). */
348	td->td_retval[0] = fd;
349
350	/*
351	 * Warning: once we've gotten past allocation of the fd for the
352	 * read-side, we can only drop the read side via fdrop() in order
353	 * to avoid races against processes which manage to dup() the read
354	 * side while we are blocked trying to allocate the write side.
355	 */
356	FILE_LOCK(rf);
357	rf->f_flag = FREAD | FWRITE;
358	rf->f_type = DTYPE_PIPE;
359	rf->f_data = rpipe;
360	rf->f_ops = &pipeops;
361	FILE_UNLOCK(rf);
362	error = falloc(td, &wf, &fd);
363	if (error) {
364		FILEDESC_LOCK(fdp);
365		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
366			fdp->fd_ofiles[td->td_retval[0]] = NULL;
367			fdunused(fdp, td->td_retval[0]);
368			FILEDESC_UNLOCK(fdp);
369			fdrop(rf, td);
370		} else {
371			FILEDESC_UNLOCK(fdp);
372		}
373		fdrop(rf, td);
374		/* rpipe has been closed by fdrop(). */
375		pipeclose(wpipe);
376		return (error);
377	}
378	/* An extra reference on `wf' has been held for us by falloc(). */
379	FILE_LOCK(wf);
380	wf->f_flag = FREAD | FWRITE;
381	wf->f_type = DTYPE_PIPE;
382	wf->f_data = wpipe;
383	wf->f_ops = &pipeops;
384	FILE_UNLOCK(wf);
385	fdrop(wf, td);
386	td->td_retval[1] = fd;
387	fdrop(rf, td);
388
389	return (0);
390}
391
392/*
393 * Allocate kva for pipe circular buffer, the space is pageable
394 * This routine will 'realloc' the size of a pipe safely, if it fails
395 * it will retain the old buffer.
396 * If it fails it will return ENOMEM.
397 */
398static int
399pipespace_new(cpipe, size)
400	struct pipe *cpipe;
401	int size;
402{
403	caddr_t buffer;
404	int error;
405	static int curfail = 0;
406	static struct timeval lastfail;
407
408	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
409
410	size = round_page(size);
411	/*
412	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
413	 */
414	buffer = (caddr_t) vm_map_min(pipe_map);
415
416	/*
417	 * The map entry is, by default, pageable.
418	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
419	 */
420	error = vm_map_find(pipe_map, NULL, 0,
421		(vm_offset_t *) &buffer, size, 1,
422		VM_PROT_ALL, VM_PROT_ALL, 0);
423	if (error != KERN_SUCCESS) {
424		if (ppsratecheck(&lastfail, &curfail, 1))
425			printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
426		return (ENOMEM);
427	}
428
429	/* free old resources if we're resizing */
430	pipe_free_kmem(cpipe);
431	cpipe->pipe_buffer.buffer = buffer;
432	cpipe->pipe_buffer.size = size;
433	cpipe->pipe_buffer.in = 0;
434	cpipe->pipe_buffer.out = 0;
435	cpipe->pipe_buffer.cnt = 0;
436	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
437	return (0);
438}
439
440/*
441 * Wrapper for pipespace_new() that performs locking assertions.
442 */
443static int
444pipespace(cpipe, size)
445	struct pipe *cpipe;
446	int size;
447{
448
449	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
450		("Unlocked pipe passed to pipespace"));
451	return (pipespace_new(cpipe, size));
452}
453
454/*
455 * lock a pipe for I/O, blocking other access
456 */
457static __inline int
458pipelock(cpipe, catch)
459	struct pipe *cpipe;
460	int catch;
461{
462	int error;
463
464	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
465	while (cpipe->pipe_state & PIPE_LOCKFL) {
466		cpipe->pipe_state |= PIPE_LWANT;
467		error = msleep(cpipe, PIPE_MTX(cpipe),
468		    catch ? (PRIBIO | PCATCH) : PRIBIO,
469		    "pipelk", 0);
470		if (error != 0)
471			return (error);
472	}
473	cpipe->pipe_state |= PIPE_LOCKFL;
474	return (0);
475}
476
477/*
478 * unlock a pipe I/O lock
479 */
480static __inline void
481pipeunlock(cpipe)
482	struct pipe *cpipe;
483{
484
485	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
486	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
487		("Unlocked pipe passed to pipeunlock"));
488	cpipe->pipe_state &= ~PIPE_LOCKFL;
489	if (cpipe->pipe_state & PIPE_LWANT) {
490		cpipe->pipe_state &= ~PIPE_LWANT;
491		wakeup(cpipe);
492	}
493}
494
495static __inline void
496pipeselwakeup(cpipe)
497	struct pipe *cpipe;
498{
499
500	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
501	if (cpipe->pipe_state & PIPE_SEL) {
502		cpipe->pipe_state &= ~PIPE_SEL;
503		selwakeuppri(&cpipe->pipe_sel, PSOCK);
504	}
505	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
506		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
507	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
508}
509
510/*
511 * Initialize and allocate VM and memory for pipe.  The structure
512 * will start out zero'd from the ctor, so we just manage the kmem.
513 */
514static int
515pipe_create(pipe)
516	struct pipe *pipe;
517{
518	int error;
519
520	/*
521	 * Reduce to 1/4th pipe size if we're over our global max.
522	 */
523	if (amountpipekva > maxpipekva / 2)
524		error = pipespace_new(pipe, SMALL_PIPE_SIZE);
525	else
526		error = pipespace_new(pipe, PIPE_SIZE);
527	knlist_init(&pipe->pipe_sel.si_note, PIPE_MTX(pipe));
528	return (error);
529}
530
531/* ARGSUSED */
532static int
533pipe_read(fp, uio, active_cred, flags, td)
534	struct file *fp;
535	struct uio *uio;
536	struct ucred *active_cred;
537	struct thread *td;
538	int flags;
539{
540	struct pipe *rpipe = fp->f_data;
541	int error;
542	int nread = 0;
543	u_int size;
544
545	PIPE_LOCK(rpipe);
546	++rpipe->pipe_busy;
547	error = pipelock(rpipe, 1);
548	if (error)
549		goto unlocked_error;
550
551#ifdef MAC
552	error = mac_check_pipe_read(active_cred, rpipe->pipe_pair);
553	if (error)
554		goto locked_error;
555#endif
556
557	while (uio->uio_resid) {
558		/*
559		 * normal pipe buffer receive
560		 */
561		if (rpipe->pipe_buffer.cnt > 0) {
562			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
563			if (size > rpipe->pipe_buffer.cnt)
564				size = rpipe->pipe_buffer.cnt;
565			if (size > (u_int) uio->uio_resid)
566				size = (u_int) uio->uio_resid;
567
568			PIPE_UNLOCK(rpipe);
569			error = uiomove(
570			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
571			    size, uio);
572			PIPE_LOCK(rpipe);
573			if (error)
574				break;
575
576			rpipe->pipe_buffer.out += size;
577			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
578				rpipe->pipe_buffer.out = 0;
579
580			rpipe->pipe_buffer.cnt -= size;
581
582			/*
583			 * If there is no more to read in the pipe, reset
584			 * its pointers to the beginning.  This improves
585			 * cache hit stats.
586			 */
587			if (rpipe->pipe_buffer.cnt == 0) {
588				rpipe->pipe_buffer.in = 0;
589				rpipe->pipe_buffer.out = 0;
590			}
591			nread += size;
592#ifndef PIPE_NODIRECT
593		/*
594		 * Direct copy, bypassing a kernel buffer.
595		 */
596		} else if ((size = rpipe->pipe_map.cnt) &&
597			   (rpipe->pipe_state & PIPE_DIRECTW)) {
598			if (size > (u_int) uio->uio_resid)
599				size = (u_int) uio->uio_resid;
600
601			PIPE_UNLOCK(rpipe);
602			error = uiomove_fromphys(rpipe->pipe_map.ms,
603			    rpipe->pipe_map.pos, size, uio);
604			PIPE_LOCK(rpipe);
605			if (error)
606				break;
607			nread += size;
608			rpipe->pipe_map.pos += size;
609			rpipe->pipe_map.cnt -= size;
610			if (rpipe->pipe_map.cnt == 0) {
611				rpipe->pipe_state &= ~PIPE_DIRECTW;
612				wakeup(rpipe);
613			}
614#endif
615		} else {
616			/*
617			 * detect EOF condition
618			 * read returns 0 on EOF, no need to set error
619			 */
620			if (rpipe->pipe_state & PIPE_EOF)
621				break;
622
623			/*
624			 * If the "write-side" has been blocked, wake it up now.
625			 */
626			if (rpipe->pipe_state & PIPE_WANTW) {
627				rpipe->pipe_state &= ~PIPE_WANTW;
628				wakeup(rpipe);
629			}
630
631			/*
632			 * Break if some data was read.
633			 */
634			if (nread > 0)
635				break;
636
637			/*
638			 * Unlock the pipe buffer for our remaining processing.
639			 * We will either break out with an error or we will
640			 * sleep and relock to loop.
641			 */
642			pipeunlock(rpipe);
643
644			/*
645			 * Handle non-blocking mode operation or
646			 * wait for more data.
647			 */
648			if (fp->f_flag & FNONBLOCK) {
649				error = EAGAIN;
650			} else {
651				rpipe->pipe_state |= PIPE_WANTR;
652				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
653				    PRIBIO | PCATCH,
654				    "piperd", 0)) == 0)
655					error = pipelock(rpipe, 1);
656			}
657			if (error)
658				goto unlocked_error;
659		}
660	}
661#ifdef MAC
662locked_error:
663#endif
664	pipeunlock(rpipe);
665
666	/* XXX: should probably do this before getting any locks. */
667	if (error == 0)
668		vfs_timestamp(&rpipe->pipe_atime);
669unlocked_error:
670	--rpipe->pipe_busy;
671
672	/*
673	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
674	 */
675	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
676		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
677		wakeup(rpipe);
678	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
679		/*
680		 * Handle write blocking hysteresis.
681		 */
682		if (rpipe->pipe_state & PIPE_WANTW) {
683			rpipe->pipe_state &= ~PIPE_WANTW;
684			wakeup(rpipe);
685		}
686	}
687
688	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
689		pipeselwakeup(rpipe);
690
691	PIPE_UNLOCK(rpipe);
692	return (error);
693}
694
695#ifndef PIPE_NODIRECT
696/*
697 * Map the sending processes' buffer into kernel space and wire it.
698 * This is similar to a physical write operation.
699 */
700static int
701pipe_build_write_buffer(wpipe, uio)
702	struct pipe *wpipe;
703	struct uio *uio;
704{
705	pmap_t pmap;
706	u_int size;
707	int i, j;
708	vm_offset_t addr, endaddr;
709
710	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
711
712	size = (u_int) uio->uio_iov->iov_len;
713	if (size > wpipe->pipe_buffer.size)
714		size = wpipe->pipe_buffer.size;
715
716	pmap = vmspace_pmap(curproc->p_vmspace);
717	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
718	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
719	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
720		/*
721		 * vm_fault_quick() can sleep.  Consequently,
722		 * vm_page_lock_queue() and vm_page_unlock_queue()
723		 * should not be performed outside of this loop.
724		 */
725	race:
726		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
727			vm_page_lock_queues();
728			for (j = 0; j < i; j++)
729				vm_page_unhold(wpipe->pipe_map.ms[j]);
730			vm_page_unlock_queues();
731			return (EFAULT);
732		}
733		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
734		    VM_PROT_READ);
735		if (wpipe->pipe_map.ms[i] == NULL)
736			goto race;
737	}
738
739/*
740 * set up the control block
741 */
742	wpipe->pipe_map.npages = i;
743	wpipe->pipe_map.pos =
744	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
745	wpipe->pipe_map.cnt = size;
746
747/*
748 * and update the uio data
749 */
750
751	uio->uio_iov->iov_len -= size;
752	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
753	if (uio->uio_iov->iov_len == 0)
754		uio->uio_iov++;
755	uio->uio_resid -= size;
756	uio->uio_offset += size;
757	return (0);
758}
759
760/*
761 * unmap and unwire the process buffer
762 */
763static void
764pipe_destroy_write_buffer(wpipe)
765	struct pipe *wpipe;
766{
767	int i;
768
769	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
770	vm_page_lock_queues();
771	for (i = 0; i < wpipe->pipe_map.npages; i++) {
772		vm_page_unhold(wpipe->pipe_map.ms[i]);
773	}
774	vm_page_unlock_queues();
775	wpipe->pipe_map.npages = 0;
776}
777
778/*
779 * In the case of a signal, the writing process might go away.  This
780 * code copies the data into the circular buffer so that the source
781 * pages can be freed without loss of data.
782 */
783static void
784pipe_clone_write_buffer(wpipe)
785	struct pipe *wpipe;
786{
787	struct uio uio;
788	struct iovec iov;
789	int size;
790	int pos;
791
792	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
793	size = wpipe->pipe_map.cnt;
794	pos = wpipe->pipe_map.pos;
795
796	wpipe->pipe_buffer.in = size;
797	wpipe->pipe_buffer.out = 0;
798	wpipe->pipe_buffer.cnt = size;
799	wpipe->pipe_state &= ~PIPE_DIRECTW;
800
801	PIPE_UNLOCK(wpipe);
802	iov.iov_base = wpipe->pipe_buffer.buffer;
803	iov.iov_len = size;
804	uio.uio_iov = &iov;
805	uio.uio_iovcnt = 1;
806	uio.uio_offset = 0;
807	uio.uio_resid = size;
808	uio.uio_segflg = UIO_SYSSPACE;
809	uio.uio_rw = UIO_READ;
810	uio.uio_td = curthread;
811	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
812	PIPE_LOCK(wpipe);
813	pipe_destroy_write_buffer(wpipe);
814}
815
816/*
817 * This implements the pipe buffer write mechanism.  Note that only
818 * a direct write OR a normal pipe write can be pending at any given time.
819 * If there are any characters in the pipe buffer, the direct write will
820 * be deferred until the receiving process grabs all of the bytes from
821 * the pipe buffer.  Then the direct mapping write is set-up.
822 */
823static int
824pipe_direct_write(wpipe, uio)
825	struct pipe *wpipe;
826	struct uio *uio;
827{
828	int error;
829
830retry:
831	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
832	error = pipelock(wpipe, 1);
833	if (wpipe->pipe_state & PIPE_EOF)
834		error = EPIPE;
835	if (error) {
836		pipeunlock(wpipe);
837		goto error1;
838	}
839	while (wpipe->pipe_state & PIPE_DIRECTW) {
840		if (wpipe->pipe_state & PIPE_WANTR) {
841			wpipe->pipe_state &= ~PIPE_WANTR;
842			wakeup(wpipe);
843		}
844		wpipe->pipe_state |= PIPE_WANTW;
845		pipeunlock(wpipe);
846		error = msleep(wpipe, PIPE_MTX(wpipe),
847		    PRIBIO | PCATCH, "pipdww", 0);
848		if (error)
849			goto error1;
850		else
851			goto retry;
852	}
853	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
854	if (wpipe->pipe_buffer.cnt > 0) {
855		if (wpipe->pipe_state & PIPE_WANTR) {
856			wpipe->pipe_state &= ~PIPE_WANTR;
857			wakeup(wpipe);
858		}
859		wpipe->pipe_state |= PIPE_WANTW;
860		pipeunlock(wpipe);
861		error = msleep(wpipe, PIPE_MTX(wpipe),
862		    PRIBIO | PCATCH, "pipdwc", 0);
863		if (error)
864			goto error1;
865		else
866			goto retry;
867	}
868
869	wpipe->pipe_state |= PIPE_DIRECTW;
870
871	PIPE_UNLOCK(wpipe);
872	error = pipe_build_write_buffer(wpipe, uio);
873	PIPE_LOCK(wpipe);
874	if (error) {
875		wpipe->pipe_state &= ~PIPE_DIRECTW;
876		pipeunlock(wpipe);
877		goto error1;
878	}
879
880	error = 0;
881	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
882		if (wpipe->pipe_state & PIPE_EOF) {
883			pipe_destroy_write_buffer(wpipe);
884			pipeselwakeup(wpipe);
885			pipeunlock(wpipe);
886			error = EPIPE;
887			goto error1;
888		}
889		if (wpipe->pipe_state & PIPE_WANTR) {
890			wpipe->pipe_state &= ~PIPE_WANTR;
891			wakeup(wpipe);
892		}
893		pipeselwakeup(wpipe);
894		pipeunlock(wpipe);
895		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
896		    "pipdwt", 0);
897		pipelock(wpipe, 0);
898	}
899
900	if (wpipe->pipe_state & PIPE_EOF)
901		error = EPIPE;
902	if (wpipe->pipe_state & PIPE_DIRECTW) {
903		/*
904		 * this bit of trickery substitutes a kernel buffer for
905		 * the process that might be going away.
906		 */
907		pipe_clone_write_buffer(wpipe);
908	} else {
909		pipe_destroy_write_buffer(wpipe);
910	}
911	pipeunlock(wpipe);
912	return (error);
913
914error1:
915	wakeup(wpipe);
916	return (error);
917}
918#endif
919
920static int
921pipe_write(fp, uio, active_cred, flags, td)
922	struct file *fp;
923	struct uio *uio;
924	struct ucred *active_cred;
925	struct thread *td;
926	int flags;
927{
928	int error = 0;
929	int orig_resid;
930	struct pipe *wpipe, *rpipe;
931
932	rpipe = fp->f_data;
933	wpipe = rpipe->pipe_peer;
934
935	PIPE_LOCK(rpipe);
936	error = pipelock(wpipe, 1);
937	if (error) {
938		PIPE_UNLOCK(rpipe);
939		return (error);
940	}
941	/*
942	 * detect loss of pipe read side, issue SIGPIPE if lost.
943	 */
944	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
945		pipeunlock(wpipe);
946		PIPE_UNLOCK(rpipe);
947		return (EPIPE);
948	}
949#ifdef MAC
950	error = mac_check_pipe_write(active_cred, wpipe->pipe_pair);
951	if (error) {
952		pipeunlock(wpipe);
953		PIPE_UNLOCK(rpipe);
954		return (error);
955	}
956#endif
957	++wpipe->pipe_busy;
958
959	/*
960	 * If it is advantageous to resize the pipe buffer, do
961	 * so.
962	 */
963	if ((uio->uio_resid > PIPE_SIZE) &&
964		(amountpipekva < maxpipekva / 2) &&
965		(nbigpipe < LIMITBIGPIPES) &&
966		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
967		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
968		(wpipe->pipe_buffer.cnt == 0)) {
969
970		PIPE_UNLOCK(wpipe);
971		if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
972			atomic_add_int(&nbigpipe, 1);
973		PIPE_LOCK(wpipe);
974	}
975
976	pipeunlock(wpipe);
977
978	orig_resid = uio->uio_resid;
979
980	while (uio->uio_resid) {
981		int space;
982
983		pipelock(wpipe, 0);
984		if (wpipe->pipe_state & PIPE_EOF) {
985			pipeunlock(wpipe);
986			error = EPIPE;
987			break;
988		}
989#ifndef PIPE_NODIRECT
990		/*
991		 * If the transfer is large, we can gain performance if
992		 * we do process-to-process copies directly.
993		 * If the write is non-blocking, we don't use the
994		 * direct write mechanism.
995		 *
996		 * The direct write mechanism will detect the reader going
997		 * away on us.
998		 */
999		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1000		    (fp->f_flag & FNONBLOCK) == 0) {
1001			pipeunlock(wpipe);
1002			error = pipe_direct_write(wpipe, uio);
1003			if (error)
1004				break;
1005			continue;
1006		}
1007#endif
1008
1009		/*
1010		 * Pipe buffered writes cannot be coincidental with
1011		 * direct writes.  We wait until the currently executing
1012		 * direct write is completed before we start filling the
1013		 * pipe buffer.  We break out if a signal occurs or the
1014		 * reader goes away.
1015		 */
1016		if (wpipe->pipe_state & PIPE_DIRECTW) {
1017			if (wpipe->pipe_state & PIPE_WANTR) {
1018				wpipe->pipe_state &= ~PIPE_WANTR;
1019				wakeup(wpipe);
1020			}
1021			pipeunlock(wpipe);
1022			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1023			    "pipbww", 0);
1024			if (error)
1025				break;
1026			else
1027				continue;
1028		}
1029
1030		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1031
1032		/* Writes of size <= PIPE_BUF must be atomic. */
1033		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1034			space = 0;
1035
1036		if (space > 0) {
1037			int size;	/* Transfer size */
1038			int segsize;	/* first segment to transfer */
1039
1040			/*
1041			 * Transfer size is minimum of uio transfer
1042			 * and free space in pipe buffer.
1043			 */
1044			if (space > uio->uio_resid)
1045				size = uio->uio_resid;
1046			else
1047				size = space;
1048			/*
1049			 * First segment to transfer is minimum of
1050			 * transfer size and contiguous space in
1051			 * pipe buffer.  If first segment to transfer
1052			 * is less than the transfer size, we've got
1053			 * a wraparound in the buffer.
1054			 */
1055			segsize = wpipe->pipe_buffer.size -
1056				wpipe->pipe_buffer.in;
1057			if (segsize > size)
1058				segsize = size;
1059
1060			/* Transfer first segment */
1061
1062			PIPE_UNLOCK(rpipe);
1063			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1064					segsize, uio);
1065			PIPE_LOCK(rpipe);
1066
1067			if (error == 0 && segsize < size) {
1068				KASSERT(wpipe->pipe_buffer.in + segsize ==
1069					wpipe->pipe_buffer.size,
1070					("Pipe buffer wraparound disappeared"));
1071				/*
1072				 * Transfer remaining part now, to
1073				 * support atomic writes.  Wraparound
1074				 * happened.
1075				 */
1076
1077				PIPE_UNLOCK(rpipe);
1078				error = uiomove(
1079				    &wpipe->pipe_buffer.buffer[0],
1080				    size - segsize, uio);
1081				PIPE_LOCK(rpipe);
1082			}
1083			if (error == 0) {
1084				wpipe->pipe_buffer.in += size;
1085				if (wpipe->pipe_buffer.in >=
1086				    wpipe->pipe_buffer.size) {
1087					KASSERT(wpipe->pipe_buffer.in ==
1088						size - segsize +
1089						wpipe->pipe_buffer.size,
1090						("Expected wraparound bad"));
1091					wpipe->pipe_buffer.in = size - segsize;
1092				}
1093
1094				wpipe->pipe_buffer.cnt += size;
1095				KASSERT(wpipe->pipe_buffer.cnt <=
1096					wpipe->pipe_buffer.size,
1097					("Pipe buffer overflow"));
1098			}
1099			pipeunlock(wpipe);
1100		} else {
1101			/*
1102			 * If the "read-side" has been blocked, wake it up now.
1103			 */
1104			if (wpipe->pipe_state & PIPE_WANTR) {
1105				wpipe->pipe_state &= ~PIPE_WANTR;
1106				wakeup(wpipe);
1107			}
1108
1109			/*
1110			 * don't block on non-blocking I/O
1111			 */
1112			if (fp->f_flag & FNONBLOCK) {
1113				error = EAGAIN;
1114				pipeunlock(wpipe);
1115				break;
1116			}
1117
1118			/*
1119			 * We have no more space and have something to offer,
1120			 * wake up select/poll.
1121			 */
1122			pipeselwakeup(wpipe);
1123
1124			wpipe->pipe_state |= PIPE_WANTW;
1125			pipeunlock(wpipe);
1126			error = msleep(wpipe, PIPE_MTX(rpipe),
1127			    PRIBIO | PCATCH, "pipewr", 0);
1128			if (error != 0)
1129				break;
1130		}
1131	}
1132
1133	pipelock(wpipe, 0);
1134	--wpipe->pipe_busy;
1135
1136	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1137		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1138		wakeup(wpipe);
1139	} else if (wpipe->pipe_buffer.cnt > 0) {
1140		/*
1141		 * If we have put any characters in the buffer, we wake up
1142		 * the reader.
1143		 */
1144		if (wpipe->pipe_state & PIPE_WANTR) {
1145			wpipe->pipe_state &= ~PIPE_WANTR;
1146			wakeup(wpipe);
1147		}
1148	}
1149
1150	/*
1151	 * Don't return EPIPE if I/O was successful
1152	 */
1153	if ((wpipe->pipe_buffer.cnt == 0) &&
1154	    (uio->uio_resid == 0) &&
1155	    (error == EPIPE)) {
1156		error = 0;
1157	}
1158
1159	if (error == 0)
1160		vfs_timestamp(&wpipe->pipe_mtime);
1161
1162	/*
1163	 * We have something to offer,
1164	 * wake up select/poll.
1165	 */
1166	if (wpipe->pipe_buffer.cnt)
1167		pipeselwakeup(wpipe);
1168
1169	pipeunlock(wpipe);
1170	PIPE_UNLOCK(rpipe);
1171	return (error);
1172}
1173
1174/*
1175 * we implement a very minimal set of ioctls for compatibility with sockets.
1176 */
1177static int
1178pipe_ioctl(fp, cmd, data, active_cred, td)
1179	struct file *fp;
1180	u_long cmd;
1181	void *data;
1182	struct ucred *active_cred;
1183	struct thread *td;
1184{
1185	struct pipe *mpipe = fp->f_data;
1186#ifdef MAC
1187	int error;
1188#endif
1189
1190	PIPE_LOCK(mpipe);
1191
1192#ifdef MAC
1193	error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1194	if (error) {
1195		PIPE_UNLOCK(mpipe);
1196		return (error);
1197	}
1198#endif
1199
1200	switch (cmd) {
1201
1202	case FIONBIO:
1203		PIPE_UNLOCK(mpipe);
1204		return (0);
1205
1206	case FIOASYNC:
1207		if (*(int *)data) {
1208			mpipe->pipe_state |= PIPE_ASYNC;
1209		} else {
1210			mpipe->pipe_state &= ~PIPE_ASYNC;
1211		}
1212		PIPE_UNLOCK(mpipe);
1213		return (0);
1214
1215	case FIONREAD:
1216		if (mpipe->pipe_state & PIPE_DIRECTW)
1217			*(int *)data = mpipe->pipe_map.cnt;
1218		else
1219			*(int *)data = mpipe->pipe_buffer.cnt;
1220		PIPE_UNLOCK(mpipe);
1221		return (0);
1222
1223	case FIOSETOWN:
1224		PIPE_UNLOCK(mpipe);
1225		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1226
1227	case FIOGETOWN:
1228		PIPE_UNLOCK(mpipe);
1229		*(int *)data = fgetown(&mpipe->pipe_sigio);
1230		return (0);
1231
1232	/* This is deprecated, FIOSETOWN should be used instead. */
1233	case TIOCSPGRP:
1234		PIPE_UNLOCK(mpipe);
1235		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1236
1237	/* This is deprecated, FIOGETOWN should be used instead. */
1238	case TIOCGPGRP:
1239		PIPE_UNLOCK(mpipe);
1240		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1241		return (0);
1242
1243	}
1244	PIPE_UNLOCK(mpipe);
1245	return (ENOTTY);
1246}
1247
1248static int
1249pipe_poll(fp, events, active_cred, td)
1250	struct file *fp;
1251	int events;
1252	struct ucred *active_cred;
1253	struct thread *td;
1254{
1255	struct pipe *rpipe = fp->f_data;
1256	struct pipe *wpipe;
1257	int revents = 0;
1258#ifdef MAC
1259	int error;
1260#endif
1261
1262	wpipe = rpipe->pipe_peer;
1263	PIPE_LOCK(rpipe);
1264#ifdef MAC
1265	error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair);
1266	if (error)
1267		goto locked_error;
1268#endif
1269	if (events & (POLLIN | POLLRDNORM))
1270		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1271		    (rpipe->pipe_buffer.cnt > 0) ||
1272		    (rpipe->pipe_state & PIPE_EOF))
1273			revents |= events & (POLLIN | POLLRDNORM);
1274
1275	if (events & (POLLOUT | POLLWRNORM))
1276		if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) ||
1277		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1278		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1279			revents |= events & (POLLOUT | POLLWRNORM);
1280
1281	if ((rpipe->pipe_state & PIPE_EOF) ||
1282	    (!wpipe->pipe_present) ||
1283	    (wpipe->pipe_state & PIPE_EOF))
1284		revents |= POLLHUP;
1285
1286	if (revents == 0) {
1287		if (events & (POLLIN | POLLRDNORM)) {
1288			selrecord(td, &rpipe->pipe_sel);
1289			rpipe->pipe_state |= PIPE_SEL;
1290		}
1291
1292		if (events & (POLLOUT | POLLWRNORM)) {
1293			selrecord(td, &wpipe->pipe_sel);
1294			wpipe->pipe_state |= PIPE_SEL;
1295		}
1296	}
1297#ifdef MAC
1298locked_error:
1299#endif
1300	PIPE_UNLOCK(rpipe);
1301
1302	return (revents);
1303}
1304
1305/*
1306 * We shouldn't need locks here as we're doing a read and this should
1307 * be a natural race.
1308 */
1309static int
1310pipe_stat(fp, ub, active_cred, td)
1311	struct file *fp;
1312	struct stat *ub;
1313	struct ucred *active_cred;
1314	struct thread *td;
1315{
1316	struct pipe *pipe = fp->f_data;
1317#ifdef MAC
1318	int error;
1319
1320	PIPE_LOCK(pipe);
1321	error = mac_check_pipe_stat(active_cred, pipe->pipe_pair);
1322	PIPE_UNLOCK(pipe);
1323	if (error)
1324		return (error);
1325#endif
1326	bzero(ub, sizeof(*ub));
1327	ub->st_mode = S_IFIFO;
1328	ub->st_blksize = pipe->pipe_buffer.size;
1329	if (pipe->pipe_state & PIPE_DIRECTW)
1330		ub->st_size = pipe->pipe_map.cnt;
1331	else
1332		ub->st_size = pipe->pipe_buffer.cnt;
1333	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1334	ub->st_atimespec = pipe->pipe_atime;
1335	ub->st_mtimespec = pipe->pipe_mtime;
1336	ub->st_ctimespec = pipe->pipe_ctime;
1337	ub->st_uid = fp->f_cred->cr_uid;
1338	ub->st_gid = fp->f_cred->cr_gid;
1339	/*
1340	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1341	 * XXX (st_dev, st_ino) should be unique.
1342	 */
1343	return (0);
1344}
1345
1346/* ARGSUSED */
1347static int
1348pipe_close(fp, td)
1349	struct file *fp;
1350	struct thread *td;
1351{
1352	struct pipe *cpipe = fp->f_data;
1353
1354	fp->f_ops = &badfileops;
1355	fp->f_data = NULL;
1356	funsetown(&cpipe->pipe_sigio);
1357	pipeclose(cpipe);
1358	return (0);
1359}
1360
1361static void
1362pipe_free_kmem(cpipe)
1363	struct pipe *cpipe;
1364{
1365
1366	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1367	    ("pipe_free_kmem: pipe mutex locked"));
1368
1369	if (cpipe->pipe_buffer.buffer != NULL) {
1370		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1371			atomic_subtract_int(&nbigpipe, 1);
1372		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1373		vm_map_remove(pipe_map,
1374		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1375		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1376		cpipe->pipe_buffer.buffer = NULL;
1377	}
1378#ifndef PIPE_NODIRECT
1379	{
1380		cpipe->pipe_map.cnt = 0;
1381		cpipe->pipe_map.pos = 0;
1382		cpipe->pipe_map.npages = 0;
1383	}
1384#endif
1385}
1386
1387/*
1388 * shutdown the pipe
1389 */
1390static void
1391pipeclose(cpipe)
1392	struct pipe *cpipe;
1393{
1394	struct pipepair *pp;
1395	struct pipe *ppipe;
1396
1397	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1398
1399	PIPE_LOCK(cpipe);
1400	pipelock(cpipe, 0);
1401	pp = cpipe->pipe_pair;
1402
1403	pipeselwakeup(cpipe);
1404
1405	/*
1406	 * If the other side is blocked, wake it up saying that
1407	 * we want to close it down.
1408	 */
1409	cpipe->pipe_state |= PIPE_EOF;
1410	while (cpipe->pipe_busy) {
1411		wakeup(cpipe);
1412		cpipe->pipe_state |= PIPE_WANT;
1413		pipeunlock(cpipe);
1414		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1415		pipelock(cpipe, 0);
1416	}
1417
1418
1419	/*
1420	 * Disconnect from peer, if any.
1421	 */
1422	ppipe = cpipe->pipe_peer;
1423	if (ppipe->pipe_present != 0) {
1424		pipeselwakeup(ppipe);
1425
1426		ppipe->pipe_state |= PIPE_EOF;
1427		wakeup(ppipe);
1428		KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
1429	}
1430
1431	/*
1432	 * Mark this endpoint as free.  Release kmem resources.  We
1433	 * don't mark this endpoint as unused until we've finished
1434	 * doing that, or the pipe might disappear out from under
1435	 * us.
1436	 */
1437	PIPE_UNLOCK(cpipe);
1438	pipe_free_kmem(cpipe);
1439	PIPE_LOCK(cpipe);
1440	cpipe->pipe_present = 0;
1441	pipeunlock(cpipe);
1442	knlist_clear(&cpipe->pipe_sel.si_note, 1);
1443	knlist_destroy(&cpipe->pipe_sel.si_note);
1444
1445	/*
1446	 * If both endpoints are now closed, release the memory for the
1447	 * pipe pair.  If not, unlock.
1448	 */
1449	if (ppipe->pipe_present == 0) {
1450		PIPE_UNLOCK(cpipe);
1451#ifdef MAC
1452		mac_destroy_pipe(pp);
1453#endif
1454		uma_zfree(pipe_zone, cpipe->pipe_pair);
1455	} else
1456		PIPE_UNLOCK(cpipe);
1457}
1458
1459/*ARGSUSED*/
1460static int
1461pipe_kqfilter(struct file *fp, struct knote *kn)
1462{
1463	struct pipe *cpipe;
1464
1465	cpipe = kn->kn_fp->f_data;
1466	PIPE_LOCK(cpipe);
1467	switch (kn->kn_filter) {
1468	case EVFILT_READ:
1469		kn->kn_fop = &pipe_rfiltops;
1470		break;
1471	case EVFILT_WRITE:
1472		kn->kn_fop = &pipe_wfiltops;
1473		if (!cpipe->pipe_peer->pipe_present) {
1474			/* other end of pipe has been closed */
1475			PIPE_UNLOCK(cpipe);
1476			return (EPIPE);
1477		}
1478		cpipe = cpipe->pipe_peer;
1479		break;
1480	default:
1481		PIPE_UNLOCK(cpipe);
1482		return (EINVAL);
1483	}
1484
1485	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
1486	PIPE_UNLOCK(cpipe);
1487	return (0);
1488}
1489
1490static void
1491filt_pipedetach(struct knote *kn)
1492{
1493	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1494
1495	PIPE_LOCK(cpipe);
1496	if (kn->kn_filter == EVFILT_WRITE) {
1497		if (!cpipe->pipe_peer->pipe_present) {
1498			PIPE_UNLOCK(cpipe);
1499			return;
1500		}
1501		cpipe = cpipe->pipe_peer;
1502	}
1503	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
1504	PIPE_UNLOCK(cpipe);
1505}
1506
1507/*ARGSUSED*/
1508static int
1509filt_piperead(struct knote *kn, long hint)
1510{
1511	struct pipe *rpipe = kn->kn_fp->f_data;
1512	struct pipe *wpipe = rpipe->pipe_peer;
1513	int ret;
1514
1515	PIPE_LOCK(rpipe);
1516	kn->kn_data = rpipe->pipe_buffer.cnt;
1517	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1518		kn->kn_data = rpipe->pipe_map.cnt;
1519
1520	if ((rpipe->pipe_state & PIPE_EOF) ||
1521	    (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1522		kn->kn_flags |= EV_EOF;
1523		PIPE_UNLOCK(rpipe);
1524		return (1);
1525	}
1526	ret = kn->kn_data > 0;
1527	PIPE_UNLOCK(rpipe);
1528	return ret;
1529}
1530
1531/*ARGSUSED*/
1532static int
1533filt_pipewrite(struct knote *kn, long hint)
1534{
1535	struct pipe *rpipe = kn->kn_fp->f_data;
1536	struct pipe *wpipe = rpipe->pipe_peer;
1537
1538	PIPE_LOCK(rpipe);
1539	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1540		kn->kn_data = 0;
1541		kn->kn_flags |= EV_EOF;
1542		PIPE_UNLOCK(rpipe);
1543		return (1);
1544	}
1545	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1546	if (wpipe->pipe_state & PIPE_DIRECTW)
1547		kn->kn_data = 0;
1548
1549	PIPE_UNLOCK(rpipe);
1550	return (kn->kn_data >= PIPE_BUF);
1551}
1552