sys_pipe.c revision 137764
1/*
2 * Copyright (c) 1996 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 */
19
20/*
21 * This file contains a high-performance replacement for the socket-based
22 * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23 * all features of sockets, but does do everything that pipes normally
24 * do.
25 */
26
27/*
28 * This code has two modes of operation, a small write mode and a large
29 * write mode.  The small write mode acts like conventional pipes with
30 * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31 * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33 * the receiving process can copy it directly from the pages in the sending
34 * process.
35 *
36 * If the sending process receives a signal, it is possible that it will
37 * go away, and certainly its address space can change, because control
38 * is returned back to the user-mode side.  In that case, the pipe code
39 * arranges to copy the buffer supplied by the user process, to a pageable
40 * kernel buffer, and the receiving process will grab the data from the
41 * pageable kernel buffer.  Since signals don't happen all that often,
42 * the copy operation is normally eliminated.
43 *
44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45 * happen for small transfers so that the system will not spend all of
46 * its time context switching.
47 *
48 * In order to limit the resource use of pipes, two sysctls exist:
49 *
50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51 * address space available to us in pipe_map. This value is normally
52 * autotuned, but may also be loader tuned.
53 *
54 * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
55 * memory in use by pipes.
56 *
57 * Based on how large pipekva is relative to maxpipekva, the following
58 * will happen:
59 *
60 * 0% - 50%:
61 *     New pipes are given 16K of memory backing, pipes may dynamically
62 *     grow to as large as 64K where needed.
63 * 50% - 75%:
64 *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
65 *     existing pipes may NOT grow.
66 * 75% - 100%:
67 *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
68 *     existing pipes will be shrunk down to 4K whenever possible.
69 *
70 * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
71 * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
72 * resize which MUST occur for reverse-direction pipes when they are
73 * first used.
74 *
75 * Additional information about the current state of pipes may be obtained
76 * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
77 * and kern.ipc.piperesizefail.
78 *
79 * Locking rules:  There are two locks present here:  A mutex, used via
80 * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
81 * the flag, as mutexes can not persist over uiomove.  The mutex
82 * exists only to guard access to the flag, and is not in itself a
83 * locking mechanism.  Also note that there is only a single mutex for
84 * both directions of a pipe.
85 *
86 * As pipelock() may have to sleep before it can acquire the flag, it
87 * is important to reread all data after a call to pipelock(); everything
88 * in the structure may have changed.
89 */
90
91#include <sys/cdefs.h>
92__FBSDID("$FreeBSD: head/sys/kern/sys_pipe.c 137764 2004-11-16 06:57:52Z phk $");
93
94#include "opt_mac.h"
95
96#include <sys/param.h>
97#include <sys/systm.h>
98#include <sys/fcntl.h>
99#include <sys/file.h>
100#include <sys/filedesc.h>
101#include <sys/filio.h>
102#include <sys/kernel.h>
103#include <sys/lock.h>
104#include <sys/mac.h>
105#include <sys/mutex.h>
106#include <sys/ttycom.h>
107#include <sys/stat.h>
108#include <sys/malloc.h>
109#include <sys/poll.h>
110#include <sys/selinfo.h>
111#include <sys/signalvar.h>
112#include <sys/sysctl.h>
113#include <sys/sysproto.h>
114#include <sys/pipe.h>
115#include <sys/proc.h>
116#include <sys/vnode.h>
117#include <sys/uio.h>
118#include <sys/event.h>
119
120#include <vm/vm.h>
121#include <vm/vm_param.h>
122#include <vm/vm_object.h>
123#include <vm/vm_kern.h>
124#include <vm/vm_extern.h>
125#include <vm/pmap.h>
126#include <vm/vm_map.h>
127#include <vm/vm_page.h>
128#include <vm/uma.h>
129
130/*
131 * Use this define if you want to disable *fancy* VM things.  Expect an
132 * approx 30% decrease in transfer rate.  This could be useful for
133 * NetBSD or OpenBSD.
134 */
135/* #define PIPE_NODIRECT */
136
137/*
138 * interfaces to the outside world
139 */
140static fo_rdwr_t	pipe_read;
141static fo_rdwr_t	pipe_write;
142static fo_ioctl_t	pipe_ioctl;
143static fo_poll_t	pipe_poll;
144static fo_kqfilter_t	pipe_kqfilter;
145static fo_stat_t	pipe_stat;
146static fo_close_t	pipe_close;
147
148static struct fileops pipeops = {
149	.fo_read = pipe_read,
150	.fo_write = pipe_write,
151	.fo_ioctl = pipe_ioctl,
152	.fo_poll = pipe_poll,
153	.fo_kqfilter = pipe_kqfilter,
154	.fo_stat = pipe_stat,
155	.fo_close = pipe_close,
156	.fo_flags = DFLAG_PASSABLE
157};
158
159static void	filt_pipedetach(struct knote *kn);
160static int	filt_piperead(struct knote *kn, long hint);
161static int	filt_pipewrite(struct knote *kn, long hint);
162
163static struct filterops pipe_rfiltops =
164	{ 1, NULL, filt_pipedetach, filt_piperead };
165static struct filterops pipe_wfiltops =
166	{ 1, NULL, filt_pipedetach, filt_pipewrite };
167
168/*
169 * Default pipe buffer size(s), this can be kind-of large now because pipe
170 * space is pageable.  The pipe code will try to maintain locality of
171 * reference for performance reasons, so small amounts of outstanding I/O
172 * will not wipe the cache.
173 */
174#define MINPIPESIZE (PIPE_SIZE/3)
175#define MAXPIPESIZE (2*PIPE_SIZE/3)
176
177static int amountpipes;
178static int amountpipekva;
179static int pipefragretry;
180static int pipeallocfail;
181static int piperesizefail;
182static int piperesizeallowed = 1;
183
184SYSCTL_DECL(_kern_ipc);
185
186SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
187	   &maxpipekva, 0, "Pipe KVA limit");
188SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
189	   &amountpipes, 0, "Current # of pipes");
190SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
191	   &amountpipekva, 0, "Pipe KVA usage");
192SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
193	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
194SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
195	  &pipeallocfail, 0, "Pipe allocation failures");
196SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
197	  &piperesizefail, 0, "Pipe resize failures");
198SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
199	  &piperesizeallowed, 0, "Pipe resizing allowed");
200
201static void pipeinit(void *dummy __unused);
202static void pipeclose(struct pipe *cpipe);
203static void pipe_free_kmem(struct pipe *cpipe);
204static int pipe_create(struct pipe *pipe, int backing);
205static __inline int pipelock(struct pipe *cpipe, int catch);
206static __inline void pipeunlock(struct pipe *cpipe);
207static __inline void pipeselwakeup(struct pipe *cpipe);
208#ifndef PIPE_NODIRECT
209static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
210static void pipe_destroy_write_buffer(struct pipe *wpipe);
211static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
212static void pipe_clone_write_buffer(struct pipe *wpipe);
213#endif
214static int pipespace(struct pipe *cpipe, int size);
215static int pipespace_new(struct pipe *cpipe, int size);
216
217static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
218static void	pipe_zone_dtor(void *mem, int size, void *arg);
219static int	pipe_zone_init(void *mem, int size, int flags);
220static void	pipe_zone_fini(void *mem, int size);
221
222static uma_zone_t pipe_zone;
223
224SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
225
226static void
227pipeinit(void *dummy __unused)
228{
229
230	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair),
231	    pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini,
232	    UMA_ALIGN_PTR, 0);
233	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
234}
235
236static int
237pipe_zone_ctor(void *mem, int size, void *arg, int flags)
238{
239	struct pipepair *pp;
240	struct pipe *rpipe, *wpipe;
241
242	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
243
244	pp = (struct pipepair *)mem;
245
246	/*
247	 * We zero both pipe endpoints to make sure all the kmem pointers
248	 * are NULL, flag fields are zero'd, etc.  We timestamp both
249	 * endpoints with the same time.
250	 */
251	rpipe = &pp->pp_rpipe;
252	bzero(rpipe, sizeof(*rpipe));
253	vfs_timestamp(&rpipe->pipe_ctime);
254	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
255
256	wpipe = &pp->pp_wpipe;
257	bzero(wpipe, sizeof(*wpipe));
258	wpipe->pipe_ctime = rpipe->pipe_ctime;
259	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
260
261	rpipe->pipe_peer = wpipe;
262	rpipe->pipe_pair = pp;
263	wpipe->pipe_peer = rpipe;
264	wpipe->pipe_pair = pp;
265
266	/*
267	 * Mark both endpoints as present; they will later get free'd
268	 * one at a time.  When both are free'd, then the whole pair
269	 * is released.
270	 */
271	rpipe->pipe_present = 1;
272	wpipe->pipe_present = 1;
273
274	/*
275	 * Eventually, the MAC Framework may initialize the label
276	 * in ctor or init, but for now we do it elswhere to avoid
277	 * blocking in ctor or init.
278	 */
279	pp->pp_label = NULL;
280
281	atomic_add_int(&amountpipes, 2);
282	return (0);
283}
284
285static void
286pipe_zone_dtor(void *mem, int size, void *arg)
287{
288	struct pipepair *pp;
289
290	KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size"));
291
292	pp = (struct pipepair *)mem;
293
294	atomic_subtract_int(&amountpipes, 2);
295}
296
297static int
298pipe_zone_init(void *mem, int size, int flags)
299{
300	struct pipepair *pp;
301
302	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
303
304	pp = (struct pipepair *)mem;
305
306	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
307	return (0);
308}
309
310static void
311pipe_zone_fini(void *mem, int size)
312{
313	struct pipepair *pp;
314
315	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
316
317	pp = (struct pipepair *)mem;
318
319	mtx_destroy(&pp->pp_mtx);
320}
321
322/*
323 * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail,
324 * let the zone pick up the pieces via pipeclose().
325 */
326
327/* ARGSUSED */
328int
329pipe(td, uap)
330	struct thread *td;
331	struct pipe_args /* {
332		int	dummy;
333	} */ *uap;
334{
335	struct filedesc *fdp = td->td_proc->p_fd;
336	struct file *rf, *wf;
337	struct pipepair *pp;
338	struct pipe *rpipe, *wpipe;
339	int fd, error;
340
341	pp = uma_zalloc(pipe_zone, M_WAITOK);
342#ifdef MAC
343	/*
344	 * The MAC label is shared between the connected endpoints.  As a
345	 * result mac_init_pipe() and mac_create_pipe() are called once
346	 * for the pair, and not on the endpoints.
347	 */
348	mac_init_pipe(pp);
349	mac_create_pipe(td->td_ucred, pp);
350#endif
351	rpipe = &pp->pp_rpipe;
352	wpipe = &pp->pp_wpipe;
353
354	/* Only the forward direction pipe is backed by default */
355	if (pipe_create(rpipe, 1) || pipe_create(wpipe, 0)) {
356		pipeclose(rpipe);
357		pipeclose(wpipe);
358		return (ENFILE);
359	}
360
361	rpipe->pipe_state |= PIPE_DIRECTOK;
362	wpipe->pipe_state |= PIPE_DIRECTOK;
363
364	error = falloc(td, &rf, &fd);
365	if (error) {
366		pipeclose(rpipe);
367		pipeclose(wpipe);
368		return (error);
369	}
370	/* An extra reference on `rf' has been held for us by falloc(). */
371	td->td_retval[0] = fd;
372
373	/*
374	 * Warning: once we've gotten past allocation of the fd for the
375	 * read-side, we can only drop the read side via fdrop() in order
376	 * to avoid races against processes which manage to dup() the read
377	 * side while we are blocked trying to allocate the write side.
378	 */
379	FILE_LOCK(rf);
380	rf->f_flag = FREAD | FWRITE;
381	rf->f_type = DTYPE_PIPE;
382	rf->f_data = rpipe;
383	rf->f_ops = &pipeops;
384	FILE_UNLOCK(rf);
385	error = falloc(td, &wf, &fd);
386	if (error) {
387		fdclose(fdp, rf, td->td_retval[0], td);
388		fdrop(rf, td);
389		/* rpipe has been closed by fdrop(). */
390		pipeclose(wpipe);
391		return (error);
392	}
393	/* An extra reference on `wf' has been held for us by falloc(). */
394	FILE_LOCK(wf);
395	wf->f_flag = FREAD | FWRITE;
396	wf->f_type = DTYPE_PIPE;
397	wf->f_data = wpipe;
398	wf->f_ops = &pipeops;
399	FILE_UNLOCK(wf);
400	fdrop(wf, td);
401	td->td_retval[1] = fd;
402	fdrop(rf, td);
403
404	return (0);
405}
406
407/*
408 * Allocate kva for pipe circular buffer, the space is pageable
409 * This routine will 'realloc' the size of a pipe safely, if it fails
410 * it will retain the old buffer.
411 * If it fails it will return ENOMEM.
412 */
413static int
414pipespace_new(cpipe, size)
415	struct pipe *cpipe;
416	int size;
417{
418	caddr_t buffer;
419	int error, cnt, firstseg;
420	static int curfail = 0;
421	static struct timeval lastfail;
422
423	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
424	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
425		("pipespace: resize of direct writes not allowed"));
426retry:
427	cnt = cpipe->pipe_buffer.cnt;
428	if (cnt > size)
429		size = cnt;
430
431	size = round_page(size);
432	buffer = (caddr_t) vm_map_min(pipe_map);
433
434	error = vm_map_find(pipe_map, NULL, 0,
435		(vm_offset_t *) &buffer, size, 1,
436		VM_PROT_ALL, VM_PROT_ALL, 0);
437	if (error != KERN_SUCCESS) {
438		if ((cpipe->pipe_buffer.buffer == NULL) &&
439			(size > SMALL_PIPE_SIZE)) {
440			size = SMALL_PIPE_SIZE;
441			pipefragretry++;
442			goto retry;
443		}
444		if (cpipe->pipe_buffer.buffer == NULL) {
445			pipeallocfail++;
446			if (ppsratecheck(&lastfail, &curfail, 1))
447				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
448		} else {
449			piperesizefail++;
450		}
451		return (ENOMEM);
452	}
453
454	/* copy data, then free old resources if we're resizing */
455	if (cnt > 0) {
456		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
457			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
458			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
459				buffer, firstseg);
460			if ((cnt - firstseg) > 0)
461				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
462					cpipe->pipe_buffer.in);
463		} else {
464			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
465				buffer, cnt);
466		}
467	}
468	pipe_free_kmem(cpipe);
469	cpipe->pipe_buffer.buffer = buffer;
470	cpipe->pipe_buffer.size = size;
471	cpipe->pipe_buffer.in = cnt;
472	cpipe->pipe_buffer.out = 0;
473	cpipe->pipe_buffer.cnt = cnt;
474	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
475	return (0);
476}
477
478/*
479 * Wrapper for pipespace_new() that performs locking assertions.
480 */
481static int
482pipespace(cpipe, size)
483	struct pipe *cpipe;
484	int size;
485{
486
487	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
488		("Unlocked pipe passed to pipespace"));
489	return (pipespace_new(cpipe, size));
490}
491
492/*
493 * lock a pipe for I/O, blocking other access
494 */
495static __inline int
496pipelock(cpipe, catch)
497	struct pipe *cpipe;
498	int catch;
499{
500	int error;
501
502	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
503	while (cpipe->pipe_state & PIPE_LOCKFL) {
504		cpipe->pipe_state |= PIPE_LWANT;
505		error = msleep(cpipe, PIPE_MTX(cpipe),
506		    catch ? (PRIBIO | PCATCH) : PRIBIO,
507		    "pipelk", 0);
508		if (error != 0)
509			return (error);
510	}
511	cpipe->pipe_state |= PIPE_LOCKFL;
512	return (0);
513}
514
515/*
516 * unlock a pipe I/O lock
517 */
518static __inline void
519pipeunlock(cpipe)
520	struct pipe *cpipe;
521{
522
523	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
524	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
525		("Unlocked pipe passed to pipeunlock"));
526	cpipe->pipe_state &= ~PIPE_LOCKFL;
527	if (cpipe->pipe_state & PIPE_LWANT) {
528		cpipe->pipe_state &= ~PIPE_LWANT;
529		wakeup(cpipe);
530	}
531}
532
533static __inline void
534pipeselwakeup(cpipe)
535	struct pipe *cpipe;
536{
537
538	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
539	if (cpipe->pipe_state & PIPE_SEL) {
540		cpipe->pipe_state &= ~PIPE_SEL;
541		selwakeuppri(&cpipe->pipe_sel, PSOCK);
542	}
543	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
544		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
545	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
546}
547
548/*
549 * Initialize and allocate VM and memory for pipe.  The structure
550 * will start out zero'd from the ctor, so we just manage the kmem.
551 */
552static int
553pipe_create(pipe, backing)
554	struct pipe *pipe;
555	int backing;
556{
557	int error;
558
559	if (backing) {
560		if (amountpipekva > maxpipekva / 2)
561			error = pipespace_new(pipe, SMALL_PIPE_SIZE);
562		else
563			error = pipespace_new(pipe, PIPE_SIZE);
564	} else {
565		/* If we're not backing this pipe, no need to do anything. */
566		error = 0;
567	}
568	knlist_init(&pipe->pipe_sel.si_note, PIPE_MTX(pipe));
569	return (error);
570}
571
572/* ARGSUSED */
573static int
574pipe_read(fp, uio, active_cred, flags, td)
575	struct file *fp;
576	struct uio *uio;
577	struct ucred *active_cred;
578	struct thread *td;
579	int flags;
580{
581	struct pipe *rpipe = fp->f_data;
582	int error;
583	int nread = 0;
584	u_int size;
585
586	PIPE_LOCK(rpipe);
587	++rpipe->pipe_busy;
588	error = pipelock(rpipe, 1);
589	if (error)
590		goto unlocked_error;
591
592#ifdef MAC
593	error = mac_check_pipe_read(active_cred, rpipe->pipe_pair);
594	if (error)
595		goto locked_error;
596#endif
597	if (amountpipekva > (3 * maxpipekva) / 4) {
598		if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
599			(rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
600			(rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
601			(piperesizeallowed == 1)) {
602			PIPE_UNLOCK(rpipe);
603			pipespace(rpipe, SMALL_PIPE_SIZE);
604			PIPE_LOCK(rpipe);
605		}
606	}
607
608	while (uio->uio_resid) {
609		/*
610		 * normal pipe buffer receive
611		 */
612		if (rpipe->pipe_buffer.cnt > 0) {
613			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
614			if (size > rpipe->pipe_buffer.cnt)
615				size = rpipe->pipe_buffer.cnt;
616			if (size > (u_int) uio->uio_resid)
617				size = (u_int) uio->uio_resid;
618
619			PIPE_UNLOCK(rpipe);
620			error = uiomove(
621			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
622			    size, uio);
623			PIPE_LOCK(rpipe);
624			if (error)
625				break;
626
627			rpipe->pipe_buffer.out += size;
628			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
629				rpipe->pipe_buffer.out = 0;
630
631			rpipe->pipe_buffer.cnt -= size;
632
633			/*
634			 * If there is no more to read in the pipe, reset
635			 * its pointers to the beginning.  This improves
636			 * cache hit stats.
637			 */
638			if (rpipe->pipe_buffer.cnt == 0) {
639				rpipe->pipe_buffer.in = 0;
640				rpipe->pipe_buffer.out = 0;
641			}
642			nread += size;
643#ifndef PIPE_NODIRECT
644		/*
645		 * Direct copy, bypassing a kernel buffer.
646		 */
647		} else if ((size = rpipe->pipe_map.cnt) &&
648			   (rpipe->pipe_state & PIPE_DIRECTW)) {
649			if (size > (u_int) uio->uio_resid)
650				size = (u_int) uio->uio_resid;
651
652			PIPE_UNLOCK(rpipe);
653			error = uiomove_fromphys(rpipe->pipe_map.ms,
654			    rpipe->pipe_map.pos, size, uio);
655			PIPE_LOCK(rpipe);
656			if (error)
657				break;
658			nread += size;
659			rpipe->pipe_map.pos += size;
660			rpipe->pipe_map.cnt -= size;
661			if (rpipe->pipe_map.cnt == 0) {
662				rpipe->pipe_state &= ~PIPE_DIRECTW;
663				wakeup(rpipe);
664			}
665#endif
666		} else {
667			/*
668			 * detect EOF condition
669			 * read returns 0 on EOF, no need to set error
670			 */
671			if (rpipe->pipe_state & PIPE_EOF)
672				break;
673
674			/*
675			 * If the "write-side" has been blocked, wake it up now.
676			 */
677			if (rpipe->pipe_state & PIPE_WANTW) {
678				rpipe->pipe_state &= ~PIPE_WANTW;
679				wakeup(rpipe);
680			}
681
682			/*
683			 * Break if some data was read.
684			 */
685			if (nread > 0)
686				break;
687
688			/*
689			 * Unlock the pipe buffer for our remaining processing.
690			 * We will either break out with an error or we will
691			 * sleep and relock to loop.
692			 */
693			pipeunlock(rpipe);
694
695			/*
696			 * Handle non-blocking mode operation or
697			 * wait for more data.
698			 */
699			if (fp->f_flag & FNONBLOCK) {
700				error = EAGAIN;
701			} else {
702				rpipe->pipe_state |= PIPE_WANTR;
703				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
704				    PRIBIO | PCATCH,
705				    "piperd", 0)) == 0)
706					error = pipelock(rpipe, 1);
707			}
708			if (error)
709				goto unlocked_error;
710		}
711	}
712#ifdef MAC
713locked_error:
714#endif
715	pipeunlock(rpipe);
716
717	/* XXX: should probably do this before getting any locks. */
718	if (error == 0)
719		vfs_timestamp(&rpipe->pipe_atime);
720unlocked_error:
721	--rpipe->pipe_busy;
722
723	/*
724	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
725	 */
726	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
727		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
728		wakeup(rpipe);
729	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
730		/*
731		 * Handle write blocking hysteresis.
732		 */
733		if (rpipe->pipe_state & PIPE_WANTW) {
734			rpipe->pipe_state &= ~PIPE_WANTW;
735			wakeup(rpipe);
736		}
737	}
738
739	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
740		pipeselwakeup(rpipe);
741
742	PIPE_UNLOCK(rpipe);
743	return (error);
744}
745
746#ifndef PIPE_NODIRECT
747/*
748 * Map the sending processes' buffer into kernel space and wire it.
749 * This is similar to a physical write operation.
750 */
751static int
752pipe_build_write_buffer(wpipe, uio)
753	struct pipe *wpipe;
754	struct uio *uio;
755{
756	pmap_t pmap;
757	u_int size;
758	int i, j;
759	vm_offset_t addr, endaddr;
760
761	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
762	KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
763		("Clone attempt on non-direct write pipe!"));
764
765	size = (u_int) uio->uio_iov->iov_len;
766	if (size > wpipe->pipe_buffer.size)
767		size = wpipe->pipe_buffer.size;
768
769	pmap = vmspace_pmap(curproc->p_vmspace);
770	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
771	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
772	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
773		/*
774		 * vm_fault_quick() can sleep.  Consequently,
775		 * vm_page_lock_queue() and vm_page_unlock_queue()
776		 * should not be performed outside of this loop.
777		 */
778	race:
779		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
780			vm_page_lock_queues();
781			for (j = 0; j < i; j++)
782				vm_page_unhold(wpipe->pipe_map.ms[j]);
783			vm_page_unlock_queues();
784			return (EFAULT);
785		}
786		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
787		    VM_PROT_READ);
788		if (wpipe->pipe_map.ms[i] == NULL)
789			goto race;
790	}
791
792/*
793 * set up the control block
794 */
795	wpipe->pipe_map.npages = i;
796	wpipe->pipe_map.pos =
797	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
798	wpipe->pipe_map.cnt = size;
799
800/*
801 * and update the uio data
802 */
803
804	uio->uio_iov->iov_len -= size;
805	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
806	if (uio->uio_iov->iov_len == 0)
807		uio->uio_iov++;
808	uio->uio_resid -= size;
809	uio->uio_offset += size;
810	return (0);
811}
812
813/*
814 * unmap and unwire the process buffer
815 */
816static void
817pipe_destroy_write_buffer(wpipe)
818	struct pipe *wpipe;
819{
820	int i;
821
822	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
823	vm_page_lock_queues();
824	for (i = 0; i < wpipe->pipe_map.npages; i++) {
825		vm_page_unhold(wpipe->pipe_map.ms[i]);
826	}
827	vm_page_unlock_queues();
828	wpipe->pipe_map.npages = 0;
829}
830
831/*
832 * In the case of a signal, the writing process might go away.  This
833 * code copies the data into the circular buffer so that the source
834 * pages can be freed without loss of data.
835 */
836static void
837pipe_clone_write_buffer(wpipe)
838	struct pipe *wpipe;
839{
840	struct uio uio;
841	struct iovec iov;
842	int size;
843	int pos;
844
845	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
846	size = wpipe->pipe_map.cnt;
847	pos = wpipe->pipe_map.pos;
848
849	wpipe->pipe_buffer.in = size;
850	wpipe->pipe_buffer.out = 0;
851	wpipe->pipe_buffer.cnt = size;
852	wpipe->pipe_state &= ~PIPE_DIRECTW;
853
854	PIPE_UNLOCK(wpipe);
855	iov.iov_base = wpipe->pipe_buffer.buffer;
856	iov.iov_len = size;
857	uio.uio_iov = &iov;
858	uio.uio_iovcnt = 1;
859	uio.uio_offset = 0;
860	uio.uio_resid = size;
861	uio.uio_segflg = UIO_SYSSPACE;
862	uio.uio_rw = UIO_READ;
863	uio.uio_td = curthread;
864	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
865	PIPE_LOCK(wpipe);
866	pipe_destroy_write_buffer(wpipe);
867}
868
869/*
870 * This implements the pipe buffer write mechanism.  Note that only
871 * a direct write OR a normal pipe write can be pending at any given time.
872 * If there are any characters in the pipe buffer, the direct write will
873 * be deferred until the receiving process grabs all of the bytes from
874 * the pipe buffer.  Then the direct mapping write is set-up.
875 */
876static int
877pipe_direct_write(wpipe, uio)
878	struct pipe *wpipe;
879	struct uio *uio;
880{
881	int error;
882
883retry:
884	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
885	error = pipelock(wpipe, 1);
886	if (wpipe->pipe_state & PIPE_EOF)
887		error = EPIPE;
888	if (error) {
889		pipeunlock(wpipe);
890		goto error1;
891	}
892	while (wpipe->pipe_state & PIPE_DIRECTW) {
893		if (wpipe->pipe_state & PIPE_WANTR) {
894			wpipe->pipe_state &= ~PIPE_WANTR;
895			wakeup(wpipe);
896		}
897		wpipe->pipe_state |= PIPE_WANTW;
898		pipeunlock(wpipe);
899		error = msleep(wpipe, PIPE_MTX(wpipe),
900		    PRIBIO | PCATCH, "pipdww", 0);
901		if (error)
902			goto error1;
903		else
904			goto retry;
905	}
906	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
907	if (wpipe->pipe_buffer.cnt > 0) {
908		if (wpipe->pipe_state & PIPE_WANTR) {
909			wpipe->pipe_state &= ~PIPE_WANTR;
910			wakeup(wpipe);
911		}
912		wpipe->pipe_state |= PIPE_WANTW;
913		pipeunlock(wpipe);
914		error = msleep(wpipe, PIPE_MTX(wpipe),
915		    PRIBIO | PCATCH, "pipdwc", 0);
916		if (error)
917			goto error1;
918		else
919			goto retry;
920	}
921
922	wpipe->pipe_state |= PIPE_DIRECTW;
923
924	PIPE_UNLOCK(wpipe);
925	error = pipe_build_write_buffer(wpipe, uio);
926	PIPE_LOCK(wpipe);
927	if (error) {
928		wpipe->pipe_state &= ~PIPE_DIRECTW;
929		pipeunlock(wpipe);
930		goto error1;
931	}
932
933	error = 0;
934	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
935		if (wpipe->pipe_state & PIPE_EOF) {
936			pipe_destroy_write_buffer(wpipe);
937			pipeselwakeup(wpipe);
938			pipeunlock(wpipe);
939			error = EPIPE;
940			goto error1;
941		}
942		if (wpipe->pipe_state & PIPE_WANTR) {
943			wpipe->pipe_state &= ~PIPE_WANTR;
944			wakeup(wpipe);
945		}
946		pipeselwakeup(wpipe);
947		pipeunlock(wpipe);
948		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
949		    "pipdwt", 0);
950		pipelock(wpipe, 0);
951	}
952
953	if (wpipe->pipe_state & PIPE_EOF)
954		error = EPIPE;
955	if (wpipe->pipe_state & PIPE_DIRECTW) {
956		/*
957		 * this bit of trickery substitutes a kernel buffer for
958		 * the process that might be going away.
959		 */
960		pipe_clone_write_buffer(wpipe);
961	} else {
962		pipe_destroy_write_buffer(wpipe);
963	}
964	pipeunlock(wpipe);
965	return (error);
966
967error1:
968	wakeup(wpipe);
969	return (error);
970}
971#endif
972
973static int
974pipe_write(fp, uio, active_cred, flags, td)
975	struct file *fp;
976	struct uio *uio;
977	struct ucred *active_cred;
978	struct thread *td;
979	int flags;
980{
981	int error = 0;
982	int desiredsize, orig_resid;
983	struct pipe *wpipe, *rpipe;
984
985	rpipe = fp->f_data;
986	wpipe = rpipe->pipe_peer;
987
988	PIPE_LOCK(rpipe);
989	error = pipelock(wpipe, 1);
990	if (error) {
991		PIPE_UNLOCK(rpipe);
992		return (error);
993	}
994	/*
995	 * detect loss of pipe read side, issue SIGPIPE if lost.
996	 */
997	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
998		pipeunlock(wpipe);
999		PIPE_UNLOCK(rpipe);
1000		return (EPIPE);
1001	}
1002#ifdef MAC
1003	error = mac_check_pipe_write(active_cred, wpipe->pipe_pair);
1004	if (error) {
1005		pipeunlock(wpipe);
1006		PIPE_UNLOCK(rpipe);
1007		return (error);
1008	}
1009#endif
1010	++wpipe->pipe_busy;
1011
1012	/* Choose a larger size if it's advantageous */
1013	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
1014	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
1015		if (piperesizeallowed != 1)
1016			break;
1017		if (amountpipekva > maxpipekva / 2)
1018			break;
1019		if (desiredsize == BIG_PIPE_SIZE)
1020			break;
1021		desiredsize = desiredsize * 2;
1022	}
1023
1024	/* Choose a smaller size if we're in a OOM situation */
1025	if ((amountpipekva > (3 * maxpipekva) / 4) &&
1026		(wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
1027		(wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
1028		(piperesizeallowed == 1))
1029		desiredsize = SMALL_PIPE_SIZE;
1030
1031	/* Resize if the above determined that a new size was necessary */
1032	if ((desiredsize != wpipe->pipe_buffer.size) &&
1033		((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
1034		PIPE_UNLOCK(wpipe);
1035		pipespace(wpipe, desiredsize);
1036		PIPE_LOCK(wpipe);
1037	}
1038	if (wpipe->pipe_buffer.size == 0) {
1039		/*
1040		 * This can only happen for reverse direction use of pipes
1041		 * in a complete OOM situation.
1042		 */
1043		error = ENOMEM;
1044		--wpipe->pipe_busy;
1045		pipeunlock(wpipe);
1046		PIPE_UNLOCK(wpipe);
1047		return (error);
1048	}
1049
1050	pipeunlock(wpipe);
1051
1052	orig_resid = uio->uio_resid;
1053
1054	while (uio->uio_resid) {
1055		int space;
1056
1057		pipelock(wpipe, 0);
1058		if (wpipe->pipe_state & PIPE_EOF) {
1059			pipeunlock(wpipe);
1060			error = EPIPE;
1061			break;
1062		}
1063#ifndef PIPE_NODIRECT
1064		/*
1065		 * If the transfer is large, we can gain performance if
1066		 * we do process-to-process copies directly.
1067		 * If the write is non-blocking, we don't use the
1068		 * direct write mechanism.
1069		 *
1070		 * The direct write mechanism will detect the reader going
1071		 * away on us.
1072		 */
1073		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1074		    (wpipe->pipe_buffer.size >= PIPE_MINDIRECT) &&
1075		    (fp->f_flag & FNONBLOCK) == 0) {
1076			pipeunlock(wpipe);
1077			error = pipe_direct_write(wpipe, uio);
1078			if (error)
1079				break;
1080			continue;
1081		}
1082#endif
1083
1084		/*
1085		 * Pipe buffered writes cannot be coincidental with
1086		 * direct writes.  We wait until the currently executing
1087		 * direct write is completed before we start filling the
1088		 * pipe buffer.  We break out if a signal occurs or the
1089		 * reader goes away.
1090		 */
1091		if (wpipe->pipe_state & PIPE_DIRECTW) {
1092			if (wpipe->pipe_state & PIPE_WANTR) {
1093				wpipe->pipe_state &= ~PIPE_WANTR;
1094				wakeup(wpipe);
1095			}
1096			pipeunlock(wpipe);
1097			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1098			    "pipbww", 0);
1099			if (error)
1100				break;
1101			else
1102				continue;
1103		}
1104
1105		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1106
1107		/* Writes of size <= PIPE_BUF must be atomic. */
1108		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1109			space = 0;
1110
1111		if (space > 0) {
1112			int size;	/* Transfer size */
1113			int segsize;	/* first segment to transfer */
1114
1115			/*
1116			 * Transfer size is minimum of uio transfer
1117			 * and free space in pipe buffer.
1118			 */
1119			if (space > uio->uio_resid)
1120				size = uio->uio_resid;
1121			else
1122				size = space;
1123			/*
1124			 * First segment to transfer is minimum of
1125			 * transfer size and contiguous space in
1126			 * pipe buffer.  If first segment to transfer
1127			 * is less than the transfer size, we've got
1128			 * a wraparound in the buffer.
1129			 */
1130			segsize = wpipe->pipe_buffer.size -
1131				wpipe->pipe_buffer.in;
1132			if (segsize > size)
1133				segsize = size;
1134
1135			/* Transfer first segment */
1136
1137			PIPE_UNLOCK(rpipe);
1138			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1139					segsize, uio);
1140			PIPE_LOCK(rpipe);
1141
1142			if (error == 0 && segsize < size) {
1143				KASSERT(wpipe->pipe_buffer.in + segsize ==
1144					wpipe->pipe_buffer.size,
1145					("Pipe buffer wraparound disappeared"));
1146				/*
1147				 * Transfer remaining part now, to
1148				 * support atomic writes.  Wraparound
1149				 * happened.
1150				 */
1151
1152				PIPE_UNLOCK(rpipe);
1153				error = uiomove(
1154				    &wpipe->pipe_buffer.buffer[0],
1155				    size - segsize, uio);
1156				PIPE_LOCK(rpipe);
1157			}
1158			if (error == 0) {
1159				wpipe->pipe_buffer.in += size;
1160				if (wpipe->pipe_buffer.in >=
1161				    wpipe->pipe_buffer.size) {
1162					KASSERT(wpipe->pipe_buffer.in ==
1163						size - segsize +
1164						wpipe->pipe_buffer.size,
1165						("Expected wraparound bad"));
1166					wpipe->pipe_buffer.in = size - segsize;
1167				}
1168
1169				wpipe->pipe_buffer.cnt += size;
1170				KASSERT(wpipe->pipe_buffer.cnt <=
1171					wpipe->pipe_buffer.size,
1172					("Pipe buffer overflow"));
1173			}
1174			pipeunlock(wpipe);
1175		} else {
1176			/*
1177			 * If the "read-side" has been blocked, wake it up now.
1178			 */
1179			if (wpipe->pipe_state & PIPE_WANTR) {
1180				wpipe->pipe_state &= ~PIPE_WANTR;
1181				wakeup(wpipe);
1182			}
1183
1184			/*
1185			 * don't block on non-blocking I/O
1186			 */
1187			if (fp->f_flag & FNONBLOCK) {
1188				error = EAGAIN;
1189				pipeunlock(wpipe);
1190				break;
1191			}
1192
1193			/*
1194			 * We have no more space and have something to offer,
1195			 * wake up select/poll.
1196			 */
1197			pipeselwakeup(wpipe);
1198
1199			wpipe->pipe_state |= PIPE_WANTW;
1200			pipeunlock(wpipe);
1201			error = msleep(wpipe, PIPE_MTX(rpipe),
1202			    PRIBIO | PCATCH, "pipewr", 0);
1203			if (error != 0)
1204				break;
1205		}
1206	}
1207
1208	pipelock(wpipe, 0);
1209	--wpipe->pipe_busy;
1210
1211	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1212		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1213		wakeup(wpipe);
1214	} else if (wpipe->pipe_buffer.cnt > 0) {
1215		/*
1216		 * If we have put any characters in the buffer, we wake up
1217		 * the reader.
1218		 */
1219		if (wpipe->pipe_state & PIPE_WANTR) {
1220			wpipe->pipe_state &= ~PIPE_WANTR;
1221			wakeup(wpipe);
1222		}
1223	}
1224
1225	/*
1226	 * Don't return EPIPE if I/O was successful
1227	 */
1228	if ((wpipe->pipe_buffer.cnt == 0) &&
1229	    (uio->uio_resid == 0) &&
1230	    (error == EPIPE)) {
1231		error = 0;
1232	}
1233
1234	if (error == 0)
1235		vfs_timestamp(&wpipe->pipe_mtime);
1236
1237	/*
1238	 * We have something to offer,
1239	 * wake up select/poll.
1240	 */
1241	if (wpipe->pipe_buffer.cnt)
1242		pipeselwakeup(wpipe);
1243
1244	pipeunlock(wpipe);
1245	PIPE_UNLOCK(rpipe);
1246	return (error);
1247}
1248
1249/*
1250 * we implement a very minimal set of ioctls for compatibility with sockets.
1251 */
1252static int
1253pipe_ioctl(fp, cmd, data, active_cred, td)
1254	struct file *fp;
1255	u_long cmd;
1256	void *data;
1257	struct ucred *active_cred;
1258	struct thread *td;
1259{
1260	struct pipe *mpipe = fp->f_data;
1261	int error;
1262
1263	PIPE_LOCK(mpipe);
1264
1265#ifdef MAC
1266	error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1267	if (error) {
1268		PIPE_UNLOCK(mpipe);
1269		return (error);
1270	}
1271#endif
1272
1273	error = 0;
1274	switch (cmd) {
1275
1276	case FIONBIO:
1277		break;
1278
1279	case FIOASYNC:
1280		if (*(int *)data) {
1281			mpipe->pipe_state |= PIPE_ASYNC;
1282		} else {
1283			mpipe->pipe_state &= ~PIPE_ASYNC;
1284		}
1285		break;
1286
1287	case FIONREAD:
1288		if (mpipe->pipe_state & PIPE_DIRECTW)
1289			*(int *)data = mpipe->pipe_map.cnt;
1290		else
1291			*(int *)data = mpipe->pipe_buffer.cnt;
1292		break;
1293
1294	case FIOSETOWN:
1295		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
1296		break;
1297
1298	case FIOGETOWN:
1299		*(int *)data = fgetown(&mpipe->pipe_sigio);
1300		break;
1301
1302	/* This is deprecated, FIOSETOWN should be used instead. */
1303	case TIOCSPGRP:
1304		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
1305		break;
1306
1307	/* This is deprecated, FIOGETOWN should be used instead. */
1308	case TIOCGPGRP:
1309		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1310		break;
1311
1312	default:
1313		error = ENOTTY;
1314		break;
1315	}
1316	PIPE_UNLOCK(mpipe);
1317	return (error);
1318}
1319
1320static int
1321pipe_poll(fp, events, active_cred, td)
1322	struct file *fp;
1323	int events;
1324	struct ucred *active_cred;
1325	struct thread *td;
1326{
1327	struct pipe *rpipe = fp->f_data;
1328	struct pipe *wpipe;
1329	int revents = 0;
1330#ifdef MAC
1331	int error;
1332#endif
1333
1334	wpipe = rpipe->pipe_peer;
1335	PIPE_LOCK(rpipe);
1336#ifdef MAC
1337	error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair);
1338	if (error)
1339		goto locked_error;
1340#endif
1341	if (events & (POLLIN | POLLRDNORM))
1342		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1343		    (rpipe->pipe_buffer.cnt > 0) ||
1344		    (rpipe->pipe_state & PIPE_EOF))
1345			revents |= events & (POLLIN | POLLRDNORM);
1346
1347	if (events & (POLLOUT | POLLWRNORM))
1348		if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) ||
1349		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1350		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1351			revents |= events & (POLLOUT | POLLWRNORM);
1352
1353	if ((rpipe->pipe_state & PIPE_EOF) ||
1354	    (!wpipe->pipe_present) ||
1355	    (wpipe->pipe_state & PIPE_EOF))
1356		revents |= POLLHUP;
1357
1358	if (revents == 0) {
1359		if (events & (POLLIN | POLLRDNORM)) {
1360			selrecord(td, &rpipe->pipe_sel);
1361			rpipe->pipe_state |= PIPE_SEL;
1362		}
1363
1364		if (events & (POLLOUT | POLLWRNORM)) {
1365			selrecord(td, &wpipe->pipe_sel);
1366			wpipe->pipe_state |= PIPE_SEL;
1367		}
1368	}
1369#ifdef MAC
1370locked_error:
1371#endif
1372	PIPE_UNLOCK(rpipe);
1373
1374	return (revents);
1375}
1376
1377/*
1378 * We shouldn't need locks here as we're doing a read and this should
1379 * be a natural race.
1380 */
1381static int
1382pipe_stat(fp, ub, active_cred, td)
1383	struct file *fp;
1384	struct stat *ub;
1385	struct ucred *active_cred;
1386	struct thread *td;
1387{
1388	struct pipe *pipe = fp->f_data;
1389#ifdef MAC
1390	int error;
1391
1392	PIPE_LOCK(pipe);
1393	error = mac_check_pipe_stat(active_cred, pipe->pipe_pair);
1394	PIPE_UNLOCK(pipe);
1395	if (error)
1396		return (error);
1397#endif
1398	bzero(ub, sizeof(*ub));
1399	ub->st_mode = S_IFIFO;
1400	ub->st_blksize = PAGE_SIZE;
1401	if (pipe->pipe_state & PIPE_DIRECTW)
1402		ub->st_size = pipe->pipe_map.cnt;
1403	else
1404		ub->st_size = pipe->pipe_buffer.cnt;
1405	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1406	ub->st_atimespec = pipe->pipe_atime;
1407	ub->st_mtimespec = pipe->pipe_mtime;
1408	ub->st_ctimespec = pipe->pipe_ctime;
1409	ub->st_uid = fp->f_cred->cr_uid;
1410	ub->st_gid = fp->f_cred->cr_gid;
1411	/*
1412	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1413	 * XXX (st_dev, st_ino) should be unique.
1414	 */
1415	return (0);
1416}
1417
1418/* ARGSUSED */
1419static int
1420pipe_close(fp, td)
1421	struct file *fp;
1422	struct thread *td;
1423{
1424	struct pipe *cpipe = fp->f_data;
1425
1426	fp->f_ops = &badfileops;
1427	fp->f_data = NULL;
1428	funsetown(&cpipe->pipe_sigio);
1429	pipeclose(cpipe);
1430	return (0);
1431}
1432
1433static void
1434pipe_free_kmem(cpipe)
1435	struct pipe *cpipe;
1436{
1437
1438	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1439	    ("pipe_free_kmem: pipe mutex locked"));
1440
1441	if (cpipe->pipe_buffer.buffer != NULL) {
1442		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1443		vm_map_remove(pipe_map,
1444		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1445		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1446		cpipe->pipe_buffer.buffer = NULL;
1447	}
1448#ifndef PIPE_NODIRECT
1449	{
1450		cpipe->pipe_map.cnt = 0;
1451		cpipe->pipe_map.pos = 0;
1452		cpipe->pipe_map.npages = 0;
1453	}
1454#endif
1455}
1456
1457/*
1458 * shutdown the pipe
1459 */
1460static void
1461pipeclose(cpipe)
1462	struct pipe *cpipe;
1463{
1464	struct pipepair *pp;
1465	struct pipe *ppipe;
1466
1467	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1468
1469	PIPE_LOCK(cpipe);
1470	pipelock(cpipe, 0);
1471	pp = cpipe->pipe_pair;
1472
1473	pipeselwakeup(cpipe);
1474
1475	/*
1476	 * If the other side is blocked, wake it up saying that
1477	 * we want to close it down.
1478	 */
1479	cpipe->pipe_state |= PIPE_EOF;
1480	while (cpipe->pipe_busy) {
1481		wakeup(cpipe);
1482		cpipe->pipe_state |= PIPE_WANT;
1483		pipeunlock(cpipe);
1484		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1485		pipelock(cpipe, 0);
1486	}
1487
1488
1489	/*
1490	 * Disconnect from peer, if any.
1491	 */
1492	ppipe = cpipe->pipe_peer;
1493	if (ppipe->pipe_present != 0) {
1494		pipeselwakeup(ppipe);
1495
1496		ppipe->pipe_state |= PIPE_EOF;
1497		wakeup(ppipe);
1498		KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
1499	}
1500
1501	/*
1502	 * Mark this endpoint as free.  Release kmem resources.  We
1503	 * don't mark this endpoint as unused until we've finished
1504	 * doing that, or the pipe might disappear out from under
1505	 * us.
1506	 */
1507	PIPE_UNLOCK(cpipe);
1508	pipe_free_kmem(cpipe);
1509	PIPE_LOCK(cpipe);
1510	cpipe->pipe_present = 0;
1511	pipeunlock(cpipe);
1512	knlist_clear(&cpipe->pipe_sel.si_note, 1);
1513	knlist_destroy(&cpipe->pipe_sel.si_note);
1514
1515	/*
1516	 * If both endpoints are now closed, release the memory for the
1517	 * pipe pair.  If not, unlock.
1518	 */
1519	if (ppipe->pipe_present == 0) {
1520		PIPE_UNLOCK(cpipe);
1521#ifdef MAC
1522		mac_destroy_pipe(pp);
1523#endif
1524		uma_zfree(pipe_zone, cpipe->pipe_pair);
1525	} else
1526		PIPE_UNLOCK(cpipe);
1527}
1528
1529/*ARGSUSED*/
1530static int
1531pipe_kqfilter(struct file *fp, struct knote *kn)
1532{
1533	struct pipe *cpipe;
1534
1535	cpipe = kn->kn_fp->f_data;
1536	PIPE_LOCK(cpipe);
1537	switch (kn->kn_filter) {
1538	case EVFILT_READ:
1539		kn->kn_fop = &pipe_rfiltops;
1540		break;
1541	case EVFILT_WRITE:
1542		kn->kn_fop = &pipe_wfiltops;
1543		if (!cpipe->pipe_peer->pipe_present) {
1544			/* other end of pipe has been closed */
1545			PIPE_UNLOCK(cpipe);
1546			return (EPIPE);
1547		}
1548		cpipe = cpipe->pipe_peer;
1549		break;
1550	default:
1551		PIPE_UNLOCK(cpipe);
1552		return (EINVAL);
1553	}
1554
1555	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
1556	PIPE_UNLOCK(cpipe);
1557	return (0);
1558}
1559
1560static void
1561filt_pipedetach(struct knote *kn)
1562{
1563	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1564
1565	PIPE_LOCK(cpipe);
1566	if (kn->kn_filter == EVFILT_WRITE) {
1567		if (!cpipe->pipe_peer->pipe_present) {
1568			PIPE_UNLOCK(cpipe);
1569			return;
1570		}
1571		cpipe = cpipe->pipe_peer;
1572	}
1573	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
1574	PIPE_UNLOCK(cpipe);
1575}
1576
1577/*ARGSUSED*/
1578static int
1579filt_piperead(struct knote *kn, long hint)
1580{
1581	struct pipe *rpipe = kn->kn_fp->f_data;
1582	struct pipe *wpipe = rpipe->pipe_peer;
1583	int ret;
1584
1585	PIPE_LOCK(rpipe);
1586	kn->kn_data = rpipe->pipe_buffer.cnt;
1587	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1588		kn->kn_data = rpipe->pipe_map.cnt;
1589
1590	if ((rpipe->pipe_state & PIPE_EOF) ||
1591	    (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1592		kn->kn_flags |= EV_EOF;
1593		PIPE_UNLOCK(rpipe);
1594		return (1);
1595	}
1596	ret = kn->kn_data > 0;
1597	PIPE_UNLOCK(rpipe);
1598	return ret;
1599}
1600
1601/*ARGSUSED*/
1602static int
1603filt_pipewrite(struct knote *kn, long hint)
1604{
1605	struct pipe *rpipe = kn->kn_fp->f_data;
1606	struct pipe *wpipe = rpipe->pipe_peer;
1607
1608	PIPE_LOCK(rpipe);
1609	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1610		kn->kn_data = 0;
1611		kn->kn_flags |= EV_EOF;
1612		PIPE_UNLOCK(rpipe);
1613		return (1);
1614	}
1615	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1616	if (wpipe->pipe_state & PIPE_DIRECTW)
1617		kn->kn_data = 0;
1618
1619	PIPE_UNLOCK(rpipe);
1620	return (kn->kn_data >= PIPE_BUF);
1621}
1622