sys_pipe.c revision 125293
1/*
2 * Copyright (c) 1996 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 */
19
20/*
21 * This file contains a high-performance replacement for the socket-based
22 * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23 * all features of sockets, but does do everything that pipes normally
24 * do.
25 */
26
27/*
28 * This code has two modes of operation, a small write mode and a large
29 * write mode.  The small write mode acts like conventional pipes with
30 * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31 * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33 * the receiving process can copy it directly from the pages in the sending
34 * process.
35 *
36 * If the sending process receives a signal, it is possible that it will
37 * go away, and certainly its address space can change, because control
38 * is returned back to the user-mode side.  In that case, the pipe code
39 * arranges to copy the buffer supplied by the user process, to a pageable
40 * kernel buffer, and the receiving process will grab the data from the
41 * pageable kernel buffer.  Since signals don't happen all that often,
42 * the copy operation is normally eliminated.
43 *
44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45 * happen for small transfers so that the system will not spend all of
46 * its time context switching.
47 *
48 * In order to limit the resource use of pipes, two sysctls exist:
49 *
50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51 * address space available to us in pipe_map.  Whenever the amount in use
52 * exceeds half of this value, all new pipes will be created with size
53 * SMALL_PIPE_SIZE, rather than PIPE_SIZE.  Big pipe creation will be limited
54 * as well.  This value is loader tunable only.
55 *
56 * kern.ipc.maxpipekvawired - This value limits the amount of memory that may
57 * be wired in order to facilitate direct copies using page flipping.
58 * Whenever this value is exceeded, pipes will fall back to using regular
59 * copies.  This value is sysctl controllable at all times.
60 *
61 * These values are autotuned in subr_param.c.
62 *
63 * Memory usage may be monitored through the sysctls
64 * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired.
65 *
66 */
67
68#include <sys/cdefs.h>
69__FBSDID("$FreeBSD: head/sys/kern/sys_pipe.c 125293 2004-02-01 05:56:51Z rwatson $");
70
71#include "opt_mac.h"
72
73#include <sys/param.h>
74#include <sys/systm.h>
75#include <sys/fcntl.h>
76#include <sys/file.h>
77#include <sys/filedesc.h>
78#include <sys/filio.h>
79#include <sys/kernel.h>
80#include <sys/lock.h>
81#include <sys/mac.h>
82#include <sys/mutex.h>
83#include <sys/ttycom.h>
84#include <sys/stat.h>
85#include <sys/malloc.h>
86#include <sys/poll.h>
87#include <sys/selinfo.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
90#include <sys/sysproto.h>
91#include <sys/pipe.h>
92#include <sys/proc.h>
93#include <sys/vnode.h>
94#include <sys/uio.h>
95#include <sys/event.h>
96
97#include <vm/vm.h>
98#include <vm/vm_param.h>
99#include <vm/vm_object.h>
100#include <vm/vm_kern.h>
101#include <vm/vm_extern.h>
102#include <vm/pmap.h>
103#include <vm/vm_map.h>
104#include <vm/vm_page.h>
105#include <vm/uma.h>
106
107/*
108 * Use this define if you want to disable *fancy* VM things.  Expect an
109 * approx 30% decrease in transfer rate.  This could be useful for
110 * NetBSD or OpenBSD.
111 */
112/* #define PIPE_NODIRECT */
113
114/*
115 * interfaces to the outside world
116 */
117static fo_rdwr_t	pipe_read;
118static fo_rdwr_t	pipe_write;
119static fo_ioctl_t	pipe_ioctl;
120static fo_poll_t	pipe_poll;
121static fo_kqfilter_t	pipe_kqfilter;
122static fo_stat_t	pipe_stat;
123static fo_close_t	pipe_close;
124
125static struct fileops pipeops = {
126	.fo_read = pipe_read,
127	.fo_write = pipe_write,
128	.fo_ioctl = pipe_ioctl,
129	.fo_poll = pipe_poll,
130	.fo_kqfilter = pipe_kqfilter,
131	.fo_stat = pipe_stat,
132	.fo_close = pipe_close,
133	.fo_flags = DFLAG_PASSABLE
134};
135
136static void	filt_pipedetach(struct knote *kn);
137static int	filt_piperead(struct knote *kn, long hint);
138static int	filt_pipewrite(struct knote *kn, long hint);
139
140static struct filterops pipe_rfiltops =
141	{ 1, NULL, filt_pipedetach, filt_piperead };
142static struct filterops pipe_wfiltops =
143	{ 1, NULL, filt_pipedetach, filt_pipewrite };
144
145/*
146 * Default pipe buffer size(s), this can be kind-of large now because pipe
147 * space is pageable.  The pipe code will try to maintain locality of
148 * reference for performance reasons, so small amounts of outstanding I/O
149 * will not wipe the cache.
150 */
151#define MINPIPESIZE (PIPE_SIZE/3)
152#define MAXPIPESIZE (2*PIPE_SIZE/3)
153
154/*
155 * Limit the number of "big" pipes
156 */
157#define LIMITBIGPIPES	32
158static int nbigpipe;
159
160static int amountpipes;
161static int amountpipekva;
162static int amountpipekvawired;
163
164SYSCTL_DECL(_kern_ipc);
165
166SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
167	   &maxpipekva, 0, "Pipe KVA limit");
168SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW,
169	   &maxpipekvawired, 0, "Pipe KVA wired limit");
170SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
171	   &amountpipes, 0, "Current # of pipes");
172SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
173	   &nbigpipe, 0, "Current # of big pipes");
174SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
175	   &amountpipekva, 0, "Pipe KVA usage");
176SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD,
177	   &amountpipekvawired, 0, "Pipe wired KVA usage");
178
179static void pipeinit(void *dummy __unused);
180static void pipeclose(struct pipe *cpipe);
181static void pipe_free_kmem(struct pipe *cpipe);
182static int pipe_create(struct pipe *pipe);
183static __inline int pipelock(struct pipe *cpipe, int catch);
184static __inline void pipeunlock(struct pipe *cpipe);
185static __inline void pipeselwakeup(struct pipe *cpipe);
186#ifndef PIPE_NODIRECT
187static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
188static void pipe_destroy_write_buffer(struct pipe *wpipe);
189static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
190static void pipe_clone_write_buffer(struct pipe *wpipe);
191#endif
192static int pipespace(struct pipe *cpipe, int size);
193
194static void	pipe_zone_ctor(void *mem, int size, void *arg);
195static void	pipe_zone_dtor(void *mem, int size, void *arg);
196static void	pipe_zone_init(void *mem, int size);
197static void	pipe_zone_fini(void *mem, int size);
198
199static uma_zone_t pipe_zone;
200
201SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
202
203static void
204pipeinit(void *dummy __unused)
205{
206
207	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair),
208	    pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini,
209	    UMA_ALIGN_PTR, 0);
210	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
211}
212
213static void
214pipe_zone_ctor(void *mem, int size, void *arg)
215{
216	struct pipepair *pp;
217	struct pipe *rpipe, *wpipe;
218
219	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
220
221	pp = (struct pipepair *)mem;
222
223	/*
224	 * We zero both pipe endpoints to make sure all the kmem pointers
225	 * are NULL, flag fields are zero'd, etc.  We timestamp both
226	 * endpoints with the same time.
227	 */
228	rpipe = &pp->pp_rpipe;
229	bzero(rpipe, sizeof(*rpipe));
230	vfs_timestamp(&rpipe->pipe_ctime);
231	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
232
233	wpipe = &pp->pp_wpipe;
234	bzero(wpipe, sizeof(*wpipe));
235	wpipe->pipe_ctime = rpipe->pipe_ctime;
236	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
237
238	rpipe->pipe_peer = wpipe;
239	rpipe->pipe_pair = pp;
240	wpipe->pipe_peer = rpipe;
241	wpipe->pipe_pair = pp;
242
243	/*
244	 * Mark both endpoints as present; they will later get free'd
245	 * one at a time.  When both are free'd, then the whole pair
246	 * is released.
247	 */
248	rpipe->pipe_present = 1;
249	wpipe->pipe_present = 1;
250
251	/*
252	 * Eventually, the MAC Framework may initialize the label
253	 * in ctor or init, but for now we do it elswhere to avoid
254	 * blocking in ctor or init.
255	 */
256	pp->pp_label = NULL;
257
258}
259
260static void
261pipe_zone_dtor(void *mem, int size, void *arg)
262{
263	struct pipepair *pp;
264
265	KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size"));
266
267	pp = (struct pipepair *)mem;
268}
269
270static void
271pipe_zone_init(void *mem, int size)
272{
273	struct pipepair *pp;
274
275	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
276
277	pp = (struct pipepair *)mem;
278
279	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
280}
281
282static void
283pipe_zone_fini(void *mem, int size)
284{
285	struct pipepair *pp;
286
287	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
288
289	pp = (struct pipepair *)mem;
290
291	mtx_destroy(&pp->pp_mtx);
292}
293
294/*
295 * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail,
296 * let the zone pick up the pieces via pipeclose().
297 */
298
299/* ARGSUSED */
300int
301pipe(td, uap)
302	struct thread *td;
303	struct pipe_args /* {
304		int	dummy;
305	} */ *uap;
306{
307	struct filedesc *fdp = td->td_proc->p_fd;
308	struct file *rf, *wf;
309	struct pipepair *pp;
310	struct pipe *rpipe, *wpipe;
311	int fd, error;
312
313	pp = uma_zalloc(pipe_zone, M_WAITOK);
314#ifdef MAC
315	/*
316	 * struct pipe represents a pipe endpoint.  The MAC label is shared
317	 * between the connected endpoints.  As a result mac_init_pipe() and
318	 * mac_create_pipe() should only be called on one of the endpoints
319	 * after they have been connected.
320	 */
321	mac_init_pipe(pp);
322	mac_create_pipe(td->td_ucred, pp);
323#endif
324	rpipe = &pp->pp_rpipe;
325	wpipe = &pp->pp_wpipe;
326
327	if (pipe_create(rpipe) || pipe_create(wpipe)) {
328		pipeclose(rpipe);
329		pipeclose(wpipe);
330		return (ENFILE);
331	}
332
333	rpipe->pipe_state |= PIPE_DIRECTOK;
334	wpipe->pipe_state |= PIPE_DIRECTOK;
335
336	error = falloc(td, &rf, &fd);
337	if (error) {
338		pipeclose(rpipe);
339		pipeclose(wpipe);
340		return (error);
341	}
342	/* An extra reference on `rf' has been held for us by falloc(). */
343	td->td_retval[0] = fd;
344
345	/*
346	 * Warning: once we've gotten past allocation of the fd for the
347	 * read-side, we can only drop the read side via fdrop() in order
348	 * to avoid races against processes which manage to dup() the read
349	 * side while we are blocked trying to allocate the write side.
350	 */
351	FILE_LOCK(rf);
352	rf->f_flag = FREAD | FWRITE;
353	rf->f_type = DTYPE_PIPE;
354	rf->f_data = rpipe;
355	rf->f_ops = &pipeops;
356	FILE_UNLOCK(rf);
357	error = falloc(td, &wf, &fd);
358	if (error) {
359		FILEDESC_LOCK(fdp);
360		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
361			fdp->fd_ofiles[td->td_retval[0]] = NULL;
362			fdunused(fdp, td->td_retval[0]);
363			FILEDESC_UNLOCK(fdp);
364			fdrop(rf, td);
365		} else {
366			FILEDESC_UNLOCK(fdp);
367		}
368		fdrop(rf, td);
369		/* rpipe has been closed by fdrop(). */
370		pipeclose(wpipe);
371		return (error);
372	}
373	/* An extra reference on `wf' has been held for us by falloc(). */
374	FILE_LOCK(wf);
375	wf->f_flag = FREAD | FWRITE;
376	wf->f_type = DTYPE_PIPE;
377	wf->f_data = wpipe;
378	wf->f_ops = &pipeops;
379	FILE_UNLOCK(wf);
380	fdrop(wf, td);
381	td->td_retval[1] = fd;
382	fdrop(rf, td);
383
384	return (0);
385}
386
387/*
388 * Allocate kva for pipe circular buffer, the space is pageable
389 * This routine will 'realloc' the size of a pipe safely, if it fails
390 * it will retain the old buffer.
391 * If it fails it will return ENOMEM.
392 */
393static int
394pipespace(cpipe, size)
395	struct pipe *cpipe;
396	int size;
397{
398	caddr_t buffer;
399	int error;
400	static int curfail = 0;
401	static struct timeval lastfail;
402
403	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
404
405	size = round_page(size);
406	/*
407	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
408	 */
409	buffer = (caddr_t) vm_map_min(pipe_map);
410
411	/*
412	 * The map entry is, by default, pageable.
413	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
414	 */
415	error = vm_map_find(pipe_map, NULL, 0,
416		(vm_offset_t *) &buffer, size, 1,
417		VM_PROT_ALL, VM_PROT_ALL, 0);
418	if (error != KERN_SUCCESS) {
419		if (ppsratecheck(&lastfail, &curfail, 1))
420			printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
421		return (ENOMEM);
422	}
423
424	/* free old resources if we're resizing */
425	pipe_free_kmem(cpipe);
426	cpipe->pipe_buffer.buffer = buffer;
427	cpipe->pipe_buffer.size = size;
428	cpipe->pipe_buffer.in = 0;
429	cpipe->pipe_buffer.out = 0;
430	cpipe->pipe_buffer.cnt = 0;
431	atomic_add_int(&amountpipes, 1);
432	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
433	return (0);
434}
435
436/*
437 * Initialize and allocate VM and memory for pipe.  The structure
438 * will start out zero'd from the ctor, so we just manage the kmem.
439 */
440static int
441pipe_create(pipe)
442	struct pipe *pipe;
443{
444	int error;
445
446	/*
447	 * Reduce to 1/4th pipe size if we're over our global max.
448	 */
449	if (amountpipekva > maxpipekva / 2)
450		error = pipespace(pipe, SMALL_PIPE_SIZE);
451	else
452		error = pipespace(pipe, PIPE_SIZE);
453	if (error)
454		return (error);
455
456	return (0);
457}
458
459/*
460 * lock a pipe for I/O, blocking other access
461 */
462static __inline int
463pipelock(cpipe, catch)
464	struct pipe *cpipe;
465	int catch;
466{
467	int error;
468
469	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
470	while (cpipe->pipe_state & PIPE_LOCKFL) {
471		cpipe->pipe_state |= PIPE_LWANT;
472		error = msleep(cpipe, PIPE_MTX(cpipe),
473		    catch ? (PRIBIO | PCATCH) : PRIBIO,
474		    "pipelk", 0);
475		if (error != 0)
476			return (error);
477	}
478	cpipe->pipe_state |= PIPE_LOCKFL;
479	return (0);
480}
481
482/*
483 * unlock a pipe I/O lock
484 */
485static __inline void
486pipeunlock(cpipe)
487	struct pipe *cpipe;
488{
489
490	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
491	cpipe->pipe_state &= ~PIPE_LOCKFL;
492	if (cpipe->pipe_state & PIPE_LWANT) {
493		cpipe->pipe_state &= ~PIPE_LWANT;
494		wakeup(cpipe);
495	}
496}
497
498static __inline void
499pipeselwakeup(cpipe)
500	struct pipe *cpipe;
501{
502
503	if (cpipe->pipe_state & PIPE_SEL) {
504		cpipe->pipe_state &= ~PIPE_SEL;
505		selwakeuppri(&cpipe->pipe_sel, PSOCK);
506	}
507	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
508		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
509	KNOTE(&cpipe->pipe_sel.si_note, 0);
510}
511
512/* ARGSUSED */
513static int
514pipe_read(fp, uio, active_cred, flags, td)
515	struct file *fp;
516	struct uio *uio;
517	struct ucred *active_cred;
518	struct thread *td;
519	int flags;
520{
521	struct pipe *rpipe = fp->f_data;
522	int error;
523	int nread = 0;
524	u_int size;
525
526	PIPE_LOCK(rpipe);
527	++rpipe->pipe_busy;
528	error = pipelock(rpipe, 1);
529	if (error)
530		goto unlocked_error;
531
532#ifdef MAC
533	error = mac_check_pipe_read(active_cred, rpipe->pipe_pair);
534	if (error)
535		goto locked_error;
536#endif
537
538	while (uio->uio_resid) {
539		/*
540		 * normal pipe buffer receive
541		 */
542		if (rpipe->pipe_buffer.cnt > 0) {
543			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
544			if (size > rpipe->pipe_buffer.cnt)
545				size = rpipe->pipe_buffer.cnt;
546			if (size > (u_int) uio->uio_resid)
547				size = (u_int) uio->uio_resid;
548
549			PIPE_UNLOCK(rpipe);
550			error = uiomove(
551			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
552			    size, uio);
553			PIPE_LOCK(rpipe);
554			if (error)
555				break;
556
557			rpipe->pipe_buffer.out += size;
558			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
559				rpipe->pipe_buffer.out = 0;
560
561			rpipe->pipe_buffer.cnt -= size;
562
563			/*
564			 * If there is no more to read in the pipe, reset
565			 * its pointers to the beginning.  This improves
566			 * cache hit stats.
567			 */
568			if (rpipe->pipe_buffer.cnt == 0) {
569				rpipe->pipe_buffer.in = 0;
570				rpipe->pipe_buffer.out = 0;
571			}
572			nread += size;
573#ifndef PIPE_NODIRECT
574		/*
575		 * Direct copy, bypassing a kernel buffer.
576		 */
577		} else if ((size = rpipe->pipe_map.cnt) &&
578			   (rpipe->pipe_state & PIPE_DIRECTW)) {
579			caddr_t	va;
580			if (size > (u_int) uio->uio_resid)
581				size = (u_int) uio->uio_resid;
582
583			va = (caddr_t) rpipe->pipe_map.kva +
584			    rpipe->pipe_map.pos;
585			PIPE_UNLOCK(rpipe);
586			error = uiomove(va, size, uio);
587			PIPE_LOCK(rpipe);
588			if (error)
589				break;
590			nread += size;
591			rpipe->pipe_map.pos += size;
592			rpipe->pipe_map.cnt -= size;
593			if (rpipe->pipe_map.cnt == 0) {
594				rpipe->pipe_state &= ~PIPE_DIRECTW;
595				wakeup(rpipe);
596			}
597#endif
598		} else {
599			/*
600			 * detect EOF condition
601			 * read returns 0 on EOF, no need to set error
602			 */
603			if (rpipe->pipe_state & PIPE_EOF)
604				break;
605
606			/*
607			 * If the "write-side" has been blocked, wake it up now.
608			 */
609			if (rpipe->pipe_state & PIPE_WANTW) {
610				rpipe->pipe_state &= ~PIPE_WANTW;
611				wakeup(rpipe);
612			}
613
614			/*
615			 * Break if some data was read.
616			 */
617			if (nread > 0)
618				break;
619
620			/*
621			 * Unlock the pipe buffer for our remaining processing.
622			 * We will either break out with an error or we will
623			 * sleep and relock to loop.
624			 */
625			pipeunlock(rpipe);
626
627			/*
628			 * Handle non-blocking mode operation or
629			 * wait for more data.
630			 */
631			if (fp->f_flag & FNONBLOCK) {
632				error = EAGAIN;
633			} else {
634				rpipe->pipe_state |= PIPE_WANTR;
635				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
636				    PRIBIO | PCATCH,
637				    "piperd", 0)) == 0)
638					error = pipelock(rpipe, 1);
639			}
640			if (error)
641				goto unlocked_error;
642		}
643	}
644#ifdef MAC
645locked_error:
646#endif
647	pipeunlock(rpipe);
648
649	/* XXX: should probably do this before getting any locks. */
650	if (error == 0)
651		vfs_timestamp(&rpipe->pipe_atime);
652unlocked_error:
653	--rpipe->pipe_busy;
654
655	/*
656	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
657	 */
658	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
659		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
660		wakeup(rpipe);
661	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
662		/*
663		 * Handle write blocking hysteresis.
664		 */
665		if (rpipe->pipe_state & PIPE_WANTW) {
666			rpipe->pipe_state &= ~PIPE_WANTW;
667			wakeup(rpipe);
668		}
669	}
670
671	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
672		pipeselwakeup(rpipe);
673
674	PIPE_UNLOCK(rpipe);
675	return (error);
676}
677
678#ifndef PIPE_NODIRECT
679/*
680 * Map the sending processes' buffer into kernel space and wire it.
681 * This is similar to a physical write operation.
682 */
683static int
684pipe_build_write_buffer(wpipe, uio)
685	struct pipe *wpipe;
686	struct uio *uio;
687{
688	pmap_t pmap;
689	u_int size;
690	int i, j;
691	vm_offset_t addr, endaddr;
692
693	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
694
695	size = (u_int) uio->uio_iov->iov_len;
696	if (size > wpipe->pipe_buffer.size)
697		size = wpipe->pipe_buffer.size;
698
699	pmap = vmspace_pmap(curproc->p_vmspace);
700	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
701	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
702	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
703		/*
704		 * vm_fault_quick() can sleep.  Consequently,
705		 * vm_page_lock_queue() and vm_page_unlock_queue()
706		 * should not be performed outside of this loop.
707		 */
708	race:
709		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
710			vm_page_lock_queues();
711			for (j = 0; j < i; j++)
712				vm_page_unhold(wpipe->pipe_map.ms[j]);
713			vm_page_unlock_queues();
714			return (EFAULT);
715		}
716		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
717		    VM_PROT_READ);
718		if (wpipe->pipe_map.ms[i] == NULL)
719			goto race;
720	}
721
722/*
723 * set up the control block
724 */
725	wpipe->pipe_map.npages = i;
726	wpipe->pipe_map.pos =
727	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
728	wpipe->pipe_map.cnt = size;
729
730/*
731 * and map the buffer
732 */
733	if (wpipe->pipe_map.kva == 0) {
734		/*
735		 * We need to allocate space for an extra page because the
736		 * address range might (will) span pages at times.
737		 */
738		wpipe->pipe_map.kva = kmem_alloc_nofault(kernel_map,
739			wpipe->pipe_buffer.size + PAGE_SIZE);
740		atomic_add_int(&amountpipekvawired,
741		    wpipe->pipe_buffer.size + PAGE_SIZE);
742	}
743	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
744		wpipe->pipe_map.npages);
745
746/*
747 * and update the uio data
748 */
749
750	uio->uio_iov->iov_len -= size;
751	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
752	if (uio->uio_iov->iov_len == 0)
753		uio->uio_iov++;
754	uio->uio_resid -= size;
755	uio->uio_offset += size;
756	return (0);
757}
758
759/*
760 * unmap and unwire the process buffer
761 */
762static void
763pipe_destroy_write_buffer(wpipe)
764	struct pipe *wpipe;
765{
766	int i;
767
768	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
769	if (wpipe->pipe_map.kva) {
770		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
771
772		if (amountpipekvawired > maxpipekvawired / 2) {
773			/* Conserve address space */
774			vm_offset_t kva = wpipe->pipe_map.kva;
775			wpipe->pipe_map.kva = 0;
776			kmem_free(kernel_map, kva,
777			    wpipe->pipe_buffer.size + PAGE_SIZE);
778			atomic_subtract_int(&amountpipekvawired,
779			    wpipe->pipe_buffer.size + PAGE_SIZE);
780		}
781	}
782	vm_page_lock_queues();
783	for (i = 0; i < wpipe->pipe_map.npages; i++) {
784		vm_page_unhold(wpipe->pipe_map.ms[i]);
785	}
786	vm_page_unlock_queues();
787	wpipe->pipe_map.npages = 0;
788}
789
790/*
791 * In the case of a signal, the writing process might go away.  This
792 * code copies the data into the circular buffer so that the source
793 * pages can be freed without loss of data.
794 */
795static void
796pipe_clone_write_buffer(wpipe)
797	struct pipe *wpipe;
798{
799	int size;
800	int pos;
801
802	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
803	size = wpipe->pipe_map.cnt;
804	pos = wpipe->pipe_map.pos;
805
806	wpipe->pipe_buffer.in = size;
807	wpipe->pipe_buffer.out = 0;
808	wpipe->pipe_buffer.cnt = size;
809	wpipe->pipe_state &= ~PIPE_DIRECTW;
810
811	PIPE_UNLOCK(wpipe);
812	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
813	    wpipe->pipe_buffer.buffer, size);
814	pipe_destroy_write_buffer(wpipe);
815	PIPE_LOCK(wpipe);
816}
817
818/*
819 * This implements the pipe buffer write mechanism.  Note that only
820 * a direct write OR a normal pipe write can be pending at any given time.
821 * If there are any characters in the pipe buffer, the direct write will
822 * be deferred until the receiving process grabs all of the bytes from
823 * the pipe buffer.  Then the direct mapping write is set-up.
824 */
825static int
826pipe_direct_write(wpipe, uio)
827	struct pipe *wpipe;
828	struct uio *uio;
829{
830	int error;
831
832retry:
833	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
834	while (wpipe->pipe_state & PIPE_DIRECTW) {
835		if (wpipe->pipe_state & PIPE_WANTR) {
836			wpipe->pipe_state &= ~PIPE_WANTR;
837			wakeup(wpipe);
838		}
839		wpipe->pipe_state |= PIPE_WANTW;
840		error = msleep(wpipe, PIPE_MTX(wpipe),
841		    PRIBIO | PCATCH, "pipdww", 0);
842		if (error)
843			goto error1;
844		if (wpipe->pipe_state & PIPE_EOF) {
845			error = EPIPE;
846			goto error1;
847		}
848	}
849	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
850	if (wpipe->pipe_buffer.cnt > 0) {
851		if (wpipe->pipe_state & PIPE_WANTR) {
852			wpipe->pipe_state &= ~PIPE_WANTR;
853			wakeup(wpipe);
854		}
855
856		wpipe->pipe_state |= PIPE_WANTW;
857		error = msleep(wpipe, PIPE_MTX(wpipe),
858		    PRIBIO | PCATCH, "pipdwc", 0);
859		if (error)
860			goto error1;
861		if (wpipe->pipe_state & PIPE_EOF) {
862			error = EPIPE;
863			goto error1;
864		}
865		goto retry;
866	}
867
868	wpipe->pipe_state |= PIPE_DIRECTW;
869
870	pipelock(wpipe, 0);
871	PIPE_UNLOCK(wpipe);
872	error = pipe_build_write_buffer(wpipe, uio);
873	PIPE_LOCK(wpipe);
874	pipeunlock(wpipe);
875	if (error) {
876		wpipe->pipe_state &= ~PIPE_DIRECTW;
877		goto error1;
878	}
879
880	error = 0;
881	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
882		if (wpipe->pipe_state & PIPE_EOF) {
883			pipelock(wpipe, 0);
884			PIPE_UNLOCK(wpipe);
885			pipe_destroy_write_buffer(wpipe);
886			PIPE_LOCK(wpipe);
887			pipeselwakeup(wpipe);
888			pipeunlock(wpipe);
889			error = EPIPE;
890			goto error1;
891		}
892		if (wpipe->pipe_state & PIPE_WANTR) {
893			wpipe->pipe_state &= ~PIPE_WANTR;
894			wakeup(wpipe);
895		}
896		pipeselwakeup(wpipe);
897		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
898		    "pipdwt", 0);
899	}
900
901	pipelock(wpipe,0);
902	if (wpipe->pipe_state & PIPE_DIRECTW) {
903		/*
904		 * this bit of trickery substitutes a kernel buffer for
905		 * the process that might be going away.
906		 */
907		pipe_clone_write_buffer(wpipe);
908	} else {
909		PIPE_UNLOCK(wpipe);
910		pipe_destroy_write_buffer(wpipe);
911		PIPE_LOCK(wpipe);
912	}
913	pipeunlock(wpipe);
914	return (error);
915
916error1:
917	wakeup(wpipe);
918	return (error);
919}
920#endif
921
922static int
923pipe_write(fp, uio, active_cred, flags, td)
924	struct file *fp;
925	struct uio *uio;
926	struct ucred *active_cred;
927	struct thread *td;
928	int flags;
929{
930	int error = 0;
931	int orig_resid;
932	struct pipe *wpipe, *rpipe;
933
934	rpipe = fp->f_data;
935	wpipe = rpipe->pipe_peer;
936
937	PIPE_LOCK(rpipe);
938	/*
939	 * detect loss of pipe read side, issue SIGPIPE if lost.
940	 */
941	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
942		PIPE_UNLOCK(rpipe);
943		return (EPIPE);
944	}
945#ifdef MAC
946	error = mac_check_pipe_write(active_cred, wpipe->pipe_pair);
947	if (error) {
948		PIPE_UNLOCK(rpipe);
949		return (error);
950	}
951#endif
952	++wpipe->pipe_busy;
953
954	/*
955	 * If it is advantageous to resize the pipe buffer, do
956	 * so.
957	 */
958	if ((uio->uio_resid > PIPE_SIZE) &&
959		(amountpipekva < maxpipekva / 2) &&
960		(nbigpipe < LIMITBIGPIPES) &&
961		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
962		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
963		(wpipe->pipe_buffer.cnt == 0)) {
964
965		if ((error = pipelock(wpipe, 1)) == 0) {
966			PIPE_UNLOCK(wpipe);
967			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
968				atomic_add_int(&nbigpipe, 1);
969			PIPE_LOCK(wpipe);
970			pipeunlock(wpipe);
971		}
972	}
973
974	/*
975	 * If an early error occured unbusy and return, waking up any pending
976	 * readers.
977	 */
978	if (error) {
979		--wpipe->pipe_busy;
980		if ((wpipe->pipe_busy == 0) &&
981		    (wpipe->pipe_state & PIPE_WANT)) {
982			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
983			wakeup(wpipe);
984		}
985		PIPE_UNLOCK(rpipe);
986		return(error);
987	}
988
989	orig_resid = uio->uio_resid;
990
991	while (uio->uio_resid) {
992		int space;
993
994#ifndef PIPE_NODIRECT
995		/*
996		 * If the transfer is large, we can gain performance if
997		 * we do process-to-process copies directly.
998		 * If the write is non-blocking, we don't use the
999		 * direct write mechanism.
1000		 *
1001		 * The direct write mechanism will detect the reader going
1002		 * away on us.
1003		 */
1004		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1005		    (fp->f_flag & FNONBLOCK) == 0 &&
1006		    amountpipekvawired + uio->uio_resid < maxpipekvawired) {
1007			error = pipe_direct_write(wpipe, uio);
1008			if (error)
1009				break;
1010			continue;
1011		}
1012#endif
1013
1014		/*
1015		 * Pipe buffered writes cannot be coincidental with
1016		 * direct writes.  We wait until the currently executing
1017		 * direct write is completed before we start filling the
1018		 * pipe buffer.  We break out if a signal occurs or the
1019		 * reader goes away.
1020		 */
1021	retrywrite:
1022		while (wpipe->pipe_state & PIPE_DIRECTW) {
1023			if (wpipe->pipe_state & PIPE_WANTR) {
1024				wpipe->pipe_state &= ~PIPE_WANTR;
1025				wakeup(wpipe);
1026			}
1027			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1028			    "pipbww", 0);
1029			if (wpipe->pipe_state & PIPE_EOF)
1030				break;
1031			if (error)
1032				break;
1033		}
1034		if (wpipe->pipe_state & PIPE_EOF) {
1035			error = EPIPE;
1036			break;
1037		}
1038
1039		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1040
1041		/* Writes of size <= PIPE_BUF must be atomic. */
1042		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1043			space = 0;
1044
1045		if (space > 0) {
1046			if ((error = pipelock(wpipe,1)) == 0) {
1047				int size;	/* Transfer size */
1048				int segsize;	/* first segment to transfer */
1049
1050				/*
1051				 * It is possible for a direct write to
1052				 * slip in on us... handle it here...
1053				 */
1054				if (wpipe->pipe_state & PIPE_DIRECTW) {
1055					pipeunlock(wpipe);
1056					goto retrywrite;
1057				}
1058				/*
1059				 * If a process blocked in uiomove, our
1060				 * value for space might be bad.
1061				 *
1062				 * XXX will we be ok if the reader has gone
1063				 * away here?
1064				 */
1065				if (space > wpipe->pipe_buffer.size -
1066				    wpipe->pipe_buffer.cnt) {
1067					pipeunlock(wpipe);
1068					goto retrywrite;
1069				}
1070
1071				/*
1072				 * Transfer size is minimum of uio transfer
1073				 * and free space in pipe buffer.
1074				 */
1075				if (space > uio->uio_resid)
1076					size = uio->uio_resid;
1077				else
1078					size = space;
1079				/*
1080				 * First segment to transfer is minimum of
1081				 * transfer size and contiguous space in
1082				 * pipe buffer.  If first segment to transfer
1083				 * is less than the transfer size, we've got
1084				 * a wraparound in the buffer.
1085				 */
1086				segsize = wpipe->pipe_buffer.size -
1087					wpipe->pipe_buffer.in;
1088				if (segsize > size)
1089					segsize = size;
1090
1091				/* Transfer first segment */
1092
1093				PIPE_UNLOCK(rpipe);
1094				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1095						segsize, uio);
1096				PIPE_LOCK(rpipe);
1097
1098				if (error == 0 && segsize < size) {
1099					/*
1100					 * Transfer remaining part now, to
1101					 * support atomic writes.  Wraparound
1102					 * happened.
1103					 */
1104					if (wpipe->pipe_buffer.in + segsize !=
1105					    wpipe->pipe_buffer.size)
1106						panic("Expected pipe buffer "
1107						    "wraparound disappeared");
1108
1109					PIPE_UNLOCK(rpipe);
1110					error = uiomove(
1111					    &wpipe->pipe_buffer.buffer[0],
1112					    size - segsize, uio);
1113					PIPE_LOCK(rpipe);
1114				}
1115				if (error == 0) {
1116					wpipe->pipe_buffer.in += size;
1117					if (wpipe->pipe_buffer.in >=
1118					    wpipe->pipe_buffer.size) {
1119						if (wpipe->pipe_buffer.in !=
1120						    size - segsize +
1121						    wpipe->pipe_buffer.size)
1122							panic("Expected "
1123							    "wraparound bad");
1124						wpipe->pipe_buffer.in = size -
1125						    segsize;
1126					}
1127
1128					wpipe->pipe_buffer.cnt += size;
1129					if (wpipe->pipe_buffer.cnt >
1130					    wpipe->pipe_buffer.size)
1131						panic("Pipe buffer overflow");
1132
1133				}
1134				pipeunlock(wpipe);
1135			}
1136			if (error)
1137				break;
1138
1139		} else {
1140			/*
1141			 * If the "read-side" has been blocked, wake it up now.
1142			 */
1143			if (wpipe->pipe_state & PIPE_WANTR) {
1144				wpipe->pipe_state &= ~PIPE_WANTR;
1145				wakeup(wpipe);
1146			}
1147
1148			/*
1149			 * don't block on non-blocking I/O
1150			 */
1151			if (fp->f_flag & FNONBLOCK) {
1152				error = EAGAIN;
1153				break;
1154			}
1155
1156			/*
1157			 * We have no more space and have something to offer,
1158			 * wake up select/poll.
1159			 */
1160			pipeselwakeup(wpipe);
1161
1162			wpipe->pipe_state |= PIPE_WANTW;
1163			error = msleep(wpipe, PIPE_MTX(rpipe),
1164			    PRIBIO | PCATCH, "pipewr", 0);
1165			if (error != 0)
1166				break;
1167			/*
1168			 * If read side wants to go away, we just issue a signal
1169			 * to ourselves.
1170			 */
1171			if (wpipe->pipe_state & PIPE_EOF) {
1172				error = EPIPE;
1173				break;
1174			}
1175		}
1176	}
1177
1178	--wpipe->pipe_busy;
1179
1180	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1181		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1182		wakeup(wpipe);
1183	} else if (wpipe->pipe_buffer.cnt > 0) {
1184		/*
1185		 * If we have put any characters in the buffer, we wake up
1186		 * the reader.
1187		 */
1188		if (wpipe->pipe_state & PIPE_WANTR) {
1189			wpipe->pipe_state &= ~PIPE_WANTR;
1190			wakeup(wpipe);
1191		}
1192	}
1193
1194	/*
1195	 * Don't return EPIPE if I/O was successful
1196	 */
1197	if ((wpipe->pipe_buffer.cnt == 0) &&
1198	    (uio->uio_resid == 0) &&
1199	    (error == EPIPE)) {
1200		error = 0;
1201	}
1202
1203	if (error == 0)
1204		vfs_timestamp(&wpipe->pipe_mtime);
1205
1206	/*
1207	 * We have something to offer,
1208	 * wake up select/poll.
1209	 */
1210	if (wpipe->pipe_buffer.cnt)
1211		pipeselwakeup(wpipe);
1212
1213	PIPE_UNLOCK(rpipe);
1214	return (error);
1215}
1216
1217/*
1218 * we implement a very minimal set of ioctls for compatibility with sockets.
1219 */
1220static int
1221pipe_ioctl(fp, cmd, data, active_cred, td)
1222	struct file *fp;
1223	u_long cmd;
1224	void *data;
1225	struct ucred *active_cred;
1226	struct thread *td;
1227{
1228	struct pipe *mpipe = fp->f_data;
1229#ifdef MAC
1230	int error;
1231#endif
1232
1233	PIPE_LOCK(mpipe);
1234
1235#ifdef MAC
1236	error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1237	if (error) {
1238		PIPE_UNLOCK(mpipe);
1239		return (error);
1240	}
1241#endif
1242
1243	switch (cmd) {
1244
1245	case FIONBIO:
1246		PIPE_UNLOCK(mpipe);
1247		return (0);
1248
1249	case FIOASYNC:
1250		if (*(int *)data) {
1251			mpipe->pipe_state |= PIPE_ASYNC;
1252		} else {
1253			mpipe->pipe_state &= ~PIPE_ASYNC;
1254		}
1255		PIPE_UNLOCK(mpipe);
1256		return (0);
1257
1258	case FIONREAD:
1259		if (mpipe->pipe_state & PIPE_DIRECTW)
1260			*(int *)data = mpipe->pipe_map.cnt;
1261		else
1262			*(int *)data = mpipe->pipe_buffer.cnt;
1263		PIPE_UNLOCK(mpipe);
1264		return (0);
1265
1266	case FIOSETOWN:
1267		PIPE_UNLOCK(mpipe);
1268		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1269
1270	case FIOGETOWN:
1271		PIPE_UNLOCK(mpipe);
1272		*(int *)data = fgetown(&mpipe->pipe_sigio);
1273		return (0);
1274
1275	/* This is deprecated, FIOSETOWN should be used instead. */
1276	case TIOCSPGRP:
1277		PIPE_UNLOCK(mpipe);
1278		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1279
1280	/* This is deprecated, FIOGETOWN should be used instead. */
1281	case TIOCGPGRP:
1282		PIPE_UNLOCK(mpipe);
1283		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1284		return (0);
1285
1286	}
1287	PIPE_UNLOCK(mpipe);
1288	return (ENOTTY);
1289}
1290
1291static int
1292pipe_poll(fp, events, active_cred, td)
1293	struct file *fp;
1294	int events;
1295	struct ucred *active_cred;
1296	struct thread *td;
1297{
1298	struct pipe *rpipe = fp->f_data;
1299	struct pipe *wpipe;
1300	int revents = 0;
1301#ifdef MAC
1302	int error;
1303#endif
1304
1305	wpipe = rpipe->pipe_peer;
1306	PIPE_LOCK(rpipe);
1307#ifdef MAC
1308	error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair);
1309	if (error)
1310		goto locked_error;
1311#endif
1312	if (events & (POLLIN | POLLRDNORM))
1313		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1314		    (rpipe->pipe_buffer.cnt > 0) ||
1315		    (rpipe->pipe_state & PIPE_EOF))
1316			revents |= events & (POLLIN | POLLRDNORM);
1317
1318	if (events & (POLLOUT | POLLWRNORM))
1319		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1320		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1321		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1322			revents |= events & (POLLOUT | POLLWRNORM);
1323
1324	if ((rpipe->pipe_state & PIPE_EOF) ||
1325	    (wpipe == NULL) ||
1326	    (wpipe->pipe_state & PIPE_EOF))
1327		revents |= POLLHUP;
1328
1329	if (revents == 0) {
1330		if (events & (POLLIN | POLLRDNORM)) {
1331			selrecord(td, &rpipe->pipe_sel);
1332			rpipe->pipe_state |= PIPE_SEL;
1333		}
1334
1335		if (events & (POLLOUT | POLLWRNORM)) {
1336			selrecord(td, &wpipe->pipe_sel);
1337			wpipe->pipe_state |= PIPE_SEL;
1338		}
1339	}
1340#ifdef MAC
1341locked_error:
1342#endif
1343	PIPE_UNLOCK(rpipe);
1344
1345	return (revents);
1346}
1347
1348/*
1349 * We shouldn't need locks here as we're doing a read and this should
1350 * be a natural race.
1351 */
1352static int
1353pipe_stat(fp, ub, active_cred, td)
1354	struct file *fp;
1355	struct stat *ub;
1356	struct ucred *active_cred;
1357	struct thread *td;
1358{
1359	struct pipe *pipe = fp->f_data;
1360#ifdef MAC
1361	int error;
1362
1363	PIPE_LOCK(pipe);
1364	error = mac_check_pipe_stat(active_cred, pipe->pipe_pair);
1365	PIPE_UNLOCK(pipe);
1366	if (error)
1367		return (error);
1368#endif
1369	bzero(ub, sizeof(*ub));
1370	ub->st_mode = S_IFIFO;
1371	ub->st_blksize = pipe->pipe_buffer.size;
1372	ub->st_size = pipe->pipe_buffer.cnt;
1373	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1374	ub->st_atimespec = pipe->pipe_atime;
1375	ub->st_mtimespec = pipe->pipe_mtime;
1376	ub->st_ctimespec = pipe->pipe_ctime;
1377	ub->st_uid = fp->f_cred->cr_uid;
1378	ub->st_gid = fp->f_cred->cr_gid;
1379	/*
1380	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1381	 * XXX (st_dev, st_ino) should be unique.
1382	 */
1383	return (0);
1384}
1385
1386/* ARGSUSED */
1387static int
1388pipe_close(fp, td)
1389	struct file *fp;
1390	struct thread *td;
1391{
1392	struct pipe *cpipe = fp->f_data;
1393
1394	fp->f_ops = &badfileops;
1395	fp->f_data = NULL;
1396	funsetown(&cpipe->pipe_sigio);
1397	pipeclose(cpipe);
1398	return (0);
1399}
1400
1401static void
1402pipe_free_kmem(cpipe)
1403	struct pipe *cpipe;
1404{
1405
1406	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1407	    ("pipe_free_kmem: pipe mutex locked"));
1408
1409	if (cpipe->pipe_buffer.buffer != NULL) {
1410		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1411			atomic_subtract_int(&nbigpipe, 1);
1412		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1413		atomic_subtract_int(&amountpipes, 1);
1414		vm_map_remove(pipe_map,
1415		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1416		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1417		cpipe->pipe_buffer.buffer = NULL;
1418	}
1419#ifndef PIPE_NODIRECT
1420	if (cpipe->pipe_map.kva != 0) {
1421		atomic_subtract_int(&amountpipekvawired,
1422		    cpipe->pipe_buffer.size + PAGE_SIZE);
1423		kmem_free(kernel_map,
1424			cpipe->pipe_map.kva,
1425			cpipe->pipe_buffer.size + PAGE_SIZE);
1426		cpipe->pipe_map.cnt = 0;
1427		cpipe->pipe_map.kva = 0;
1428		cpipe->pipe_map.pos = 0;
1429		cpipe->pipe_map.npages = 0;
1430	}
1431#endif
1432}
1433
1434/*
1435 * shutdown the pipe
1436 */
1437static void
1438pipeclose(cpipe)
1439	struct pipe *cpipe;
1440{
1441	struct pipepair *pp;
1442	struct pipe *ppipe;
1443	int hadpeer;
1444
1445	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1446
1447	hadpeer = 0;
1448	PIPE_LOCK(cpipe);
1449	pp = cpipe->pipe_pair;
1450
1451	pipeselwakeup(cpipe);
1452
1453	/*
1454	 * If the other side is blocked, wake it up saying that
1455	 * we want to close it down.
1456	 */
1457	while (cpipe->pipe_busy) {
1458		wakeup(cpipe);
1459		cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
1460		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1461	}
1462
1463
1464	/*
1465	 * Disconnect from peer, if any.
1466	 */
1467	ppipe = cpipe->pipe_peer;
1468	if (ppipe->pipe_present != 0) {
1469		hadpeer++;
1470		pipeselwakeup(ppipe);
1471
1472		ppipe->pipe_state |= PIPE_EOF;
1473		wakeup(ppipe);
1474		KNOTE(&ppipe->pipe_sel.si_note, 0);
1475	}
1476
1477	/*
1478	 * Mark this endpoint as free.  Release kmem resources.  We
1479	 * don't mark this endpoint as unused until we've finished
1480	 * doing that, or the pipe might disappear out from under
1481	 * us.
1482	 */
1483	PIPE_UNLOCK(cpipe);
1484	pipe_free_kmem(cpipe);
1485	PIPE_LOCK(cpipe);
1486	cpipe->pipe_present = 0;
1487
1488	/*
1489	 * If both endpoints are now closed, release the memory for the
1490	 * pipe pair.  If not, unlock.
1491	 */
1492	if (ppipe->pipe_present == 0) {
1493		PIPE_UNLOCK(cpipe);
1494#ifdef MAC
1495		mac_destroy_pipe(pp);
1496#endif
1497		uma_zfree(pipe_zone, cpipe->pipe_pair);
1498	} else
1499		PIPE_UNLOCK(cpipe);
1500}
1501
1502/*ARGSUSED*/
1503static int
1504pipe_kqfilter(struct file *fp, struct knote *kn)
1505{
1506	struct pipe *cpipe;
1507
1508	cpipe = kn->kn_fp->f_data;
1509	switch (kn->kn_filter) {
1510	case EVFILT_READ:
1511		kn->kn_fop = &pipe_rfiltops;
1512		break;
1513	case EVFILT_WRITE:
1514		kn->kn_fop = &pipe_wfiltops;
1515		cpipe = cpipe->pipe_peer;
1516		if (cpipe == NULL)
1517			/* other end of pipe has been closed */
1518			return (EPIPE);
1519		break;
1520	default:
1521		return (1);
1522	}
1523
1524	PIPE_LOCK(cpipe);
1525	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1526	PIPE_UNLOCK(cpipe);
1527	return (0);
1528}
1529
1530static void
1531filt_pipedetach(struct knote *kn)
1532{
1533	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1534
1535	if (kn->kn_filter == EVFILT_WRITE) {
1536		if (cpipe->pipe_peer == NULL)
1537			return;
1538		cpipe = cpipe->pipe_peer;
1539	}
1540
1541	PIPE_LOCK(cpipe);
1542	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1543	PIPE_UNLOCK(cpipe);
1544}
1545
1546/*ARGSUSED*/
1547static int
1548filt_piperead(struct knote *kn, long hint)
1549{
1550	struct pipe *rpipe = kn->kn_fp->f_data;
1551	struct pipe *wpipe = rpipe->pipe_peer;
1552
1553	PIPE_LOCK(rpipe);
1554	kn->kn_data = rpipe->pipe_buffer.cnt;
1555	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1556		kn->kn_data = rpipe->pipe_map.cnt;
1557
1558	if ((rpipe->pipe_state & PIPE_EOF) ||
1559	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1560		kn->kn_flags |= EV_EOF;
1561		PIPE_UNLOCK(rpipe);
1562		return (1);
1563	}
1564	PIPE_UNLOCK(rpipe);
1565	return (kn->kn_data > 0);
1566}
1567
1568/*ARGSUSED*/
1569static int
1570filt_pipewrite(struct knote *kn, long hint)
1571{
1572	struct pipe *rpipe = kn->kn_fp->f_data;
1573	struct pipe *wpipe = rpipe->pipe_peer;
1574
1575	PIPE_LOCK(rpipe);
1576	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1577		kn->kn_data = 0;
1578		kn->kn_flags |= EV_EOF;
1579		PIPE_UNLOCK(rpipe);
1580		return (1);
1581	}
1582	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1583	if (wpipe->pipe_state & PIPE_DIRECTW)
1584		kn->kn_data = 0;
1585
1586	PIPE_UNLOCK(rpipe);
1587	return (kn->kn_data >= PIPE_BUF);
1588}
1589