sys_pipe.c revision 76754
1/*
2 * Copyright (c) 1996 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 *
19 * $FreeBSD: head/sys/kern/sys_pipe.c 76754 2001-05-17 17:59:28Z alfred $
20 */
21
22/*
23 * This file contains a high-performance replacement for the socket-based
24 * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25 * all features of sockets, but does do everything that pipes normally
26 * do.
27 */
28
29/*
30 * This code has two modes of operation, a small write mode and a large
31 * write mode.  The small write mode acts like conventional pipes with
32 * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
33 * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
34 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
35 * the receiving process can copy it directly from the pages in the sending
36 * process.
37 *
38 * If the sending process receives a signal, it is possible that it will
39 * go away, and certainly its address space can change, because control
40 * is returned back to the user-mode side.  In that case, the pipe code
41 * arranges to copy the buffer supplied by the user process, to a pageable
42 * kernel buffer, and the receiving process will grab the data from the
43 * pageable kernel buffer.  Since signals don't happen all that often,
44 * the copy operation is normally eliminated.
45 *
46 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
47 * happen for small transfers so that the system will not spend all of
48 * its time context switching.  PIPE_SIZE is constrained by the
49 * amount of kernel virtual memory.
50 */
51
52#include <sys/param.h>
53#include <sys/systm.h>
54#include <sys/fcntl.h>
55#include <sys/file.h>
56#include <sys/filedesc.h>
57#include <sys/filio.h>
58#include <sys/lock.h>
59#include <sys/ttycom.h>
60#include <sys/stat.h>
61#include <sys/poll.h>
62#include <sys/selinfo.h>
63#include <sys/signalvar.h>
64#include <sys/sysproto.h>
65#include <sys/pipe.h>
66#include <sys/proc.h>
67#include <sys/vnode.h>
68#include <sys/uio.h>
69#include <sys/event.h>
70
71#include <vm/vm.h>
72#include <vm/vm_param.h>
73#include <vm/vm_object.h>
74#include <vm/vm_kern.h>
75#include <vm/vm_extern.h>
76#include <vm/pmap.h>
77#include <vm/vm_map.h>
78#include <vm/vm_page.h>
79#include <vm/vm_zone.h>
80
81/*
82 * Use this define if you want to disable *fancy* VM things.  Expect an
83 * approx 30% decrease in transfer rate.  This could be useful for
84 * NetBSD or OpenBSD.
85 */
86/* #define PIPE_NODIRECT */
87
88/*
89 * interfaces to the outside world
90 */
91static int pipe_read __P((struct file *fp, struct uio *uio,
92		struct ucred *cred, int flags, struct proc *p));
93static int pipe_write __P((struct file *fp, struct uio *uio,
94		struct ucred *cred, int flags, struct proc *p));
95static int pipe_close __P((struct file *fp, struct proc *p));
96static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
97		struct proc *p));
98static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
99static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
100static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
101
102static struct fileops pipeops = {
103	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
104	pipe_stat, pipe_close
105};
106
107static void	filt_pipedetach(struct knote *kn);
108static int	filt_piperead(struct knote *kn, long hint);
109static int	filt_pipewrite(struct knote *kn, long hint);
110
111static struct filterops pipe_rfiltops =
112	{ 1, NULL, filt_pipedetach, filt_piperead };
113static struct filterops pipe_wfiltops =
114	{ 1, NULL, filt_pipedetach, filt_pipewrite };
115
116
117/*
118 * Default pipe buffer size(s), this can be kind-of large now because pipe
119 * space is pageable.  The pipe code will try to maintain locality of
120 * reference for performance reasons, so small amounts of outstanding I/O
121 * will not wipe the cache.
122 */
123#define MINPIPESIZE (PIPE_SIZE/3)
124#define MAXPIPESIZE (2*PIPE_SIZE/3)
125
126/*
127 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
128 * is there so that on large systems, we don't exhaust it.
129 */
130#define MAXPIPEKVA (8*1024*1024)
131
132/*
133 * Limit for direct transfers, we cannot, of course limit
134 * the amount of kva for pipes in general though.
135 */
136#define LIMITPIPEKVA (16*1024*1024)
137
138/*
139 * Limit the number of "big" pipes
140 */
141#define LIMITBIGPIPES	32
142static int nbigpipe;
143
144static int amountpipekva;
145
146static void pipeclose __P((struct pipe *cpipe));
147static void pipe_free_kmem __P((struct pipe *cpipe));
148static int pipe_create __P((struct pipe **cpipep));
149static __inline int pipelock __P((struct pipe *cpipe, int catch));
150static __inline void pipeunlock __P((struct pipe *cpipe));
151static __inline void pipeselwakeup __P((struct pipe *cpipe));
152#ifndef PIPE_NODIRECT
153static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
154static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
155static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
156static void pipe_clone_write_buffer __P((struct pipe *wpipe));
157#endif
158static int pipespace __P((struct pipe *cpipe, int size));
159
160static vm_zone_t pipe_zone;
161
162/*
163 * The pipe system call for the DTYPE_PIPE type of pipes
164 */
165
166/* ARGSUSED */
167int
168pipe(p, uap)
169	struct proc *p;
170	struct pipe_args /* {
171		int	dummy;
172	} */ *uap;
173{
174	struct filedesc *fdp = p->p_fd;
175	struct file *rf, *wf;
176	struct pipe *rpipe, *wpipe;
177	int fd, error;
178
179	if (pipe_zone == NULL)
180		pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4);
181
182	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
183		pipeclose(rpipe);
184		pipeclose(wpipe);
185		return (ENFILE);
186	}
187
188	rpipe->pipe_state |= PIPE_DIRECTOK;
189	wpipe->pipe_state |= PIPE_DIRECTOK;
190
191	error = falloc(p, &rf, &fd);
192	if (error) {
193		pipeclose(rpipe);
194		pipeclose(wpipe);
195		return (error);
196	}
197	fhold(rf);
198	p->p_retval[0] = fd;
199
200	/*
201	 * Warning: once we've gotten past allocation of the fd for the
202	 * read-side, we can only drop the read side via fdrop() in order
203	 * to avoid races against processes which manage to dup() the read
204	 * side while we are blocked trying to allocate the write side.
205	 */
206	rf->f_flag = FREAD | FWRITE;
207	rf->f_type = DTYPE_PIPE;
208	rf->f_data = (caddr_t)rpipe;
209	rf->f_ops = &pipeops;
210	error = falloc(p, &wf, &fd);
211	if (error) {
212		if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
213			fdp->fd_ofiles[p->p_retval[0]] = NULL;
214			fdrop(rf, p);
215		}
216		fdrop(rf, p);
217		/* rpipe has been closed by fdrop(). */
218		pipeclose(wpipe);
219		return (error);
220	}
221	wf->f_flag = FREAD | FWRITE;
222	wf->f_type = DTYPE_PIPE;
223	wf->f_data = (caddr_t)wpipe;
224	wf->f_ops = &pipeops;
225	p->p_retval[1] = fd;
226
227	rpipe->pipe_peer = wpipe;
228	wpipe->pipe_peer = rpipe;
229	fdrop(rf, p);
230
231	return (0);
232}
233
234/*
235 * Allocate kva for pipe circular buffer, the space is pageable
236 * This routine will 'realloc' the size of a pipe safely, if it fails
237 * it will retain the old buffer.
238 * If it fails it will return ENOMEM.
239 */
240static int
241pipespace(cpipe, size)
242	struct pipe *cpipe;
243	int size;
244{
245	struct vm_object *object;
246	caddr_t buffer;
247	int npages, error;
248
249	npages = round_page(size)/PAGE_SIZE;
250	/*
251	 * Create an object, I don't like the idea of paging to/from
252	 * kernel_object.
253	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
254	 */
255	object = vm_object_allocate(OBJT_DEFAULT, npages);
256	buffer = (caddr_t) vm_map_min(kernel_map);
257
258	/*
259	 * Insert the object into the kernel map, and allocate kva for it.
260	 * The map entry is, by default, pageable.
261	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
262	 */
263	error = vm_map_find(kernel_map, object, 0,
264		(vm_offset_t *) &buffer, size, 1,
265		VM_PROT_ALL, VM_PROT_ALL, 0);
266
267	if (error != KERN_SUCCESS) {
268		vm_object_deallocate(object);
269		return (ENOMEM);
270	}
271
272	/* free old resources if we're resizing */
273	pipe_free_kmem(cpipe);
274	cpipe->pipe_buffer.object = object;
275	cpipe->pipe_buffer.buffer = buffer;
276	cpipe->pipe_buffer.size = size;
277	cpipe->pipe_buffer.in = 0;
278	cpipe->pipe_buffer.out = 0;
279	cpipe->pipe_buffer.cnt = 0;
280	amountpipekva += cpipe->pipe_buffer.size;
281	return (0);
282}
283
284/*
285 * initialize and allocate VM and memory for pipe
286 */
287static int
288pipe_create(cpipep)
289	struct pipe **cpipep;
290{
291	struct pipe *cpipe;
292	int error;
293
294	*cpipep = zalloc(pipe_zone);
295	if (*cpipep == NULL)
296		return (ENOMEM);
297
298	cpipe = *cpipep;
299
300	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
301	cpipe->pipe_buffer.object = NULL;
302#ifndef PIPE_NODIRECT
303	cpipe->pipe_map.kva = NULL;
304#endif
305	/*
306	 * protect so pipeclose() doesn't follow a junk pointer
307	 * if pipespace() fails.
308	 */
309	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
310	cpipe->pipe_state = 0;
311	cpipe->pipe_peer = NULL;
312	cpipe->pipe_busy = 0;
313
314#ifndef PIPE_NODIRECT
315	/*
316	 * pipe data structure initializations to support direct pipe I/O
317	 */
318	cpipe->pipe_map.cnt = 0;
319	cpipe->pipe_map.kva = 0;
320	cpipe->pipe_map.pos = 0;
321	cpipe->pipe_map.npages = 0;
322	/* cpipe->pipe_map.ms[] = invalid */
323#endif
324
325	error = pipespace(cpipe, PIPE_SIZE);
326	if (error) {
327		return (error);
328	}
329
330	vfs_timestamp(&cpipe->pipe_ctime);
331	cpipe->pipe_atime = cpipe->pipe_ctime;
332	cpipe->pipe_mtime = cpipe->pipe_ctime;
333
334	return (0);
335}
336
337
338/*
339 * lock a pipe for I/O, blocking other access
340 */
341static __inline int
342pipelock(cpipe, catch)
343	struct pipe *cpipe;
344	int catch;
345{
346	int error;
347
348	while (cpipe->pipe_state & PIPE_LOCK) {
349		cpipe->pipe_state |= PIPE_LWANT;
350		if ((error = tsleep( cpipe,
351			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) != 0) {
352			return error;
353		}
354	}
355	cpipe->pipe_state |= PIPE_LOCK;
356	return 0;
357}
358
359/*
360 * unlock a pipe I/O lock
361 */
362static __inline void
363pipeunlock(cpipe)
364	struct pipe *cpipe;
365{
366
367	cpipe->pipe_state &= ~PIPE_LOCK;
368	if (cpipe->pipe_state & PIPE_LWANT) {
369		cpipe->pipe_state &= ~PIPE_LWANT;
370		wakeup(cpipe);
371	}
372}
373
374static __inline void
375pipeselwakeup(cpipe)
376	struct pipe *cpipe;
377{
378
379	if (cpipe->pipe_state & PIPE_SEL) {
380		cpipe->pipe_state &= ~PIPE_SEL;
381		selwakeup(&cpipe->pipe_sel);
382	}
383	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
384		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
385	KNOTE(&cpipe->pipe_sel.si_note, 0);
386}
387
388/* ARGSUSED */
389static int
390pipe_read(fp, uio, cred, flags, p)
391	struct file *fp;
392	struct uio *uio;
393	struct ucred *cred;
394	struct proc *p;
395	int flags;
396{
397	struct pipe *rpipe = (struct pipe *) fp->f_data;
398	int error;
399	int nread = 0;
400	u_int size;
401
402	++rpipe->pipe_busy;
403	error = pipelock(rpipe, 1);
404	if (error)
405		goto unlocked_error;
406
407	while (uio->uio_resid) {
408		/*
409		 * normal pipe buffer receive
410		 */
411		if (rpipe->pipe_buffer.cnt > 0) {
412			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
413			if (size > rpipe->pipe_buffer.cnt)
414				size = rpipe->pipe_buffer.cnt;
415			if (size > (u_int) uio->uio_resid)
416				size = (u_int) uio->uio_resid;
417
418			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
419					size, uio);
420			if (error) {
421				break;
422			}
423			rpipe->pipe_buffer.out += size;
424			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
425				rpipe->pipe_buffer.out = 0;
426
427			rpipe->pipe_buffer.cnt -= size;
428
429			/*
430			 * If there is no more to read in the pipe, reset
431			 * its pointers to the beginning.  This improves
432			 * cache hit stats.
433			 */
434			if (rpipe->pipe_buffer.cnt == 0) {
435				rpipe->pipe_buffer.in = 0;
436				rpipe->pipe_buffer.out = 0;
437			}
438			nread += size;
439#ifndef PIPE_NODIRECT
440		/*
441		 * Direct copy, bypassing a kernel buffer.
442		 */
443		} else if ((size = rpipe->pipe_map.cnt) &&
444			   (rpipe->pipe_state & PIPE_DIRECTW)) {
445			caddr_t	va;
446			if (size > (u_int) uio->uio_resid)
447				size = (u_int) uio->uio_resid;
448
449			va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
450			error = uiomove(va, size, uio);
451			if (error)
452				break;
453			nread += size;
454			rpipe->pipe_map.pos += size;
455			rpipe->pipe_map.cnt -= size;
456			if (rpipe->pipe_map.cnt == 0) {
457				rpipe->pipe_state &= ~PIPE_DIRECTW;
458				wakeup(rpipe);
459			}
460#endif
461		} else {
462			/*
463			 * detect EOF condition
464			 */
465			if (rpipe->pipe_state & PIPE_EOF) {
466				/* XXX error = ? */
467				break;
468			}
469
470			/*
471			 * If the "write-side" has been blocked, wake it up now.
472			 */
473			if (rpipe->pipe_state & PIPE_WANTW) {
474				rpipe->pipe_state &= ~PIPE_WANTW;
475				wakeup(rpipe);
476			}
477
478			/*
479			 * Break if some data was read.
480			 */
481			if (nread > 0)
482				break;
483
484			/*
485			 * Unlock the pipe buffer for our remaining processing.  We
486			 * will either break out with an error or we will sleep and
487			 * relock to loop.
488			 */
489			pipeunlock(rpipe);
490
491			/*
492			 * Handle non-blocking mode operation or
493			 * wait for more data.
494			 */
495			if (fp->f_flag & FNONBLOCK)
496				error = EAGAIN;
497			else {
498				rpipe->pipe_state |= PIPE_WANTR;
499				if ((error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) == 0)
500					error = pipelock(rpipe, 1);
501			}
502			if (error)
503				goto unlocked_error;
504		}
505	}
506	pipeunlock(rpipe);
507
508	if (error == 0)
509		vfs_timestamp(&rpipe->pipe_atime);
510unlocked_error:
511	--rpipe->pipe_busy;
512
513	/*
514	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
515	 */
516	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
517		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
518		wakeup(rpipe);
519	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
520		/*
521		 * Handle write blocking hysteresis.
522		 */
523		if (rpipe->pipe_state & PIPE_WANTW) {
524			rpipe->pipe_state &= ~PIPE_WANTW;
525			wakeup(rpipe);
526		}
527	}
528
529	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
530		pipeselwakeup(rpipe);
531
532	return error;
533}
534
535#ifndef PIPE_NODIRECT
536/*
537 * Map the sending processes' buffer into kernel space and wire it.
538 * This is similar to a physical write operation.
539 */
540static int
541pipe_build_write_buffer(wpipe, uio)
542	struct pipe *wpipe;
543	struct uio *uio;
544{
545	u_int size;
546	int i;
547	vm_offset_t addr, endaddr, paddr;
548
549	size = (u_int) uio->uio_iov->iov_len;
550	if (size > wpipe->pipe_buffer.size)
551		size = wpipe->pipe_buffer.size;
552
553	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
554	for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
555		addr < endaddr;
556		addr += PAGE_SIZE, i+=1) {
557
558		vm_page_t m;
559
560		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
561		    (paddr = pmap_kextract(addr)) == 0) {
562			int j;
563			for(j=0;j<i;j++)
564				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
565			return EFAULT;
566		}
567
568		m = PHYS_TO_VM_PAGE(paddr);
569		vm_page_wire(m);
570		wpipe->pipe_map.ms[i] = m;
571	}
572
573/*
574 * set up the control block
575 */
576	wpipe->pipe_map.npages = i;
577	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
578	wpipe->pipe_map.cnt = size;
579
580/*
581 * and map the buffer
582 */
583	if (wpipe->pipe_map.kva == 0) {
584		/*
585		 * We need to allocate space for an extra page because the
586		 * address range might (will) span pages at times.
587		 */
588		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
589			wpipe->pipe_buffer.size + PAGE_SIZE);
590		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
591	}
592	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
593		wpipe->pipe_map.npages);
594
595/*
596 * and update the uio data
597 */
598
599	uio->uio_iov->iov_len -= size;
600	uio->uio_iov->iov_base += size;
601	if (uio->uio_iov->iov_len == 0)
602		uio->uio_iov++;
603	uio->uio_resid -= size;
604	uio->uio_offset += size;
605	return 0;
606}
607
608/*
609 * unmap and unwire the process buffer
610 */
611static void
612pipe_destroy_write_buffer(wpipe)
613struct pipe *wpipe;
614{
615	int i;
616
617	if (wpipe->pipe_map.kva) {
618		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
619
620		if (amountpipekva > MAXPIPEKVA) {
621			vm_offset_t kva = wpipe->pipe_map.kva;
622			wpipe->pipe_map.kva = 0;
623			kmem_free(kernel_map, kva,
624				wpipe->pipe_buffer.size + PAGE_SIZE);
625			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
626		}
627	}
628	for (i=0;i<wpipe->pipe_map.npages;i++)
629		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
630}
631
632/*
633 * In the case of a signal, the writing process might go away.  This
634 * code copies the data into the circular buffer so that the source
635 * pages can be freed without loss of data.
636 */
637static void
638pipe_clone_write_buffer(wpipe)
639	struct pipe *wpipe;
640{
641	int size;
642	int pos;
643
644	size = wpipe->pipe_map.cnt;
645	pos = wpipe->pipe_map.pos;
646	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
647			(caddr_t) wpipe->pipe_buffer.buffer,
648			size);
649
650	wpipe->pipe_buffer.in = size;
651	wpipe->pipe_buffer.out = 0;
652	wpipe->pipe_buffer.cnt = size;
653	wpipe->pipe_state &= ~PIPE_DIRECTW;
654
655	pipe_destroy_write_buffer(wpipe);
656}
657
658/*
659 * This implements the pipe buffer write mechanism.  Note that only
660 * a direct write OR a normal pipe write can be pending at any given time.
661 * If there are any characters in the pipe buffer, the direct write will
662 * be deferred until the receiving process grabs all of the bytes from
663 * the pipe buffer.  Then the direct mapping write is set-up.
664 */
665static int
666pipe_direct_write(wpipe, uio)
667	struct pipe *wpipe;
668	struct uio *uio;
669{
670	int error;
671
672retry:
673	while (wpipe->pipe_state & PIPE_DIRECTW) {
674		if ( wpipe->pipe_state & PIPE_WANTR) {
675			wpipe->pipe_state &= ~PIPE_WANTR;
676			wakeup(wpipe);
677		}
678		wpipe->pipe_state |= PIPE_WANTW;
679		error = tsleep(wpipe,
680				PRIBIO|PCATCH, "pipdww", 0);
681		if (error)
682			goto error1;
683		if (wpipe->pipe_state & PIPE_EOF) {
684			error = EPIPE;
685			goto error1;
686		}
687	}
688	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
689	if (wpipe->pipe_buffer.cnt > 0) {
690		if ( wpipe->pipe_state & PIPE_WANTR) {
691			wpipe->pipe_state &= ~PIPE_WANTR;
692			wakeup(wpipe);
693		}
694
695		wpipe->pipe_state |= PIPE_WANTW;
696		error = tsleep(wpipe,
697				PRIBIO|PCATCH, "pipdwc", 0);
698		if (error)
699			goto error1;
700		if (wpipe->pipe_state & PIPE_EOF) {
701			error = EPIPE;
702			goto error1;
703		}
704		goto retry;
705	}
706
707	wpipe->pipe_state |= PIPE_DIRECTW;
708
709	error = pipe_build_write_buffer(wpipe, uio);
710	if (error) {
711		wpipe->pipe_state &= ~PIPE_DIRECTW;
712		goto error1;
713	}
714
715	error = 0;
716	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
717		if (wpipe->pipe_state & PIPE_EOF) {
718			pipelock(wpipe, 0);
719			pipe_destroy_write_buffer(wpipe);
720			pipeunlock(wpipe);
721			pipeselwakeup(wpipe);
722			error = EPIPE;
723			goto error1;
724		}
725		if (wpipe->pipe_state & PIPE_WANTR) {
726			wpipe->pipe_state &= ~PIPE_WANTR;
727			wakeup(wpipe);
728		}
729		pipeselwakeup(wpipe);
730		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
731	}
732
733	pipelock(wpipe,0);
734	if (wpipe->pipe_state & PIPE_DIRECTW) {
735		/*
736		 * this bit of trickery substitutes a kernel buffer for
737		 * the process that might be going away.
738		 */
739		pipe_clone_write_buffer(wpipe);
740	} else {
741		pipe_destroy_write_buffer(wpipe);
742	}
743	pipeunlock(wpipe);
744	return error;
745
746error1:
747	wakeup(wpipe);
748	return error;
749}
750#endif
751
752static int
753pipe_write(fp, uio, cred, flags, p)
754	struct file *fp;
755	struct uio *uio;
756	struct ucred *cred;
757	struct proc *p;
758	int flags;
759{
760	int error = 0;
761	int orig_resid;
762	struct pipe *wpipe, *rpipe;
763
764	rpipe = (struct pipe *) fp->f_data;
765	wpipe = rpipe->pipe_peer;
766
767	/*
768	 * detect loss of pipe read side, issue SIGPIPE if lost.
769	 */
770	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
771		return EPIPE;
772	}
773
774	/*
775	 * If it is advantageous to resize the pipe buffer, do
776	 * so.
777	 */
778	if ((uio->uio_resid > PIPE_SIZE) &&
779		(nbigpipe < LIMITBIGPIPES) &&
780		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
781		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
782		(wpipe->pipe_buffer.cnt == 0)) {
783
784		if ((error = pipelock(wpipe,1)) == 0) {
785			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
786				nbigpipe++;
787			pipeunlock(wpipe);
788		} else {
789			return error;
790		}
791	}
792
793	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
794
795	++wpipe->pipe_busy;
796	orig_resid = uio->uio_resid;
797	while (uio->uio_resid) {
798		int space;
799#ifndef PIPE_NODIRECT
800		/*
801		 * If the transfer is large, we can gain performance if
802		 * we do process-to-process copies directly.
803		 * If the write is non-blocking, we don't use the
804		 * direct write mechanism.
805		 *
806		 * The direct write mechanism will detect the reader going
807		 * away on us.
808		 */
809		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
810		    (fp->f_flag & FNONBLOCK) == 0 &&
811			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
812			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
813			error = pipe_direct_write( wpipe, uio);
814			if (error) {
815				break;
816			}
817			continue;
818		}
819#endif
820
821		/*
822		 * Pipe buffered writes cannot be coincidental with
823		 * direct writes.  We wait until the currently executing
824		 * direct write is completed before we start filling the
825		 * pipe buffer.  We break out if a signal occurs or the
826		 * reader goes away.
827		 */
828	retrywrite:
829		while (wpipe->pipe_state & PIPE_DIRECTW) {
830			if (wpipe->pipe_state & PIPE_WANTR) {
831				wpipe->pipe_state &= ~PIPE_WANTR;
832				wakeup(wpipe);
833			}
834			error = tsleep(wpipe, PRIBIO|PCATCH, "pipbww", 0);
835			if (wpipe->pipe_state & PIPE_EOF)
836				break;
837			if (error)
838				break;
839		}
840		if (wpipe->pipe_state & PIPE_EOF) {
841			error = EPIPE;
842			break;
843		}
844
845		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
846
847		/* Writes of size <= PIPE_BUF must be atomic. */
848		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
849			space = 0;
850
851		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
852			if ((error = pipelock(wpipe,1)) == 0) {
853				int size;	/* Transfer size */
854				int segsize;	/* first segment to transfer */
855				/*
856				 * It is possible for a direct write to
857				 * slip in on us... handle it here...
858				 */
859				if (wpipe->pipe_state & PIPE_DIRECTW) {
860					pipeunlock(wpipe);
861					goto retrywrite;
862				}
863				/*
864				 * If a process blocked in uiomove, our
865				 * value for space might be bad.
866				 *
867				 * XXX will we be ok if the reader has gone
868				 * away here?
869				 */
870				if (space > wpipe->pipe_buffer.size -
871				    wpipe->pipe_buffer.cnt) {
872					pipeunlock(wpipe);
873					goto retrywrite;
874				}
875
876				/*
877				 * Transfer size is minimum of uio transfer
878				 * and free space in pipe buffer.
879				 */
880				if (space > uio->uio_resid)
881					size = uio->uio_resid;
882				else
883					size = space;
884				/*
885				 * First segment to transfer is minimum of
886				 * transfer size and contiguous space in
887				 * pipe buffer.  If first segment to transfer
888				 * is less than the transfer size, we've got
889				 * a wraparound in the buffer.
890				 */
891				segsize = wpipe->pipe_buffer.size -
892					wpipe->pipe_buffer.in;
893				if (segsize > size)
894					segsize = size;
895
896				/* Transfer first segment */
897
898				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
899						segsize, uio);
900
901				if (error == 0 && segsize < size) {
902					/*
903					 * Transfer remaining part now, to
904					 * support atomic writes.  Wraparound
905					 * happened.
906					 */
907					if (wpipe->pipe_buffer.in + segsize !=
908					    wpipe->pipe_buffer.size)
909						panic("Expected pipe buffer wraparound disappeared");
910
911					error = uiomove(&wpipe->pipe_buffer.buffer[0],
912							size - segsize, uio);
913				}
914				if (error == 0) {
915					wpipe->pipe_buffer.in += size;
916					if (wpipe->pipe_buffer.in >=
917					    wpipe->pipe_buffer.size) {
918						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
919							panic("Expected wraparound bad");
920						wpipe->pipe_buffer.in = size - segsize;
921					}
922
923					wpipe->pipe_buffer.cnt += size;
924					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
925						panic("Pipe buffer overflow");
926
927				}
928				pipeunlock(wpipe);
929			}
930			if (error)
931				break;
932
933		} else {
934			/*
935			 * If the "read-side" has been blocked, wake it up now.
936			 */
937			if (wpipe->pipe_state & PIPE_WANTR) {
938				wpipe->pipe_state &= ~PIPE_WANTR;
939				wakeup(wpipe);
940			}
941
942			/*
943			 * don't block on non-blocking I/O
944			 */
945			if (fp->f_flag & FNONBLOCK) {
946				error = EAGAIN;
947				break;
948			}
949
950			/*
951			 * We have no more space and have something to offer,
952			 * wake up select/poll.
953			 */
954			pipeselwakeup(wpipe);
955
956			wpipe->pipe_state |= PIPE_WANTW;
957			if ((error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) != 0) {
958				break;
959			}
960			/*
961			 * If read side wants to go away, we just issue a signal
962			 * to ourselves.
963			 */
964			if (wpipe->pipe_state & PIPE_EOF) {
965				error = EPIPE;
966				break;
967			}
968		}
969	}
970
971	--wpipe->pipe_busy;
972	if ((wpipe->pipe_busy == 0) &&
973		(wpipe->pipe_state & PIPE_WANT)) {
974		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
975		wakeup(wpipe);
976	} else if (wpipe->pipe_buffer.cnt > 0) {
977		/*
978		 * If we have put any characters in the buffer, we wake up
979		 * the reader.
980		 */
981		if (wpipe->pipe_state & PIPE_WANTR) {
982			wpipe->pipe_state &= ~PIPE_WANTR;
983			wakeup(wpipe);
984		}
985	}
986
987	/*
988	 * Don't return EPIPE if I/O was successful
989	 */
990	if ((wpipe->pipe_buffer.cnt == 0) &&
991		(uio->uio_resid == 0) &&
992		(error == EPIPE))
993		error = 0;
994
995	if (error == 0)
996		vfs_timestamp(&wpipe->pipe_mtime);
997
998	/*
999	 * We have something to offer,
1000	 * wake up select/poll.
1001	 */
1002	if (wpipe->pipe_buffer.cnt)
1003		pipeselwakeup(wpipe);
1004
1005	return error;
1006}
1007
1008/*
1009 * we implement a very minimal set of ioctls for compatibility with sockets.
1010 */
1011int
1012pipe_ioctl(fp, cmd, data, p)
1013	struct file *fp;
1014	u_long cmd;
1015	caddr_t data;
1016	struct proc *p;
1017{
1018	struct pipe *mpipe = (struct pipe *)fp->f_data;
1019
1020	switch (cmd) {
1021
1022	case FIONBIO:
1023		return (0);
1024
1025	case FIOASYNC:
1026		if (*(int *)data) {
1027			mpipe->pipe_state |= PIPE_ASYNC;
1028		} else {
1029			mpipe->pipe_state &= ~PIPE_ASYNC;
1030		}
1031		return (0);
1032
1033	case FIONREAD:
1034		if (mpipe->pipe_state & PIPE_DIRECTW)
1035			*(int *)data = mpipe->pipe_map.cnt;
1036		else
1037			*(int *)data = mpipe->pipe_buffer.cnt;
1038		return (0);
1039
1040	case FIOSETOWN:
1041		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1042
1043	case FIOGETOWN:
1044		*(int *)data = fgetown(mpipe->pipe_sigio);
1045		return (0);
1046
1047	/* This is deprecated, FIOSETOWN should be used instead. */
1048	case TIOCSPGRP:
1049		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1050
1051	/* This is deprecated, FIOGETOWN should be used instead. */
1052	case TIOCGPGRP:
1053		*(int *)data = -fgetown(mpipe->pipe_sigio);
1054		return (0);
1055
1056	}
1057	return (ENOTTY);
1058}
1059
1060int
1061pipe_poll(fp, events, cred, p)
1062	struct file *fp;
1063	int events;
1064	struct ucred *cred;
1065	struct proc *p;
1066{
1067	struct pipe *rpipe = (struct pipe *)fp->f_data;
1068	struct pipe *wpipe;
1069	int revents = 0;
1070
1071	wpipe = rpipe->pipe_peer;
1072	if (events & (POLLIN | POLLRDNORM))
1073		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1074		    (rpipe->pipe_buffer.cnt > 0) ||
1075		    (rpipe->pipe_state & PIPE_EOF))
1076			revents |= events & (POLLIN | POLLRDNORM);
1077
1078	if (events & (POLLOUT | POLLWRNORM))
1079		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1080		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1081		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1082			revents |= events & (POLLOUT | POLLWRNORM);
1083
1084	if ((rpipe->pipe_state & PIPE_EOF) ||
1085	    (wpipe == NULL) ||
1086	    (wpipe->pipe_state & PIPE_EOF))
1087		revents |= POLLHUP;
1088
1089	if (revents == 0) {
1090		if (events & (POLLIN | POLLRDNORM)) {
1091			selrecord(p, &rpipe->pipe_sel);
1092			rpipe->pipe_state |= PIPE_SEL;
1093		}
1094
1095		if (events & (POLLOUT | POLLWRNORM)) {
1096			selrecord(p, &wpipe->pipe_sel);
1097			wpipe->pipe_state |= PIPE_SEL;
1098		}
1099	}
1100
1101	return (revents);
1102}
1103
1104static int
1105pipe_stat(fp, ub, p)
1106	struct file *fp;
1107	struct stat *ub;
1108	struct proc *p;
1109{
1110	struct pipe *pipe = (struct pipe *)fp->f_data;
1111
1112	bzero((caddr_t)ub, sizeof (*ub));
1113	ub->st_mode = S_IFIFO;
1114	ub->st_blksize = pipe->pipe_buffer.size;
1115	ub->st_size = pipe->pipe_buffer.cnt;
1116	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1117	ub->st_atimespec = pipe->pipe_atime;
1118	ub->st_mtimespec = pipe->pipe_mtime;
1119	ub->st_ctimespec = pipe->pipe_ctime;
1120	ub->st_uid = fp->f_cred->cr_uid;
1121	ub->st_gid = fp->f_cred->cr_gid;
1122	/*
1123	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1124	 * XXX (st_dev, st_ino) should be unique.
1125	 */
1126	return 0;
1127}
1128
1129/* ARGSUSED */
1130static int
1131pipe_close(fp, p)
1132	struct file *fp;
1133	struct proc *p;
1134{
1135	struct pipe *cpipe = (struct pipe *)fp->f_data;
1136
1137	fp->f_ops = &badfileops;
1138	fp->f_data = NULL;
1139	funsetown(cpipe->pipe_sigio);
1140	pipeclose(cpipe);
1141	return 0;
1142}
1143
1144static void
1145pipe_free_kmem(cpipe)
1146	struct pipe *cpipe;
1147{
1148
1149	if (cpipe->pipe_buffer.buffer != NULL) {
1150		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1151			--nbigpipe;
1152		amountpipekva -= cpipe->pipe_buffer.size;
1153		kmem_free(kernel_map,
1154			(vm_offset_t)cpipe->pipe_buffer.buffer,
1155			cpipe->pipe_buffer.size);
1156		cpipe->pipe_buffer.buffer = NULL;
1157	}
1158#ifndef PIPE_NODIRECT
1159	if (cpipe->pipe_map.kva != NULL) {
1160		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1161		kmem_free(kernel_map,
1162			cpipe->pipe_map.kva,
1163			cpipe->pipe_buffer.size + PAGE_SIZE);
1164		cpipe->pipe_map.cnt = 0;
1165		cpipe->pipe_map.kva = 0;
1166		cpipe->pipe_map.pos = 0;
1167		cpipe->pipe_map.npages = 0;
1168	}
1169#endif
1170}
1171
1172/*
1173 * shutdown the pipe
1174 */
1175static void
1176pipeclose(cpipe)
1177	struct pipe *cpipe;
1178{
1179	struct pipe *ppipe;
1180
1181	if (cpipe) {
1182
1183		pipeselwakeup(cpipe);
1184
1185		/*
1186		 * If the other side is blocked, wake it up saying that
1187		 * we want to close it down.
1188		 */
1189		while (cpipe->pipe_busy) {
1190			wakeup(cpipe);
1191			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
1192			tsleep(cpipe, PRIBIO, "pipecl", 0);
1193		}
1194
1195		/*
1196		 * Disconnect from peer
1197		 */
1198		if ((ppipe = cpipe->pipe_peer) != NULL) {
1199			pipeselwakeup(ppipe);
1200
1201			ppipe->pipe_state |= PIPE_EOF;
1202			wakeup(ppipe);
1203			ppipe->pipe_peer = NULL;
1204		}
1205
1206		/*
1207		 * free resources
1208		 */
1209		pipe_free_kmem(cpipe);
1210		zfree(pipe_zone, cpipe);
1211	}
1212}
1213
1214/*ARGSUSED*/
1215static int
1216pipe_kqfilter(struct file *fp, struct knote *kn)
1217{
1218	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1219
1220	switch (kn->kn_filter) {
1221	case EVFILT_READ:
1222		kn->kn_fop = &pipe_rfiltops;
1223		break;
1224	case EVFILT_WRITE:
1225		kn->kn_fop = &pipe_wfiltops;
1226		break;
1227	default:
1228		return (1);
1229	}
1230
1231	SLIST_INSERT_HEAD(&rpipe->pipe_sel.si_note, kn, kn_selnext);
1232	return (0);
1233}
1234
1235static void
1236filt_pipedetach(struct knote *kn)
1237{
1238	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1239
1240	SLIST_REMOVE(&rpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1241}
1242
1243/*ARGSUSED*/
1244static int
1245filt_piperead(struct knote *kn, long hint)
1246{
1247	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1248	struct pipe *wpipe = rpipe->pipe_peer;
1249
1250	kn->kn_data = rpipe->pipe_buffer.cnt;
1251	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1252		kn->kn_data = rpipe->pipe_map.cnt;
1253
1254	if ((rpipe->pipe_state & PIPE_EOF) ||
1255	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1256		kn->kn_flags |= EV_EOF;
1257		return (1);
1258	}
1259	return (kn->kn_data > 0);
1260}
1261
1262/*ARGSUSED*/
1263static int
1264filt_pipewrite(struct knote *kn, long hint)
1265{
1266	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1267	struct pipe *wpipe = rpipe->pipe_peer;
1268
1269	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1270		kn->kn_data = 0;
1271		kn->kn_flags |= EV_EOF;
1272		return (1);
1273	}
1274	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1275	if (wpipe->pipe_state & PIPE_DIRECTW)
1276		kn->kn_data = 0;
1277
1278	return (kn->kn_data >= PIPE_BUF);
1279}
1280