sys_pipe.c revision 13913
118316Swollman/*
218316Swollman * Copyright (c) 1996 John S. Dyson
318316Swollman * All rights reserved.
418316Swollman *
518316Swollman * Redistribution and use in source and binary forms, with or without
618316Swollman * modification, are permitted provided that the following conditions
718316Swollman * are met:
818316Swollman * 1. Redistributions of source code must retain the above copyright
918316Swollman *    notice immediately at the beginning of the file, without modification,
1018316Swollman *    this list of conditions, and the following disclaimer.
1118316Swollman * 2. Redistributions in binary form must reproduce the above copyright
1218316Swollman *    notice, this list of conditions and the following disclaimer in the
1318316Swollman *    documentation and/or other materials provided with the distribution.
1446303Smarkm * 3. Absolutely no warranty of function or purpose is made by the author
1518316Swollman *    John S. Dyson.
1618316Swollman * 4. This work was done expressly for inclusion into FreeBSD.  Other use
1718316Swollman *    is allowed if this notation is included.
1818316Swollman * 5. Modifications may be freely made to this file if the above conditions
1918316Swollman *    are met.
2018316Swollman *
2118316Swollman * $Id: sys_pipe.c,v 1.6 1996/02/04 22:09:05 dyson Exp $
2218316Swollman */
2318316Swollman
2418316Swollman#ifndef OLD_PIPE
2518316Swollman
2618316Swollman/*
2718316Swollman * This file contains a high-performance replacement for the socket-based
2818316Swollman * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
2918316Swollman * all features of sockets, but does do everything that pipes normally
3018316Swollman * do.
3118316Swollman */
3246303Smarkm
3350476Speter/*
3418316Swollman * This code has two modes of operation, a small write mode and a large
3518316Swollman * write mode.  The small write mode acts like conventional pipes with
3618316Swollman * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
3746303Smarkm * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
3818316Swollman * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
3918316Swollman * the receiving process can copy it directly from the pages in the sending
4018316Swollman * process.
4146303Smarkm *
4218316Swollman * If the sending process receives a signal, it is possible that it will
4346303Smarkm * go away, and certainly its address space can change, because control
4418316Swollman * is returned back to the user-mode side.  In that case, the pipe code
4546303Smarkm * arranges to copy the buffer supplied by the user process, to a pageable
4646303Smarkm * kernel buffer, and the receiving process will grab the data from the
4746303Smarkm * pageable kernel buffer.  Since signals don't happen all that often,
4846303Smarkm * the copy operation is normally eliminated.
4946303Smarkm *
5046303Smarkm * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
5146303Smarkm * happen for small transfers so that the system will not spend all of
5246303Smarkm * its time context switching.  PIPE_SIZE is constrained by the
5346303Smarkm * amount of kernel virtual memory.
5446303Smarkm */
5550969Speter
5646303Smarkm#include <sys/param.h>
5746303Smarkm#include <sys/systm.h>
5818316Swollman#include <sys/proc.h>
5918316Swollman#include <sys/file.h>
6018316Swollman#include <sys/protosw.h>
6118316Swollman#include <sys/stat.h>
6218316Swollman#include <sys/filedesc.h>
6346303Smarkm#include <sys/malloc.h>
6446303Smarkm#include <sys/ioctl.h>
6518316Swollman#include <sys/stat.h>
6618316Swollman#include <sys/select.h>
6718316Swollman#include <sys/signalvar.h>
6818316Swollman#include <sys/errno.h>
6918316Swollman#include <sys/queue.h>
7018316Swollman#include <sys/vmmeter.h>
7118316Swollman#include <sys/kernel.h>
7218316Swollman#include <sys/sysproto.h>
7337908Scharnier#include <sys/pipe.h>
7418316Swollman
7518316Swollman#include <vm/vm.h>
7618316Swollman#include <vm/vm_prot.h>
7718316Swollman#include <vm/vm_param.h>
7846303Smarkm#include <vm/lock.h>
7918316Swollman#include <vm/vm_object.h>
8018316Swollman#include <vm/vm_kern.h>
8118316Swollman#include <vm/vm_extern.h>
8218316Swollman#include <vm/pmap.h>
8318316Swollman#include <vm/vm_map.h>
8418316Swollman#include <vm/vm_page.h>
8564131Ssheldonh
8664131Ssheldonhstatic int pipe_read __P((struct file *fp, struct uio *uio,
8764131Ssheldonh		struct ucred *cred));
8818316Swollmanstatic int pipe_write __P((struct file *fp, struct uio *uio,
8946303Smarkm		struct ucred *cred));
9046303Smarkmstatic int pipe_close __P((struct file *fp, struct proc *p));
9118316Swollmanstatic int pipe_select __P((struct file *fp, int which, struct proc *p));
9218316Swollmanstatic int pipe_ioctl __P((struct file *fp, int cmd, caddr_t data, struct proc *p));
9318316Swollman
9418316Swollmanstatic struct fileops pipeops =
9518316Swollman    { pipe_read, pipe_write, pipe_ioctl, pipe_select, pipe_close };
9618316Swollman
9718316Swollman/*
9818316Swollman * Default pipe buffer size(s), this can be kind-of large now because pipe
9918316Swollman * space is pageable.  The pipe code will try to maintain locality of
10018316Swollman * reference for performance reasons, so small amounts of outstanding I/O
10118316Swollman * will not wipe the cache.
10218316Swollman */
10318316Swollman#define MINPIPESIZE (PIPE_SIZE/3)
10418316Swollman#define MAXPIPESIZE (2*PIPE_SIZE/3)
10518316Swollman
10618316Swollman/*
10718316Swollman * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
10818316Swollman * is there so that on large systems, we don't exhaust it.
10946303Smarkm */
11018316Swollman#define MAXPIPEKVA (8*1024*1024)
11118316Swollman
11218316Swollman/*
11319896Swollman * Limit for direct transfers, we cannot, of course limit
11418316Swollman * the amount of kva for pipes in general though.
11518316Swollman */
11618316Swollman#define LIMITPIPEKVA (16*1024*1024)
11718316Swollmanint amountpipekva;
11818316Swollman
11919896Swollmanstatic void pipeclose __P((struct pipe *cpipe));
12019896Swollmanstatic void pipebufferinit __P((struct pipe *cpipe));
12119896Swollmanstatic void pipeinit __P((struct pipe *cpipe));
12219896Swollmanstatic __inline int pipelock __P((struct pipe *cpipe, int catch));
12319896Swollmanstatic __inline void pipeunlock __P((struct pipe *cpipe));
12418316Swollmanstatic int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
12518316Swollmanstatic void pipe_destroy_write_buffer __P((struct pipe *wpipe));
12618316Swollmanstatic int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
12718316Swollmanstatic void pipe_clone_write_buffer __P((struct pipe *wpipe));
12818316Swollmanstatic void pipe_mark_pages_clean __P((struct pipe *cpipe));
12918316Swollmanstatic int pipewrite __P((struct pipe *wpipe, struct uio *uio, int nbio));
13018316Swollmanstatic void pipespace __P((struct pipe *cpipe));
13118316Swollman
13218316Swollman/*
13318316Swollman * The pipe system call for the DTYPE_PIPE type of pipes
13418316Swollman */
13518316Swollman
13618316Swollman/* ARGSUSED */
13718316Swollmanint
13818316Swollmanpipe(p, uap, retval)
13918316Swollman	struct proc *p;
14046303Smarkm	struct pipe_args /* {
14118316Swollman		int	dummy;
14218316Swollman	} */ *uap;
14318316Swollman	int retval[];
14418316Swollman{
14518316Swollman	register struct filedesc *fdp = p->p_fd;
14618316Swollman	struct file *rf, *wf;
14718316Swollman	struct pipe *rpipe, *wpipe;
14818316Swollman	int fd, error;
14918316Swollman
15018316Swollman	rpipe = malloc( sizeof (*rpipe), M_TEMP, M_WAITOK);
15118316Swollman	pipeinit(rpipe);
15218316Swollman	rpipe->pipe_state |= PIPE_DIRECTOK;
15318316Swollman	wpipe = malloc( sizeof (*wpipe), M_TEMP, M_WAITOK);
15418316Swollman	pipeinit(wpipe);
15518316Swollman	wpipe->pipe_state |= PIPE_DIRECTOK;
15618316Swollman
15746303Smarkm	error = falloc(p, &rf, &fd);
15818316Swollman	if (error)
15946303Smarkm		goto free2;
16046303Smarkm	retval[0] = fd;
16146303Smarkm	rf->f_flag = FREAD | FWRITE;
16218316Swollman	rf->f_type = DTYPE_PIPE;
16318316Swollman	rf->f_ops = &pipeops;
16418316Swollman	rf->f_data = (caddr_t)rpipe;
16518316Swollman	error = falloc(p, &wf, &fd);
16618316Swollman	if (error)
16718316Swollman		goto free3;
16818316Swollman	wf->f_flag = FREAD | FWRITE;
16918316Swollman	wf->f_type = DTYPE_PIPE;
17018316Swollman	wf->f_ops = &pipeops;
17118316Swollman	wf->f_data = (caddr_t)wpipe;
17218316Swollman	retval[1] = fd;
17318316Swollman
17418316Swollman	rpipe->pipe_peer = wpipe;
17518316Swollman	wpipe->pipe_peer = rpipe;
17618316Swollman
17718316Swollman	return (0);
17818316Swollmanfree3:
17918316Swollman	ffree(rf);
18018316Swollman	fdp->fd_ofiles[retval[0]] = 0;
18118316Swollmanfree2:
18218316Swollman	(void)pipeclose(wpipe);
18318316Swollmanfree1:
18418316Swollman	(void)pipeclose(rpipe);
18518316Swollman	return (error);
18618316Swollman}
18718316Swollman
18818316Swollman/*
18918316Swollman * Allocate kva for pipe circular buffer, the space is pageable
19019896Swollman */
19118316Swollmanstatic void
19218316Swollmanpipespace(cpipe)
19318316Swollman	struct pipe *cpipe;
19418316Swollman{
19518316Swollman	int npages, error;
19618316Swollman
19718316Swollman	npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
19818316Swollman	/*
19919896Swollman	 * Create an object, I don't like the idea of paging to/from
20018316Swollman	 * kernel_object.
20118316Swollman	 */
20218316Swollman	cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
20318316Swollman	cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
20446303Smarkm
20519896Swollman	/*
20618316Swollman	 * Insert the object into the kernel map, and allocate kva for it.
20718316Swollman	 * The map entry is, by default, pageable.
20846303Smarkm	 */
20946303Smarkm	error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
21046303Smarkm		(vm_offset_t *) &cpipe->pipe_buffer.buffer,
21118316Swollman		cpipe->pipe_buffer.size, 1,
21218316Swollman		VM_PROT_ALL, VM_PROT_ALL, 0);
21318316Swollman
21446303Smarkm	if (error != KERN_SUCCESS)
21518316Swollman		panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
21646303Smarkm	amountpipekva += cpipe->pipe_buffer.size;
21746303Smarkm}
21846303Smarkm
21946303Smarkm/*
22046303Smarkm * initialize and allocate VM and memory for pipe
22118316Swollman */
22218316Swollmanstatic void
22346303Smarkmpipeinit(cpipe)
22446303Smarkm	struct pipe *cpipe;
22546303Smarkm{
22664483Ssheldonh	int s;
22746303Smarkm
22846303Smarkm	cpipe->pipe_buffer.in = 0;
22918316Swollman	cpipe->pipe_buffer.out = 0;
23018316Swollman	cpipe->pipe_buffer.cnt = 0;
23118316Swollman	cpipe->pipe_buffer.size = PIPE_SIZE;
23218316Swollman	/* Buffer kva gets dynamically allocated */
23318316Swollman	cpipe->pipe_buffer.buffer = NULL;
23418316Swollman
23518316Swollman	cpipe->pipe_state = 0;
23618316Swollman	cpipe->pipe_peer = NULL;
23718316Swollman	cpipe->pipe_busy = 0;
23818316Swollman	s = splhigh();
23918316Swollman	cpipe->pipe_ctime = time;
24020342Swollman	cpipe->pipe_atime = time;
24120342Swollman	cpipe->pipe_mtime = time;
24218316Swollman	splx(s);
24318316Swollman	bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
24446303Smarkm
24546303Smarkm	/*
24618316Swollman	 * pipe data structure initializations to support direct pipe I/O
24746303Smarkm	 */
24846303Smarkm	cpipe->pipe_map.cnt = 0;
24946303Smarkm	cpipe->pipe_map.kva = 0;
25018316Swollman	cpipe->pipe_map.pos = 0;
25146303Smarkm	cpipe->pipe_map.npages = 0;
25218316Swollman}
25318316Swollman
25418316Swollman
25518316Swollman/*
25618316Swollman * lock a pipe for I/O, blocking other access
25718316Swollman */
25818316Swollmanstatic __inline int
25918316Swollmanpipelock(cpipe, catch)
26018316Swollman	struct pipe *cpipe;
26118316Swollman	int catch;
26218316Swollman{
26318316Swollman	int error;
26418316Swollman	while (cpipe->pipe_state & PIPE_LOCK) {
26518316Swollman		cpipe->pipe_state |= PIPE_LWANT;
26618316Swollman		if (error = tsleep( &cpipe->pipe_state,
26718316Swollman			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) {
26818316Swollman			return error;
26918316Swollman		}
27018316Swollman	}
27118316Swollman	cpipe->pipe_state |= PIPE_LOCK;
27218316Swollman	return 0;
27318316Swollman}
27418316Swollman
27518316Swollman/*
27618316Swollman * unlock a pipe I/O lock
27718316Swollman */
27818316Swollmanstatic __inline void
27918316Swollmanpipeunlock(cpipe)
28018316Swollman	struct pipe *cpipe;
28118316Swollman{
28218316Swollman	cpipe->pipe_state &= ~PIPE_LOCK;
28318316Swollman	if (cpipe->pipe_state & PIPE_LWANT) {
28418316Swollman		cpipe->pipe_state &= ~PIPE_LWANT;
28518316Swollman		wakeup(&cpipe->pipe_state);
28618316Swollman	}
28718316Swollman	return;
28818316Swollman}
28918316Swollman
29018316Swollman#if 0
29120342Swollmanstatic void
29246303Smarkmpipe_mark_pages_clean(cpipe)
29320342Swollman	struct pipe *cpipe;
29418316Swollman{
29546303Smarkm	vm_size_t off;
29620342Swollman	vm_page_t m;
29718316Swollman
29818316Swollman	for(off = 0; off < cpipe->pipe_buffer.object->size; off += 1) {
29918316Swollman		m = vm_page_lookup(cpipe->pipe_buffer.object, off);
300110670Sache		if ((m != NULL) && (m->busy == 0) && (m->flags & PG_BUSY) == 0) {
301110670Sache			m->dirty = 0;
302110670Sache			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
30318316Swollman		}
304110670Sache	}
30518316Swollman}
30618316Swollman#endif
30718316Swollman
30818316Swollman/* ARGSUSED */
30918316Swollmanstatic int
31018316Swollmanpipe_read(fp, uio, cred)
31118316Swollman	struct file *fp;
31218316Swollman	struct uio *uio;
31318316Swollman	struct ucred *cred;
31418316Swollman{
31518316Swollman
31618316Swollman	struct pipe *rpipe = (struct pipe *) fp->f_data;
31718316Swollman	int error = 0;
31818316Swollman	int nread = 0;
31918316Swollman	int size;
32018316Swollman
32118316Swollman	++rpipe->pipe_busy;
32220342Swollman	while (uio->uio_resid) {
32346303Smarkm		/*
32420342Swollman		 * normal pipe buffer receive
32546303Smarkm		 */
32618316Swollman		if (rpipe->pipe_buffer.cnt > 0) {
32718316Swollman			int size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
32819896Swollman			if (size > rpipe->pipe_buffer.cnt)
32919896Swollman				size = rpipe->pipe_buffer.cnt;
33018316Swollman			if (size > uio->uio_resid)
33118316Swollman				size = uio->uio_resid;
33218316Swollman			if ((error = pipelock(rpipe,1)) == 0) {
33318316Swollman				error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
33418316Swollman					size, uio);
33518316Swollman				pipeunlock(rpipe);
33618316Swollman			}
33718316Swollman			if (error) {
33818316Swollman				break;
33918316Swollman			}
34018316Swollman			rpipe->pipe_buffer.out += size;
34118316Swollman			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
34218316Swollman				rpipe->pipe_buffer.out = 0;
34318316Swollman
34418316Swollman			rpipe->pipe_buffer.cnt -= size;
34518316Swollman			nread += size;
34618316Swollman		/*
34718316Swollman		 * Direct copy, bypassing a kernel buffer.
34818316Swollman		 */
34918316Swollman		} else if ((size = rpipe->pipe_map.cnt) &&
35018316Swollman			(rpipe->pipe_state & PIPE_DIRECTW)) {
35118316Swollman			caddr_t va;
35218316Swollman			if (size > uio->uio_resid)
35318316Swollman				size = uio->uio_resid;
35419896Swollman			if ((error = pipelock(rpipe,1)) == 0) {
35518316Swollman				va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
35646303Smarkm				error = uiomove(va, size, uio);
35746303Smarkm				pipeunlock(rpipe);
35846303Smarkm			}
35946303Smarkm			if (error)
36018316Swollman				break;
36118316Swollman			nread += size;
36218316Swollman			rpipe->pipe_map.pos += size;
36318316Swollman			rpipe->pipe_map.cnt -= size;
36418316Swollman			if (rpipe->pipe_map.cnt == 0) {
36546303Smarkm				rpipe->pipe_state &= ~PIPE_DIRECTW;
36646303Smarkm				wakeup(rpipe);
36746303Smarkm			}
36846303Smarkm		} else {
36946303Smarkm			/*
37046303Smarkm			 * detect EOF condition
37146303Smarkm			 */
37218316Swollman			if (rpipe->pipe_state & PIPE_EOF) {
37346303Smarkm				break;
37446303Smarkm			}
37546303Smarkm			/*
37646303Smarkm			 * If the "write-side" has been blocked, wake it up now.
37746303Smarkm			 */
37846303Smarkm			if (rpipe->pipe_state & PIPE_WANTW) {
37946303Smarkm				rpipe->pipe_state &= ~PIPE_WANTW;
38046303Smarkm				wakeup(rpipe);
38146303Smarkm			}
38246303Smarkm			if (nread > 0)
38346303Smarkm				break;
38446303Smarkm			if (rpipe->pipe_state & PIPE_NBIO) {
38546303Smarkm				error = EAGAIN;
38646303Smarkm				break;
38746303Smarkm			}
38846303Smarkm
38946303Smarkm			/*
39018316Swollman			 * If there is no more to read in the pipe, reset
39118316Swollman			 * its pointers to the beginning.  This improves
39218316Swollman			 * cache hit stats.
39318316Swollman			 */
39418316Swollman
39518316Swollman			if ((error = pipelock(rpipe,1)) == 0) {
39620342Swollman				if (rpipe->pipe_buffer.cnt == 0) {
39720342Swollman					rpipe->pipe_buffer.in = 0;
39818316Swollman					rpipe->pipe_buffer.out = 0;
39918316Swollman				}
40019896Swollman				pipeunlock(rpipe);
40119896Swollman			} else {
40246303Smarkm				break;
40318316Swollman			}
40418316Swollman			rpipe->pipe_state |= PIPE_WANTR;
40518316Swollman			if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) {
40618316Swollman				break;
40718316Swollman			}
40818316Swollman		}
40918316Swollman	}
41018316Swollman
41118316Swollman	if (error == 0) {
41218316Swollman		int s = splhigh();
41318316Swollman		rpipe->pipe_atime = time;
41418316Swollman		splx(s);
41546303Smarkm	}
41646303Smarkm
41746303Smarkm	--rpipe->pipe_busy;
41846303Smarkm	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
41946303Smarkm		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
42046303Smarkm		wakeup(rpipe);
42146303Smarkm	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
42246303Smarkm		/*
42346303Smarkm		 * If there is no more to read in the pipe, reset
42446303Smarkm		 * its pointers to the beginning.  This improves
42546303Smarkm		 * cache hit stats.
42646303Smarkm		 */
42746303Smarkm		if ((error == 0) && (error = pipelock(rpipe,1)) == 0) {
42818316Swollman			if (rpipe->pipe_buffer.cnt == 0) {
42918316Swollman#if 0
43018316Swollman				pipe_mark_pages_clean(rpipe);
43118316Swollman#endif
43218316Swollman				rpipe->pipe_buffer.in = 0;
43318316Swollman				rpipe->pipe_buffer.out = 0;
43418316Swollman			}
43518316Swollman			pipeunlock(rpipe);
43618316Swollman		}
43718316Swollman
43818316Swollman		/*
43918316Swollman		 * If the "write-side" has been blocked, wake it up now.
44018316Swollman		 */
44118316Swollman		if (rpipe->pipe_state & PIPE_WANTW) {
44218316Swollman			rpipe->pipe_state &= ~PIPE_WANTW;
44329314Sdanny			wakeup(rpipe);
44418316Swollman		}
44518316Swollman	}
44618316Swollman	if (rpipe->pipe_state & PIPE_SEL) {
44718316Swollman		rpipe->pipe_state &= ~PIPE_SEL;
44818316Swollman		selwakeup(&rpipe->pipe_sel);
44918316Swollman	}
45018316Swollman	return error;
45118316Swollman}
45218316Swollman
45318316Swollman/*
45418316Swollman * Map the sending processes' buffer into kernel space and wire it.
45518316Swollman * This is similar to a physical write operation.
45618316Swollman */
45718316Swollmanstatic int
45818316Swollmanpipe_build_write_buffer(wpipe, uio)
45918316Swollman	struct pipe *wpipe;
46018316Swollman	struct uio *uio;
46118316Swollman{
46218316Swollman	int size;
46318316Swollman	int i;
46418316Swollman	vm_offset_t addr, endaddr, paddr;
46518316Swollman
46618316Swollman	size = uio->uio_iov->iov_len;
46718316Swollman	if (size > wpipe->pipe_buffer.size)
46818316Swollman		size = wpipe->pipe_buffer.size;
46918316Swollman
47018316Swollman	endaddr = round_page(uio->uio_iov->iov_base + size);
47118316Swollman	for(i = 0, addr = trunc_page(uio->uio_iov->iov_base);
47218316Swollman		addr < endaddr;
47318316Swollman		addr += PAGE_SIZE, i+=1) {
47418316Swollman
47518316Swollman		vm_page_t m;
47618316Swollman
47718316Swollman		vm_fault_quick( (caddr_t) addr, VM_PROT_READ);
47818316Swollman		paddr = pmap_kextract(addr);
47918316Swollman		if (!paddr) {
48018316Swollman			int j;
48118316Swollman			for(j=0;j<i;j++)
48218316Swollman				vm_page_unwire(wpipe->pipe_map.ms[j]);
48318316Swollman			return EFAULT;
48418316Swollman		}
48518316Swollman
48618316Swollman		m = PHYS_TO_VM_PAGE(paddr);
48718316Swollman		vm_page_wire(m);
48818316Swollman		wpipe->pipe_map.ms[i] = m;
48918316Swollman	}
49018316Swollman
49118316Swollman/*
49218316Swollman * set up the control block
49318316Swollman */
49418316Swollman	wpipe->pipe_map.npages = i;
49518316Swollman	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
49618316Swollman	wpipe->pipe_map.cnt = size;
49718316Swollman
49846303Smarkm/*
49918316Swollman * and map the buffer
50018316Swollman */
50118316Swollman	if (wpipe->pipe_map.kva == 0) {
50218316Swollman		/*
50318316Swollman		 * We need to allocate space for an extra page because the
50418316Swollman		 * address range might (will) span pages at times.
50518316Swollman		 */
50618316Swollman		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
50718316Swollman			wpipe->pipe_buffer.size + PAGE_SIZE);
50818316Swollman		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
50918316Swollman	}
51018316Swollman	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
51118316Swollman		wpipe->pipe_map.npages);
51218316Swollman
51318316Swollman/*
51418316Swollman * and update the uio data
51518316Swollman */
51618316Swollman
51718316Swollman	uio->uio_iov->iov_len -= size;
51818316Swollman	uio->uio_iov->iov_base += size;
51918316Swollman	if (uio->uio_iov->iov_len == 0)
52018316Swollman		uio->uio_iov++;
52118316Swollman	uio->uio_resid -= size;
52218316Swollman	uio->uio_offset += size;
52318316Swollman	return 0;
52418316Swollman}
52518316Swollman
52618316Swollman/*
52718316Swollman * unmap and unwire the process buffer
52818316Swollman */
52918316Swollmanstatic void
53018316Swollmanpipe_destroy_write_buffer(wpipe)
53118316Swollmanstruct pipe *wpipe;
53218316Swollman{
53318316Swollman	int i;
53418316Swollman	pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
53518316Swollman
53618316Swollman	if (wpipe->pipe_map.kva) {
53718316Swollman		if (amountpipekva > MAXPIPEKVA) {
53818316Swollman			vm_offset_t kva = wpipe->pipe_map.kva;
53918316Swollman			wpipe->pipe_map.kva = 0;
54018316Swollman			kmem_free(kernel_map, kva,
54118316Swollman				wpipe->pipe_buffer.size + PAGE_SIZE);
54218316Swollman			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
54318316Swollman		}
54418316Swollman	}
54518316Swollman	for (i=0;i<wpipe->pipe_map.npages;i++)
54646303Smarkm		vm_page_unwire(wpipe->pipe_map.ms[i]);
54718316Swollman}
54818316Swollman
54918316Swollman/*
55018316Swollman * In the case of a signal, the writing process might go away.  This
55118316Swollman * code copies the data into the circular buffer so that the source
55219896Swollman * pages can be freed without loss of data.
55318316Swollman */
55418316Swollmanstatic void
55518316Swollmanpipe_clone_write_buffer(wpipe)
55618316Swollmanstruct pipe *wpipe;
55718316Swollman{
55818316Swollman	int size;
55918316Swollman	int pos;
56018316Swollman
56118316Swollman	size = wpipe->pipe_map.cnt;
56218316Swollman	pos = wpipe->pipe_map.pos;
56318316Swollman	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
56418316Swollman			(caddr_t) wpipe->pipe_buffer.buffer,
56518316Swollman			size);
56618316Swollman
56718316Swollman	wpipe->pipe_buffer.in = size;
56818316Swollman	wpipe->pipe_buffer.out = 0;
56918316Swollman	wpipe->pipe_buffer.cnt = size;
57018316Swollman	wpipe->pipe_state &= ~PIPE_DIRECTW;
57118316Swollman
57218316Swollman	pipe_destroy_write_buffer(wpipe);
57318316Swollman}
57418316Swollman
57518316Swollman/*
57618316Swollman * This implements the pipe buffer write mechanism.  Note that only
57718316Swollman * a direct write OR a normal pipe write can be pending at any given time.
57818316Swollman * If there are any characters in the pipe buffer, the direct write will
57918316Swollman * be deferred until the receiving process grabs all of the bytes from
58018316Swollman * the pipe buffer.  Then the direct mapping write is set-up.
58118316Swollman */
58218316Swollmanstatic int
58318316Swollmanpipe_direct_write(wpipe, uio)
58418316Swollman	struct pipe *wpipe;
58518316Swollman	struct uio *uio;
58618316Swollman{
58718316Swollman	int error;
58818316Swollman	while (wpipe->pipe_state & PIPE_DIRECTW) {
58918316Swollman		error = tsleep(wpipe,
59018316Swollman				PRIBIO|PCATCH, "pipdww", 0);
59118316Swollman		if (error || (wpipe->pipe_state & PIPE_EOF))
59218316Swollman			goto error1;
59318316Swollman	}
59418316Swollman	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
59518316Swollman	wpipe->pipe_state |= PIPE_DIRECTW;
59618316Swollman	while (wpipe->pipe_buffer.cnt > 0) {
59718316Swollman		error = tsleep(wpipe,
59818316Swollman				PRIBIO|PCATCH, "pipdwc", 0);
59946303Smarkm		if (error || (wpipe->pipe_state & PIPE_EOF)) {
60018316Swollman			wpipe->pipe_state &= ~PIPE_DIRECTW;
60118316Swollman			if (error == 0)
60218316Swollman				error = EPIPE;
60318316Swollman			goto error1;
60418316Swollman		}
60518316Swollman	}
60618316Swollman
60718316Swollman	error = pipe_build_write_buffer(wpipe, uio);
60818316Swollman	if (error) {
60919896Swollman		wpipe->pipe_state &= ~PIPE_DIRECTW;
61018316Swollman		goto error1;
61118316Swollman	}
61219896Swollman
61319896Swollman	if (wpipe->pipe_state & PIPE_WANTR) {
61419896Swollman		wpipe->pipe_state &= ~PIPE_WANTR;
61519896Swollman		wakeup(wpipe);
61619896Swollman	}
61719896Swollman
61819896Swollman	error = 0;
61918316Swollman	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
62018316Swollman		if (wpipe->pipe_state & PIPE_EOF) {
62118316Swollman			pipelock(wpipe, 0);
62218316Swollman			pipe_destroy_write_buffer(wpipe);
62318316Swollman			pipeunlock(wpipe);
62418316Swollman			wakeup(wpipe);
62518316Swollman			return EPIPE;
62618316Swollman		}
62718316Swollman		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
62819896Swollman	}
62918316Swollman
63018316Swollman	pipelock(wpipe,0);
63118316Swollman	if (wpipe->pipe_state & PIPE_DIRECTW) {
63218316Swollman		/*
63318316Swollman		 * this bit of trickery substitutes a kernel buffer for
63418316Swollman		 * the process that might be going away.
63518316Swollman		 */
63618316Swollman		pipe_clone_write_buffer(wpipe);
63718316Swollman	} else {
63818316Swollman		pipe_destroy_write_buffer(wpipe);
63918316Swollman	}
64018316Swollman	pipeunlock(wpipe);
64118316Swollman	return error;
64218316Swollman
64318316Swollmanerror1:
64418316Swollman	wakeup(wpipe);
64518316Swollman	return error;
64618316Swollman}
64718316Swollman
64818316Swollmanstatic __inline int
64918316Swollmanpipewrite(wpipe, uio, nbio)
65018316Swollman	struct pipe *wpipe;
65118316Swollman	struct uio *uio;
65218316Swollman	int nbio;
65318316Swollman{
65418316Swollman	int error = 0;
65546303Smarkm	int orig_resid;
65618316Swollman
65718316Swollman	/*
65818316Swollman	 * detect loss of pipe read side, issue SIGPIPE if lost.
65918316Swollman	 */
66018316Swollman	if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)) {
66118316Swollman		return EPIPE;
66246303Smarkm	}
66318316Swollman
66418316Swollman	if( wpipe->pipe_buffer.buffer == NULL) {
66518316Swollman		if ((error = pipelock(wpipe,1)) == 0) {
66618316Swollman			pipespace(wpipe);
66718316Swollman			pipeunlock(wpipe);
66818316Swollman		} else {
66918316Swollman			return error;
67018316Swollman		}
67118316Swollman	}
67218316Swollman
67318316Swollman	++wpipe->pipe_busy;
67418316Swollman	orig_resid = uio->uio_resid;
67518316Swollman	while (uio->uio_resid) {
67618316Swollman		int space;
67718316Swollman		/*
67818316Swollman		 * If the transfer is large, we can gain performance if
67918316Swollman		 * we do process-to-process copies directly.
68018316Swollman		 */
68118316Swollman		if ((amountpipekva < LIMITPIPEKVA) &&
68218316Swollman			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
68346303Smarkm			error = pipe_direct_write( wpipe, uio);
68418316Swollman			if (error) {
68518316Swollman				break;
68618316Swollman			}
68719896Swollman			continue;
68818316Swollman		}
68918316Swollman
69018316Swollman		/*
69118316Swollman		 * Pipe buffered writes cannot be coincidental with
69218316Swollman		 * direct writes.  We wait until the currently executing
69318316Swollman		 * direct write is completed before we start filling the
69418316Swollman		 * pipe buffer.
69519896Swollman		 */
69619896Swollman	retrywrite:
69719896Swollman		while (wpipe->pipe_state & PIPE_DIRECTW) {
69818316Swollman			error = tsleep(wpipe,
69918316Swollman					PRIBIO|PCATCH, "pipbww", 0);
70018316Swollman			if (error)
70118316Swollman				break;
70218316Swollman		}
70318316Swollman
70418316Swollman		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
70518316Swollman		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
70618316Swollman			space = 0;
70718316Swollman
70818316Swollman		/*
70918316Swollman		 * We must afford contiguous writes on buffers of size
71018316Swollman		 * PIPE_BUF or less.
71118316Swollman		 */
71218316Swollman		if (space > 0) {
71318316Swollman			int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in;
71418316Swollman			if (size > space)
71518316Swollman				size = space;
71618316Swollman			if (size > uio->uio_resid)
71718316Swollman				size = uio->uio_resid;
71818316Swollman			if ((error = pipelock(wpipe,1)) == 0) {
71918316Swollman				/*
72018316Swollman				 * It is possible for a direct write to
72118316Swollman				 * slip in on us... handle it here...
72218316Swollman				 */
72318316Swollman				if (wpipe->pipe_state & PIPE_DIRECTW) {
72418316Swollman					pipeunlock(wpipe);
72518316Swollman					goto retrywrite;
72618316Swollman				}
72718316Swollman				error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
72818316Swollman					size, uio);
72918316Swollman				pipeunlock(wpipe);
73018316Swollman			}
73118316Swollman			if (error)
73218316Swollman				break;
73318316Swollman
73418316Swollman			wpipe->pipe_buffer.in += size;
73518316Swollman			if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size)
73618316Swollman				wpipe->pipe_buffer.in = 0;
73718316Swollman
73818316Swollman			wpipe->pipe_buffer.cnt += size;
73918316Swollman		} else {
74018316Swollman			/*
74118316Swollman			 * If the "read-side" has been blocked, wake it up now.
74218316Swollman			 */
74318316Swollman			if (wpipe->pipe_state & PIPE_WANTR) {
74418316Swollman				wpipe->pipe_state &= ~PIPE_WANTR;
74518316Swollman				wakeup(wpipe);
74618316Swollman			}
74718316Swollman			/*
74818316Swollman			 * don't block on non-blocking I/O
74918316Swollman			 */
75019896Swollman			if (nbio) {
75119896Swollman				error = EAGAIN;
75246303Smarkm				break;
75346303Smarkm			}
75418316Swollman
75546303Smarkm			wpipe->pipe_state |= PIPE_WANTW;
75619896Swollman			if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) {
75718316Swollman				break;
75818316Swollman			}
75918316Swollman			/*
76018316Swollman			 * If read side wants to go away, we just issue a signal
76118316Swollman			 * to ourselves.
76218316Swollman			 */
76318316Swollman			if (wpipe->pipe_state & PIPE_EOF) {
76418316Swollman				error = EPIPE;
76518316Swollman				break;
76618316Swollman			}
76718316Swollman		}
76818316Swollman	}
76918316Swollman
77018316Swollman	if ((wpipe->pipe_busy == 0) &&
77118316Swollman		(wpipe->pipe_state & PIPE_WANT)) {
77218316Swollman		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
77318316Swollman		wakeup(wpipe);
77418316Swollman	} else if (wpipe->pipe_buffer.cnt > 0) {
77518316Swollman		/*
77618316Swollman		 * If we have put any characters in the buffer, we wake up
77718316Swollman		 * the reader.
77818316Swollman		 */
77919896Swollman		if (wpipe->pipe_state & PIPE_WANTR) {
78018316Swollman			wpipe->pipe_state &= ~PIPE_WANTR;
78118316Swollman			wakeup(wpipe);
78218316Swollman		}
78318316Swollman	}
78418316Swollman
78519896Swollman	/*
78619896Swollman	 * Don't return EPIPE if I/O was successful
78718316Swollman	 */
78818316Swollman	if ((wpipe->pipe_buffer.cnt == 0) &&
78918316Swollman		(uio->uio_resid == 0) &&
79018316Swollman		(error == EPIPE))
79119896Swollman		error = 0;
79218316Swollman
79319896Swollman	if (error = 0) {
79418316Swollman		int s = splhigh();
79518316Swollman		wpipe->pipe_mtime = time;
79618316Swollman		splx(s);
79718316Swollman	}
79818316Swollman
79918316Swollman	if (wpipe->pipe_state & PIPE_SEL) {
80018316Swollman		wpipe->pipe_state &= ~PIPE_SEL;
80146303Smarkm		selwakeup(&wpipe->pipe_sel);
80218316Swollman	}
80318316Swollman
80446303Smarkm	--wpipe->pipe_busy;
80546303Smarkm	return error;
80618316Swollman}
80718316Swollman
80818316Swollman/* ARGSUSED */
80918316Swollmanstatic int
81018316Swollmanpipe_write(fp, uio, cred)
81118316Swollman	struct file *fp;
81218316Swollman	struct uio *uio;
81318316Swollman	struct ucred *cred;
81418316Swollman{
81518316Swollman	struct pipe *rpipe = (struct pipe *) fp->f_data;
81618316Swollman	struct pipe *wpipe = rpipe->pipe_peer;
81718316Swollman	return pipewrite(wpipe, uio, (rpipe->pipe_state & PIPE_NBIO)?1:0);
81818316Swollman}
81918316Swollman
82018316Swollman/*
82118316Swollman * we implement a very minimal set of ioctls for compatibility with sockets.
82218316Swollman */
82318316Swollmanint
82418316Swollmanpipe_ioctl(fp, cmd, data, p)
82518316Swollman	struct file *fp;
82618316Swollman	int cmd;
82718316Swollman	register caddr_t data;
82818316Swollman	struct proc *p;
82918316Swollman{
83037908Scharnier	register struct pipe *mpipe = (struct pipe *)fp->f_data;
83118316Swollman
83218316Swollman	switch (cmd) {
83318316Swollman
83418316Swollman	case FIONBIO:
83518316Swollman		if (*(int *)data)
83618316Swollman			mpipe->pipe_state |= PIPE_NBIO;
83718316Swollman		else
83818316Swollman			mpipe->pipe_state &= ~PIPE_NBIO;
83918316Swollman		return (0);
84018316Swollman
84118316Swollman	case FIOASYNC:
84218316Swollman		if (*(int *)data) {
84318316Swollman			mpipe->pipe_state |= PIPE_ASYNC;
84418316Swollman		} else {
84518316Swollman			mpipe->pipe_state &= ~PIPE_ASYNC;
84618316Swollman		}
84718316Swollman		return (0);
84818316Swollman
84918316Swollman	case FIONREAD:
85018316Swollman		*(int *)data = mpipe->pipe_buffer.cnt;
85118316Swollman		return (0);
85219896Swollman
85319896Swollman	case SIOCSPGRP:
85418316Swollman		mpipe->pipe_pgid = *(int *)data;
85546303Smarkm		return (0);
85618316Swollman
85718316Swollman	case SIOCGPGRP:
85818316Swollman		*(int *)data = mpipe->pipe_pgid;
85918316Swollman		return (0);
86018316Swollman
86118316Swollman	}
86218316Swollman	return ENOSYS;
86318316Swollman}
86418316Swollman
86518316Swollmanint
86618316Swollmanpipe_select(fp, which, p)
86718316Swollman	struct file *fp;
86818316Swollman	int which;
86918316Swollman	struct proc *p;
87018316Swollman{
87118316Swollman	register struct pipe *rpipe = (struct pipe *)fp->f_data;
87218316Swollman	struct pipe *wpipe;
87320342Swollman
87419896Swollman	wpipe = rpipe->pipe_peer;
87520342Swollman	switch (which) {
87620342Swollman
87720342Swollman	case FREAD:
87820342Swollman		if (rpipe->pipe_buffer.cnt > 0 ||
87919896Swollman			(rpipe->pipe_state & PIPE_EOF)) {
88018316Swollman			return (1);
88146303Smarkm		}
88219896Swollman		selrecord(p, &rpipe->pipe_sel);
88319896Swollman		rpipe->pipe_state |= PIPE_SEL;
88420342Swollman		break;
88520342Swollman
88646303Smarkm	case FWRITE:
88719896Swollman		if ((wpipe == NULL) ||
88819896Swollman			(wpipe->pipe_state & PIPE_EOF) ||
88919896Swollman			((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
89020342Swollman			return (1);
89120342Swollman		}
89220342Swollman		selrecord(p, &wpipe->pipe_sel);
89320342Swollman		wpipe->pipe_state |= PIPE_SEL;
89420342Swollman		break;
89520342Swollman
89620342Swollman	case 0:
89720342Swollman		if ((rpipe->pipe_state & PIPE_EOF) ||
89820342Swollman			(wpipe == NULL) ||
89920342Swollman			(wpipe->pipe_state & PIPE_EOF)) {
90020342Swollman			return (1);
90120342Swollman		}
90220342Swollman
90320342Swollman		selrecord(p, &rpipe->pipe_sel);
90420342Swollman		rpipe->pipe_state |= PIPE_SEL;
90520342Swollman		break;
90620342Swollman	}
90720342Swollman	return (0);
90820342Swollman}
90920342Swollman
91020342Swollmanint
91120342Swollmanpipe_stat(pipe, ub)
91220342Swollman	register struct pipe *pipe;
91320342Swollman	register struct stat *ub;
91420342Swollman{
91520342Swollman	bzero((caddr_t)ub, sizeof (*ub));
91620342Swollman	ub->st_mode = S_IFSOCK;
91720342Swollman	ub->st_blksize = pipe->pipe_buffer.size;
91820342Swollman	ub->st_size = pipe->pipe_buffer.cnt;
91920342Swollman	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
92019896Swollman	TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec);
92119896Swollman	TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
92219896Swollman	TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
92319896Swollman	return 0;
92419896Swollman}
92519896Swollman
92619896Swollman/* ARGSUSED */
92720342Swollmanstatic int
92819896Swollmanpipe_close(fp, p)
92919896Swollman	struct file *fp;
93019896Swollman	struct proc *p;
93119896Swollman{
93219896Swollman	int error = 0;
93319896Swollman	struct pipe *cpipe = (struct pipe *)fp->f_data;
93419896Swollman	pipeclose(cpipe);
93519896Swollman	fp->f_data = NULL;
93646303Smarkm	return 0;
93718316Swollman}
93818316Swollman
93918316Swollman/*
94018316Swollman * shutdown the pipe
94118316Swollman */
94218316Swollmanstatic void
94318316Swollmanpipeclose(cpipe)
94418316Swollman	struct pipe *cpipe;
94518316Swollman{
94618316Swollman	struct pipe *ppipe;
94718316Swollman	if (cpipe) {
94818316Swollman
94918316Swollman		if (cpipe->pipe_state & PIPE_SEL) {
95018316Swollman			cpipe->pipe_state &= ~PIPE_SEL;
95118316Swollman			selwakeup(&cpipe->pipe_sel);
95218316Swollman		}
95318316Swollman
954		/*
955		 * If the other side is blocked, wake it up saying that
956		 * we want to close it down.
957		 */
958		while (cpipe->pipe_busy) {
959			wakeup(cpipe);
960			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
961			tsleep(cpipe, PRIBIO, "pipecl", 0);
962		}
963
964		/*
965		 * Disconnect from peer
966		 */
967		if (ppipe = cpipe->pipe_peer) {
968			if (ppipe->pipe_state & PIPE_SEL) {
969				ppipe->pipe_state &= ~PIPE_SEL;
970				selwakeup(&ppipe->pipe_sel);
971			}
972
973			ppipe->pipe_state |= PIPE_EOF;
974			wakeup(ppipe);
975			ppipe->pipe_peer = NULL;
976		}
977
978		/*
979		 * free resources
980		 */
981		if (cpipe->pipe_buffer.buffer) {
982			amountpipekva -= cpipe->pipe_buffer.size;
983			kmem_free(kernel_map,
984				(vm_offset_t)cpipe->pipe_buffer.buffer,
985				cpipe->pipe_buffer.size);
986		}
987		if (cpipe->pipe_map.kva) {
988			amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
989			kmem_free(kernel_map,
990				cpipe->pipe_map.kva,
991				cpipe->pipe_buffer.size + PAGE_SIZE);
992		}
993		free(cpipe, M_TEMP);
994	}
995}
996#endif
997