sys_pipe.c revision 13913
118316Swollman/* 218316Swollman * Copyright (c) 1996 John S. Dyson 318316Swollman * All rights reserved. 418316Swollman * 518316Swollman * Redistribution and use in source and binary forms, with or without 618316Swollman * modification, are permitted provided that the following conditions 718316Swollman * are met: 818316Swollman * 1. Redistributions of source code must retain the above copyright 918316Swollman * notice immediately at the beginning of the file, without modification, 1018316Swollman * this list of conditions, and the following disclaimer. 1118316Swollman * 2. Redistributions in binary form must reproduce the above copyright 1218316Swollman * notice, this list of conditions and the following disclaimer in the 1318316Swollman * documentation and/or other materials provided with the distribution. 1446303Smarkm * 3. Absolutely no warranty of function or purpose is made by the author 1518316Swollman * John S. Dyson. 1618316Swollman * 4. This work was done expressly for inclusion into FreeBSD. Other use 1718316Swollman * is allowed if this notation is included. 1818316Swollman * 5. Modifications may be freely made to this file if the above conditions 1918316Swollman * are met. 2018316Swollman * 2118316Swollman * $Id: sys_pipe.c,v 1.6 1996/02/04 22:09:05 dyson Exp $ 2218316Swollman */ 2318316Swollman 2418316Swollman#ifndef OLD_PIPE 2518316Swollman 2618316Swollman/* 2718316Swollman * This file contains a high-performance replacement for the socket-based 2818316Swollman * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 2918316Swollman * all features of sockets, but does do everything that pipes normally 3018316Swollman * do. 3118316Swollman */ 3246303Smarkm 3350476Speter/* 3418316Swollman * This code has two modes of operation, a small write mode and a large 3518316Swollman * write mode. The small write mode acts like conventional pipes with 3618316Swollman * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 3746303Smarkm * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 3818316Swollman * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and 3918316Swollman * the receiving process can copy it directly from the pages in the sending 4018316Swollman * process. 4146303Smarkm * 4218316Swollman * If the sending process receives a signal, it is possible that it will 4346303Smarkm * go away, and certainly its address space can change, because control 4418316Swollman * is returned back to the user-mode side. In that case, the pipe code 4546303Smarkm * arranges to copy the buffer supplied by the user process, to a pageable 4646303Smarkm * kernel buffer, and the receiving process will grab the data from the 4746303Smarkm * pageable kernel buffer. Since signals don't happen all that often, 4846303Smarkm * the copy operation is normally eliminated. 4946303Smarkm * 5046303Smarkm * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 5146303Smarkm * happen for small transfers so that the system will not spend all of 5246303Smarkm * its time context switching. PIPE_SIZE is constrained by the 5346303Smarkm * amount of kernel virtual memory. 5446303Smarkm */ 5550969Speter 5646303Smarkm#include <sys/param.h> 5746303Smarkm#include <sys/systm.h> 5818316Swollman#include <sys/proc.h> 5918316Swollman#include <sys/file.h> 6018316Swollman#include <sys/protosw.h> 6118316Swollman#include <sys/stat.h> 6218316Swollman#include <sys/filedesc.h> 6346303Smarkm#include <sys/malloc.h> 6446303Smarkm#include <sys/ioctl.h> 6518316Swollman#include <sys/stat.h> 6618316Swollman#include <sys/select.h> 6718316Swollman#include <sys/signalvar.h> 6818316Swollman#include <sys/errno.h> 6918316Swollman#include <sys/queue.h> 7018316Swollman#include <sys/vmmeter.h> 7118316Swollman#include <sys/kernel.h> 7218316Swollman#include <sys/sysproto.h> 7337908Scharnier#include <sys/pipe.h> 7418316Swollman 7518316Swollman#include <vm/vm.h> 7618316Swollman#include <vm/vm_prot.h> 7718316Swollman#include <vm/vm_param.h> 7846303Smarkm#include <vm/lock.h> 7918316Swollman#include <vm/vm_object.h> 8018316Swollman#include <vm/vm_kern.h> 8118316Swollman#include <vm/vm_extern.h> 8218316Swollman#include <vm/pmap.h> 8318316Swollman#include <vm/vm_map.h> 8418316Swollman#include <vm/vm_page.h> 8564131Ssheldonh 8664131Ssheldonhstatic int pipe_read __P((struct file *fp, struct uio *uio, 8764131Ssheldonh struct ucred *cred)); 8818316Swollmanstatic int pipe_write __P((struct file *fp, struct uio *uio, 8946303Smarkm struct ucred *cred)); 9046303Smarkmstatic int pipe_close __P((struct file *fp, struct proc *p)); 9118316Swollmanstatic int pipe_select __P((struct file *fp, int which, struct proc *p)); 9218316Swollmanstatic int pipe_ioctl __P((struct file *fp, int cmd, caddr_t data, struct proc *p)); 9318316Swollman 9418316Swollmanstatic struct fileops pipeops = 9518316Swollman { pipe_read, pipe_write, pipe_ioctl, pipe_select, pipe_close }; 9618316Swollman 9718316Swollman/* 9818316Swollman * Default pipe buffer size(s), this can be kind-of large now because pipe 9918316Swollman * space is pageable. The pipe code will try to maintain locality of 10018316Swollman * reference for performance reasons, so small amounts of outstanding I/O 10118316Swollman * will not wipe the cache. 10218316Swollman */ 10318316Swollman#define MINPIPESIZE (PIPE_SIZE/3) 10418316Swollman#define MAXPIPESIZE (2*PIPE_SIZE/3) 10518316Swollman 10618316Swollman/* 10718316Swollman * Maximum amount of kva for pipes -- this is kind-of a soft limit, but 10818316Swollman * is there so that on large systems, we don't exhaust it. 10946303Smarkm */ 11018316Swollman#define MAXPIPEKVA (8*1024*1024) 11118316Swollman 11218316Swollman/* 11319896Swollman * Limit for direct transfers, we cannot, of course limit 11418316Swollman * the amount of kva for pipes in general though. 11518316Swollman */ 11618316Swollman#define LIMITPIPEKVA (16*1024*1024) 11718316Swollmanint amountpipekva; 11818316Swollman 11919896Swollmanstatic void pipeclose __P((struct pipe *cpipe)); 12019896Swollmanstatic void pipebufferinit __P((struct pipe *cpipe)); 12119896Swollmanstatic void pipeinit __P((struct pipe *cpipe)); 12219896Swollmanstatic __inline int pipelock __P((struct pipe *cpipe, int catch)); 12319896Swollmanstatic __inline void pipeunlock __P((struct pipe *cpipe)); 12418316Swollmanstatic int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio)); 12518316Swollmanstatic void pipe_destroy_write_buffer __P((struct pipe *wpipe)); 12618316Swollmanstatic int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio)); 12718316Swollmanstatic void pipe_clone_write_buffer __P((struct pipe *wpipe)); 12818316Swollmanstatic void pipe_mark_pages_clean __P((struct pipe *cpipe)); 12918316Swollmanstatic int pipewrite __P((struct pipe *wpipe, struct uio *uio, int nbio)); 13018316Swollmanstatic void pipespace __P((struct pipe *cpipe)); 13118316Swollman 13218316Swollman/* 13318316Swollman * The pipe system call for the DTYPE_PIPE type of pipes 13418316Swollman */ 13518316Swollman 13618316Swollman/* ARGSUSED */ 13718316Swollmanint 13818316Swollmanpipe(p, uap, retval) 13918316Swollman struct proc *p; 14046303Smarkm struct pipe_args /* { 14118316Swollman int dummy; 14218316Swollman } */ *uap; 14318316Swollman int retval[]; 14418316Swollman{ 14518316Swollman register struct filedesc *fdp = p->p_fd; 14618316Swollman struct file *rf, *wf; 14718316Swollman struct pipe *rpipe, *wpipe; 14818316Swollman int fd, error; 14918316Swollman 15018316Swollman rpipe = malloc( sizeof (*rpipe), M_TEMP, M_WAITOK); 15118316Swollman pipeinit(rpipe); 15218316Swollman rpipe->pipe_state |= PIPE_DIRECTOK; 15318316Swollman wpipe = malloc( sizeof (*wpipe), M_TEMP, M_WAITOK); 15418316Swollman pipeinit(wpipe); 15518316Swollman wpipe->pipe_state |= PIPE_DIRECTOK; 15618316Swollman 15746303Smarkm error = falloc(p, &rf, &fd); 15818316Swollman if (error) 15946303Smarkm goto free2; 16046303Smarkm retval[0] = fd; 16146303Smarkm rf->f_flag = FREAD | FWRITE; 16218316Swollman rf->f_type = DTYPE_PIPE; 16318316Swollman rf->f_ops = &pipeops; 16418316Swollman rf->f_data = (caddr_t)rpipe; 16518316Swollman error = falloc(p, &wf, &fd); 16618316Swollman if (error) 16718316Swollman goto free3; 16818316Swollman wf->f_flag = FREAD | FWRITE; 16918316Swollman wf->f_type = DTYPE_PIPE; 17018316Swollman wf->f_ops = &pipeops; 17118316Swollman wf->f_data = (caddr_t)wpipe; 17218316Swollman retval[1] = fd; 17318316Swollman 17418316Swollman rpipe->pipe_peer = wpipe; 17518316Swollman wpipe->pipe_peer = rpipe; 17618316Swollman 17718316Swollman return (0); 17818316Swollmanfree3: 17918316Swollman ffree(rf); 18018316Swollman fdp->fd_ofiles[retval[0]] = 0; 18118316Swollmanfree2: 18218316Swollman (void)pipeclose(wpipe); 18318316Swollmanfree1: 18418316Swollman (void)pipeclose(rpipe); 18518316Swollman return (error); 18618316Swollman} 18718316Swollman 18818316Swollman/* 18918316Swollman * Allocate kva for pipe circular buffer, the space is pageable 19019896Swollman */ 19118316Swollmanstatic void 19218316Swollmanpipespace(cpipe) 19318316Swollman struct pipe *cpipe; 19418316Swollman{ 19518316Swollman int npages, error; 19618316Swollman 19718316Swollman npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE; 19818316Swollman /* 19919896Swollman * Create an object, I don't like the idea of paging to/from 20018316Swollman * kernel_object. 20118316Swollman */ 20218316Swollman cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages); 20318316Swollman cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map); 20446303Smarkm 20519896Swollman /* 20618316Swollman * Insert the object into the kernel map, and allocate kva for it. 20718316Swollman * The map entry is, by default, pageable. 20846303Smarkm */ 20946303Smarkm error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0, 21046303Smarkm (vm_offset_t *) &cpipe->pipe_buffer.buffer, 21118316Swollman cpipe->pipe_buffer.size, 1, 21218316Swollman VM_PROT_ALL, VM_PROT_ALL, 0); 21318316Swollman 21446303Smarkm if (error != KERN_SUCCESS) 21518316Swollman panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error); 21646303Smarkm amountpipekva += cpipe->pipe_buffer.size; 21746303Smarkm} 21846303Smarkm 21946303Smarkm/* 22046303Smarkm * initialize and allocate VM and memory for pipe 22118316Swollman */ 22218316Swollmanstatic void 22346303Smarkmpipeinit(cpipe) 22446303Smarkm struct pipe *cpipe; 22546303Smarkm{ 22664483Ssheldonh int s; 22746303Smarkm 22846303Smarkm cpipe->pipe_buffer.in = 0; 22918316Swollman cpipe->pipe_buffer.out = 0; 23018316Swollman cpipe->pipe_buffer.cnt = 0; 23118316Swollman cpipe->pipe_buffer.size = PIPE_SIZE; 23218316Swollman /* Buffer kva gets dynamically allocated */ 23318316Swollman cpipe->pipe_buffer.buffer = NULL; 23418316Swollman 23518316Swollman cpipe->pipe_state = 0; 23618316Swollman cpipe->pipe_peer = NULL; 23718316Swollman cpipe->pipe_busy = 0; 23818316Swollman s = splhigh(); 23918316Swollman cpipe->pipe_ctime = time; 24020342Swollman cpipe->pipe_atime = time; 24120342Swollman cpipe->pipe_mtime = time; 24218316Swollman splx(s); 24318316Swollman bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel); 24446303Smarkm 24546303Smarkm /* 24618316Swollman * pipe data structure initializations to support direct pipe I/O 24746303Smarkm */ 24846303Smarkm cpipe->pipe_map.cnt = 0; 24946303Smarkm cpipe->pipe_map.kva = 0; 25018316Swollman cpipe->pipe_map.pos = 0; 25146303Smarkm cpipe->pipe_map.npages = 0; 25218316Swollman} 25318316Swollman 25418316Swollman 25518316Swollman/* 25618316Swollman * lock a pipe for I/O, blocking other access 25718316Swollman */ 25818316Swollmanstatic __inline int 25918316Swollmanpipelock(cpipe, catch) 26018316Swollman struct pipe *cpipe; 26118316Swollman int catch; 26218316Swollman{ 26318316Swollman int error; 26418316Swollman while (cpipe->pipe_state & PIPE_LOCK) { 26518316Swollman cpipe->pipe_state |= PIPE_LWANT; 26618316Swollman if (error = tsleep( &cpipe->pipe_state, 26718316Swollman catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) { 26818316Swollman return error; 26918316Swollman } 27018316Swollman } 27118316Swollman cpipe->pipe_state |= PIPE_LOCK; 27218316Swollman return 0; 27318316Swollman} 27418316Swollman 27518316Swollman/* 27618316Swollman * unlock a pipe I/O lock 27718316Swollman */ 27818316Swollmanstatic __inline void 27918316Swollmanpipeunlock(cpipe) 28018316Swollman struct pipe *cpipe; 28118316Swollman{ 28218316Swollman cpipe->pipe_state &= ~PIPE_LOCK; 28318316Swollman if (cpipe->pipe_state & PIPE_LWANT) { 28418316Swollman cpipe->pipe_state &= ~PIPE_LWANT; 28518316Swollman wakeup(&cpipe->pipe_state); 28618316Swollman } 28718316Swollman return; 28818316Swollman} 28918316Swollman 29018316Swollman#if 0 29120342Swollmanstatic void 29246303Smarkmpipe_mark_pages_clean(cpipe) 29320342Swollman struct pipe *cpipe; 29418316Swollman{ 29546303Smarkm vm_size_t off; 29620342Swollman vm_page_t m; 29718316Swollman 29818316Swollman for(off = 0; off < cpipe->pipe_buffer.object->size; off += 1) { 29918316Swollman m = vm_page_lookup(cpipe->pipe_buffer.object, off); 300110670Sache if ((m != NULL) && (m->busy == 0) && (m->flags & PG_BUSY) == 0) { 301110670Sache m->dirty = 0; 302110670Sache pmap_clear_modify(VM_PAGE_TO_PHYS(m)); 30318316Swollman } 304110670Sache } 30518316Swollman} 30618316Swollman#endif 30718316Swollman 30818316Swollman/* ARGSUSED */ 30918316Swollmanstatic int 31018316Swollmanpipe_read(fp, uio, cred) 31118316Swollman struct file *fp; 31218316Swollman struct uio *uio; 31318316Swollman struct ucred *cred; 31418316Swollman{ 31518316Swollman 31618316Swollman struct pipe *rpipe = (struct pipe *) fp->f_data; 31718316Swollman int error = 0; 31818316Swollman int nread = 0; 31918316Swollman int size; 32018316Swollman 32118316Swollman ++rpipe->pipe_busy; 32220342Swollman while (uio->uio_resid) { 32346303Smarkm /* 32420342Swollman * normal pipe buffer receive 32546303Smarkm */ 32618316Swollman if (rpipe->pipe_buffer.cnt > 0) { 32718316Swollman int size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 32819896Swollman if (size > rpipe->pipe_buffer.cnt) 32919896Swollman size = rpipe->pipe_buffer.cnt; 33018316Swollman if (size > uio->uio_resid) 33118316Swollman size = uio->uio_resid; 33218316Swollman if ((error = pipelock(rpipe,1)) == 0) { 33318316Swollman error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 33418316Swollman size, uio); 33518316Swollman pipeunlock(rpipe); 33618316Swollman } 33718316Swollman if (error) { 33818316Swollman break; 33918316Swollman } 34018316Swollman rpipe->pipe_buffer.out += size; 34118316Swollman if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 34218316Swollman rpipe->pipe_buffer.out = 0; 34318316Swollman 34418316Swollman rpipe->pipe_buffer.cnt -= size; 34518316Swollman nread += size; 34618316Swollman /* 34718316Swollman * Direct copy, bypassing a kernel buffer. 34818316Swollman */ 34918316Swollman } else if ((size = rpipe->pipe_map.cnt) && 35018316Swollman (rpipe->pipe_state & PIPE_DIRECTW)) { 35118316Swollman caddr_t va; 35218316Swollman if (size > uio->uio_resid) 35318316Swollman size = uio->uio_resid; 35419896Swollman if ((error = pipelock(rpipe,1)) == 0) { 35518316Swollman va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos; 35646303Smarkm error = uiomove(va, size, uio); 35746303Smarkm pipeunlock(rpipe); 35846303Smarkm } 35946303Smarkm if (error) 36018316Swollman break; 36118316Swollman nread += size; 36218316Swollman rpipe->pipe_map.pos += size; 36318316Swollman rpipe->pipe_map.cnt -= size; 36418316Swollman if (rpipe->pipe_map.cnt == 0) { 36546303Smarkm rpipe->pipe_state &= ~PIPE_DIRECTW; 36646303Smarkm wakeup(rpipe); 36746303Smarkm } 36846303Smarkm } else { 36946303Smarkm /* 37046303Smarkm * detect EOF condition 37146303Smarkm */ 37218316Swollman if (rpipe->pipe_state & PIPE_EOF) { 37346303Smarkm break; 37446303Smarkm } 37546303Smarkm /* 37646303Smarkm * If the "write-side" has been blocked, wake it up now. 37746303Smarkm */ 37846303Smarkm if (rpipe->pipe_state & PIPE_WANTW) { 37946303Smarkm rpipe->pipe_state &= ~PIPE_WANTW; 38046303Smarkm wakeup(rpipe); 38146303Smarkm } 38246303Smarkm if (nread > 0) 38346303Smarkm break; 38446303Smarkm if (rpipe->pipe_state & PIPE_NBIO) { 38546303Smarkm error = EAGAIN; 38646303Smarkm break; 38746303Smarkm } 38846303Smarkm 38946303Smarkm /* 39018316Swollman * If there is no more to read in the pipe, reset 39118316Swollman * its pointers to the beginning. This improves 39218316Swollman * cache hit stats. 39318316Swollman */ 39418316Swollman 39518316Swollman if ((error = pipelock(rpipe,1)) == 0) { 39620342Swollman if (rpipe->pipe_buffer.cnt == 0) { 39720342Swollman rpipe->pipe_buffer.in = 0; 39818316Swollman rpipe->pipe_buffer.out = 0; 39918316Swollman } 40019896Swollman pipeunlock(rpipe); 40119896Swollman } else { 40246303Smarkm break; 40318316Swollman } 40418316Swollman rpipe->pipe_state |= PIPE_WANTR; 40518316Swollman if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) { 40618316Swollman break; 40718316Swollman } 40818316Swollman } 40918316Swollman } 41018316Swollman 41118316Swollman if (error == 0) { 41218316Swollman int s = splhigh(); 41318316Swollman rpipe->pipe_atime = time; 41418316Swollman splx(s); 41546303Smarkm } 41646303Smarkm 41746303Smarkm --rpipe->pipe_busy; 41846303Smarkm if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 41946303Smarkm rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 42046303Smarkm wakeup(rpipe); 42146303Smarkm } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 42246303Smarkm /* 42346303Smarkm * If there is no more to read in the pipe, reset 42446303Smarkm * its pointers to the beginning. This improves 42546303Smarkm * cache hit stats. 42646303Smarkm */ 42746303Smarkm if ((error == 0) && (error = pipelock(rpipe,1)) == 0) { 42818316Swollman if (rpipe->pipe_buffer.cnt == 0) { 42918316Swollman#if 0 43018316Swollman pipe_mark_pages_clean(rpipe); 43118316Swollman#endif 43218316Swollman rpipe->pipe_buffer.in = 0; 43318316Swollman rpipe->pipe_buffer.out = 0; 43418316Swollman } 43518316Swollman pipeunlock(rpipe); 43618316Swollman } 43718316Swollman 43818316Swollman /* 43918316Swollman * If the "write-side" has been blocked, wake it up now. 44018316Swollman */ 44118316Swollman if (rpipe->pipe_state & PIPE_WANTW) { 44218316Swollman rpipe->pipe_state &= ~PIPE_WANTW; 44329314Sdanny wakeup(rpipe); 44418316Swollman } 44518316Swollman } 44618316Swollman if (rpipe->pipe_state & PIPE_SEL) { 44718316Swollman rpipe->pipe_state &= ~PIPE_SEL; 44818316Swollman selwakeup(&rpipe->pipe_sel); 44918316Swollman } 45018316Swollman return error; 45118316Swollman} 45218316Swollman 45318316Swollman/* 45418316Swollman * Map the sending processes' buffer into kernel space and wire it. 45518316Swollman * This is similar to a physical write operation. 45618316Swollman */ 45718316Swollmanstatic int 45818316Swollmanpipe_build_write_buffer(wpipe, uio) 45918316Swollman struct pipe *wpipe; 46018316Swollman struct uio *uio; 46118316Swollman{ 46218316Swollman int size; 46318316Swollman int i; 46418316Swollman vm_offset_t addr, endaddr, paddr; 46518316Swollman 46618316Swollman size = uio->uio_iov->iov_len; 46718316Swollman if (size > wpipe->pipe_buffer.size) 46818316Swollman size = wpipe->pipe_buffer.size; 46918316Swollman 47018316Swollman endaddr = round_page(uio->uio_iov->iov_base + size); 47118316Swollman for(i = 0, addr = trunc_page(uio->uio_iov->iov_base); 47218316Swollman addr < endaddr; 47318316Swollman addr += PAGE_SIZE, i+=1) { 47418316Swollman 47518316Swollman vm_page_t m; 47618316Swollman 47718316Swollman vm_fault_quick( (caddr_t) addr, VM_PROT_READ); 47818316Swollman paddr = pmap_kextract(addr); 47918316Swollman if (!paddr) { 48018316Swollman int j; 48118316Swollman for(j=0;j<i;j++) 48218316Swollman vm_page_unwire(wpipe->pipe_map.ms[j]); 48318316Swollman return EFAULT; 48418316Swollman } 48518316Swollman 48618316Swollman m = PHYS_TO_VM_PAGE(paddr); 48718316Swollman vm_page_wire(m); 48818316Swollman wpipe->pipe_map.ms[i] = m; 48918316Swollman } 49018316Swollman 49118316Swollman/* 49218316Swollman * set up the control block 49318316Swollman */ 49418316Swollman wpipe->pipe_map.npages = i; 49518316Swollman wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 49618316Swollman wpipe->pipe_map.cnt = size; 49718316Swollman 49846303Smarkm/* 49918316Swollman * and map the buffer 50018316Swollman */ 50118316Swollman if (wpipe->pipe_map.kva == 0) { 50218316Swollman /* 50318316Swollman * We need to allocate space for an extra page because the 50418316Swollman * address range might (will) span pages at times. 50518316Swollman */ 50618316Swollman wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, 50718316Swollman wpipe->pipe_buffer.size + PAGE_SIZE); 50818316Swollman amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; 50918316Swollman } 51018316Swollman pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, 51118316Swollman wpipe->pipe_map.npages); 51218316Swollman 51318316Swollman/* 51418316Swollman * and update the uio data 51518316Swollman */ 51618316Swollman 51718316Swollman uio->uio_iov->iov_len -= size; 51818316Swollman uio->uio_iov->iov_base += size; 51918316Swollman if (uio->uio_iov->iov_len == 0) 52018316Swollman uio->uio_iov++; 52118316Swollman uio->uio_resid -= size; 52218316Swollman uio->uio_offset += size; 52318316Swollman return 0; 52418316Swollman} 52518316Swollman 52618316Swollman/* 52718316Swollman * unmap and unwire the process buffer 52818316Swollman */ 52918316Swollmanstatic void 53018316Swollmanpipe_destroy_write_buffer(wpipe) 53118316Swollmanstruct pipe *wpipe; 53218316Swollman{ 53318316Swollman int i; 53418316Swollman pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); 53518316Swollman 53618316Swollman if (wpipe->pipe_map.kva) { 53718316Swollman if (amountpipekva > MAXPIPEKVA) { 53818316Swollman vm_offset_t kva = wpipe->pipe_map.kva; 53918316Swollman wpipe->pipe_map.kva = 0; 54018316Swollman kmem_free(kernel_map, kva, 54118316Swollman wpipe->pipe_buffer.size + PAGE_SIZE); 54218316Swollman amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; 54318316Swollman } 54418316Swollman } 54518316Swollman for (i=0;i<wpipe->pipe_map.npages;i++) 54646303Smarkm vm_page_unwire(wpipe->pipe_map.ms[i]); 54718316Swollman} 54818316Swollman 54918316Swollman/* 55018316Swollman * In the case of a signal, the writing process might go away. This 55118316Swollman * code copies the data into the circular buffer so that the source 55219896Swollman * pages can be freed without loss of data. 55318316Swollman */ 55418316Swollmanstatic void 55518316Swollmanpipe_clone_write_buffer(wpipe) 55618316Swollmanstruct pipe *wpipe; 55718316Swollman{ 55818316Swollman int size; 55918316Swollman int pos; 56018316Swollman 56118316Swollman size = wpipe->pipe_map.cnt; 56218316Swollman pos = wpipe->pipe_map.pos; 56318316Swollman bcopy((caddr_t) wpipe->pipe_map.kva+pos, 56418316Swollman (caddr_t) wpipe->pipe_buffer.buffer, 56518316Swollman size); 56618316Swollman 56718316Swollman wpipe->pipe_buffer.in = size; 56818316Swollman wpipe->pipe_buffer.out = 0; 56918316Swollman wpipe->pipe_buffer.cnt = size; 57018316Swollman wpipe->pipe_state &= ~PIPE_DIRECTW; 57118316Swollman 57218316Swollman pipe_destroy_write_buffer(wpipe); 57318316Swollman} 57418316Swollman 57518316Swollman/* 57618316Swollman * This implements the pipe buffer write mechanism. Note that only 57718316Swollman * a direct write OR a normal pipe write can be pending at any given time. 57818316Swollman * If there are any characters in the pipe buffer, the direct write will 57918316Swollman * be deferred until the receiving process grabs all of the bytes from 58018316Swollman * the pipe buffer. Then the direct mapping write is set-up. 58118316Swollman */ 58218316Swollmanstatic int 58318316Swollmanpipe_direct_write(wpipe, uio) 58418316Swollman struct pipe *wpipe; 58518316Swollman struct uio *uio; 58618316Swollman{ 58718316Swollman int error; 58818316Swollman while (wpipe->pipe_state & PIPE_DIRECTW) { 58918316Swollman error = tsleep(wpipe, 59018316Swollman PRIBIO|PCATCH, "pipdww", 0); 59118316Swollman if (error || (wpipe->pipe_state & PIPE_EOF)) 59218316Swollman goto error1; 59318316Swollman } 59418316Swollman wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 59518316Swollman wpipe->pipe_state |= PIPE_DIRECTW; 59618316Swollman while (wpipe->pipe_buffer.cnt > 0) { 59718316Swollman error = tsleep(wpipe, 59818316Swollman PRIBIO|PCATCH, "pipdwc", 0); 59946303Smarkm if (error || (wpipe->pipe_state & PIPE_EOF)) { 60018316Swollman wpipe->pipe_state &= ~PIPE_DIRECTW; 60118316Swollman if (error == 0) 60218316Swollman error = EPIPE; 60318316Swollman goto error1; 60418316Swollman } 60518316Swollman } 60618316Swollman 60718316Swollman error = pipe_build_write_buffer(wpipe, uio); 60818316Swollman if (error) { 60919896Swollman wpipe->pipe_state &= ~PIPE_DIRECTW; 61018316Swollman goto error1; 61118316Swollman } 61219896Swollman 61319896Swollman if (wpipe->pipe_state & PIPE_WANTR) { 61419896Swollman wpipe->pipe_state &= ~PIPE_WANTR; 61519896Swollman wakeup(wpipe); 61619896Swollman } 61719896Swollman 61819896Swollman error = 0; 61918316Swollman while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 62018316Swollman if (wpipe->pipe_state & PIPE_EOF) { 62118316Swollman pipelock(wpipe, 0); 62218316Swollman pipe_destroy_write_buffer(wpipe); 62318316Swollman pipeunlock(wpipe); 62418316Swollman wakeup(wpipe); 62518316Swollman return EPIPE; 62618316Swollman } 62718316Swollman error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0); 62819896Swollman } 62918316Swollman 63018316Swollman pipelock(wpipe,0); 63118316Swollman if (wpipe->pipe_state & PIPE_DIRECTW) { 63218316Swollman /* 63318316Swollman * this bit of trickery substitutes a kernel buffer for 63418316Swollman * the process that might be going away. 63518316Swollman */ 63618316Swollman pipe_clone_write_buffer(wpipe); 63718316Swollman } else { 63818316Swollman pipe_destroy_write_buffer(wpipe); 63918316Swollman } 64018316Swollman pipeunlock(wpipe); 64118316Swollman return error; 64218316Swollman 64318316Swollmanerror1: 64418316Swollman wakeup(wpipe); 64518316Swollman return error; 64618316Swollman} 64718316Swollman 64818316Swollmanstatic __inline int 64918316Swollmanpipewrite(wpipe, uio, nbio) 65018316Swollman struct pipe *wpipe; 65118316Swollman struct uio *uio; 65218316Swollman int nbio; 65318316Swollman{ 65418316Swollman int error = 0; 65546303Smarkm int orig_resid; 65618316Swollman 65718316Swollman /* 65818316Swollman * detect loss of pipe read side, issue SIGPIPE if lost. 65918316Swollman */ 66018316Swollman if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)) { 66118316Swollman return EPIPE; 66246303Smarkm } 66318316Swollman 66418316Swollman if( wpipe->pipe_buffer.buffer == NULL) { 66518316Swollman if ((error = pipelock(wpipe,1)) == 0) { 66618316Swollman pipespace(wpipe); 66718316Swollman pipeunlock(wpipe); 66818316Swollman } else { 66918316Swollman return error; 67018316Swollman } 67118316Swollman } 67218316Swollman 67318316Swollman ++wpipe->pipe_busy; 67418316Swollman orig_resid = uio->uio_resid; 67518316Swollman while (uio->uio_resid) { 67618316Swollman int space; 67718316Swollman /* 67818316Swollman * If the transfer is large, we can gain performance if 67918316Swollman * we do process-to-process copies directly. 68018316Swollman */ 68118316Swollman if ((amountpipekva < LIMITPIPEKVA) && 68218316Swollman (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) { 68346303Smarkm error = pipe_direct_write( wpipe, uio); 68418316Swollman if (error) { 68518316Swollman break; 68618316Swollman } 68719896Swollman continue; 68818316Swollman } 68918316Swollman 69018316Swollman /* 69118316Swollman * Pipe buffered writes cannot be coincidental with 69218316Swollman * direct writes. We wait until the currently executing 69318316Swollman * direct write is completed before we start filling the 69418316Swollman * pipe buffer. 69519896Swollman */ 69619896Swollman retrywrite: 69719896Swollman while (wpipe->pipe_state & PIPE_DIRECTW) { 69818316Swollman error = tsleep(wpipe, 69918316Swollman PRIBIO|PCATCH, "pipbww", 0); 70018316Swollman if (error) 70118316Swollman break; 70218316Swollman } 70318316Swollman 70418316Swollman space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 70518316Swollman if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 70618316Swollman space = 0; 70718316Swollman 70818316Swollman /* 70918316Swollman * We must afford contiguous writes on buffers of size 71018316Swollman * PIPE_BUF or less. 71118316Swollman */ 71218316Swollman if (space > 0) { 71318316Swollman int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; 71418316Swollman if (size > space) 71518316Swollman size = space; 71618316Swollman if (size > uio->uio_resid) 71718316Swollman size = uio->uio_resid; 71818316Swollman if ((error = pipelock(wpipe,1)) == 0) { 71918316Swollman /* 72018316Swollman * It is possible for a direct write to 72118316Swollman * slip in on us... handle it here... 72218316Swollman */ 72318316Swollman if (wpipe->pipe_state & PIPE_DIRECTW) { 72418316Swollman pipeunlock(wpipe); 72518316Swollman goto retrywrite; 72618316Swollman } 72718316Swollman error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 72818316Swollman size, uio); 72918316Swollman pipeunlock(wpipe); 73018316Swollman } 73118316Swollman if (error) 73218316Swollman break; 73318316Swollman 73418316Swollman wpipe->pipe_buffer.in += size; 73518316Swollman if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) 73618316Swollman wpipe->pipe_buffer.in = 0; 73718316Swollman 73818316Swollman wpipe->pipe_buffer.cnt += size; 73918316Swollman } else { 74018316Swollman /* 74118316Swollman * If the "read-side" has been blocked, wake it up now. 74218316Swollman */ 74318316Swollman if (wpipe->pipe_state & PIPE_WANTR) { 74418316Swollman wpipe->pipe_state &= ~PIPE_WANTR; 74518316Swollman wakeup(wpipe); 74618316Swollman } 74718316Swollman /* 74818316Swollman * don't block on non-blocking I/O 74918316Swollman */ 75019896Swollman if (nbio) { 75119896Swollman error = EAGAIN; 75246303Smarkm break; 75346303Smarkm } 75418316Swollman 75546303Smarkm wpipe->pipe_state |= PIPE_WANTW; 75619896Swollman if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) { 75718316Swollman break; 75818316Swollman } 75918316Swollman /* 76018316Swollman * If read side wants to go away, we just issue a signal 76118316Swollman * to ourselves. 76218316Swollman */ 76318316Swollman if (wpipe->pipe_state & PIPE_EOF) { 76418316Swollman error = EPIPE; 76518316Swollman break; 76618316Swollman } 76718316Swollman } 76818316Swollman } 76918316Swollman 77018316Swollman if ((wpipe->pipe_busy == 0) && 77118316Swollman (wpipe->pipe_state & PIPE_WANT)) { 77218316Swollman wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR); 77318316Swollman wakeup(wpipe); 77418316Swollman } else if (wpipe->pipe_buffer.cnt > 0) { 77518316Swollman /* 77618316Swollman * If we have put any characters in the buffer, we wake up 77718316Swollman * the reader. 77818316Swollman */ 77919896Swollman if (wpipe->pipe_state & PIPE_WANTR) { 78018316Swollman wpipe->pipe_state &= ~PIPE_WANTR; 78118316Swollman wakeup(wpipe); 78218316Swollman } 78318316Swollman } 78418316Swollman 78519896Swollman /* 78619896Swollman * Don't return EPIPE if I/O was successful 78718316Swollman */ 78818316Swollman if ((wpipe->pipe_buffer.cnt == 0) && 78918316Swollman (uio->uio_resid == 0) && 79018316Swollman (error == EPIPE)) 79119896Swollman error = 0; 79218316Swollman 79319896Swollman if (error = 0) { 79418316Swollman int s = splhigh(); 79518316Swollman wpipe->pipe_mtime = time; 79618316Swollman splx(s); 79718316Swollman } 79818316Swollman 79918316Swollman if (wpipe->pipe_state & PIPE_SEL) { 80018316Swollman wpipe->pipe_state &= ~PIPE_SEL; 80146303Smarkm selwakeup(&wpipe->pipe_sel); 80218316Swollman } 80318316Swollman 80446303Smarkm --wpipe->pipe_busy; 80546303Smarkm return error; 80618316Swollman} 80718316Swollman 80818316Swollman/* ARGSUSED */ 80918316Swollmanstatic int 81018316Swollmanpipe_write(fp, uio, cred) 81118316Swollman struct file *fp; 81218316Swollman struct uio *uio; 81318316Swollman struct ucred *cred; 81418316Swollman{ 81518316Swollman struct pipe *rpipe = (struct pipe *) fp->f_data; 81618316Swollman struct pipe *wpipe = rpipe->pipe_peer; 81718316Swollman return pipewrite(wpipe, uio, (rpipe->pipe_state & PIPE_NBIO)?1:0); 81818316Swollman} 81918316Swollman 82018316Swollman/* 82118316Swollman * we implement a very minimal set of ioctls for compatibility with sockets. 82218316Swollman */ 82318316Swollmanint 82418316Swollmanpipe_ioctl(fp, cmd, data, p) 82518316Swollman struct file *fp; 82618316Swollman int cmd; 82718316Swollman register caddr_t data; 82818316Swollman struct proc *p; 82918316Swollman{ 83037908Scharnier register struct pipe *mpipe = (struct pipe *)fp->f_data; 83118316Swollman 83218316Swollman switch (cmd) { 83318316Swollman 83418316Swollman case FIONBIO: 83518316Swollman if (*(int *)data) 83618316Swollman mpipe->pipe_state |= PIPE_NBIO; 83718316Swollman else 83818316Swollman mpipe->pipe_state &= ~PIPE_NBIO; 83918316Swollman return (0); 84018316Swollman 84118316Swollman case FIOASYNC: 84218316Swollman if (*(int *)data) { 84318316Swollman mpipe->pipe_state |= PIPE_ASYNC; 84418316Swollman } else { 84518316Swollman mpipe->pipe_state &= ~PIPE_ASYNC; 84618316Swollman } 84718316Swollman return (0); 84818316Swollman 84918316Swollman case FIONREAD: 85018316Swollman *(int *)data = mpipe->pipe_buffer.cnt; 85118316Swollman return (0); 85219896Swollman 85319896Swollman case SIOCSPGRP: 85418316Swollman mpipe->pipe_pgid = *(int *)data; 85546303Smarkm return (0); 85618316Swollman 85718316Swollman case SIOCGPGRP: 85818316Swollman *(int *)data = mpipe->pipe_pgid; 85918316Swollman return (0); 86018316Swollman 86118316Swollman } 86218316Swollman return ENOSYS; 86318316Swollman} 86418316Swollman 86518316Swollmanint 86618316Swollmanpipe_select(fp, which, p) 86718316Swollman struct file *fp; 86818316Swollman int which; 86918316Swollman struct proc *p; 87018316Swollman{ 87118316Swollman register struct pipe *rpipe = (struct pipe *)fp->f_data; 87218316Swollman struct pipe *wpipe; 87320342Swollman 87419896Swollman wpipe = rpipe->pipe_peer; 87520342Swollman switch (which) { 87620342Swollman 87720342Swollman case FREAD: 87820342Swollman if (rpipe->pipe_buffer.cnt > 0 || 87919896Swollman (rpipe->pipe_state & PIPE_EOF)) { 88018316Swollman return (1); 88146303Smarkm } 88219896Swollman selrecord(p, &rpipe->pipe_sel); 88319896Swollman rpipe->pipe_state |= PIPE_SEL; 88420342Swollman break; 88520342Swollman 88646303Smarkm case FWRITE: 88719896Swollman if ((wpipe == NULL) || 88819896Swollman (wpipe->pipe_state & PIPE_EOF) || 88919896Swollman ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) { 89020342Swollman return (1); 89120342Swollman } 89220342Swollman selrecord(p, &wpipe->pipe_sel); 89320342Swollman wpipe->pipe_state |= PIPE_SEL; 89420342Swollman break; 89520342Swollman 89620342Swollman case 0: 89720342Swollman if ((rpipe->pipe_state & PIPE_EOF) || 89820342Swollman (wpipe == NULL) || 89920342Swollman (wpipe->pipe_state & PIPE_EOF)) { 90020342Swollman return (1); 90120342Swollman } 90220342Swollman 90320342Swollman selrecord(p, &rpipe->pipe_sel); 90420342Swollman rpipe->pipe_state |= PIPE_SEL; 90520342Swollman break; 90620342Swollman } 90720342Swollman return (0); 90820342Swollman} 90920342Swollman 91020342Swollmanint 91120342Swollmanpipe_stat(pipe, ub) 91220342Swollman register struct pipe *pipe; 91320342Swollman register struct stat *ub; 91420342Swollman{ 91520342Swollman bzero((caddr_t)ub, sizeof (*ub)); 91620342Swollman ub->st_mode = S_IFSOCK; 91720342Swollman ub->st_blksize = pipe->pipe_buffer.size; 91820342Swollman ub->st_size = pipe->pipe_buffer.cnt; 91920342Swollman ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 92019896Swollman TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec); 92119896Swollman TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec); 92219896Swollman TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec); 92319896Swollman return 0; 92419896Swollman} 92519896Swollman 92619896Swollman/* ARGSUSED */ 92720342Swollmanstatic int 92819896Swollmanpipe_close(fp, p) 92919896Swollman struct file *fp; 93019896Swollman struct proc *p; 93119896Swollman{ 93219896Swollman int error = 0; 93319896Swollman struct pipe *cpipe = (struct pipe *)fp->f_data; 93419896Swollman pipeclose(cpipe); 93519896Swollman fp->f_data = NULL; 93646303Smarkm return 0; 93718316Swollman} 93818316Swollman 93918316Swollman/* 94018316Swollman * shutdown the pipe 94118316Swollman */ 94218316Swollmanstatic void 94318316Swollmanpipeclose(cpipe) 94418316Swollman struct pipe *cpipe; 94518316Swollman{ 94618316Swollman struct pipe *ppipe; 94718316Swollman if (cpipe) { 94818316Swollman 94918316Swollman if (cpipe->pipe_state & PIPE_SEL) { 95018316Swollman cpipe->pipe_state &= ~PIPE_SEL; 95118316Swollman selwakeup(&cpipe->pipe_sel); 95218316Swollman } 95318316Swollman 954 /* 955 * If the other side is blocked, wake it up saying that 956 * we want to close it down. 957 */ 958 while (cpipe->pipe_busy) { 959 wakeup(cpipe); 960 cpipe->pipe_state |= PIPE_WANT|PIPE_EOF; 961 tsleep(cpipe, PRIBIO, "pipecl", 0); 962 } 963 964 /* 965 * Disconnect from peer 966 */ 967 if (ppipe = cpipe->pipe_peer) { 968 if (ppipe->pipe_state & PIPE_SEL) { 969 ppipe->pipe_state &= ~PIPE_SEL; 970 selwakeup(&ppipe->pipe_sel); 971 } 972 973 ppipe->pipe_state |= PIPE_EOF; 974 wakeup(ppipe); 975 ppipe->pipe_peer = NULL; 976 } 977 978 /* 979 * free resources 980 */ 981 if (cpipe->pipe_buffer.buffer) { 982 amountpipekva -= cpipe->pipe_buffer.size; 983 kmem_free(kernel_map, 984 (vm_offset_t)cpipe->pipe_buffer.buffer, 985 cpipe->pipe_buffer.size); 986 } 987 if (cpipe->pipe_map.kva) { 988 amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; 989 kmem_free(kernel_map, 990 cpipe->pipe_map.kva, 991 cpipe->pipe_buffer.size + PAGE_SIZE); 992 } 993 free(cpipe, M_TEMP); 994 } 995} 996#endif 997