1/*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD: stable/10/sys/kern/sys_generic.c 360332 2020-04-26 08:35:32Z hselasky $"); 39 40#include "opt_capsicum.h" 41#include "opt_compat.h" 42#include "opt_ktrace.h" 43 44#include <sys/param.h> 45#include <sys/systm.h> 46#include <sys/sysproto.h> 47#include <sys/capsicum.h> 48#include <sys/filedesc.h> 49#include <sys/filio.h> 50#include <sys/fcntl.h> 51#include <sys/file.h> 52#include <sys/lock.h> 53#include <sys/proc.h> 54#include <sys/signalvar.h> 55#include <sys/socketvar.h> 56#include <sys/uio.h> 57#include <sys/kernel.h> 58#include <sys/ktr.h> 59#include <sys/limits.h> 60#include <sys/malloc.h> 61#include <sys/poll.h> 62#include <sys/resourcevar.h> 63#include <sys/selinfo.h> 64#include <sys/sleepqueue.h> 65#include <sys/syscallsubr.h> 66#include <sys/sysctl.h> 67#include <sys/sysent.h> 68#include <sys/vnode.h> 69#include <sys/bio.h> 70#include <sys/buf.h> 71#include <sys/condvar.h> 72#ifdef KTRACE 73#include <sys/ktrace.h> 74#endif 75 76#include <security/audit/audit.h> 77 78/* 79 * The following macro defines how many bytes will be allocated from 80 * the stack instead of memory allocated when passing the IOCTL data 81 * structures from userspace and to the kernel. Some IOCTLs having 82 * small data structures are used very frequently and this small 83 * buffer on the stack gives a significant speedup improvement for 84 * those requests. The value of this define should be greater or equal 85 * to 64 bytes and should also be power of two. The data structure is 86 * currently hard-aligned to a 8-byte boundary on the stack. This 87 * should currently be sufficient for all supported platforms. 88 */ 89#define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ 90#define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ 91 92int iosize_max_clamp = 1; 93SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, 94 &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); 95int devfs_iosize_max_clamp = 1; 96SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, 97 &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); 98 99/* 100 * Assert that the return value of read(2) and write(2) syscalls fits 101 * into a register. If not, an architecture will need to provide the 102 * usermode wrappers to reconstruct the result. 103 */ 104CTASSERT(sizeof(register_t) >= sizeof(size_t)); 105 106static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 107static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 108MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 109 110static int pollout(struct thread *, struct pollfd *, struct pollfd *, 111 u_int); 112static int pollscan(struct thread *, struct pollfd *, u_int); 113static int pollrescan(struct thread *); 114static int selscan(struct thread *, fd_mask **, fd_mask **, int); 115static int selrescan(struct thread *, fd_mask **, fd_mask **); 116static void selfdalloc(struct thread *, void *); 117static void selfdfree(struct seltd *, struct selfd *); 118static int dofileread(struct thread *, int, struct file *, struct uio *, 119 off_t, int); 120static int dofilewrite(struct thread *, int, struct file *, struct uio *, 121 off_t, int); 122static void doselwakeup(struct selinfo *, int); 123static void seltdinit(struct thread *); 124static int seltdwait(struct thread *, sbintime_t, sbintime_t); 125static void seltdclear(struct thread *); 126 127/* 128 * One seltd per-thread allocated on demand as needed. 129 * 130 * t - protected by st_mtx 131 * k - Only accessed by curthread or read-only 132 */ 133struct seltd { 134 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 135 struct selfd *st_free1; /* (k) free fd for read set. */ 136 struct selfd *st_free2; /* (k) free fd for write set. */ 137 struct mtx st_mtx; /* Protects struct seltd */ 138 struct cv st_wait; /* (t) Wait channel. */ 139 int st_flags; /* (t) SELTD_ flags. */ 140}; 141 142#define SELTD_PENDING 0x0001 /* We have pending events. */ 143#define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 144 145/* 146 * One selfd allocated per-thread per-file-descriptor. 147 * f - protected by sf_mtx 148 */ 149struct selfd { 150 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 151 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 152 struct selinfo *sf_si; /* (f) selinfo when linked. */ 153 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 154 struct seltd *sf_td; /* (k) owning seltd. */ 155 void *sf_cookie; /* (k) fd or pollfd. */ 156}; 157 158static uma_zone_t selfd_zone; 159static struct mtx_pool *mtxpool_select; 160 161#ifndef _SYS_SYSPROTO_H_ 162struct read_args { 163 int fd; 164 void *buf; 165 size_t nbyte; 166}; 167#endif 168int 169sys_read(td, uap) 170 struct thread *td; 171 struct read_args *uap; 172{ 173 struct uio auio; 174 struct iovec aiov; 175 int error; 176 177 if (uap->nbyte > IOSIZE_MAX) 178 return (EINVAL); 179 aiov.iov_base = uap->buf; 180 aiov.iov_len = uap->nbyte; 181 auio.uio_iov = &aiov; 182 auio.uio_iovcnt = 1; 183 auio.uio_resid = uap->nbyte; 184 auio.uio_segflg = UIO_USERSPACE; 185 error = kern_readv(td, uap->fd, &auio); 186 return(error); 187} 188 189/* 190 * Positioned read system call 191 */ 192#ifndef _SYS_SYSPROTO_H_ 193struct pread_args { 194 int fd; 195 void *buf; 196 size_t nbyte; 197 int pad; 198 off_t offset; 199}; 200#endif 201int 202sys_pread(td, uap) 203 struct thread *td; 204 struct pread_args *uap; 205{ 206 struct uio auio; 207 struct iovec aiov; 208 int error; 209 210 if (uap->nbyte > IOSIZE_MAX) 211 return (EINVAL); 212 aiov.iov_base = uap->buf; 213 aiov.iov_len = uap->nbyte; 214 auio.uio_iov = &aiov; 215 auio.uio_iovcnt = 1; 216 auio.uio_resid = uap->nbyte; 217 auio.uio_segflg = UIO_USERSPACE; 218 error = kern_preadv(td, uap->fd, &auio, uap->offset); 219 return(error); 220} 221 222int 223freebsd6_pread(td, uap) 224 struct thread *td; 225 struct freebsd6_pread_args *uap; 226{ 227 struct pread_args oargs; 228 229 oargs.fd = uap->fd; 230 oargs.buf = uap->buf; 231 oargs.nbyte = uap->nbyte; 232 oargs.offset = uap->offset; 233 return (sys_pread(td, &oargs)); 234} 235 236/* 237 * Scatter read system call. 238 */ 239#ifndef _SYS_SYSPROTO_H_ 240struct readv_args { 241 int fd; 242 struct iovec *iovp; 243 u_int iovcnt; 244}; 245#endif 246int 247sys_readv(struct thread *td, struct readv_args *uap) 248{ 249 struct uio *auio; 250 int error; 251 252 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 253 if (error) 254 return (error); 255 error = kern_readv(td, uap->fd, auio); 256 free(auio, M_IOV); 257 return (error); 258} 259 260int 261kern_readv(struct thread *td, int fd, struct uio *auio) 262{ 263 struct file *fp; 264 cap_rights_t rights; 265 int error; 266 267 error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp); 268 if (error) 269 return (error); 270 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 271 fdrop(fp, td); 272 return (error); 273} 274 275/* 276 * Scatter positioned read system call. 277 */ 278#ifndef _SYS_SYSPROTO_H_ 279struct preadv_args { 280 int fd; 281 struct iovec *iovp; 282 u_int iovcnt; 283 off_t offset; 284}; 285#endif 286int 287sys_preadv(struct thread *td, struct preadv_args *uap) 288{ 289 struct uio *auio; 290 int error; 291 292 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 293 if (error) 294 return (error); 295 error = kern_preadv(td, uap->fd, auio, uap->offset); 296 free(auio, M_IOV); 297 return (error); 298} 299 300int 301kern_preadv(td, fd, auio, offset) 302 struct thread *td; 303 int fd; 304 struct uio *auio; 305 off_t offset; 306{ 307 struct file *fp; 308 cap_rights_t rights; 309 int error; 310 311 error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp); 312 if (error) 313 return (error); 314 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 315 error = ESPIPE; 316 else if (offset < 0 && 317 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 318 error = EINVAL; 319 else 320 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 321 fdrop(fp, td); 322 return (error); 323} 324 325/* 326 * Common code for readv and preadv that reads data in 327 * from a file using the passed in uio, offset, and flags. 328 */ 329static int 330dofileread(td, fd, fp, auio, offset, flags) 331 struct thread *td; 332 int fd; 333 struct file *fp; 334 struct uio *auio; 335 off_t offset; 336 int flags; 337{ 338 ssize_t cnt; 339 int error; 340#ifdef KTRACE 341 struct uio *ktruio = NULL; 342#endif 343 344 /* Finish zero length reads right here */ 345 if (auio->uio_resid == 0) { 346 td->td_retval[0] = 0; 347 return(0); 348 } 349 auio->uio_rw = UIO_READ; 350 auio->uio_offset = offset; 351 auio->uio_td = td; 352#ifdef KTRACE 353 if (KTRPOINT(td, KTR_GENIO)) 354 ktruio = cloneuio(auio); 355#endif 356 cnt = auio->uio_resid; 357 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 358 if (auio->uio_resid != cnt && (error == ERESTART || 359 error == EINTR || error == EWOULDBLOCK)) 360 error = 0; 361 } 362 cnt -= auio->uio_resid; 363#ifdef KTRACE 364 if (ktruio != NULL) { 365 ktruio->uio_resid = cnt; 366 ktrgenio(fd, UIO_READ, ktruio, error); 367 } 368#endif 369 td->td_retval[0] = cnt; 370 return (error); 371} 372 373#ifndef _SYS_SYSPROTO_H_ 374struct write_args { 375 int fd; 376 const void *buf; 377 size_t nbyte; 378}; 379#endif 380int 381sys_write(td, uap) 382 struct thread *td; 383 struct write_args *uap; 384{ 385 struct uio auio; 386 struct iovec aiov; 387 int error; 388 389 if (uap->nbyte > IOSIZE_MAX) 390 return (EINVAL); 391 aiov.iov_base = (void *)(uintptr_t)uap->buf; 392 aiov.iov_len = uap->nbyte; 393 auio.uio_iov = &aiov; 394 auio.uio_iovcnt = 1; 395 auio.uio_resid = uap->nbyte; 396 auio.uio_segflg = UIO_USERSPACE; 397 error = kern_writev(td, uap->fd, &auio); 398 return(error); 399} 400 401/* 402 * Positioned write system call. 403 */ 404#ifndef _SYS_SYSPROTO_H_ 405struct pwrite_args { 406 int fd; 407 const void *buf; 408 size_t nbyte; 409 int pad; 410 off_t offset; 411}; 412#endif 413int 414sys_pwrite(td, uap) 415 struct thread *td; 416 struct pwrite_args *uap; 417{ 418 struct uio auio; 419 struct iovec aiov; 420 int error; 421 422 if (uap->nbyte > IOSIZE_MAX) 423 return (EINVAL); 424 aiov.iov_base = (void *)(uintptr_t)uap->buf; 425 aiov.iov_len = uap->nbyte; 426 auio.uio_iov = &aiov; 427 auio.uio_iovcnt = 1; 428 auio.uio_resid = uap->nbyte; 429 auio.uio_segflg = UIO_USERSPACE; 430 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 431 return(error); 432} 433 434int 435freebsd6_pwrite(td, uap) 436 struct thread *td; 437 struct freebsd6_pwrite_args *uap; 438{ 439 struct pwrite_args oargs; 440 441 oargs.fd = uap->fd; 442 oargs.buf = uap->buf; 443 oargs.nbyte = uap->nbyte; 444 oargs.offset = uap->offset; 445 return (sys_pwrite(td, &oargs)); 446} 447 448/* 449 * Gather write system call. 450 */ 451#ifndef _SYS_SYSPROTO_H_ 452struct writev_args { 453 int fd; 454 struct iovec *iovp; 455 u_int iovcnt; 456}; 457#endif 458int 459sys_writev(struct thread *td, struct writev_args *uap) 460{ 461 struct uio *auio; 462 int error; 463 464 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 465 if (error) 466 return (error); 467 error = kern_writev(td, uap->fd, auio); 468 free(auio, M_IOV); 469 return (error); 470} 471 472int 473kern_writev(struct thread *td, int fd, struct uio *auio) 474{ 475 struct file *fp; 476 cap_rights_t rights; 477 int error; 478 479 error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp); 480 if (error) 481 return (error); 482 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 483 fdrop(fp, td); 484 return (error); 485} 486 487/* 488 * Gather positioned write system call. 489 */ 490#ifndef _SYS_SYSPROTO_H_ 491struct pwritev_args { 492 int fd; 493 struct iovec *iovp; 494 u_int iovcnt; 495 off_t offset; 496}; 497#endif 498int 499sys_pwritev(struct thread *td, struct pwritev_args *uap) 500{ 501 struct uio *auio; 502 int error; 503 504 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 505 if (error) 506 return (error); 507 error = kern_pwritev(td, uap->fd, auio, uap->offset); 508 free(auio, M_IOV); 509 return (error); 510} 511 512int 513kern_pwritev(td, fd, auio, offset) 514 struct thread *td; 515 struct uio *auio; 516 int fd; 517 off_t offset; 518{ 519 struct file *fp; 520 cap_rights_t rights; 521 int error; 522 523 error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp); 524 if (error) 525 return (error); 526 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 527 error = ESPIPE; 528 else if (offset < 0 && 529 (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) 530 error = EINVAL; 531 else 532 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 533 fdrop(fp, td); 534 return (error); 535} 536 537/* 538 * Common code for writev and pwritev that writes data to 539 * a file using the passed in uio, offset, and flags. 540 */ 541static int 542dofilewrite(td, fd, fp, auio, offset, flags) 543 struct thread *td; 544 int fd; 545 struct file *fp; 546 struct uio *auio; 547 off_t offset; 548 int flags; 549{ 550 ssize_t cnt; 551 int error; 552#ifdef KTRACE 553 struct uio *ktruio = NULL; 554#endif 555 556 auio->uio_rw = UIO_WRITE; 557 auio->uio_td = td; 558 auio->uio_offset = offset; 559#ifdef KTRACE 560 if (KTRPOINT(td, KTR_GENIO)) 561 ktruio = cloneuio(auio); 562#endif 563 cnt = auio->uio_resid; 564 if (fp->f_type == DTYPE_VNODE && 565 (fp->f_vnread_flags & FDEVFS_VNODE) == 0) 566 bwillwrite(); 567 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 568 if (auio->uio_resid != cnt && (error == ERESTART || 569 error == EINTR || error == EWOULDBLOCK)) 570 error = 0; 571 /* Socket layer is responsible for issuing SIGPIPE. */ 572 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 573 PROC_LOCK(td->td_proc); 574 tdsignal(td, SIGPIPE); 575 PROC_UNLOCK(td->td_proc); 576 } 577 } 578 cnt -= auio->uio_resid; 579#ifdef KTRACE 580 if (ktruio != NULL) { 581 ktruio->uio_resid = cnt; 582 ktrgenio(fd, UIO_WRITE, ktruio, error); 583 } 584#endif 585 td->td_retval[0] = cnt; 586 return (error); 587} 588 589/* 590 * Truncate a file given a file descriptor. 591 * 592 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 593 * descriptor isn't writable. 594 */ 595int 596kern_ftruncate(td, fd, length) 597 struct thread *td; 598 int fd; 599 off_t length; 600{ 601 struct file *fp; 602 cap_rights_t rights; 603 int error; 604 605 AUDIT_ARG_FD(fd); 606 if (length < 0) 607 return (EINVAL); 608 error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp); 609 if (error) 610 return (error); 611 AUDIT_ARG_FILE(td->td_proc, fp); 612 if (!(fp->f_flag & FWRITE)) { 613 fdrop(fp, td); 614 return (EINVAL); 615 } 616 error = fo_truncate(fp, length, td->td_ucred, td); 617 fdrop(fp, td); 618 return (error); 619} 620 621#ifndef _SYS_SYSPROTO_H_ 622struct ftruncate_args { 623 int fd; 624 int pad; 625 off_t length; 626}; 627#endif 628int 629sys_ftruncate(td, uap) 630 struct thread *td; 631 struct ftruncate_args *uap; 632{ 633 634 return (kern_ftruncate(td, uap->fd, uap->length)); 635} 636 637#if defined(COMPAT_43) 638#ifndef _SYS_SYSPROTO_H_ 639struct oftruncate_args { 640 int fd; 641 long length; 642}; 643#endif 644int 645oftruncate(td, uap) 646 struct thread *td; 647 struct oftruncate_args *uap; 648{ 649 650 return (kern_ftruncate(td, uap->fd, uap->length)); 651} 652#endif /* COMPAT_43 */ 653 654#ifndef _SYS_SYSPROTO_H_ 655struct ioctl_args { 656 int fd; 657 u_long com; 658 caddr_t data; 659}; 660#endif 661/* ARGSUSED */ 662int 663sys_ioctl(struct thread *td, struct ioctl_args *uap) 664{ 665 u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN); 666 uint32_t com; 667 int arg, error; 668 u_int size; 669 caddr_t data; 670 671#ifdef INVARIANTS 672 if (uap->com > 0xffffffff) { 673 printf( 674 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 675 td->td_proc->p_pid, td->td_name, uap->com); 676 } 677#endif 678 com = (uint32_t)uap->com; 679 680 /* 681 * Interpret high order word to find amount of data to be 682 * copied to/from the user's address space. 683 */ 684 size = IOCPARM_LEN(com); 685 if ((size > IOCPARM_MAX) || 686 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 687#if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 688 ((com & IOC_OUT) && size == 0) || 689#else 690 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 691#endif 692 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 693 return (ENOTTY); 694 695 if (size > 0) { 696 if (com & IOC_VOID) { 697 /* Integer argument. */ 698 arg = (intptr_t)uap->data; 699 data = (void *)&arg; 700 size = 0; 701 } else { 702 if (size > SYS_IOCTL_SMALL_SIZE) 703 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 704 else 705 data = smalldata; 706 } 707 } else 708 data = (void *)&uap->data; 709 if (com & IOC_IN) { 710 error = copyin(uap->data, data, (u_int)size); 711 if (error != 0) 712 goto out; 713 } else if (com & IOC_OUT) { 714 /* 715 * Zero the buffer so the user always 716 * gets back something deterministic. 717 */ 718 bzero(data, size); 719 } 720 721 error = kern_ioctl(td, uap->fd, com, data); 722 723 if (error == 0 && (com & IOC_OUT)) 724 error = copyout(data, uap->data, (u_int)size); 725 726out: 727 if (size > SYS_IOCTL_SMALL_SIZE) 728 free(data, M_IOCTLOPS); 729 return (error); 730} 731 732int 733kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 734{ 735 struct file *fp; 736 struct filedesc *fdp; 737#ifndef CAPABILITIES 738 cap_rights_t rights; 739#endif 740 int error, tmp, locked; 741 742 AUDIT_ARG_FD(fd); 743 AUDIT_ARG_CMD(com); 744 745 fdp = td->td_proc->p_fd; 746 747 switch (com) { 748 case FIONCLEX: 749 case FIOCLEX: 750 FILEDESC_XLOCK(fdp); 751 locked = LA_XLOCKED; 752 break; 753 default: 754#ifdef CAPABILITIES 755 FILEDESC_SLOCK(fdp); 756 locked = LA_SLOCKED; 757#else 758 locked = LA_UNLOCKED; 759#endif 760 break; 761 } 762 763#ifdef CAPABILITIES 764 if ((fp = fget_locked(fdp, fd)) == NULL) { 765 error = EBADF; 766 goto out; 767 } 768 if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { 769 fp = NULL; /* fhold() was not called yet */ 770 goto out; 771 } 772 fhold(fp); 773 if (locked == LA_SLOCKED) { 774 FILEDESC_SUNLOCK(fdp); 775 locked = LA_UNLOCKED; 776 } 777#else 778 error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp); 779 if (error != 0) { 780 fp = NULL; 781 goto out; 782 } 783#endif 784 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 785 error = EBADF; 786 goto out; 787 } 788 789 switch (com) { 790 case FIONCLEX: 791 fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; 792 goto out; 793 case FIOCLEX: 794 fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; 795 goto out; 796 case FIONBIO: 797 if ((tmp = *(int *)data)) 798 atomic_set_int(&fp->f_flag, FNONBLOCK); 799 else 800 atomic_clear_int(&fp->f_flag, FNONBLOCK); 801 data = (void *)&tmp; 802 break; 803 case FIOASYNC: 804 if ((tmp = *(int *)data)) 805 atomic_set_int(&fp->f_flag, FASYNC); 806 else 807 atomic_clear_int(&fp->f_flag, FASYNC); 808 data = (void *)&tmp; 809 break; 810 } 811 812 error = fo_ioctl(fp, com, data, td->td_ucred, td); 813out: 814 switch (locked) { 815 case LA_XLOCKED: 816 FILEDESC_XUNLOCK(fdp); 817 break; 818#ifdef CAPABILITIES 819 case LA_SLOCKED: 820 FILEDESC_SUNLOCK(fdp); 821 break; 822#endif 823 default: 824 FILEDESC_UNLOCK_ASSERT(fdp); 825 break; 826 } 827 if (fp != NULL) 828 fdrop(fp, td); 829 return (error); 830} 831 832int 833poll_no_poll(int events) 834{ 835 /* 836 * Return true for read/write. If the user asked for something 837 * special, return POLLNVAL, so that clients have a way of 838 * determining reliably whether or not the extended 839 * functionality is present without hard-coding knowledge 840 * of specific filesystem implementations. 841 */ 842 if (events & ~POLLSTANDARD) 843 return (POLLNVAL); 844 845 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 846} 847 848int 849sys_pselect(struct thread *td, struct pselect_args *uap) 850{ 851 struct timespec ts; 852 struct timeval tv, *tvp; 853 sigset_t set, *uset; 854 int error; 855 856 if (uap->ts != NULL) { 857 error = copyin(uap->ts, &ts, sizeof(ts)); 858 if (error != 0) 859 return (error); 860 TIMESPEC_TO_TIMEVAL(&tv, &ts); 861 tvp = &tv; 862 } else 863 tvp = NULL; 864 if (uap->sm != NULL) { 865 error = copyin(uap->sm, &set, sizeof(set)); 866 if (error != 0) 867 return (error); 868 uset = &set; 869 } else 870 uset = NULL; 871 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 872 uset, NFDBITS)); 873} 874 875int 876kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 877 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 878{ 879 int error; 880 881 if (uset != NULL) { 882 error = kern_sigprocmask(td, SIG_SETMASK, uset, 883 &td->td_oldsigmask, 0); 884 if (error != 0) 885 return (error); 886 td->td_pflags |= TDP_OLDMASK; 887 /* 888 * Make sure that ast() is called on return to 889 * usermode and TDP_OLDMASK is cleared, restoring old 890 * sigmask. 891 */ 892 thread_lock(td); 893 td->td_flags |= TDF_ASTPENDING; 894 thread_unlock(td); 895 } 896 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 897 return (error); 898} 899 900#ifndef _SYS_SYSPROTO_H_ 901struct select_args { 902 int nd; 903 fd_set *in, *ou, *ex; 904 struct timeval *tv; 905}; 906#endif 907int 908sys_select(struct thread *td, struct select_args *uap) 909{ 910 struct timeval tv, *tvp; 911 int error; 912 913 if (uap->tv != NULL) { 914 error = copyin(uap->tv, &tv, sizeof(tv)); 915 if (error) 916 return (error); 917 tvp = &tv; 918 } else 919 tvp = NULL; 920 921 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 922 NFDBITS)); 923} 924 925/* 926 * In the unlikely case when user specified n greater then the last 927 * open file descriptor, check that no bits are set after the last 928 * valid fd. We must return EBADF if any is set. 929 * 930 * There are applications that rely on the behaviour. 931 * 932 * nd is fd_lastfile + 1. 933 */ 934static int 935select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) 936{ 937 char *addr, *oaddr; 938 int b, i, res; 939 uint8_t bits; 940 941 if (nd >= ndu || fd_in == NULL) 942 return (0); 943 944 oaddr = NULL; 945 bits = 0; /* silence gcc */ 946 for (i = nd; i < ndu; i++) { 947 b = i / NBBY; 948#if BYTE_ORDER == LITTLE_ENDIAN 949 addr = (char *)fd_in + b; 950#else 951 addr = (char *)fd_in; 952 if (abi_nfdbits == NFDBITS) { 953 addr += rounddown(b, sizeof(fd_mask)) + 954 sizeof(fd_mask) - 1 - b % sizeof(fd_mask); 955 } else { 956 addr += rounddown(b, sizeof(uint32_t)) + 957 sizeof(uint32_t) - 1 - b % sizeof(uint32_t); 958 } 959#endif 960 if (addr != oaddr) { 961 res = fubyte(addr); 962 if (res == -1) 963 return (EFAULT); 964 oaddr = addr; 965 bits = res; 966 } 967 if ((bits & (1 << (i % NBBY))) != 0) 968 return (EBADF); 969 } 970 return (0); 971} 972 973int 974kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 975 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 976{ 977 struct filedesc *fdp; 978 /* 979 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 980 * infds with the new FD_SETSIZE of 1024, and more than enough for 981 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 982 * of 256. 983 */ 984 fd_mask s_selbits[howmany(2048, NFDBITS)]; 985 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 986 struct timeval rtv; 987 sbintime_t asbt, precision, rsbt; 988 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 989 int error, lf, ndu; 990 991 if (nd < 0) 992 return (EINVAL); 993 fdp = td->td_proc->p_fd; 994 ndu = nd; 995 lf = fdp->fd_lastfile; 996 if (nd > lf + 1) 997 nd = lf + 1; 998 999 error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); 1000 if (error != 0) 1001 return (error); 1002 error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); 1003 if (error != 0) 1004 return (error); 1005 error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); 1006 if (error != 0) 1007 return (error); 1008 1009 /* 1010 * Allocate just enough bits for the non-null fd_sets. Use the 1011 * preallocated auto buffer if possible. 1012 */ 1013 nfdbits = roundup(nd, NFDBITS); 1014 ncpbytes = nfdbits / NBBY; 1015 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 1016 nbufbytes = 0; 1017 if (fd_in != NULL) 1018 nbufbytes += 2 * ncpbytes; 1019 if (fd_ou != NULL) 1020 nbufbytes += 2 * ncpbytes; 1021 if (fd_ex != NULL) 1022 nbufbytes += 2 * ncpbytes; 1023 if (nbufbytes <= sizeof s_selbits) 1024 selbits = &s_selbits[0]; 1025 else 1026 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 1027 1028 /* 1029 * Assign pointers into the bit buffers and fetch the input bits. 1030 * Put the output buffers together so that they can be bzeroed 1031 * together. 1032 */ 1033 sbp = selbits; 1034#define getbits(name, x) \ 1035 do { \ 1036 if (name == NULL) { \ 1037 ibits[x] = NULL; \ 1038 obits[x] = NULL; \ 1039 } else { \ 1040 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 1041 obits[x] = sbp; \ 1042 sbp += ncpbytes / sizeof *sbp; \ 1043 error = copyin(name, ibits[x], ncpubytes); \ 1044 if (error != 0) \ 1045 goto done; \ 1046 bzero((char *)ibits[x] + ncpubytes, \ 1047 ncpbytes - ncpubytes); \ 1048 } \ 1049 } while (0) 1050 getbits(fd_in, 0); 1051 getbits(fd_ou, 1); 1052 getbits(fd_ex, 2); 1053#undef getbits 1054 1055#if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 1056 /* 1057 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 1058 * we are running under 32-bit emulation. This should be more 1059 * generic. 1060 */ 1061#define swizzle_fdset(bits) \ 1062 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 1063 int i; \ 1064 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 1065 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 1066 } 1067#else 1068#define swizzle_fdset(bits) 1069#endif 1070 1071 /* Make sure the bit order makes it through an ABI transition */ 1072 swizzle_fdset(ibits[0]); 1073 swizzle_fdset(ibits[1]); 1074 swizzle_fdset(ibits[2]); 1075 1076 if (nbufbytes != 0) 1077 bzero(selbits, nbufbytes / 2); 1078 1079 precision = 0; 1080 if (tvp != NULL) { 1081 rtv = *tvp; 1082 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1083 rtv.tv_usec >= 1000000) { 1084 error = EINVAL; 1085 goto done; 1086 } 1087 if (!timevalisset(&rtv)) 1088 asbt = 0; 1089 else if (rtv.tv_sec <= INT32_MAX) { 1090 rsbt = tvtosbt(rtv); 1091 precision = rsbt; 1092 precision >>= tc_precexp; 1093 if (TIMESEL(&asbt, rsbt)) 1094 asbt += tc_tick_sbt; 1095 if (asbt <= SBT_MAX - rsbt) 1096 asbt += rsbt; 1097 else 1098 asbt = -1; 1099 } else 1100 asbt = -1; 1101 } else 1102 asbt = -1; 1103 seltdinit(td); 1104 /* Iterate until the timeout expires or descriptors become ready. */ 1105 for (;;) { 1106 error = selscan(td, ibits, obits, nd); 1107 if (error || td->td_retval[0] != 0) 1108 break; 1109 error = seltdwait(td, asbt, precision); 1110 if (error) 1111 break; 1112 error = selrescan(td, ibits, obits); 1113 if (error || td->td_retval[0] != 0) 1114 break; 1115 } 1116 seltdclear(td); 1117 1118done: 1119 /* select is not restarted after signals... */ 1120 if (error == ERESTART) 1121 error = EINTR; 1122 if (error == EWOULDBLOCK) 1123 error = 0; 1124 1125 /* swizzle bit order back, if necessary */ 1126 swizzle_fdset(obits[0]); 1127 swizzle_fdset(obits[1]); 1128 swizzle_fdset(obits[2]); 1129#undef swizzle_fdset 1130 1131#define putbits(name, x) \ 1132 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 1133 error = error2; 1134 if (error == 0) { 1135 int error2; 1136 1137 putbits(fd_in, 0); 1138 putbits(fd_ou, 1); 1139 putbits(fd_ex, 2); 1140#undef putbits 1141 } 1142 if (selbits != &s_selbits[0]) 1143 free(selbits, M_SELECT); 1144 1145 return (error); 1146} 1147/* 1148 * Convert a select bit set to poll flags. 1149 * 1150 * The backend always returns POLLHUP/POLLERR if appropriate and we 1151 * return this as a set bit in any set. 1152 */ 1153static int select_flags[3] = { 1154 POLLRDNORM | POLLHUP | POLLERR, 1155 POLLWRNORM | POLLHUP | POLLERR, 1156 POLLRDBAND | POLLERR 1157}; 1158 1159/* 1160 * Compute the fo_poll flags required for a fd given by the index and 1161 * bit position in the fd_mask array. 1162 */ 1163static __inline int 1164selflags(fd_mask **ibits, int idx, fd_mask bit) 1165{ 1166 int flags; 1167 int msk; 1168 1169 flags = 0; 1170 for (msk = 0; msk < 3; msk++) { 1171 if (ibits[msk] == NULL) 1172 continue; 1173 if ((ibits[msk][idx] & bit) == 0) 1174 continue; 1175 flags |= select_flags[msk]; 1176 } 1177 return (flags); 1178} 1179 1180/* 1181 * Set the appropriate output bits given a mask of fired events and the 1182 * input bits originally requested. 1183 */ 1184static __inline int 1185selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1186{ 1187 int msk; 1188 int n; 1189 1190 n = 0; 1191 for (msk = 0; msk < 3; msk++) { 1192 if ((events & select_flags[msk]) == 0) 1193 continue; 1194 if (ibits[msk] == NULL) 1195 continue; 1196 if ((ibits[msk][idx] & bit) == 0) 1197 continue; 1198 /* 1199 * XXX Check for a duplicate set. This can occur because a 1200 * socket calls selrecord() twice for each poll() call 1201 * resulting in two selfds per real fd. selrescan() will 1202 * call selsetbits twice as a result. 1203 */ 1204 if ((obits[msk][idx] & bit) != 0) 1205 continue; 1206 obits[msk][idx] |= bit; 1207 n++; 1208 } 1209 1210 return (n); 1211} 1212 1213static __inline int 1214getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp) 1215{ 1216 cap_rights_t rights; 1217 1218 cap_rights_init(&rights, CAP_EVENT); 1219 1220 return (fget_unlocked(fdp, fd, &rights, 0, fpp, NULL)); 1221} 1222 1223/* 1224 * Traverse the list of fds attached to this thread's seltd and check for 1225 * completion. 1226 */ 1227static int 1228selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1229{ 1230 struct filedesc *fdp; 1231 struct selinfo *si; 1232 struct seltd *stp; 1233 struct selfd *sfp; 1234 struct selfd *sfn; 1235 struct file *fp; 1236 fd_mask bit; 1237 int fd, ev, n, idx; 1238 int error; 1239 1240 fdp = td->td_proc->p_fd; 1241 stp = td->td_sel; 1242 n = 0; 1243 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1244 fd = (int)(uintptr_t)sfp->sf_cookie; 1245 si = sfp->sf_si; 1246 selfdfree(stp, sfp); 1247 /* If the selinfo wasn't cleared the event didn't fire. */ 1248 if (si != NULL) 1249 continue; 1250 error = getselfd_cap(fdp, fd, &fp); 1251 if (error) 1252 return (error); 1253 idx = fd / NFDBITS; 1254 bit = (fd_mask)1 << (fd % NFDBITS); 1255 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1256 fdrop(fp, td); 1257 if (ev != 0) 1258 n += selsetbits(ibits, obits, idx, bit, ev); 1259 } 1260 stp->st_flags = 0; 1261 td->td_retval[0] = n; 1262 return (0); 1263} 1264 1265/* 1266 * Perform the initial filedescriptor scan and register ourselves with 1267 * each selinfo. 1268 */ 1269static int 1270selscan(td, ibits, obits, nfd) 1271 struct thread *td; 1272 fd_mask **ibits, **obits; 1273 int nfd; 1274{ 1275 struct filedesc *fdp; 1276 struct file *fp; 1277 fd_mask bit; 1278 int ev, flags, end, fd; 1279 int n, idx; 1280 int error; 1281 1282 fdp = td->td_proc->p_fd; 1283 n = 0; 1284 for (idx = 0, fd = 0; fd < nfd; idx++) { 1285 end = imin(fd + NFDBITS, nfd); 1286 for (bit = 1; fd < end; bit <<= 1, fd++) { 1287 /* Compute the list of events we're interested in. */ 1288 flags = selflags(ibits, idx, bit); 1289 if (flags == 0) 1290 continue; 1291 error = getselfd_cap(fdp, fd, &fp); 1292 if (error) 1293 return (error); 1294 selfdalloc(td, (void *)(uintptr_t)fd); 1295 ev = fo_poll(fp, flags, td->td_ucred, td); 1296 fdrop(fp, td); 1297 if (ev != 0) 1298 n += selsetbits(ibits, obits, idx, bit, ev); 1299 } 1300 } 1301 1302 td->td_retval[0] = n; 1303 return (0); 1304} 1305 1306int 1307sys_poll(struct thread *td, struct poll_args *uap) 1308{ 1309 struct timespec ts, *tsp; 1310 1311 if (uap->timeout != INFTIM) { 1312 if (uap->timeout < 0) 1313 return (EINVAL); 1314 ts.tv_sec = uap->timeout / 1000; 1315 ts.tv_nsec = (uap->timeout % 1000) * 1000000; 1316 tsp = &ts; 1317 } else 1318 tsp = NULL; 1319 1320 return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL)); 1321} 1322 1323int 1324kern_poll(struct thread *td, struct pollfd *fds, u_int nfds, 1325 struct timespec *tsp, sigset_t *uset) 1326{ 1327 struct pollfd *bits; 1328 struct pollfd smallbits[32]; 1329 sbintime_t sbt, precision, tmp; 1330 time_t over; 1331 struct timespec ts; 1332 int error; 1333 size_t ni; 1334 1335 precision = 0; 1336 if (tsp != NULL) { 1337 if (tsp->tv_sec < 0) 1338 return (EINVAL); 1339 if (tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) 1340 return (EINVAL); 1341 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) 1342 sbt = 0; 1343 else { 1344 ts = *tsp; 1345 if (ts.tv_sec > INT32_MAX / 2) { 1346 over = ts.tv_sec - INT32_MAX / 2; 1347 ts.tv_sec -= over; 1348 } else 1349 over = 0; 1350 tmp = tstosbt(ts); 1351 precision = tmp; 1352 precision >>= tc_precexp; 1353 if (TIMESEL(&sbt, tmp)) 1354 sbt += tc_tick_sbt; 1355 sbt += tmp; 1356 } 1357 } else 1358 sbt = -1; 1359 1360 if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 1361 return (EINVAL); 1362 ni = nfds * sizeof(struct pollfd); 1363 if (ni > sizeof(smallbits)) 1364 bits = malloc(ni, M_TEMP, M_WAITOK); 1365 else 1366 bits = smallbits; 1367 error = copyin(fds, bits, ni); 1368 if (error) 1369 goto done; 1370 1371 if (uset != NULL) { 1372 error = kern_sigprocmask(td, SIG_SETMASK, uset, 1373 &td->td_oldsigmask, 0); 1374 if (error) 1375 goto done; 1376 td->td_pflags |= TDP_OLDMASK; 1377 /* 1378 * Make sure that ast() is called on return to 1379 * usermode and TDP_OLDMASK is cleared, restoring old 1380 * sigmask. 1381 */ 1382 thread_lock(td); 1383 td->td_flags |= TDF_ASTPENDING; 1384 thread_unlock(td); 1385 } 1386 1387 seltdinit(td); 1388 /* Iterate until the timeout expires or descriptors become ready. */ 1389 for (;;) { 1390 error = pollscan(td, bits, nfds); 1391 if (error || td->td_retval[0] != 0) 1392 break; 1393 error = seltdwait(td, sbt, precision); 1394 if (error) 1395 break; 1396 error = pollrescan(td); 1397 if (error || td->td_retval[0] != 0) 1398 break; 1399 } 1400 seltdclear(td); 1401 1402done: 1403 /* poll is not restarted after signals... */ 1404 if (error == ERESTART) 1405 error = EINTR; 1406 if (error == EWOULDBLOCK) 1407 error = 0; 1408 if (error == 0) { 1409 error = pollout(td, bits, fds, nfds); 1410 if (error) 1411 goto out; 1412 } 1413out: 1414 if (ni > sizeof(smallbits)) 1415 free(bits, M_TEMP); 1416 return (error); 1417} 1418 1419int 1420sys_ppoll(struct thread *td, struct ppoll_args *uap) 1421{ 1422 struct timespec ts, *tsp; 1423 sigset_t set, *ssp; 1424 int error; 1425 1426 if (uap->ts != NULL) { 1427 error = copyin(uap->ts, &ts, sizeof(ts)); 1428 if (error) 1429 return (error); 1430 tsp = &ts; 1431 } else 1432 tsp = NULL; 1433 if (uap->set != NULL) { 1434 error = copyin(uap->set, &set, sizeof(set)); 1435 if (error) 1436 return (error); 1437 ssp = &set; 1438 } else 1439 ssp = NULL; 1440 /* 1441 * fds is still a pointer to user space. kern_poll() will 1442 * take care of copyin that array to the kernel space. 1443 */ 1444 1445 return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp)); 1446} 1447 1448static int 1449pollrescan(struct thread *td) 1450{ 1451 struct seltd *stp; 1452 struct selfd *sfp; 1453 struct selfd *sfn; 1454 struct selinfo *si; 1455 struct filedesc *fdp; 1456 struct file *fp; 1457 struct pollfd *fd; 1458#ifdef CAPABILITIES 1459 cap_rights_t rights; 1460#endif 1461 int n; 1462 1463 n = 0; 1464 fdp = td->td_proc->p_fd; 1465 stp = td->td_sel; 1466 FILEDESC_SLOCK(fdp); 1467 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1468 fd = (struct pollfd *)sfp->sf_cookie; 1469 si = sfp->sf_si; 1470 selfdfree(stp, sfp); 1471 /* If the selinfo wasn't cleared the event didn't fire. */ 1472 if (si != NULL) 1473 continue; 1474 fp = fdp->fd_ofiles[fd->fd].fde_file; 1475#ifdef CAPABILITIES 1476 if (fp == NULL || 1477 cap_check(cap_rights(fdp, fd->fd), 1478 cap_rights_init(&rights, CAP_EVENT)) != 0) 1479#else 1480 if (fp == NULL) 1481#endif 1482 { 1483 fd->revents = POLLNVAL; 1484 n++; 1485 continue; 1486 } 1487 1488 /* 1489 * Note: backend also returns POLLHUP and 1490 * POLLERR if appropriate. 1491 */ 1492 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1493 if (fd->revents != 0) 1494 n++; 1495 } 1496 FILEDESC_SUNLOCK(fdp); 1497 stp->st_flags = 0; 1498 td->td_retval[0] = n; 1499 return (0); 1500} 1501 1502 1503static int 1504pollout(td, fds, ufds, nfd) 1505 struct thread *td; 1506 struct pollfd *fds; 1507 struct pollfd *ufds; 1508 u_int nfd; 1509{ 1510 int error = 0; 1511 u_int i = 0; 1512 u_int n = 0; 1513 1514 for (i = 0; i < nfd; i++) { 1515 error = copyout(&fds->revents, &ufds->revents, 1516 sizeof(ufds->revents)); 1517 if (error) 1518 return (error); 1519 if (fds->revents != 0) 1520 n++; 1521 fds++; 1522 ufds++; 1523 } 1524 td->td_retval[0] = n; 1525 return (0); 1526} 1527 1528static int 1529pollscan(td, fds, nfd) 1530 struct thread *td; 1531 struct pollfd *fds; 1532 u_int nfd; 1533{ 1534 struct filedesc *fdp = td->td_proc->p_fd; 1535 struct file *fp; 1536#ifdef CAPABILITIES 1537 cap_rights_t rights; 1538#endif 1539 int i, n = 0; 1540 1541 FILEDESC_SLOCK(fdp); 1542 for (i = 0; i < nfd; i++, fds++) { 1543 if (fds->fd > fdp->fd_lastfile) { 1544 fds->revents = POLLNVAL; 1545 n++; 1546 } else if (fds->fd < 0) { 1547 fds->revents = 0; 1548 } else { 1549 fp = fdp->fd_ofiles[fds->fd].fde_file; 1550#ifdef CAPABILITIES 1551 if (fp == NULL || 1552 cap_check(cap_rights(fdp, fds->fd), 1553 cap_rights_init(&rights, CAP_EVENT)) != 0) 1554#else 1555 if (fp == NULL) 1556#endif 1557 { 1558 fds->revents = POLLNVAL; 1559 n++; 1560 } else { 1561 /* 1562 * Note: backend also returns POLLHUP and 1563 * POLLERR if appropriate. 1564 */ 1565 selfdalloc(td, fds); 1566 fds->revents = fo_poll(fp, fds->events, 1567 td->td_ucred, td); 1568 /* 1569 * POSIX requires POLLOUT to be never 1570 * set simultaneously with POLLHUP. 1571 */ 1572 if ((fds->revents & POLLHUP) != 0) 1573 fds->revents &= ~POLLOUT; 1574 1575 if (fds->revents != 0) 1576 n++; 1577 } 1578 } 1579 } 1580 FILEDESC_SUNLOCK(fdp); 1581 td->td_retval[0] = n; 1582 return (0); 1583} 1584 1585/* 1586 * OpenBSD poll system call. 1587 * 1588 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1589 */ 1590#ifndef _SYS_SYSPROTO_H_ 1591struct openbsd_poll_args { 1592 struct pollfd *fds; 1593 u_int nfds; 1594 int timeout; 1595}; 1596#endif 1597int 1598sys_openbsd_poll(td, uap) 1599 register struct thread *td; 1600 register struct openbsd_poll_args *uap; 1601{ 1602 return (sys_poll(td, (struct poll_args *)uap)); 1603} 1604 1605/* 1606 * XXX This was created specifically to support netncp and netsmb. This 1607 * allows the caller to specify a socket to wait for events on. It returns 1608 * 0 if any events matched and an error otherwise. There is no way to 1609 * determine which events fired. 1610 */ 1611int 1612selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1613{ 1614 struct timeval rtv; 1615 sbintime_t asbt, precision, rsbt; 1616 int error; 1617 1618 precision = 0; /* stupid gcc! */ 1619 if (tvp != NULL) { 1620 rtv = *tvp; 1621 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1622 rtv.tv_usec >= 1000000) 1623 return (EINVAL); 1624 if (!timevalisset(&rtv)) 1625 asbt = 0; 1626 else if (rtv.tv_sec <= INT32_MAX) { 1627 rsbt = tvtosbt(rtv); 1628 precision = rsbt; 1629 precision >>= tc_precexp; 1630 if (TIMESEL(&asbt, rsbt)) 1631 asbt += tc_tick_sbt; 1632 if (asbt <= SBT_MAX - rsbt) 1633 asbt += rsbt; 1634 else 1635 asbt = -1; 1636 } else 1637 asbt = -1; 1638 } else 1639 asbt = -1; 1640 seltdinit(td); 1641 /* 1642 * Iterate until the timeout expires or the socket becomes ready. 1643 */ 1644 for (;;) { 1645 selfdalloc(td, NULL); 1646 error = sopoll(so, events, NULL, td); 1647 /* error here is actually the ready events. */ 1648 if (error) 1649 return (0); 1650 error = seltdwait(td, asbt, precision); 1651 if (error) 1652 break; 1653 } 1654 seltdclear(td); 1655 /* XXX Duplicates ncp/smb behavior. */ 1656 if (error == ERESTART) 1657 error = 0; 1658 return (error); 1659} 1660 1661/* 1662 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1663 * have two select sets, one for read and another for write. 1664 */ 1665static void 1666selfdalloc(struct thread *td, void *cookie) 1667{ 1668 struct seltd *stp; 1669 1670 stp = td->td_sel; 1671 if (stp->st_free1 == NULL) 1672 stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1673 stp->st_free1->sf_td = stp; 1674 stp->st_free1->sf_cookie = cookie; 1675 if (stp->st_free2 == NULL) 1676 stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1677 stp->st_free2->sf_td = stp; 1678 stp->st_free2->sf_cookie = cookie; 1679} 1680 1681static void 1682selfdfree(struct seltd *stp, struct selfd *sfp) 1683{ 1684 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1685 mtx_lock(sfp->sf_mtx); 1686 if (sfp->sf_si) 1687 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1688 mtx_unlock(sfp->sf_mtx); 1689 uma_zfree(selfd_zone, sfp); 1690} 1691 1692/* Drain the waiters tied to all the selfd belonging the specified selinfo. */ 1693void 1694seldrain(sip) 1695 struct selinfo *sip; 1696{ 1697 1698 /* 1699 * This feature is already provided by doselwakeup(), thus it is 1700 * enough to go for it. 1701 * Eventually, the context, should take care to avoid races 1702 * between thread calling select()/poll() and file descriptor 1703 * detaching, but, again, the races are just the same as 1704 * selwakeup(). 1705 */ 1706 doselwakeup(sip, -1); 1707} 1708 1709/* 1710 * Record a select request. 1711 */ 1712void 1713selrecord(selector, sip) 1714 struct thread *selector; 1715 struct selinfo *sip; 1716{ 1717 struct selfd *sfp; 1718 struct seltd *stp; 1719 struct mtx *mtxp; 1720 1721 stp = selector->td_sel; 1722 /* 1723 * Don't record when doing a rescan. 1724 */ 1725 if (stp->st_flags & SELTD_RESCAN) 1726 return; 1727 /* 1728 * Grab one of the preallocated descriptors. 1729 */ 1730 sfp = NULL; 1731 if ((sfp = stp->st_free1) != NULL) 1732 stp->st_free1 = NULL; 1733 else if ((sfp = stp->st_free2) != NULL) 1734 stp->st_free2 = NULL; 1735 else 1736 panic("selrecord: No free selfd on selq"); 1737 mtxp = sip->si_mtx; 1738 if (mtxp == NULL) 1739 mtxp = mtx_pool_find(mtxpool_select, sip); 1740 /* 1741 * Initialize the sfp and queue it in the thread. 1742 */ 1743 sfp->sf_si = sip; 1744 sfp->sf_mtx = mtxp; 1745 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1746 /* 1747 * Now that we've locked the sip, check for initialization. 1748 */ 1749 mtx_lock(mtxp); 1750 if (sip->si_mtx == NULL) { 1751 sip->si_mtx = mtxp; 1752 TAILQ_INIT(&sip->si_tdlist); 1753 } 1754 /* 1755 * Add this thread to the list of selfds listening on this selinfo. 1756 */ 1757 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1758 mtx_unlock(sip->si_mtx); 1759} 1760 1761/* Wake up a selecting thread. */ 1762void 1763selwakeup(sip) 1764 struct selinfo *sip; 1765{ 1766 doselwakeup(sip, -1); 1767} 1768 1769/* Wake up a selecting thread, and set its priority. */ 1770void 1771selwakeuppri(sip, pri) 1772 struct selinfo *sip; 1773 int pri; 1774{ 1775 doselwakeup(sip, pri); 1776} 1777 1778/* 1779 * Do a wakeup when a selectable event occurs. 1780 */ 1781static void 1782doselwakeup(sip, pri) 1783 struct selinfo *sip; 1784 int pri; 1785{ 1786 struct selfd *sfp; 1787 struct selfd *sfn; 1788 struct seltd *stp; 1789 1790 /* If it's not initialized there can't be any waiters. */ 1791 if (sip->si_mtx == NULL) 1792 return; 1793 /* 1794 * Locking the selinfo locks all selfds associated with it. 1795 */ 1796 mtx_lock(sip->si_mtx); 1797 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1798 /* 1799 * Once we remove this sfp from the list and clear the 1800 * sf_si seltdclear will know to ignore this si. 1801 */ 1802 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1803 sfp->sf_si = NULL; 1804 stp = sfp->sf_td; 1805 mtx_lock(&stp->st_mtx); 1806 stp->st_flags |= SELTD_PENDING; 1807 cv_broadcastpri(&stp->st_wait, pri); 1808 mtx_unlock(&stp->st_mtx); 1809 } 1810 mtx_unlock(sip->si_mtx); 1811} 1812 1813static void 1814seltdinit(struct thread *td) 1815{ 1816 struct seltd *stp; 1817 1818 if ((stp = td->td_sel) != NULL) 1819 goto out; 1820 td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1821 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1822 cv_init(&stp->st_wait, "select"); 1823out: 1824 stp->st_flags = 0; 1825 STAILQ_INIT(&stp->st_selq); 1826} 1827 1828static int 1829seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) 1830{ 1831 struct seltd *stp; 1832 int error; 1833 1834 stp = td->td_sel; 1835 /* 1836 * An event of interest may occur while we do not hold the seltd 1837 * locked so check the pending flag before we sleep. 1838 */ 1839 mtx_lock(&stp->st_mtx); 1840 /* 1841 * Any further calls to selrecord will be a rescan. 1842 */ 1843 stp->st_flags |= SELTD_RESCAN; 1844 if (stp->st_flags & SELTD_PENDING) { 1845 mtx_unlock(&stp->st_mtx); 1846 return (0); 1847 } 1848 if (sbt == 0) 1849 error = EWOULDBLOCK; 1850 else if (sbt != -1) 1851 error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, 1852 sbt, precision, C_ABSOLUTE); 1853 else 1854 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 1855 mtx_unlock(&stp->st_mtx); 1856 1857 return (error); 1858} 1859 1860void 1861seltdfini(struct thread *td) 1862{ 1863 struct seltd *stp; 1864 1865 stp = td->td_sel; 1866 if (stp == NULL) 1867 return; 1868 if (stp->st_free1) 1869 uma_zfree(selfd_zone, stp->st_free1); 1870 if (stp->st_free2) 1871 uma_zfree(selfd_zone, stp->st_free2); 1872 td->td_sel = NULL; 1873 free(stp, M_SELECT); 1874} 1875 1876/* 1877 * Remove the references to the thread from all of the objects we were 1878 * polling. 1879 */ 1880static void 1881seltdclear(struct thread *td) 1882{ 1883 struct seltd *stp; 1884 struct selfd *sfp; 1885 struct selfd *sfn; 1886 1887 stp = td->td_sel; 1888 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 1889 selfdfree(stp, sfp); 1890 stp->st_flags = 0; 1891} 1892 1893static void selectinit(void *); 1894SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 1895static void 1896selectinit(void *dummy __unused) 1897{ 1898 1899 selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, 1900 NULL, NULL, UMA_ALIGN_PTR, 0); 1901 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 1902} 1903