1177548Scsjp/*- 2177548Scsjp * Copyright (c) 2007 Seccuris Inc. 3177548Scsjp * All rights reserved. 4177548Scsjp * 5234969Seadler * This software was developed by Robert N. M. Watson under contract to 6177548Scsjp * Seccuris Inc. 7177548Scsjp * 8177548Scsjp * Redistribution and use in source and binary forms, with or without 9177548Scsjp * modification, are permitted provided that the following conditions 10177548Scsjp * are met: 11177548Scsjp * 1. Redistributions of source code must retain the above copyright 12177548Scsjp * notice, this list of conditions and the following disclaimer. 13177548Scsjp * 2. Redistributions in binary form must reproduce the above copyright 14177548Scsjp * notice, this list of conditions and the following disclaimer in the 15177548Scsjp * documentation and/or other materials provided with the distribution. 16177548Scsjp * 17177548Scsjp * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18177548Scsjp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19177548Scsjp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20177548Scsjp * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21177548Scsjp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22177548Scsjp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23177548Scsjp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24177548Scsjp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25177548Scsjp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26177548Scsjp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27177548Scsjp * SUCH DAMAGE. 28177548Scsjp */ 29177548Scsjp 30177548Scsjp#include <sys/cdefs.h> 31177548Scsjp__FBSDID("$FreeBSD$"); 32177548Scsjp 33177548Scsjp#include "opt_bpf.h" 34177548Scsjp 35177548Scsjp#include <sys/param.h> 36177548Scsjp#include <sys/lock.h> 37177548Scsjp#include <sys/malloc.h> 38177548Scsjp#include <sys/mbuf.h> 39177548Scsjp#include <sys/mutex.h> 40177548Scsjp#include <sys/proc.h> 41177548Scsjp#include <sys/sf_buf.h> 42177548Scsjp#include <sys/socket.h> 43177548Scsjp#include <sys/uio.h> 44177548Scsjp 45177548Scsjp#include <machine/atomic.h> 46177548Scsjp 47177548Scsjp#include <net/if.h> 48177548Scsjp#include <net/bpf.h> 49177585Sjkim#include <net/bpf_zerocopy.h> 50177548Scsjp#include <net/bpfdesc.h> 51177548Scsjp 52177548Scsjp#include <vm/vm.h> 53240238Skib#include <vm/vm_param.h> 54177548Scsjp#include <vm/pmap.h> 55177548Scsjp#include <vm/vm_extern.h> 56177548Scsjp#include <vm/vm_map.h> 57177548Scsjp#include <vm/vm_page.h> 58177548Scsjp 59177548Scsjp/* 60177548Scsjp * Zero-copy buffer scheme for BPF: user space "donates" two buffers, which 61177548Scsjp * are mapped into the kernel address space using sf_bufs and used directly 62177548Scsjp * by BPF. Memory is wired since page faults cannot be tolerated in the 63177548Scsjp * contexts where the buffers are copied to (locks held, interrupt context, 64177548Scsjp * etc). Access to shared memory buffers is synchronized using a header on 65177548Scsjp * each buffer, allowing the number of system calls to go to zero as BPF 66177548Scsjp * reaches saturation (buffers filled as fast as they can be drained by the 67177548Scsjp * user process). Full details of the protocol for communicating between the 68177548Scsjp * user process and BPF may be found in bpf(4). 69177548Scsjp */ 70177548Scsjp 71177548Scsjp/* 72177548Scsjp * Maximum number of pages per buffer. Since all BPF devices use two, the 73177548Scsjp * maximum per device is 2*BPF_MAX_PAGES. Resource limits on the number of 74177548Scsjp * sf_bufs may be an issue, so do not set this too high. On older systems, 75177548Scsjp * kernel address space limits may also be an issue. 76177548Scsjp */ 77177548Scsjp#define BPF_MAX_PAGES 512 78177548Scsjp 79177548Scsjp/* 80177548Scsjp * struct zbuf describes a memory buffer loaned by a user process to the 81177548Scsjp * kernel. We represent this as a series of pages managed using an array of 82177548Scsjp * sf_bufs. Even though the memory is contiguous in user space, it may not 83177548Scsjp * be mapped contiguously in the kernel (i.e., a set of physically 84177548Scsjp * non-contiguous pages in the direct map region) so we must implement 85177548Scsjp * scatter-gather copying. One significant mitigating factor is that on 86177548Scsjp * systems with a direct memory map, we can avoid TLB misses. 87177548Scsjp * 88177966Srwatson * At the front of the shared memory region is a bpf_zbuf_header, which 89177548Scsjp * contains shared control data to allow user space and the kernel to 90177548Scsjp * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF 91177548Scsjp * knows that the space is not available. 92177548Scsjp */ 93177548Scsjpstruct zbuf { 94189489Srwatson vm_offset_t zb_uaddr; /* User address at time of setup. */ 95177548Scsjp size_t zb_size; /* Size of buffer, incl. header. */ 96177548Scsjp u_int zb_numpages; /* Number of pages. */ 97177966Srwatson int zb_flags; /* Flags on zbuf. */ 98177548Scsjp struct sf_buf **zb_pages; /* Pages themselves. */ 99177548Scsjp struct bpf_zbuf_header *zb_header; /* Shared header. */ 100177548Scsjp}; 101177548Scsjp 102177548Scsjp/* 103177966Srwatson * When a buffer has been assigned to userspace, flag it as such, as the 104177966Srwatson * buffer may remain in the store position as a result of the user process 105177966Srwatson * not yet having acknowledged the buffer in the hold position yet. 106177966Srwatson */ 107189489Srwatson#define ZBUF_FLAG_ASSIGNED 0x00000001 /* Set when owned by user. */ 108177966Srwatson 109177966Srwatson/* 110177548Scsjp * Release a page we've previously wired. 111177548Scsjp */ 112177548Scsjpstatic void 113177548Scsjpzbuf_page_free(vm_page_t pp) 114177548Scsjp{ 115177548Scsjp 116207548Salc vm_page_lock(pp); 117177548Scsjp vm_page_unwire(pp, 0); 118177548Scsjp if (pp->wire_count == 0 && pp->object == NULL) 119177548Scsjp vm_page_free(pp); 120207548Salc vm_page_unlock(pp); 121177548Scsjp} 122177548Scsjp 123177548Scsjp/* 124177548Scsjp * Free an sf_buf with attached page. 125177548Scsjp */ 126177548Scsjpstatic void 127177548Scsjpzbuf_sfbuf_free(struct sf_buf *sf) 128177548Scsjp{ 129177548Scsjp vm_page_t pp; 130177548Scsjp 131177548Scsjp pp = sf_buf_page(sf); 132177548Scsjp sf_buf_free(sf); 133177548Scsjp zbuf_page_free(pp); 134177548Scsjp} 135177548Scsjp 136177548Scsjp/* 137177548Scsjp * Free a zbuf, including its page array, sbufs, and pages. Allow partially 138177548Scsjp * allocated zbufs to be freed so that it may be used even during a zbuf 139177548Scsjp * setup. 140177548Scsjp */ 141177548Scsjpstatic void 142177548Scsjpzbuf_free(struct zbuf *zb) 143177548Scsjp{ 144177548Scsjp int i; 145177548Scsjp 146177548Scsjp for (i = 0; i < zb->zb_numpages; i++) { 147177548Scsjp if (zb->zb_pages[i] != NULL) 148177548Scsjp zbuf_sfbuf_free(zb->zb_pages[i]); 149177548Scsjp } 150177548Scsjp free(zb->zb_pages, M_BPF); 151177548Scsjp free(zb, M_BPF); 152177548Scsjp} 153177548Scsjp 154177548Scsjp/* 155177548Scsjp * Given a user pointer to a page of user memory, return an sf_buf for the 156177548Scsjp * page. Because we may be requesting quite a few sf_bufs, prefer failure to 157177548Scsjp * deadlock and use SFB_NOWAIT. 158177548Scsjp */ 159177548Scsjpstatic struct sf_buf * 160177548Scsjpzbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr) 161177548Scsjp{ 162177548Scsjp struct sf_buf *sf; 163177548Scsjp vm_page_t pp; 164177548Scsjp 165216699Salc if (vm_fault_quick_hold_pages(map, uaddr, PAGE_SIZE, VM_PROT_READ | 166216699Salc VM_PROT_WRITE, &pp, 1) < 0) 167177548Scsjp return (NULL); 168207410Skmacy vm_page_lock(pp); 169177548Scsjp vm_page_wire(pp); 170177548Scsjp vm_page_unhold(pp); 171207410Skmacy vm_page_unlock(pp); 172177548Scsjp sf = sf_buf_alloc(pp, SFB_NOWAIT); 173177548Scsjp if (sf == NULL) { 174177548Scsjp zbuf_page_free(pp); 175177548Scsjp return (NULL); 176177548Scsjp } 177177548Scsjp return (sf); 178177548Scsjp} 179177548Scsjp 180177548Scsjp/* 181177548Scsjp * Create a zbuf describing a range of user address space memory. Validate 182177548Scsjp * page alignment, size requirements, etc. 183177548Scsjp */ 184177548Scsjpstatic int 185177548Scsjpzbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len, 186177548Scsjp struct zbuf **zbp) 187177548Scsjp{ 188177548Scsjp struct zbuf *zb; 189177548Scsjp struct vm_map *map; 190177548Scsjp int error, i; 191177548Scsjp 192177548Scsjp *zbp = NULL; 193177548Scsjp 194177548Scsjp /* 195177548Scsjp * User address must be page-aligned. 196177548Scsjp */ 197177548Scsjp if (uaddr & PAGE_MASK) 198177548Scsjp return (EINVAL); 199177548Scsjp 200177548Scsjp /* 201177548Scsjp * Length must be an integer number of full pages. 202177548Scsjp */ 203177548Scsjp if (len & PAGE_MASK) 204177548Scsjp return (EINVAL); 205177548Scsjp 206177548Scsjp /* 207177548Scsjp * Length must not exceed per-buffer resource limit. 208177548Scsjp */ 209177548Scsjp if ((len / PAGE_SIZE) > BPF_MAX_PAGES) 210177548Scsjp return (EINVAL); 211177548Scsjp 212177548Scsjp /* 213177548Scsjp * Allocate the buffer and set up each page with is own sf_buf. 214177548Scsjp */ 215177548Scsjp error = 0; 216177548Scsjp zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK); 217177548Scsjp zb->zb_uaddr = uaddr; 218177548Scsjp zb->zb_size = len; 219177548Scsjp zb->zb_numpages = len / PAGE_SIZE; 220177548Scsjp zb->zb_pages = malloc(sizeof(struct sf_buf *) * 221177548Scsjp zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK); 222177548Scsjp map = &td->td_proc->p_vmspace->vm_map; 223177548Scsjp for (i = 0; i < zb->zb_numpages; i++) { 224177548Scsjp zb->zb_pages[i] = zbuf_sfbuf_get(map, 225177548Scsjp uaddr + (i * PAGE_SIZE)); 226177548Scsjp if (zb->zb_pages[i] == NULL) { 227177548Scsjp error = EFAULT; 228177548Scsjp goto error; 229177548Scsjp } 230177548Scsjp } 231177548Scsjp zb->zb_header = 232177548Scsjp (struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]); 233177548Scsjp bzero(zb->zb_header, sizeof(*zb->zb_header)); 234177548Scsjp *zbp = zb; 235177548Scsjp return (0); 236177548Scsjp 237177548Scsjperror: 238177548Scsjp zbuf_free(zb); 239177548Scsjp return (error); 240177548Scsjp} 241177548Scsjp 242177548Scsjp/* 243177548Scsjp * Copy bytes from a source into the specified zbuf. The caller is 244177548Scsjp * responsible for performing bounds checking, etc. 245177548Scsjp */ 246177548Scsjpvoid 247177548Scsjpbpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, 248177548Scsjp void *src, u_int len) 249177548Scsjp{ 250177548Scsjp u_int count, page, poffset; 251177548Scsjp u_char *src_bytes; 252177548Scsjp struct zbuf *zb; 253177548Scsjp 254177548Scsjp KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, 255177548Scsjp ("bpf_zerocopy_append_bytes: not in zbuf mode")); 256177548Scsjp KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf")); 257177548Scsjp 258177548Scsjp src_bytes = (u_char *)src; 259177548Scsjp zb = (struct zbuf *)buf; 260177548Scsjp 261189489Srwatson KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0, 262189489Srwatson ("bpf_zerocopy_append_bytes: ZBUF_FLAG_ASSIGNED")); 263177966Srwatson 264177548Scsjp /* 265177548Scsjp * Scatter-gather copy to user pages mapped into kernel address space 266177548Scsjp * using sf_bufs: copy up to a page at a time. 267177548Scsjp */ 268177548Scsjp offset += sizeof(struct bpf_zbuf_header); 269177548Scsjp page = offset / PAGE_SIZE; 270177548Scsjp poffset = offset % PAGE_SIZE; 271177548Scsjp while (len > 0) { 272177548Scsjp KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:" 273177548Scsjp " page overflow (%d p %d np)\n", page, zb->zb_numpages)); 274177548Scsjp 275177548Scsjp count = min(len, PAGE_SIZE - poffset); 276177548Scsjp bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) + 277177548Scsjp poffset, count); 278177548Scsjp poffset += count; 279177548Scsjp if (poffset == PAGE_SIZE) { 280177548Scsjp poffset = 0; 281177548Scsjp page++; 282177548Scsjp } 283177548Scsjp KASSERT(poffset < PAGE_SIZE, 284177548Scsjp ("bpf_zerocopy_append_bytes: page offset overflow (%d)", 285177548Scsjp poffset)); 286177548Scsjp len -= count; 287177548Scsjp src_bytes += count; 288177548Scsjp } 289177548Scsjp} 290177548Scsjp 291177548Scsjp/* 292177548Scsjp * Copy bytes from an mbuf chain to the specified zbuf: copying will be 293177548Scsjp * scatter-gather both from mbufs, which may be fragmented over memory, and 294177548Scsjp * to pages, which may not be contiguously mapped in kernel address space. 295177548Scsjp * As with bpf_zerocopy_append_bytes(), the caller is responsible for 296177548Scsjp * checking that this will not exceed the buffer limit. 297177548Scsjp */ 298177548Scsjpvoid 299177548Scsjpbpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, 300177548Scsjp void *src, u_int len) 301177548Scsjp{ 302177548Scsjp u_int count, moffset, page, poffset; 303177548Scsjp const struct mbuf *m; 304177548Scsjp struct zbuf *zb; 305177548Scsjp 306177548Scsjp KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, 307177548Scsjp ("bpf_zerocopy_append_mbuf not in zbuf mode")); 308177548Scsjp KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf")); 309177548Scsjp 310177548Scsjp m = (struct mbuf *)src; 311177548Scsjp zb = (struct zbuf *)buf; 312177548Scsjp 313189489Srwatson KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0, 314189489Srwatson ("bpf_zerocopy_append_mbuf: ZBUF_FLAG_ASSIGNED")); 315177966Srwatson 316177548Scsjp /* 317177548Scsjp * Scatter gather both from an mbuf chain and to a user page set 318177548Scsjp * mapped into kernel address space using sf_bufs. If we're lucky, 319177548Scsjp * each mbuf requires one copy operation, but if page alignment and 320177548Scsjp * mbuf alignment work out less well, we'll be doing two copies per 321177548Scsjp * mbuf. 322177548Scsjp */ 323177548Scsjp offset += sizeof(struct bpf_zbuf_header); 324177548Scsjp page = offset / PAGE_SIZE; 325177548Scsjp poffset = offset % PAGE_SIZE; 326177548Scsjp moffset = 0; 327177548Scsjp while (len > 0) { 328177548Scsjp KASSERT(page < zb->zb_numpages, 329177548Scsjp ("bpf_zerocopy_append_mbuf: page overflow (%d p %d " 330177548Scsjp "np)\n", page, zb->zb_numpages)); 331177548Scsjp KASSERT(m != NULL, 332177548Scsjp ("bpf_zerocopy_append_mbuf: end of mbuf chain")); 333177548Scsjp 334177548Scsjp count = min(m->m_len - moffset, len); 335177548Scsjp count = min(count, PAGE_SIZE - poffset); 336177548Scsjp bcopy(mtod(m, u_char *) + moffset, 337177548Scsjp ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset, 338177548Scsjp count); 339177548Scsjp poffset += count; 340177548Scsjp if (poffset == PAGE_SIZE) { 341177548Scsjp poffset = 0; 342177548Scsjp page++; 343177548Scsjp } 344177548Scsjp KASSERT(poffset < PAGE_SIZE, 345177548Scsjp ("bpf_zerocopy_append_mbuf: page offset overflow (%d)", 346177548Scsjp poffset)); 347177548Scsjp moffset += count; 348177548Scsjp if (moffset == m->m_len) { 349177548Scsjp m = m->m_next; 350177548Scsjp moffset = 0; 351177548Scsjp } 352177548Scsjp len -= count; 353177548Scsjp } 354177548Scsjp} 355177548Scsjp 356177548Scsjp/* 357177966Srwatson * Notification from the BPF framework that a buffer in the store position is 358177966Srwatson * rejecting packets and may be considered full. We mark the buffer as 359177966Srwatson * immutable and assign to userspace so that it is immediately available for 360177966Srwatson * the user process to access. 361177966Srwatson */ 362177966Srwatsonvoid 363177966Srwatsonbpf_zerocopy_buffull(struct bpf_d *d) 364177966Srwatson{ 365177966Srwatson struct zbuf *zb; 366177966Srwatson 367177966Srwatson KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, 368177966Srwatson ("bpf_zerocopy_buffull: not in zbuf mode")); 369177966Srwatson 370177966Srwatson zb = (struct zbuf *)d->bd_sbuf; 371177966Srwatson KASSERT(zb != NULL, ("bpf_zerocopy_buffull: zb == NULL")); 372177966Srwatson 373189489Srwatson if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) { 374189489Srwatson zb->zb_flags |= ZBUF_FLAG_ASSIGNED; 375177966Srwatson zb->zb_header->bzh_kernel_len = d->bd_slen; 376177966Srwatson atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1); 377177966Srwatson } 378177966Srwatson} 379177966Srwatson 380177966Srwatson/* 381177548Scsjp * Notification from the BPF framework that a buffer has moved into the held 382177548Scsjp * slot on a descriptor. Zero-copy BPF will update the shared page to let 383189489Srwatson * the user process know and flag the buffer as assigned if it hasn't already 384189489Srwatson * been marked assigned due to filling while it was in the store position. 385177966Srwatson * 386177966Srwatson * Note: identical logic as in bpf_zerocopy_buffull(), except that we operate 387177966Srwatson * on bd_hbuf and bd_hlen. 388177548Scsjp */ 389177548Scsjpvoid 390177548Scsjpbpf_zerocopy_bufheld(struct bpf_d *d) 391177548Scsjp{ 392177548Scsjp struct zbuf *zb; 393177548Scsjp 394177548Scsjp KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, 395177548Scsjp ("bpf_zerocopy_bufheld: not in zbuf mode")); 396177548Scsjp 397177548Scsjp zb = (struct zbuf *)d->bd_hbuf; 398177548Scsjp KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL")); 399177966Srwatson 400189489Srwatson if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) { 401189489Srwatson zb->zb_flags |= ZBUF_FLAG_ASSIGNED; 402177966Srwatson zb->zb_header->bzh_kernel_len = d->bd_hlen; 403177966Srwatson atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1); 404177966Srwatson } 405177548Scsjp} 406177548Scsjp 407177548Scsjp/* 408180310Scsjp * Notification from the BPF framework that the free buffer has been been 409189489Srwatson * rotated out of the held position to the free position. This happens when 410189489Srwatson * the user acknowledges the held buffer. 411180310Scsjp */ 412180310Scsjpvoid 413180310Scsjpbpf_zerocopy_buf_reclaimed(struct bpf_d *d) 414180310Scsjp{ 415180310Scsjp struct zbuf *zb; 416180310Scsjp 417180310Scsjp KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, 418180310Scsjp ("bpf_zerocopy_reclaim_buf: not in zbuf mode")); 419180310Scsjp 420180310Scsjp KASSERT(d->bd_fbuf != NULL, 421189489Srwatson ("bpf_zerocopy_buf_reclaimed: NULL free buf")); 422180310Scsjp zb = (struct zbuf *)d->bd_fbuf; 423189489Srwatson zb->zb_flags &= ~ZBUF_FLAG_ASSIGNED; 424180310Scsjp} 425180310Scsjp 426180310Scsjp/* 427177548Scsjp * Query from the BPF framework regarding whether the buffer currently in the 428177548Scsjp * held position can be moved to the free position, which can be indicated by 429177548Scsjp * the user process making their generation number equal to the kernel 430177548Scsjp * generation number. 431177548Scsjp */ 432177548Scsjpint 433177548Scsjpbpf_zerocopy_canfreebuf(struct bpf_d *d) 434177548Scsjp{ 435177548Scsjp struct zbuf *zb; 436177548Scsjp 437177548Scsjp KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, 438177548Scsjp ("bpf_zerocopy_canfreebuf: not in zbuf mode")); 439177548Scsjp 440177548Scsjp zb = (struct zbuf *)d->bd_hbuf; 441177548Scsjp if (zb == NULL) 442177548Scsjp return (0); 443177548Scsjp if (zb->zb_header->bzh_kernel_gen == 444177548Scsjp atomic_load_acq_int(&zb->zb_header->bzh_user_gen)) 445177548Scsjp return (1); 446177548Scsjp return (0); 447177548Scsjp} 448177548Scsjp 449177548Scsjp/* 450177966Srwatson * Query from the BPF framework as to whether or not the buffer current in 451177966Srwatson * the store position can actually be written to. This may return false if 452177966Srwatson * the store buffer is assigned to userspace before the hold buffer is 453177966Srwatson * acknowledged. 454177966Srwatson */ 455177966Srwatsonint 456177966Srwatsonbpf_zerocopy_canwritebuf(struct bpf_d *d) 457177966Srwatson{ 458177966Srwatson struct zbuf *zb; 459177966Srwatson 460177966Srwatson KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, 461177966Srwatson ("bpf_zerocopy_canwritebuf: not in zbuf mode")); 462177966Srwatson 463177966Srwatson zb = (struct zbuf *)d->bd_sbuf; 464177966Srwatson KASSERT(zb != NULL, ("bpf_zerocopy_canwritebuf: bd_sbuf NULL")); 465177966Srwatson 466189489Srwatson if (zb->zb_flags & ZBUF_FLAG_ASSIGNED) 467177966Srwatson return (0); 468177966Srwatson return (1); 469177966Srwatson} 470177966Srwatson 471177966Srwatson/* 472177548Scsjp * Free zero copy buffers at request of descriptor. 473177548Scsjp */ 474177548Scsjpvoid 475177548Scsjpbpf_zerocopy_free(struct bpf_d *d) 476177548Scsjp{ 477177548Scsjp struct zbuf *zb; 478177548Scsjp 479177548Scsjp KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, 480177548Scsjp ("bpf_zerocopy_free: not in zbuf mode")); 481177548Scsjp 482177548Scsjp zb = (struct zbuf *)d->bd_sbuf; 483177548Scsjp if (zb != NULL) 484177548Scsjp zbuf_free(zb); 485177548Scsjp zb = (struct zbuf *)d->bd_hbuf; 486177548Scsjp if (zb != NULL) 487177548Scsjp zbuf_free(zb); 488177548Scsjp zb = (struct zbuf *)d->bd_fbuf; 489177548Scsjp if (zb != NULL) 490177548Scsjp zbuf_free(zb); 491177548Scsjp} 492177548Scsjp 493177548Scsjp/* 494177548Scsjp * Ioctl to return the maximum buffer size. 495177548Scsjp */ 496177548Scsjpint 497177548Scsjpbpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) 498177548Scsjp{ 499177548Scsjp 500177548Scsjp KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, 501177548Scsjp ("bpf_zerocopy_ioctl_getzmax: not in zbuf mode")); 502177548Scsjp 503177548Scsjp *i = BPF_MAX_PAGES * PAGE_SIZE; 504177548Scsjp return (0); 505177548Scsjp} 506177548Scsjp 507177548Scsjp/* 508177548Scsjp * Ioctl to force rotation of the two buffers, if there's any data available. 509189489Srwatson * This can be used by user space to implement timeouts when waiting for a 510177548Scsjp * buffer to fill. 511177548Scsjp */ 512177548Scsjpint 513177548Scsjpbpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, 514177548Scsjp struct bpf_zbuf *bz) 515177548Scsjp{ 516177548Scsjp struct zbuf *bzh; 517177548Scsjp 518177548Scsjp bzero(bz, sizeof(*bz)); 519177548Scsjp BPFD_LOCK(d); 520177548Scsjp if (d->bd_hbuf == NULL && d->bd_slen != 0) { 521177548Scsjp ROTATE_BUFFERS(d); 522177548Scsjp bzh = (struct zbuf *)d->bd_hbuf; 523177548Scsjp bz->bz_bufa = (void *)bzh->zb_uaddr; 524177548Scsjp bz->bz_buflen = d->bd_hlen; 525177548Scsjp } 526177548Scsjp BPFD_UNLOCK(d); 527177548Scsjp return (0); 528177548Scsjp} 529177548Scsjp 530177548Scsjp/* 531177548Scsjp * Ioctl to configure zero-copy buffers -- may be done only once. 532177548Scsjp */ 533177548Scsjpint 534177548Scsjpbpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d, 535177548Scsjp struct bpf_zbuf *bz) 536177548Scsjp{ 537177548Scsjp struct zbuf *zba, *zbb; 538177548Scsjp int error; 539177548Scsjp 540177548Scsjp KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, 541177548Scsjp ("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode")); 542177548Scsjp 543177548Scsjp /* 544177548Scsjp * Must set both buffers. Cannot clear them. 545177548Scsjp */ 546177548Scsjp if (bz->bz_bufa == NULL || bz->bz_bufb == NULL) 547177548Scsjp return (EINVAL); 548177548Scsjp 549177548Scsjp /* 550177548Scsjp * Buffers must have a size greater than 0. Alignment and other size 551177548Scsjp * validity checking is done in zbuf_setup(). 552177548Scsjp */ 553177548Scsjp if (bz->bz_buflen == 0) 554177548Scsjp return (EINVAL); 555177548Scsjp 556177548Scsjp /* 557177548Scsjp * Allocate new buffers. 558177548Scsjp */ 559177548Scsjp error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen, 560177548Scsjp &zba); 561177548Scsjp if (error) 562177548Scsjp return (error); 563177548Scsjp error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen, 564177548Scsjp &zbb); 565177548Scsjp if (error) { 566177548Scsjp zbuf_free(zba); 567177548Scsjp return (error); 568177548Scsjp } 569177548Scsjp 570177548Scsjp /* 571177548Scsjp * We only allow buffers to be installed once, so atomically check 572177548Scsjp * that no buffers are currently installed and install new buffers. 573177548Scsjp */ 574177548Scsjp BPFD_LOCK(d); 575177548Scsjp if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL || 576177548Scsjp d->bd_bif != NULL) { 577177548Scsjp BPFD_UNLOCK(d); 578177548Scsjp zbuf_free(zba); 579177548Scsjp zbuf_free(zbb); 580177548Scsjp return (EINVAL); 581177548Scsjp } 582177647Srwatson 583177647Srwatson /* 584177647Srwatson * Point BPF descriptor at buffers; initialize sbuf as zba so that 585177647Srwatson * it is always filled first in the sequence, per bpf(4). 586177647Srwatson */ 587177548Scsjp d->bd_fbuf = (caddr_t)zbb; 588177548Scsjp d->bd_sbuf = (caddr_t)zba; 589177548Scsjp d->bd_slen = 0; 590177548Scsjp d->bd_hlen = 0; 591177548Scsjp 592177548Scsjp /* 593177548Scsjp * We expose only the space left in the buffer after the size of the 594177548Scsjp * shared management region. 595177548Scsjp */ 596177548Scsjp d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header); 597177548Scsjp BPFD_UNLOCK(d); 598177548Scsjp return (0); 599177548Scsjp} 600