1177548Scsjp/*-
2177548Scsjp * Copyright (c) 2007 Seccuris Inc.
3177548Scsjp * All rights reserved.
4177548Scsjp *
5234969Seadler * This software was developed by Robert N. M. Watson under contract to
6177548Scsjp * Seccuris Inc.
7177548Scsjp *
8177548Scsjp * Redistribution and use in source and binary forms, with or without
9177548Scsjp * modification, are permitted provided that the following conditions
10177548Scsjp * are met:
11177548Scsjp * 1. Redistributions of source code must retain the above copyright
12177548Scsjp *    notice, this list of conditions and the following disclaimer.
13177548Scsjp * 2. Redistributions in binary form must reproduce the above copyright
14177548Scsjp *    notice, this list of conditions and the following disclaimer in the
15177548Scsjp *    documentation and/or other materials provided with the distribution.
16177548Scsjp *
17177548Scsjp * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18177548Scsjp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19177548Scsjp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20177548Scsjp * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21177548Scsjp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22177548Scsjp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23177548Scsjp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24177548Scsjp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25177548Scsjp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26177548Scsjp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27177548Scsjp * SUCH DAMAGE.
28177548Scsjp */
29177548Scsjp
30177548Scsjp#include <sys/cdefs.h>
31177548Scsjp__FBSDID("$FreeBSD$");
32177548Scsjp
33177548Scsjp#include "opt_bpf.h"
34177548Scsjp
35177548Scsjp#include <sys/param.h>
36177548Scsjp#include <sys/lock.h>
37177548Scsjp#include <sys/malloc.h>
38177548Scsjp#include <sys/mbuf.h>
39177548Scsjp#include <sys/mutex.h>
40177548Scsjp#include <sys/proc.h>
41177548Scsjp#include <sys/sf_buf.h>
42177548Scsjp#include <sys/socket.h>
43177548Scsjp#include <sys/uio.h>
44177548Scsjp
45177548Scsjp#include <machine/atomic.h>
46177548Scsjp
47177548Scsjp#include <net/if.h>
48177548Scsjp#include <net/bpf.h>
49177585Sjkim#include <net/bpf_zerocopy.h>
50177548Scsjp#include <net/bpfdesc.h>
51177548Scsjp
52177548Scsjp#include <vm/vm.h>
53240238Skib#include <vm/vm_param.h>
54177548Scsjp#include <vm/pmap.h>
55177548Scsjp#include <vm/vm_extern.h>
56177548Scsjp#include <vm/vm_map.h>
57177548Scsjp#include <vm/vm_page.h>
58177548Scsjp
59177548Scsjp/*
60177548Scsjp * Zero-copy buffer scheme for BPF: user space "donates" two buffers, which
61177548Scsjp * are mapped into the kernel address space using sf_bufs and used directly
62177548Scsjp * by BPF.  Memory is wired since page faults cannot be tolerated in the
63177548Scsjp * contexts where the buffers are copied to (locks held, interrupt context,
64177548Scsjp * etc).  Access to shared memory buffers is synchronized using a header on
65177548Scsjp * each buffer, allowing the number of system calls to go to zero as BPF
66177548Scsjp * reaches saturation (buffers filled as fast as they can be drained by the
67177548Scsjp * user process).  Full details of the protocol for communicating between the
68177548Scsjp * user process and BPF may be found in bpf(4).
69177548Scsjp */
70177548Scsjp
71177548Scsjp/*
72177548Scsjp * Maximum number of pages per buffer.  Since all BPF devices use two, the
73177548Scsjp * maximum per device is 2*BPF_MAX_PAGES.  Resource limits on the number of
74177548Scsjp * sf_bufs may be an issue, so do not set this too high.  On older systems,
75177548Scsjp * kernel address space limits may also be an issue.
76177548Scsjp */
77177548Scsjp#define	BPF_MAX_PAGES	512
78177548Scsjp
79177548Scsjp/*
80177548Scsjp * struct zbuf describes a memory buffer loaned by a user process to the
81177548Scsjp * kernel.  We represent this as a series of pages managed using an array of
82177548Scsjp * sf_bufs.  Even though the memory is contiguous in user space, it may not
83177548Scsjp * be mapped contiguously in the kernel (i.e., a set of physically
84177548Scsjp * non-contiguous pages in the direct map region) so we must implement
85177548Scsjp * scatter-gather copying.  One significant mitigating factor is that on
86177548Scsjp * systems with a direct memory map, we can avoid TLB misses.
87177548Scsjp *
88177966Srwatson * At the front of the shared memory region is a bpf_zbuf_header, which
89177548Scsjp * contains shared control data to allow user space and the kernel to
90177548Scsjp * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF
91177548Scsjp * knows that the space is not available.
92177548Scsjp */
93177548Scsjpstruct zbuf {
94189489Srwatson	vm_offset_t	 zb_uaddr;	/* User address at time of setup. */
95177548Scsjp	size_t		 zb_size;	/* Size of buffer, incl. header. */
96177548Scsjp	u_int		 zb_numpages;	/* Number of pages. */
97177966Srwatson	int		 zb_flags;	/* Flags on zbuf. */
98177548Scsjp	struct sf_buf	**zb_pages;	/* Pages themselves. */
99177548Scsjp	struct bpf_zbuf_header	*zb_header;	/* Shared header. */
100177548Scsjp};
101177548Scsjp
102177548Scsjp/*
103177966Srwatson * When a buffer has been assigned to userspace, flag it as such, as the
104177966Srwatson * buffer may remain in the store position as a result of the user process
105177966Srwatson * not yet having acknowledged the buffer in the hold position yet.
106177966Srwatson */
107189489Srwatson#define	ZBUF_FLAG_ASSIGNED	0x00000001	/* Set when owned by user. */
108177966Srwatson
109177966Srwatson/*
110177548Scsjp * Release a page we've previously wired.
111177548Scsjp */
112177548Scsjpstatic void
113177548Scsjpzbuf_page_free(vm_page_t pp)
114177548Scsjp{
115177548Scsjp
116207548Salc	vm_page_lock(pp);
117177548Scsjp	vm_page_unwire(pp, 0);
118177548Scsjp	if (pp->wire_count == 0 && pp->object == NULL)
119177548Scsjp		vm_page_free(pp);
120207548Salc	vm_page_unlock(pp);
121177548Scsjp}
122177548Scsjp
123177548Scsjp/*
124177548Scsjp * Free an sf_buf with attached page.
125177548Scsjp */
126177548Scsjpstatic void
127177548Scsjpzbuf_sfbuf_free(struct sf_buf *sf)
128177548Scsjp{
129177548Scsjp	vm_page_t pp;
130177548Scsjp
131177548Scsjp	pp = sf_buf_page(sf);
132177548Scsjp	sf_buf_free(sf);
133177548Scsjp	zbuf_page_free(pp);
134177548Scsjp}
135177548Scsjp
136177548Scsjp/*
137177548Scsjp * Free a zbuf, including its page array, sbufs, and pages.  Allow partially
138177548Scsjp * allocated zbufs to be freed so that it may be used even during a zbuf
139177548Scsjp * setup.
140177548Scsjp */
141177548Scsjpstatic void
142177548Scsjpzbuf_free(struct zbuf *zb)
143177548Scsjp{
144177548Scsjp	int i;
145177548Scsjp
146177548Scsjp	for (i = 0; i < zb->zb_numpages; i++) {
147177548Scsjp		if (zb->zb_pages[i] != NULL)
148177548Scsjp			zbuf_sfbuf_free(zb->zb_pages[i]);
149177548Scsjp	}
150177548Scsjp	free(zb->zb_pages, M_BPF);
151177548Scsjp	free(zb, M_BPF);
152177548Scsjp}
153177548Scsjp
154177548Scsjp/*
155177548Scsjp * Given a user pointer to a page of user memory, return an sf_buf for the
156177548Scsjp * page.  Because we may be requesting quite a few sf_bufs, prefer failure to
157177548Scsjp * deadlock and use SFB_NOWAIT.
158177548Scsjp */
159177548Scsjpstatic struct sf_buf *
160177548Scsjpzbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr)
161177548Scsjp{
162177548Scsjp	struct sf_buf *sf;
163177548Scsjp	vm_page_t pp;
164177548Scsjp
165216699Salc	if (vm_fault_quick_hold_pages(map, uaddr, PAGE_SIZE, VM_PROT_READ |
166216699Salc	    VM_PROT_WRITE, &pp, 1) < 0)
167177548Scsjp		return (NULL);
168207410Skmacy	vm_page_lock(pp);
169177548Scsjp	vm_page_wire(pp);
170177548Scsjp	vm_page_unhold(pp);
171207410Skmacy	vm_page_unlock(pp);
172177548Scsjp	sf = sf_buf_alloc(pp, SFB_NOWAIT);
173177548Scsjp	if (sf == NULL) {
174177548Scsjp		zbuf_page_free(pp);
175177548Scsjp		return (NULL);
176177548Scsjp	}
177177548Scsjp	return (sf);
178177548Scsjp}
179177548Scsjp
180177548Scsjp/*
181177548Scsjp * Create a zbuf describing a range of user address space memory.  Validate
182177548Scsjp * page alignment, size requirements, etc.
183177548Scsjp */
184177548Scsjpstatic int
185177548Scsjpzbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len,
186177548Scsjp    struct zbuf **zbp)
187177548Scsjp{
188177548Scsjp	struct zbuf *zb;
189177548Scsjp	struct vm_map *map;
190177548Scsjp	int error, i;
191177548Scsjp
192177548Scsjp	*zbp = NULL;
193177548Scsjp
194177548Scsjp	/*
195177548Scsjp	 * User address must be page-aligned.
196177548Scsjp	 */
197177548Scsjp	if (uaddr & PAGE_MASK)
198177548Scsjp		return (EINVAL);
199177548Scsjp
200177548Scsjp	/*
201177548Scsjp	 * Length must be an integer number of full pages.
202177548Scsjp	 */
203177548Scsjp	if (len & PAGE_MASK)
204177548Scsjp		return (EINVAL);
205177548Scsjp
206177548Scsjp	/*
207177548Scsjp	 * Length must not exceed per-buffer resource limit.
208177548Scsjp	 */
209177548Scsjp	if ((len / PAGE_SIZE) > BPF_MAX_PAGES)
210177548Scsjp		return (EINVAL);
211177548Scsjp
212177548Scsjp	/*
213177548Scsjp	 * Allocate the buffer and set up each page with is own sf_buf.
214177548Scsjp	 */
215177548Scsjp	error = 0;
216177548Scsjp	zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK);
217177548Scsjp	zb->zb_uaddr = uaddr;
218177548Scsjp	zb->zb_size = len;
219177548Scsjp	zb->zb_numpages = len / PAGE_SIZE;
220177548Scsjp	zb->zb_pages = malloc(sizeof(struct sf_buf *) *
221177548Scsjp	    zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK);
222177548Scsjp	map = &td->td_proc->p_vmspace->vm_map;
223177548Scsjp	for (i = 0; i < zb->zb_numpages; i++) {
224177548Scsjp		zb->zb_pages[i] = zbuf_sfbuf_get(map,
225177548Scsjp		    uaddr + (i * PAGE_SIZE));
226177548Scsjp		if (zb->zb_pages[i] == NULL) {
227177548Scsjp			error = EFAULT;
228177548Scsjp			goto error;
229177548Scsjp		}
230177548Scsjp	}
231177548Scsjp	zb->zb_header =
232177548Scsjp	    (struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]);
233177548Scsjp	bzero(zb->zb_header, sizeof(*zb->zb_header));
234177548Scsjp	*zbp = zb;
235177548Scsjp	return (0);
236177548Scsjp
237177548Scsjperror:
238177548Scsjp	zbuf_free(zb);
239177548Scsjp	return (error);
240177548Scsjp}
241177548Scsjp
242177548Scsjp/*
243177548Scsjp * Copy bytes from a source into the specified zbuf.  The caller is
244177548Scsjp * responsible for performing bounds checking, etc.
245177548Scsjp */
246177548Scsjpvoid
247177548Scsjpbpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
248177548Scsjp    void *src, u_int len)
249177548Scsjp{
250177548Scsjp	u_int count, page, poffset;
251177548Scsjp	u_char *src_bytes;
252177548Scsjp	struct zbuf *zb;
253177548Scsjp
254177548Scsjp	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
255177548Scsjp	    ("bpf_zerocopy_append_bytes: not in zbuf mode"));
256177548Scsjp	KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf"));
257177548Scsjp
258177548Scsjp	src_bytes = (u_char *)src;
259177548Scsjp	zb = (struct zbuf *)buf;
260177548Scsjp
261189489Srwatson	KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0,
262189489Srwatson	    ("bpf_zerocopy_append_bytes: ZBUF_FLAG_ASSIGNED"));
263177966Srwatson
264177548Scsjp	/*
265177548Scsjp	 * Scatter-gather copy to user pages mapped into kernel address space
266177548Scsjp	 * using sf_bufs: copy up to a page at a time.
267177548Scsjp	 */
268177548Scsjp	offset += sizeof(struct bpf_zbuf_header);
269177548Scsjp	page = offset / PAGE_SIZE;
270177548Scsjp	poffset = offset % PAGE_SIZE;
271177548Scsjp	while (len > 0) {
272177548Scsjp		KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:"
273177548Scsjp		   " page overflow (%d p %d np)\n", page, zb->zb_numpages));
274177548Scsjp
275177548Scsjp		count = min(len, PAGE_SIZE - poffset);
276177548Scsjp		bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) +
277177548Scsjp		    poffset, count);
278177548Scsjp		poffset += count;
279177548Scsjp		if (poffset == PAGE_SIZE) {
280177548Scsjp			poffset = 0;
281177548Scsjp			page++;
282177548Scsjp		}
283177548Scsjp		KASSERT(poffset < PAGE_SIZE,
284177548Scsjp		    ("bpf_zerocopy_append_bytes: page offset overflow (%d)",
285177548Scsjp		    poffset));
286177548Scsjp		len -= count;
287177548Scsjp		src_bytes += count;
288177548Scsjp	}
289177548Scsjp}
290177548Scsjp
291177548Scsjp/*
292177548Scsjp * Copy bytes from an mbuf chain to the specified zbuf: copying will be
293177548Scsjp * scatter-gather both from mbufs, which may be fragmented over memory, and
294177548Scsjp * to pages, which may not be contiguously mapped in kernel address space.
295177548Scsjp * As with bpf_zerocopy_append_bytes(), the caller is responsible for
296177548Scsjp * checking that this will not exceed the buffer limit.
297177548Scsjp */
298177548Scsjpvoid
299177548Scsjpbpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
300177548Scsjp    void *src, u_int len)
301177548Scsjp{
302177548Scsjp	u_int count, moffset, page, poffset;
303177548Scsjp	const struct mbuf *m;
304177548Scsjp	struct zbuf *zb;
305177548Scsjp
306177548Scsjp	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
307177548Scsjp	    ("bpf_zerocopy_append_mbuf not in zbuf mode"));
308177548Scsjp	KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf"));
309177548Scsjp
310177548Scsjp	m = (struct mbuf *)src;
311177548Scsjp	zb = (struct zbuf *)buf;
312177548Scsjp
313189489Srwatson	KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0,
314189489Srwatson	    ("bpf_zerocopy_append_mbuf: ZBUF_FLAG_ASSIGNED"));
315177966Srwatson
316177548Scsjp	/*
317177548Scsjp	 * Scatter gather both from an mbuf chain and to a user page set
318177548Scsjp	 * mapped into kernel address space using sf_bufs.  If we're lucky,
319177548Scsjp	 * each mbuf requires one copy operation, but if page alignment and
320177548Scsjp	 * mbuf alignment work out less well, we'll be doing two copies per
321177548Scsjp	 * mbuf.
322177548Scsjp	 */
323177548Scsjp	offset += sizeof(struct bpf_zbuf_header);
324177548Scsjp	page = offset / PAGE_SIZE;
325177548Scsjp	poffset = offset % PAGE_SIZE;
326177548Scsjp	moffset = 0;
327177548Scsjp	while (len > 0) {
328177548Scsjp		KASSERT(page < zb->zb_numpages,
329177548Scsjp		    ("bpf_zerocopy_append_mbuf: page overflow (%d p %d "
330177548Scsjp		    "np)\n", page, zb->zb_numpages));
331177548Scsjp		KASSERT(m != NULL,
332177548Scsjp		    ("bpf_zerocopy_append_mbuf: end of mbuf chain"));
333177548Scsjp
334177548Scsjp		count = min(m->m_len - moffset, len);
335177548Scsjp		count = min(count, PAGE_SIZE - poffset);
336177548Scsjp		bcopy(mtod(m, u_char *) + moffset,
337177548Scsjp		    ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset,
338177548Scsjp		    count);
339177548Scsjp		poffset += count;
340177548Scsjp		if (poffset == PAGE_SIZE) {
341177548Scsjp			poffset = 0;
342177548Scsjp			page++;
343177548Scsjp		}
344177548Scsjp		KASSERT(poffset < PAGE_SIZE,
345177548Scsjp		    ("bpf_zerocopy_append_mbuf: page offset overflow (%d)",
346177548Scsjp		    poffset));
347177548Scsjp		moffset += count;
348177548Scsjp		if (moffset == m->m_len) {
349177548Scsjp			m = m->m_next;
350177548Scsjp			moffset = 0;
351177548Scsjp		}
352177548Scsjp		len -= count;
353177548Scsjp	}
354177548Scsjp}
355177548Scsjp
356177548Scsjp/*
357177966Srwatson * Notification from the BPF framework that a buffer in the store position is
358177966Srwatson * rejecting packets and may be considered full.  We mark the buffer as
359177966Srwatson * immutable and assign to userspace so that it is immediately available for
360177966Srwatson * the user process to access.
361177966Srwatson */
362177966Srwatsonvoid
363177966Srwatsonbpf_zerocopy_buffull(struct bpf_d *d)
364177966Srwatson{
365177966Srwatson	struct zbuf *zb;
366177966Srwatson
367177966Srwatson	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
368177966Srwatson	    ("bpf_zerocopy_buffull: not in zbuf mode"));
369177966Srwatson
370177966Srwatson	zb = (struct zbuf *)d->bd_sbuf;
371177966Srwatson	KASSERT(zb != NULL, ("bpf_zerocopy_buffull: zb == NULL"));
372177966Srwatson
373189489Srwatson	if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) {
374189489Srwatson		zb->zb_flags |= ZBUF_FLAG_ASSIGNED;
375177966Srwatson		zb->zb_header->bzh_kernel_len = d->bd_slen;
376177966Srwatson		atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
377177966Srwatson	}
378177966Srwatson}
379177966Srwatson
380177966Srwatson/*
381177548Scsjp * Notification from the BPF framework that a buffer has moved into the held
382177548Scsjp * slot on a descriptor.  Zero-copy BPF will update the shared page to let
383189489Srwatson * the user process know and flag the buffer as assigned if it hasn't already
384189489Srwatson * been marked assigned due to filling while it was in the store position.
385177966Srwatson *
386177966Srwatson * Note: identical logic as in bpf_zerocopy_buffull(), except that we operate
387177966Srwatson * on bd_hbuf and bd_hlen.
388177548Scsjp */
389177548Scsjpvoid
390177548Scsjpbpf_zerocopy_bufheld(struct bpf_d *d)
391177548Scsjp{
392177548Scsjp	struct zbuf *zb;
393177548Scsjp
394177548Scsjp	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
395177548Scsjp	    ("bpf_zerocopy_bufheld: not in zbuf mode"));
396177548Scsjp
397177548Scsjp	zb = (struct zbuf *)d->bd_hbuf;
398177548Scsjp	KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL"));
399177966Srwatson
400189489Srwatson	if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) {
401189489Srwatson		zb->zb_flags |= ZBUF_FLAG_ASSIGNED;
402177966Srwatson		zb->zb_header->bzh_kernel_len = d->bd_hlen;
403177966Srwatson		atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
404177966Srwatson	}
405177548Scsjp}
406177548Scsjp
407177548Scsjp/*
408180310Scsjp * Notification from the BPF framework that the free buffer has been been
409189489Srwatson * rotated out of the held position to the free position.  This happens when
410189489Srwatson * the user acknowledges the held buffer.
411180310Scsjp */
412180310Scsjpvoid
413180310Scsjpbpf_zerocopy_buf_reclaimed(struct bpf_d *d)
414180310Scsjp{
415180310Scsjp	struct zbuf *zb;
416180310Scsjp
417180310Scsjp	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
418180310Scsjp	    ("bpf_zerocopy_reclaim_buf: not in zbuf mode"));
419180310Scsjp
420180310Scsjp	KASSERT(d->bd_fbuf != NULL,
421189489Srwatson	    ("bpf_zerocopy_buf_reclaimed: NULL free buf"));
422180310Scsjp	zb = (struct zbuf *)d->bd_fbuf;
423189489Srwatson	zb->zb_flags &= ~ZBUF_FLAG_ASSIGNED;
424180310Scsjp}
425180310Scsjp
426180310Scsjp/*
427177548Scsjp * Query from the BPF framework regarding whether the buffer currently in the
428177548Scsjp * held position can be moved to the free position, which can be indicated by
429177548Scsjp * the user process making their generation number equal to the kernel
430177548Scsjp * generation number.
431177548Scsjp */
432177548Scsjpint
433177548Scsjpbpf_zerocopy_canfreebuf(struct bpf_d *d)
434177548Scsjp{
435177548Scsjp	struct zbuf *zb;
436177548Scsjp
437177548Scsjp	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
438177548Scsjp	    ("bpf_zerocopy_canfreebuf: not in zbuf mode"));
439177548Scsjp
440177548Scsjp	zb = (struct zbuf *)d->bd_hbuf;
441177548Scsjp	if (zb == NULL)
442177548Scsjp		return (0);
443177548Scsjp	if (zb->zb_header->bzh_kernel_gen ==
444177548Scsjp	    atomic_load_acq_int(&zb->zb_header->bzh_user_gen))
445177548Scsjp		return (1);
446177548Scsjp	return (0);
447177548Scsjp}
448177548Scsjp
449177548Scsjp/*
450177966Srwatson * Query from the BPF framework as to whether or not the buffer current in
451177966Srwatson * the store position can actually be written to.  This may return false if
452177966Srwatson * the store buffer is assigned to userspace before the hold buffer is
453177966Srwatson * acknowledged.
454177966Srwatson */
455177966Srwatsonint
456177966Srwatsonbpf_zerocopy_canwritebuf(struct bpf_d *d)
457177966Srwatson{
458177966Srwatson	struct zbuf *zb;
459177966Srwatson
460177966Srwatson	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
461177966Srwatson	    ("bpf_zerocopy_canwritebuf: not in zbuf mode"));
462177966Srwatson
463177966Srwatson	zb = (struct zbuf *)d->bd_sbuf;
464177966Srwatson	KASSERT(zb != NULL, ("bpf_zerocopy_canwritebuf: bd_sbuf NULL"));
465177966Srwatson
466189489Srwatson	if (zb->zb_flags & ZBUF_FLAG_ASSIGNED)
467177966Srwatson		return (0);
468177966Srwatson	return (1);
469177966Srwatson}
470177966Srwatson
471177966Srwatson/*
472177548Scsjp * Free zero copy buffers at request of descriptor.
473177548Scsjp */
474177548Scsjpvoid
475177548Scsjpbpf_zerocopy_free(struct bpf_d *d)
476177548Scsjp{
477177548Scsjp	struct zbuf *zb;
478177548Scsjp
479177548Scsjp	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
480177548Scsjp	    ("bpf_zerocopy_free: not in zbuf mode"));
481177548Scsjp
482177548Scsjp	zb = (struct zbuf *)d->bd_sbuf;
483177548Scsjp	if (zb != NULL)
484177548Scsjp		zbuf_free(zb);
485177548Scsjp	zb = (struct zbuf *)d->bd_hbuf;
486177548Scsjp	if (zb != NULL)
487177548Scsjp		zbuf_free(zb);
488177548Scsjp	zb = (struct zbuf *)d->bd_fbuf;
489177548Scsjp	if (zb != NULL)
490177548Scsjp		zbuf_free(zb);
491177548Scsjp}
492177548Scsjp
493177548Scsjp/*
494177548Scsjp * Ioctl to return the maximum buffer size.
495177548Scsjp */
496177548Scsjpint
497177548Scsjpbpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
498177548Scsjp{
499177548Scsjp
500177548Scsjp	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
501177548Scsjp	    ("bpf_zerocopy_ioctl_getzmax: not in zbuf mode"));
502177548Scsjp
503177548Scsjp	*i = BPF_MAX_PAGES * PAGE_SIZE;
504177548Scsjp	return (0);
505177548Scsjp}
506177548Scsjp
507177548Scsjp/*
508177548Scsjp * Ioctl to force rotation of the two buffers, if there's any data available.
509189489Srwatson * This can be used by user space to implement timeouts when waiting for a
510177548Scsjp * buffer to fill.
511177548Scsjp */
512177548Scsjpint
513177548Scsjpbpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,
514177548Scsjp    struct bpf_zbuf *bz)
515177548Scsjp{
516177548Scsjp	struct zbuf *bzh;
517177548Scsjp
518177548Scsjp	bzero(bz, sizeof(*bz));
519177548Scsjp	BPFD_LOCK(d);
520177548Scsjp	if (d->bd_hbuf == NULL && d->bd_slen != 0) {
521177548Scsjp		ROTATE_BUFFERS(d);
522177548Scsjp		bzh = (struct zbuf *)d->bd_hbuf;
523177548Scsjp		bz->bz_bufa = (void *)bzh->zb_uaddr;
524177548Scsjp		bz->bz_buflen = d->bd_hlen;
525177548Scsjp	}
526177548Scsjp	BPFD_UNLOCK(d);
527177548Scsjp	return (0);
528177548Scsjp}
529177548Scsjp
530177548Scsjp/*
531177548Scsjp * Ioctl to configure zero-copy buffers -- may be done only once.
532177548Scsjp */
533177548Scsjpint
534177548Scsjpbpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,
535177548Scsjp    struct bpf_zbuf *bz)
536177548Scsjp{
537177548Scsjp	struct zbuf *zba, *zbb;
538177548Scsjp	int error;
539177548Scsjp
540177548Scsjp	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
541177548Scsjp	    ("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode"));
542177548Scsjp
543177548Scsjp	/*
544177548Scsjp	 * Must set both buffers.  Cannot clear them.
545177548Scsjp	 */
546177548Scsjp	if (bz->bz_bufa == NULL || bz->bz_bufb == NULL)
547177548Scsjp		return (EINVAL);
548177548Scsjp
549177548Scsjp	/*
550177548Scsjp	 * Buffers must have a size greater than 0.  Alignment and other size
551177548Scsjp	 * validity checking is done in zbuf_setup().
552177548Scsjp	 */
553177548Scsjp	if (bz->bz_buflen == 0)
554177548Scsjp		return (EINVAL);
555177548Scsjp
556177548Scsjp	/*
557177548Scsjp	 * Allocate new buffers.
558177548Scsjp	 */
559177548Scsjp	error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen,
560177548Scsjp	    &zba);
561177548Scsjp	if (error)
562177548Scsjp		return (error);
563177548Scsjp	error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen,
564177548Scsjp	    &zbb);
565177548Scsjp	if (error) {
566177548Scsjp		zbuf_free(zba);
567177548Scsjp		return (error);
568177548Scsjp	}
569177548Scsjp
570177548Scsjp	/*
571177548Scsjp	 * We only allow buffers to be installed once, so atomically check
572177548Scsjp	 * that no buffers are currently installed and install new buffers.
573177548Scsjp	 */
574177548Scsjp	BPFD_LOCK(d);
575177548Scsjp	if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL ||
576177548Scsjp	    d->bd_bif != NULL) {
577177548Scsjp		BPFD_UNLOCK(d);
578177548Scsjp		zbuf_free(zba);
579177548Scsjp		zbuf_free(zbb);
580177548Scsjp		return (EINVAL);
581177548Scsjp	}
582177647Srwatson
583177647Srwatson	/*
584177647Srwatson	 * Point BPF descriptor at buffers; initialize sbuf as zba so that
585177647Srwatson	 * it is always filled first in the sequence, per bpf(4).
586177647Srwatson	 */
587177548Scsjp	d->bd_fbuf = (caddr_t)zbb;
588177548Scsjp	d->bd_sbuf = (caddr_t)zba;
589177548Scsjp	d->bd_slen = 0;
590177548Scsjp	d->bd_hlen = 0;
591177548Scsjp
592177548Scsjp	/*
593177548Scsjp	 * We expose only the space left in the buffer after the size of the
594177548Scsjp	 * shared management region.
595177548Scsjp	 */
596177548Scsjp	d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header);
597177548Scsjp	BPFD_UNLOCK(d);
598177548Scsjp	return (0);
599177548Scsjp}
600