1/*
2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *   1. Redistributions of source code must retain the above copyright
8 *      notice, this list of conditions and the following disclaimer.
9 *   2. Redistributions in binary form must reproduce the above copyright
10 *      notice, this list of conditions and the following disclaimer in the
11 *      documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26
27/*
28 * $FreeBSD$
29 *
30 * This module supports memory mapped access to network devices,
31 * see netmap(4).
32 *
33 * The module uses a large, memory pool allocated by the kernel
34 * and accessible as mmapped memory by multiple userspace threads/processes.
35 * The memory pool contains packet buffers and "netmap rings",
36 * i.e. user-accessible copies of the interface's queues.
37 *
38 * Access to the network card works like this:
39 * 1. a process/thread issues one or more open() on /dev/netmap, to create
40 *    select()able file descriptor on which events are reported.
41 * 2. on each descriptor, the process issues an ioctl() to identify
42 *    the interface that should report events to the file descriptor.
43 * 3. on each descriptor, the process issues an mmap() request to
44 *    map the shared memory region within the process' address space.
45 *    The list of interesting queues is indicated by a location in
46 *    the shared memory region.
47 * 4. using the functions in the netmap(4) userspace API, a process
48 *    can look up the occupation state of a queue, access memory buffers,
49 *    and retrieve received packets or enqueue packets to transmit.
50 * 5. using some ioctl()s the process can synchronize the userspace view
51 *    of the queue with the actual status in the kernel. This includes both
52 *    receiving the notification of new packets, and transmitting new
53 *    packets on the output interface.
54 * 6. select() or poll() can be used to wait for events on individual
55 *    transmit or receive queues (or all queues for a given interface).
56 *
57
58		SYNCHRONIZATION (USER)
59
60The netmap rings and data structures may be shared among multiple
61user threads or even independent processes.
62Any synchronization among those threads/processes is delegated
63to the threads themselves. Only one thread at a time can be in
64a system call on the same netmap ring. The OS does not enforce
65this and only guarantees against system crashes in case of
66invalid usage.
67
68		LOCKING (INTERNAL)
69
70Within the kernel, access to the netmap rings is protected as follows:
71
72- a spinlock on each ring, to handle producer/consumer races on
73  RX rings attached to the host stack (against multiple host
74  threads writing from the host stack to the same ring),
75  and on 'destination' rings attached to a VALE switch
76  (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77  protecting multiple active senders for the same destination)
78
79- an atomic variable to guarantee that there is at most one
80  instance of *_*xsync() on the ring at any time.
81  For rings connected to user file
82  descriptors, an atomic_test_and_set() protects this, and the
83  lock on the ring is not actually used.
84  For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85  is also used to prevent multiple executions (the driver might indeed
86  already guarantee this).
87  For NIC TX rings connected to a VALE switch, the lock arbitrates
88  access to the queue (both when allocating buffers and when pushing
89  them out).
90
91- *xsync() should be protected against initializations of the card.
92  On FreeBSD most devices have the reset routine protected by
93  a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94  the RING protection on rx_reset(), this should be added.
95
96  On linux there is an external lock on the tx path, which probably
97  also arbitrates access to the reset routine. XXX to be revised
98
99- a per-interface core_lock protecting access from the host stack
100  while interfaces may be detached from netmap mode.
101  XXX there should be no need for this lock if we detach the interfaces
102  only while they are down.
103
104
105--- VALE SWITCH ---
106
107NMG_LOCK() serializes all modifications to switches and ports.
108A switch cannot be deleted until all ports are gone.
109
110For each switch, an SX lock (RWlock on linux) protects
111deletion of ports. When configuring or deleting a new port, the
112lock is acquired in exclusive mode (after holding NMG_LOCK).
113When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114The lock is held throughout the entire forwarding cycle,
115during which the thread may incur in a page fault.
116Hence it is important that sleepable shared locks are used.
117
118On the rx ring, the per-port lock is grabbed initially to reserve
119a number of slot in the ring, then the lock is released,
120packets are copied from source to destination, and then
121the lock is acquired again and the receive ring is updated.
122(A similar thing is done on the tx ring for NIC and host stack
123ports attached to the switch)
124
125 */
126
127
128/* --- internals ----
129 *
130 * Roadmap to the code that implements the above.
131 *
132 * > 1. a process/thread issues one or more open() on /dev/netmap, to create
133 * >    select()able file descriptor on which events are reported.
134 *
135 *  	Internally, we allocate a netmap_priv_d structure, that will be
136 *  	initialized on ioctl(NIOCREGIF).
137 *
138 *      os-specific:
139 *  	    FreeBSD: netmap_open (netmap_freebsd.c). The priv is
140 *  		     per-thread.
141 *  	    linux:   linux_netmap_open (netmap_linux.c). The priv is
142 *  		     per-open.
143 *
144 * > 2. on each descriptor, the process issues an ioctl() to identify
145 * >    the interface that should report events to the file descriptor.
146 *
147 * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
148 * 	Most important things happen in netmap_get_na() and
149 * 	netmap_do_regif(), called from there. Additional details can be
150 * 	found in the comments above those functions.
151 *
152 * 	In all cases, this action creates/takes-a-reference-to a
153 * 	netmap_*_adapter describing the port, and allocates a netmap_if
154 * 	and all necessary netmap rings, filling them with netmap buffers.
155 *
156 *      In this phase, the sync callbacks for each ring are set (these are used
157 *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
158 *      The adapter creation/initialization code puts them in the
159 * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
160 * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
161 * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
162 * 	actually call netmap_krings_create() to perform this and the other
163 * 	common stuff. netmap_krings_create() also takes care of the host rings,
164 * 	if needed, by setting their sync callbacks appropriately.
165 *
166 * 	Additional actions depend on the kind of netmap_adapter that has been
167 * 	registered:
168 *
169 * 	- netmap_hw_adapter:  	     [netmap.c]
170 * 	     This is a system netdev/ifp with native netmap support.
171 * 	     The ifp is detached from the host stack by redirecting:
172 * 	       - transmissions (from the network stack) to netmap_transmit()
173 * 	       - receive notifications to the nm_notify() callback for
174 * 	         this adapter. The callback is normally netmap_notify(), unless
175 * 	         the ifp is attached to a bridge using bwrap, in which case it
176 * 	         is netmap_bwrap_intr_notify().
177 *
178 * 	- netmap_generic_adapter:      [netmap_generic.c]
179 * 	      A system netdev/ifp without native netmap support.
180 *
181 * 	(the decision about native/non native support is taken in
182 * 	 netmap_get_hw_na(), called by netmap_get_na())
183 *
184 * 	- netmap_vp_adapter 		[netmap_vale.c]
185 * 	      Returned by netmap_get_bdg_na().
186 * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
187 * 	      are created on the fly if they don't already exist, and are
188 * 	      always attached to a bridge.
189 * 	      Persistent VALE ports must must be created seperately, and i
190 * 	      then attached like normal NICs. The NIOCREGIF we are examining
191 * 	      will find them only if they had previosly been created and
192 * 	      attached (see VALE_CTL below).
193 *
194 * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
195 * 	      Returned by netmap_get_pipe_na().
196 * 	      Both pipe ends are created, if they didn't already exist.
197 *
198 * 	- netmap_monitor_adapter      [netmap_monitor.c]
199 * 	      Returned by netmap_get_monitor_na().
200 * 	      If successful, the nm_sync callbacks of the monitored adapter
201 * 	      will be intercepted by the returned monitor.
202 *
203 * 	- netmap_bwrap_adapter	      [netmap_vale.c]
204 * 	      Cannot be obtained in this way, see VALE_CTL below
205 *
206 *
207 * 	os-specific:
208 * 	    linux: we first go through linux_netmap_ioctl() to
209 * 	           adapt the FreeBSD interface to the linux one.
210 *
211 *
212 * > 3. on each descriptor, the process issues an mmap() request to
213 * >    map the shared memory region within the process' address space.
214 * >    The list of interesting queues is indicated by a location in
215 * >    the shared memory region.
216 *
217 *      os-specific:
218 *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
219 *  	    linux:   linux_netmap_mmap (netmap_linux.c).
220 *
221 * > 4. using the functions in the netmap(4) userspace API, a process
222 * >    can look up the occupation state of a queue, access memory buffers,
223 * >    and retrieve received packets or enqueue packets to transmit.
224 *
225 * 	these actions do not involve the kernel.
226 *
227 * > 5. using some ioctl()s the process can synchronize the userspace view
228 * >    of the queue with the actual status in the kernel. This includes both
229 * >    receiving the notification of new packets, and transmitting new
230 * >    packets on the output interface.
231 *
232 * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
233 * 	cases. They invoke the nm_sync callbacks on the netmap_kring
234 * 	structures, as initialized in step 2 and maybe later modified
235 * 	by a monitor. Monitors, however, will always call the original
236 * 	callback before doing anything else.
237 *
238 *
239 * > 6. select() or poll() can be used to wait for events on individual
240 * >    transmit or receive queues (or all queues for a given interface).
241 *
242 * 	Implemented in netmap_poll(). This will call the same nm_sync()
243 * 	callbacks as in step 5 above.
244 *
245 * 	os-specific:
246 * 		linux: we first go through linux_netmap_poll() to adapt
247 * 		       the FreeBSD interface to the linux one.
248 *
249 *
250 *  ----  VALE_CTL -----
251 *
252 *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
253 *  nr_cmd in the nmreq structure. These subcommands are handled by
254 *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
255 *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
256 *  subcommands, respectively.
257 *
258 *  Any network interface known to the system (including a persistent VALE
259 *  port) can be attached to a VALE switch by issuing the
260 *  NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
261 *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
262 *  attachment of other interfaces, instead, requires the creation of a
263 *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
264 *  netmap mode. This may require the creation of a netmap_generic_adapter if
265 *  we have no native support for the interface, or if generic adapters have
266 *  been forced by sysctl.
267 *
268 *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
269 *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
270 *  callback.  In the case of the bwrap, the callback creates the
271 *  netmap_bwrap_adapter.  The initialization of the bwrap is then
272 *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
273 *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
274 *  A generic adapter for the wrapped ifp will be created if needed, when
275 *  netmap_get_bdg_na() calls netmap_get_hw_na().
276 *
277 *
278 *  ---- DATAPATHS -----
279 *
280 *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
281 *
282 *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
283 *
284 *    - tx from netmap userspace:
285 *	 concurrently:
286 *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
287 *                kring->nm_sync() == DEVICE_netmap_txsync()
288 *           2) device interrupt handler
289 *                na->nm_notify()  == netmap_notify()
290 *    - rx from netmap userspace:
291 *       concurrently:
292 *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
293 *                kring->nm_sync() == DEVICE_netmap_rxsync()
294 *           2) device interrupt handler
295 *                na->nm_notify()  == netmap_notify()
296 *    - tx from host stack
297 *       concurrently:
298 *           1) host stack
299 *                netmap_transmit()
300 *                  na->nm_notify  == netmap_notify()
301 *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
302 *                kring->nm_sync() == netmap_rxsync_from_host_compat
303 *                  netmap_rxsync_from_host(na, NULL, NULL)
304 *    - tx to host stack
305 *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
306 *             kring->nm_sync() == netmap_txsync_to_host_compat
307 *               netmap_txsync_to_host(na)
308 *                 NM_SEND_UP()
309 *                   FreeBSD: na->if_input() == ?? XXX
310 *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
311 *
312 *
313 *
314 *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
315 *
316 *
317 *
318 *                           -= VALE PORT =-
319 *
320 *
321 *
322 *                           -= NETMAP PIPE =-
323 *
324 *
325 *
326 *  -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
327 *
328 *
329 *
330 *  -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
331 *
332 *
333 *
334 *  -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
335 *
336 *
337 *
338 *  -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
339 *
340 *
341 *
342 */
343
344/*
345 * OS-specific code that is used only within this file.
346 * Other OS-specific code that must be accessed by drivers
347 * is present in netmap_kern.h
348 */
349
350#if defined(__FreeBSD__)
351#include <sys/cdefs.h> /* prerequisite */
352#include <sys/types.h>
353#include <sys/errno.h>
354#include <sys/param.h>	/* defines used in kernel.h */
355#include <sys/kernel.h>	/* types used in module initialization */
356#include <sys/conf.h>	/* cdevsw struct, UID, GID */
357#include <sys/filio.h>	/* FIONBIO */
358#include <sys/sockio.h>
359#include <sys/socketvar.h>	/* struct socket */
360#include <sys/malloc.h>
361#include <sys/poll.h>
362#include <sys/rwlock.h>
363#include <sys/socket.h> /* sockaddrs */
364#include <sys/selinfo.h>
365#include <sys/sysctl.h>
366#include <sys/jail.h>
367#include <net/vnet.h>
368#include <net/if.h>
369#include <net/if_var.h>
370#include <net/bpf.h>		/* BIOCIMMEDIATE */
371#include <machine/bus.h>	/* bus_dmamap_* */
372#include <sys/endian.h>
373#include <sys/refcount.h>
374
375
376/* reduce conditional code */
377// linux API, use for the knlist in FreeBSD
378#define init_waitqueue_head(x)	knlist_init_mtx(&(x)->si_note, NULL)
379
380void freebsd_selwakeup(struct selinfo *si, int pri);
381#define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
382
383#elif defined(linux)
384
385#include "bsd_glue.h"
386
387
388
389#elif defined(__APPLE__)
390
391#warning OSX support is only partial
392#include "osx_glue.h"
393
394#else
395
396#error	Unsupported platform
397
398#endif /* unsupported */
399
400/*
401 * common headers
402 */
403#include <net/netmap.h>
404#include <dev/netmap/netmap_kern.h>
405#include <dev/netmap/netmap_mem2.h>
406
407
408MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
409
410/*
411 * The following variables are used by the drivers and replicate
412 * fields in the global memory pool. They only refer to buffers
413 * used by physical interfaces.
414 */
415u_int netmap_total_buffers;
416u_int netmap_buf_size;
417char *netmap_buffer_base;	/* also address of an invalid buffer */
418
419/* user-controlled variables */
420int netmap_verbose;
421
422static int netmap_no_timestamp; /* don't timestamp on rxsync */
423
424SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
425SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
426    CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
427SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
428    CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
429int netmap_mitigate = 1;
430SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
431int netmap_no_pendintr = 1;
432SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
433    CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
434int netmap_txsync_retry = 2;
435SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
436    &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
437
438int netmap_adaptive_io = 0;
439SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,
440    &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");
441
442int netmap_flags = 0;	/* debug flags */
443int netmap_fwd = 0;	/* force transparent mode */
444int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
445
446/*
447 * netmap_admode selects the netmap mode to use.
448 * Invalid values are reset to NETMAP_ADMODE_BEST
449 */
450enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
451	NETMAP_ADMODE_NATIVE,	/* either native or none */
452	NETMAP_ADMODE_GENERIC,	/* force generic */
453	NETMAP_ADMODE_LAST };
454static int netmap_admode = NETMAP_ADMODE_BEST;
455
456int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
457int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
458int netmap_generic_rings = 1;   /* number of queues in generic. */
459
460SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
461SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
462SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
463SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
464SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
465SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
466SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
467
468NMG_LOCK_T	netmap_global_lock;
469
470
471static void
472nm_kr_get(struct netmap_kring *kr)
473{
474	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
475		tsleep(kr, 0, "NM_KR_GET", 4);
476}
477
478
479/*
480 * mark the ring as stopped, and run through the locks
481 * to make sure other users get to see it.
482 */
483static void
484netmap_disable_ring(struct netmap_kring *kr)
485{
486	kr->nkr_stopped = 1;
487	nm_kr_get(kr);
488	mtx_lock(&kr->q_lock);
489	mtx_unlock(&kr->q_lock);
490	nm_kr_put(kr);
491}
492
493/* stop or enable a single tx ring */
494void
495netmap_set_txring(struct netmap_adapter *na, u_int ring_id, int stopped)
496{
497	if (stopped)
498		netmap_disable_ring(na->tx_rings + ring_id);
499	else
500		na->tx_rings[ring_id].nkr_stopped = 0;
501	/* nofify that the stopped state has changed. This is currently
502	 *only used by bwrap to propagate the state to its own krings.
503	 * (see netmap_bwrap_intr_notify).
504	 */
505	na->nm_notify(na, ring_id, NR_TX, NAF_DISABLE_NOTIFY);
506}
507
508/* stop or enable a single rx ring */
509void
510netmap_set_rxring(struct netmap_adapter *na, u_int ring_id, int stopped)
511{
512	if (stopped)
513		netmap_disable_ring(na->rx_rings + ring_id);
514	else
515		na->rx_rings[ring_id].nkr_stopped = 0;
516	/* nofify that the stopped state has changed. This is currently
517	 *only used by bwrap to propagate the state to its own krings.
518	 * (see netmap_bwrap_intr_notify).
519	 */
520	na->nm_notify(na, ring_id, NR_RX, NAF_DISABLE_NOTIFY);
521}
522
523
524/* stop or enable all the rings of na */
525void
526netmap_set_all_rings(struct netmap_adapter *na, int stopped)
527{
528	int i;
529	u_int ntx, nrx;
530
531	if (!nm_netmap_on(na))
532		return;
533
534	ntx = netmap_real_tx_rings(na);
535	nrx = netmap_real_rx_rings(na);
536
537	for (i = 0; i < ntx; i++) {
538		netmap_set_txring(na, i, stopped);
539	}
540
541	for (i = 0; i < nrx; i++) {
542		netmap_set_rxring(na, i, stopped);
543	}
544}
545
546/*
547 * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
548 * to finish and prevents any new one from starting.  Call this before turning
549 * netmap mode off, or before removing the harware rings (e.g., on module
550 * onload).  As a rule of thumb for linux drivers, this should be placed near
551 * each napi_disable().
552 */
553void
554netmap_disable_all_rings(struct ifnet *ifp)
555{
556	netmap_set_all_rings(NA(ifp), 1 /* stopped */);
557}
558
559/*
560 * Convenience function used in drivers.  Re-enables rxsync and txsync on the
561 * adapter's rings In linux drivers, this should be placed near each
562 * napi_enable().
563 */
564void
565netmap_enable_all_rings(struct ifnet *ifp)
566{
567	netmap_set_all_rings(NA(ifp), 0 /* enabled */);
568}
569
570
571/*
572 * generic bound_checking function
573 */
574u_int
575nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
576{
577	u_int oldv = *v;
578	const char *op = NULL;
579
580	if (dflt < lo)
581		dflt = lo;
582	if (dflt > hi)
583		dflt = hi;
584	if (oldv < lo) {
585		*v = dflt;
586		op = "Bump";
587	} else if (oldv > hi) {
588		*v = hi;
589		op = "Clamp";
590	}
591	if (op && msg)
592		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
593	return *v;
594}
595
596
597/*
598 * packet-dump function, user-supplied or static buffer.
599 * The destination buffer must be at least 30+4*len
600 */
601const char *
602nm_dump_buf(char *p, int len, int lim, char *dst)
603{
604	static char _dst[8192];
605	int i, j, i0;
606	static char hex[] ="0123456789abcdef";
607	char *o;	/* output position */
608
609#define P_HI(x)	hex[((x) & 0xf0)>>4]
610#define P_LO(x)	hex[((x) & 0xf)]
611#define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
612	if (!dst)
613		dst = _dst;
614	if (lim <= 0 || lim > len)
615		lim = len;
616	o = dst;
617	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
618	o += strlen(o);
619	/* hexdump routine */
620	for (i = 0; i < lim; ) {
621		sprintf(o, "%5d: ", i);
622		o += strlen(o);
623		memset(o, ' ', 48);
624		i0 = i;
625		for (j=0; j < 16 && i < lim; i++, j++) {
626			o[j*3] = P_HI(p[i]);
627			o[j*3+1] = P_LO(p[i]);
628		}
629		i = i0;
630		for (j=0; j < 16 && i < lim; i++, j++)
631			o[j + 48] = P_C(p[i]);
632		o[j+48] = '\n';
633		o += j+49;
634	}
635	*o = '\0';
636#undef P_HI
637#undef P_LO
638#undef P_C
639	return dst;
640}
641
642
643/*
644 * Fetch configuration from the device, to cope with dynamic
645 * reconfigurations after loading the module.
646 */
647/* call with NMG_LOCK held */
648int
649netmap_update_config(struct netmap_adapter *na)
650{
651	u_int txr, txd, rxr, rxd;
652
653	txr = txd = rxr = rxd = 0;
654	if (na->nm_config) {
655		na->nm_config(na, &txr, &txd, &rxr, &rxd);
656	} else {
657		/* take whatever we had at init time */
658		txr = na->num_tx_rings;
659		txd = na->num_tx_desc;
660		rxr = na->num_rx_rings;
661		rxd = na->num_rx_desc;
662	}
663
664	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
665	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
666		return 0; /* nothing changed */
667	if (netmap_verbose || na->active_fds > 0) {
668		D("stored config %s: txring %d x %d, rxring %d x %d",
669			na->name,
670			na->num_tx_rings, na->num_tx_desc,
671			na->num_rx_rings, na->num_rx_desc);
672		D("new config %s: txring %d x %d, rxring %d x %d",
673			na->name, txr, txd, rxr, rxd);
674	}
675	if (na->active_fds == 0) {
676		D("configuration changed (but fine)");
677		na->num_tx_rings = txr;
678		na->num_tx_desc = txd;
679		na->num_rx_rings = rxr;
680		na->num_rx_desc = rxd;
681		return 0;
682	}
683	D("configuration changed while active, this is bad...");
684	return 1;
685}
686
687/* kring->nm_sync callback for the host tx ring */
688static int
689netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
690{
691	(void)flags; /* unused */
692	netmap_txsync_to_host(kring->na);
693	return 0;
694}
695
696/* kring->nm_sync callback for the host rx ring */
697static int
698netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
699{
700	(void)flags; /* unused */
701	netmap_rxsync_from_host(kring->na, NULL, NULL);
702	return 0;
703}
704
705
706
707/* create the krings array and initialize the fields common to all adapters.
708 * The array layout is this:
709 *
710 *                    +----------+
711 * na->tx_rings ----->|          | \
712 *                    |          |  } na->num_tx_ring
713 *                    |          | /
714 *                    +----------+
715 *                    |          |    host tx kring
716 * na->rx_rings ----> +----------+
717 *                    |          | \
718 *                    |          |  } na->num_rx_rings
719 *                    |          | /
720 *                    +----------+
721 *                    |          |    host rx kring
722 *                    +----------+
723 * na->tailroom ----->|          | \
724 *                    |          |  } tailroom bytes
725 *                    |          | /
726 *                    +----------+
727 *
728 * Note: for compatibility, host krings are created even when not needed.
729 * The tailroom space is currently used by vale ports for allocating leases.
730 */
731/* call with NMG_LOCK held */
732int
733netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
734{
735	u_int i, len, ndesc;
736	struct netmap_kring *kring;
737	u_int ntx, nrx;
738
739	/* account for the (possibly fake) host rings */
740	ntx = na->num_tx_rings + 1;
741	nrx = na->num_rx_rings + 1;
742
743	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
744
745	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
746	if (na->tx_rings == NULL) {
747		D("Cannot allocate krings");
748		return ENOMEM;
749	}
750	na->rx_rings = na->tx_rings + ntx;
751
752	/*
753	 * All fields in krings are 0 except the one initialized below.
754	 * but better be explicit on important kring fields.
755	 */
756	ndesc = na->num_tx_desc;
757	for (i = 0; i < ntx; i++) { /* Transmit rings */
758		kring = &na->tx_rings[i];
759		bzero(kring, sizeof(*kring));
760		kring->na = na;
761		kring->ring_id = i;
762		kring->nkr_num_slots = ndesc;
763		if (i < na->num_tx_rings) {
764			kring->nm_sync = na->nm_txsync;
765		} else if (i == na->num_tx_rings) {
766			kring->nm_sync = netmap_txsync_to_host_compat;
767		}
768		/*
769		 * IMPORTANT: Always keep one slot empty.
770		 */
771		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
772		kring->rtail = kring->nr_hwtail = ndesc - 1;
773		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", na->name, i);
774		ND("ktx %s h %d c %d t %d",
775			kring->name, kring->rhead, kring->rcur, kring->rtail);
776		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
777		init_waitqueue_head(&kring->si);
778	}
779
780	ndesc = na->num_rx_desc;
781	for (i = 0; i < nrx; i++) { /* Receive rings */
782		kring = &na->rx_rings[i];
783		bzero(kring, sizeof(*kring));
784		kring->na = na;
785		kring->ring_id = i;
786		kring->nkr_num_slots = ndesc;
787		if (i < na->num_rx_rings) {
788			kring->nm_sync = na->nm_rxsync;
789		} else if (i == na->num_rx_rings) {
790			kring->nm_sync = netmap_rxsync_from_host_compat;
791		}
792		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
793		kring->rtail = kring->nr_hwtail = 0;
794		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", na->name, i);
795		ND("krx %s h %d c %d t %d",
796			kring->name, kring->rhead, kring->rcur, kring->rtail);
797		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
798		init_waitqueue_head(&kring->si);
799	}
800	init_waitqueue_head(&na->tx_si);
801	init_waitqueue_head(&na->rx_si);
802
803	na->tailroom = na->rx_rings + nrx;
804
805	return 0;
806}
807
808
809/* undo the actions performed by netmap_krings_create */
810/* call with NMG_LOCK held */
811void
812netmap_krings_delete(struct netmap_adapter *na)
813{
814	struct netmap_kring *kring = na->tx_rings;
815
816	/* we rely on the krings layout described above */
817	for ( ; kring != na->tailroom; kring++) {
818		mtx_destroy(&kring->q_lock);
819	}
820	free(na->tx_rings, M_DEVBUF);
821	na->tx_rings = na->rx_rings = na->tailroom = NULL;
822}
823
824
825/*
826 * Destructor for NIC ports. They also have an mbuf queue
827 * on the rings connected to the host so we need to purge
828 * them first.
829 */
830/* call with NMG_LOCK held */
831static void
832netmap_hw_krings_delete(struct netmap_adapter *na)
833{
834	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
835
836	ND("destroy sw mbq with len %d", mbq_len(q));
837	mbq_purge(q);
838	mbq_safe_destroy(q);
839	netmap_krings_delete(na);
840}
841
842
843/* create a new netmap_if for a newly registered fd.
844 * If this is the first registration of the adapter,
845 * also create the netmap rings and their in-kernel view,
846 * the netmap krings.
847 */
848/* call with NMG_LOCK held */
849static struct netmap_if*
850netmap_if_new(struct netmap_adapter *na)
851{
852	struct netmap_if *nifp;
853
854	if (netmap_update_config(na)) {
855		/* configuration mismatch, report and fail */
856		return NULL;
857	}
858
859	if (na->active_fds)	/* already registered */
860		goto final;
861
862	/* create and init the krings arrays.
863	 * Depending on the adapter, this may also create
864	 * the netmap rings themselves
865	 */
866	if (na->nm_krings_create(na))
867		return NULL;
868
869	/* create all missing netmap rings */
870	if (netmap_mem_rings_create(na))
871		goto cleanup;
872
873final:
874
875	/* in all cases, create a new netmap if */
876	nifp = netmap_mem_if_new(na);
877	if (nifp == NULL)
878		goto cleanup;
879
880	return (nifp);
881
882cleanup:
883
884	if (na->active_fds == 0) {
885		netmap_mem_rings_delete(na);
886		na->nm_krings_delete(na);
887	}
888
889	return NULL;
890}
891
892
893/* grab a reference to the memory allocator, if we don't have one already.  The
894 * reference is taken from the netmap_adapter registered with the priv.
895 */
896/* call with NMG_LOCK held */
897static int
898netmap_get_memory_locked(struct netmap_priv_d* p)
899{
900	struct netmap_mem_d *nmd;
901	int error = 0;
902
903	if (p->np_na == NULL) {
904		if (!netmap_mmap_unreg)
905			return ENODEV;
906		/* for compatibility with older versions of the API
907 		 * we use the global allocator when no interface has been
908 		 * registered
909 		 */
910		nmd = &nm_mem;
911	} else {
912		nmd = p->np_na->nm_mem;
913	}
914	if (p->np_mref == NULL) {
915		error = netmap_mem_finalize(nmd, p->np_na);
916		if (!error)
917			p->np_mref = nmd;
918	} else if (p->np_mref != nmd) {
919		/* a virtual port has been registered, but previous
920 		 * syscalls already used the global allocator.
921 		 * We cannot continue
922 		 */
923		error = ENODEV;
924	}
925	return error;
926}
927
928
929/* call with NMG_LOCK *not* held */
930int
931netmap_get_memory(struct netmap_priv_d* p)
932{
933	int error;
934	NMG_LOCK();
935	error = netmap_get_memory_locked(p);
936	NMG_UNLOCK();
937	return error;
938}
939
940
941/* call with NMG_LOCK held */
942static int
943netmap_have_memory_locked(struct netmap_priv_d* p)
944{
945	return p->np_mref != NULL;
946}
947
948
949/* call with NMG_LOCK held */
950static void
951netmap_drop_memory_locked(struct netmap_priv_d* p)
952{
953	if (p->np_mref) {
954		netmap_mem_deref(p->np_mref, p->np_na);
955		p->np_mref = NULL;
956	}
957}
958
959
960/*
961 * Call nm_register(ifp,0) to stop netmap mode on the interface and
962 * revert to normal operation.
963 * The second argument is the nifp to work on. In some cases it is
964 * not attached yet to the netmap_priv_d so we need to pass it as
965 * a separate argument.
966 */
967/* call with NMG_LOCK held */
968static void
969netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
970{
971	struct netmap_adapter *na = priv->np_na;
972
973	NMG_LOCK_ASSERT();
974	na->active_fds--;
975	if (na->active_fds <= 0) {	/* last instance */
976
977		if (netmap_verbose)
978			D("deleting last instance for %s", na->name);
979		/*
980		 * (TO CHECK) This function is only called
981		 * when the last reference to this file descriptor goes
982		 * away. This means we cannot have any pending poll()
983		 * or interrupt routine operating on the structure.
984		 * XXX The file may be closed in a thread while
985		 * another thread is using it.
986		 * Linux keeps the file opened until the last reference
987		 * by any outstanding ioctl/poll or mmap is gone.
988		 * FreeBSD does not track mmap()s (but we do) and
989		 * wakes up any sleeping poll(). Need to check what
990		 * happens if the close() occurs while a concurrent
991		 * syscall is running.
992		 */
993		na->nm_register(na, 0); /* off, clear flags */
994		/* Wake up any sleeping threads. netmap_poll will
995		 * then return POLLERR
996		 * XXX The wake up now must happen during *_down(), when
997		 * we order all activities to stop. -gl
998		 */
999		/* XXX kqueue(9) needed; these will mirror knlist_init. */
1000		/* knlist_destroy(&na->tx_si.si_note); */
1001		/* knlist_destroy(&na->rx_si.si_note); */
1002
1003		/* delete rings and buffers */
1004		netmap_mem_rings_delete(na);
1005		na->nm_krings_delete(na);
1006	}
1007	/* delete the nifp */
1008	netmap_mem_if_delete(na, nifp);
1009}
1010
1011/* call with NMG_LOCK held */
1012static __inline int
1013nm_tx_si_user(struct netmap_priv_d *priv)
1014{
1015	return (priv->np_na != NULL &&
1016		(priv->np_txqlast - priv->np_txqfirst > 1));
1017}
1018
1019/* call with NMG_LOCK held */
1020static __inline int
1021nm_rx_si_user(struct netmap_priv_d *priv)
1022{
1023	return (priv->np_na != NULL &&
1024		(priv->np_rxqlast - priv->np_rxqfirst > 1));
1025}
1026
1027
1028/*
1029 * Destructor of the netmap_priv_d, called when the fd has
1030 * no active open() and mmap(). Also called in error paths.
1031 *
1032 * returns 1 if this is the last instance and we can free priv
1033 */
1034/* call with NMG_LOCK held */
1035int
1036netmap_dtor_locked(struct netmap_priv_d *priv)
1037{
1038	struct netmap_adapter *na = priv->np_na;
1039
1040#ifdef __FreeBSD__
1041	/*
1042	 * np_refcount is the number of active mmaps on
1043	 * this file descriptor
1044	 */
1045	if (--priv->np_refcount > 0) {
1046		return 0;
1047	}
1048#endif /* __FreeBSD__ */
1049	if (!na) {
1050	    return 1; //XXX is it correct?
1051	}
1052	netmap_do_unregif(priv, priv->np_nifp);
1053	priv->np_nifp = NULL;
1054	netmap_drop_memory_locked(priv);
1055	if (priv->np_na) {
1056		if (nm_tx_si_user(priv))
1057			na->tx_si_users--;
1058		if (nm_rx_si_user(priv))
1059			na->rx_si_users--;
1060		netmap_adapter_put(na);
1061		priv->np_na = NULL;
1062	}
1063	return 1;
1064}
1065
1066
1067/* call with NMG_LOCK *not* held */
1068void
1069netmap_dtor(void *data)
1070{
1071	struct netmap_priv_d *priv = data;
1072	int last_instance;
1073
1074	NMG_LOCK();
1075	last_instance = netmap_dtor_locked(priv);
1076	NMG_UNLOCK();
1077	if (last_instance) {
1078		bzero(priv, sizeof(*priv));	/* for safety */
1079		free(priv, M_DEVBUF);
1080	}
1081}
1082
1083
1084
1085
1086/*
1087 * Handlers for synchronization of the queues from/to the host.
1088 * Netmap has two operating modes:
1089 * - in the default mode, the rings connected to the host stack are
1090 *   just another ring pair managed by userspace;
1091 * - in transparent mode (XXX to be defined) incoming packets
1092 *   (from the host or the NIC) are marked as NS_FORWARD upon
1093 *   arrival, and the user application has a chance to reset the
1094 *   flag for packets that should be dropped.
1095 *   On the RXSYNC or poll(), packets in RX rings between
1096 *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1097 *   to the other side.
1098 * The transfer NIC --> host is relatively easy, just encapsulate
1099 * into mbufs and we are done. The host --> NIC side is slightly
1100 * harder because there might not be room in the tx ring so it
1101 * might take a while before releasing the buffer.
1102 */
1103
1104
1105/*
1106 * pass a chain of buffers to the host stack as coming from 'dst'
1107 * We do not need to lock because the queue is private.
1108 */
1109static void
1110netmap_send_up(struct ifnet *dst, struct mbq *q)
1111{
1112	struct mbuf *m;
1113
1114	/* send packets up, outside the lock */
1115	while ((m = mbq_dequeue(q)) != NULL) {
1116		if (netmap_verbose & NM_VERB_HOST)
1117			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1118		NM_SEND_UP(dst, m);
1119	}
1120	mbq_destroy(q);
1121}
1122
1123
1124/*
1125 * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1126 * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
1127 * and pass them up. Drop remaining packets in the unlikely event
1128 * of an mbuf shortage.
1129 */
1130static void
1131netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1132{
1133	u_int const lim = kring->nkr_num_slots - 1;
1134	u_int const head = kring->ring->head;
1135	u_int n;
1136	struct netmap_adapter *na = kring->na;
1137
1138	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1139		struct mbuf *m;
1140		struct netmap_slot *slot = &kring->ring->slot[n];
1141
1142		if ((slot->flags & NS_FORWARD) == 0 && !force)
1143			continue;
1144		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1145			RD(5, "bad pkt at %d len %d", n, slot->len);
1146			continue;
1147		}
1148		slot->flags &= ~NS_FORWARD; // XXX needed ?
1149		/* XXX TODO: adapt to the case of a multisegment packet */
1150		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1151
1152		if (m == NULL)
1153			break;
1154		mbq_enqueue(q, m);
1155	}
1156}
1157
1158
1159/*
1160 * Send to the NIC rings packets marked NS_FORWARD between
1161 * kring->nr_hwcur and kring->rhead
1162 * Called under kring->rx_queue.lock on the sw rx ring,
1163 */
1164static u_int
1165netmap_sw_to_nic(struct netmap_adapter *na)
1166{
1167	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1168	struct netmap_slot *rxslot = kring->ring->slot;
1169	u_int i, rxcur = kring->nr_hwcur;
1170	u_int const head = kring->rhead;
1171	u_int const src_lim = kring->nkr_num_slots - 1;
1172	u_int sent = 0;
1173
1174	/* scan rings to find space, then fill as much as possible */
1175	for (i = 0; i < na->num_tx_rings; i++) {
1176		struct netmap_kring *kdst = &na->tx_rings[i];
1177		struct netmap_ring *rdst = kdst->ring;
1178		u_int const dst_lim = kdst->nkr_num_slots - 1;
1179
1180		/* XXX do we trust ring or kring->rcur,rtail ? */
1181		for (; rxcur != head && !nm_ring_empty(rdst);
1182		     rxcur = nm_next(rxcur, src_lim) ) {
1183			struct netmap_slot *src, *dst, tmp;
1184			u_int dst_cur = rdst->cur;
1185
1186			src = &rxslot[rxcur];
1187			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1188				continue;
1189
1190			sent++;
1191
1192			dst = &rdst->slot[dst_cur];
1193
1194			tmp = *src;
1195
1196			src->buf_idx = dst->buf_idx;
1197			src->flags = NS_BUF_CHANGED;
1198
1199			dst->buf_idx = tmp.buf_idx;
1200			dst->len = tmp.len;
1201			dst->flags = NS_BUF_CHANGED;
1202
1203			rdst->cur = nm_next(dst_cur, dst_lim);
1204		}
1205		/* if (sent) XXX txsync ? */
1206	}
1207	return sent;
1208}
1209
1210
1211/*
1212 * netmap_txsync_to_host() passes packets up. We are called from a
1213 * system call in user process context, and the only contention
1214 * can be among multiple user threads erroneously calling
1215 * this routine concurrently.
1216 */
1217void
1218netmap_txsync_to_host(struct netmap_adapter *na)
1219{
1220	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1221	struct netmap_ring *ring = kring->ring;
1222	u_int const lim = kring->nkr_num_slots - 1;
1223	u_int const head = kring->rhead;
1224	struct mbq q;
1225
1226	/* Take packets from hwcur to head and pass them up.
1227	 * force head = cur since netmap_grab_packets() stops at head
1228	 * In case of no buffers we give up. At the end of the loop,
1229	 * the queue is drained in all cases.
1230	 */
1231	mbq_init(&q);
1232	ring->cur = head;
1233	netmap_grab_packets(kring, &q, 1 /* force */);
1234	ND("have %d pkts in queue", mbq_len(&q));
1235	kring->nr_hwcur = head;
1236	kring->nr_hwtail = head + lim;
1237	if (kring->nr_hwtail > lim)
1238		kring->nr_hwtail -= lim + 1;
1239	nm_txsync_finalize(kring);
1240
1241	netmap_send_up(na->ifp, &q);
1242}
1243
1244
1245/*
1246 * rxsync backend for packets coming from the host stack.
1247 * They have been put in kring->rx_queue by netmap_transmit().
1248 * We protect access to the kring using kring->rx_queue.lock
1249 *
1250 * This routine also does the selrecord if called from the poll handler
1251 * (we know because td != NULL).
1252 *
1253 * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1254 *     as an additional hidden argument.
1255 * returns the number of packets delivered to tx queues in
1256 * transparent mode, or a negative value if error
1257 */
1258int
1259netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1260{
1261	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1262	struct netmap_ring *ring = kring->ring;
1263	u_int nm_i, n;
1264	u_int const lim = kring->nkr_num_slots - 1;
1265	u_int const head = kring->rhead;
1266	int ret = 0;
1267	struct mbq *q = &kring->rx_queue;
1268
1269	(void)pwait;	/* disable unused warnings */
1270	(void)td;
1271
1272	mbq_lock(q);
1273
1274	/* First part: import newly received packets */
1275	n = mbq_len(q);
1276	if (n) { /* grab packets from the queue */
1277		struct mbuf *m;
1278		uint32_t stop_i;
1279
1280		nm_i = kring->nr_hwtail;
1281		stop_i = nm_prev(nm_i, lim);
1282		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1283			int len = MBUF_LEN(m);
1284			struct netmap_slot *slot = &ring->slot[nm_i];
1285
1286			m_copydata(m, 0, len, NMB(na, slot));
1287			ND("nm %d len %d", nm_i, len);
1288			if (netmap_verbose)
1289                                D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1290
1291			slot->len = len;
1292			slot->flags = kring->nkr_slot_flags;
1293			nm_i = nm_next(nm_i, lim);
1294			m_freem(m);
1295		}
1296		kring->nr_hwtail = nm_i;
1297	}
1298
1299	/*
1300	 * Second part: skip past packets that userspace has released.
1301	 */
1302	nm_i = kring->nr_hwcur;
1303	if (nm_i != head) { /* something was released */
1304		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1305			ret = netmap_sw_to_nic(na);
1306		kring->nr_hwcur = head;
1307	}
1308
1309	nm_rxsync_finalize(kring);
1310
1311	/* access copies of cur,tail in the kring */
1312	if (kring->rcur == kring->rtail && td) /* no bufs available */
1313		selrecord(td, &kring->si);
1314
1315	mbq_unlock(q);
1316	return ret;
1317}
1318
1319
1320/* Get a netmap adapter for the port.
1321 *
1322 * If it is possible to satisfy the request, return 0
1323 * with *na containing the netmap adapter found.
1324 * Otherwise return an error code, with *na containing NULL.
1325 *
1326 * When the port is attached to a bridge, we always return
1327 * EBUSY.
1328 * Otherwise, if the port is already bound to a file descriptor,
1329 * then we unconditionally return the existing adapter into *na.
1330 * In all the other cases, we return (into *na) either native,
1331 * generic or NULL, according to the following table:
1332 *
1333 *					native_support
1334 * active_fds   dev.netmap.admode         YES     NO
1335 * -------------------------------------------------------
1336 *    >0              *                 NA(ifp) NA(ifp)
1337 *
1338 *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1339 *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1340 *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1341 *
1342 */
1343
1344int
1345netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1346{
1347	/* generic support */
1348	int i = netmap_admode;	/* Take a snapshot. */
1349	int error = 0;
1350	struct netmap_adapter *prev_na;
1351	struct netmap_generic_adapter *gna;
1352
1353	*na = NULL; /* default */
1354
1355	/* reset in case of invalid value */
1356	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1357		i = netmap_admode = NETMAP_ADMODE_BEST;
1358
1359	if (NETMAP_CAPABLE(ifp)) {
1360		prev_na = NA(ifp);
1361		/* If an adapter already exists, return it if
1362		 * there are active file descriptors or if
1363		 * netmap is not forced to use generic
1364		 * adapters.
1365		 */
1366		if (NETMAP_OWNED_BY_ANY(prev_na)
1367			|| i != NETMAP_ADMODE_GENERIC
1368			|| prev_na->na_flags & NAF_FORCE_NATIVE
1369#ifdef WITH_PIPES
1370			/* ugly, but we cannot allow an adapter switch
1371			 * if some pipe is referring to this one
1372			 */
1373			|| prev_na->na_next_pipe > 0
1374#endif
1375		) {
1376			*na = prev_na;
1377			return 0;
1378		}
1379	}
1380
1381	/* If there isn't native support and netmap is not allowed
1382	 * to use generic adapters, we cannot satisfy the request.
1383	 */
1384	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1385		return EOPNOTSUPP;
1386
1387	/* Otherwise, create a generic adapter and return it,
1388	 * saving the previously used netmap adapter, if any.
1389	 *
1390	 * Note that here 'prev_na', if not NULL, MUST be a
1391	 * native adapter, and CANNOT be a generic one. This is
1392	 * true because generic adapters are created on demand, and
1393	 * destroyed when not used anymore. Therefore, if the adapter
1394	 * currently attached to an interface 'ifp' is generic, it
1395	 * must be that
1396	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1397	 * Consequently, if NA(ifp) is generic, we will enter one of
1398	 * the branches above. This ensures that we never override
1399	 * a generic adapter with another generic adapter.
1400	 */
1401	prev_na = NA(ifp);
1402	error = generic_netmap_attach(ifp);
1403	if (error)
1404		return error;
1405
1406	*na = NA(ifp);
1407	gna = (struct netmap_generic_adapter*)NA(ifp);
1408	gna->prev = prev_na; /* save old na */
1409	if (prev_na != NULL) {
1410		ifunit_ref(ifp->if_xname);
1411		// XXX add a refcount ?
1412		netmap_adapter_get(prev_na);
1413	}
1414	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1415
1416	return 0;
1417}
1418
1419
1420/*
1421 * MUST BE CALLED UNDER NMG_LOCK()
1422 *
1423 * Get a refcounted reference to a netmap adapter attached
1424 * to the interface specified by nmr.
1425 * This is always called in the execution of an ioctl().
1426 *
1427 * Return ENXIO if the interface specified by the request does
1428 * not exist, ENOTSUP if netmap is not supported by the interface,
1429 * EBUSY if the interface is already attached to a bridge,
1430 * EINVAL if parameters are invalid, ENOMEM if needed resources
1431 * could not be allocated.
1432 * If successful, hold a reference to the netmap adapter.
1433 *
1434 * No reference is kept on the real interface, which may then
1435 * disappear at any time.
1436 */
1437int
1438netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1439{
1440	struct ifnet *ifp = NULL;
1441	int error = 0;
1442	struct netmap_adapter *ret = NULL;
1443
1444	*na = NULL;     /* default return value */
1445
1446	NMG_LOCK_ASSERT();
1447
1448	/* we cascade through all possibile types of netmap adapter.
1449	 * All netmap_get_*_na() functions return an error and an na,
1450	 * with the following combinations:
1451	 *
1452	 * error    na
1453	 *   0	   NULL		type doesn't match
1454	 *  !0	   NULL		type matches, but na creation/lookup failed
1455	 *   0	  !NULL		type matches and na created/found
1456	 *  !0    !NULL		impossible
1457	 */
1458
1459	/* try to see if this is a monitor port */
1460	error = netmap_get_monitor_na(nmr, na, create);
1461	if (error || *na != NULL)
1462		return error;
1463
1464	/* try to see if this is a pipe port */
1465	error = netmap_get_pipe_na(nmr, na, create);
1466	if (error || *na != NULL)
1467		return error;
1468
1469	/* try to see if this is a bridge port */
1470	error = netmap_get_bdg_na(nmr, na, create);
1471	if (error)
1472		return error;
1473
1474	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1475		goto pipes;
1476
1477	/*
1478	 * This must be a hardware na, lookup the name in the system.
1479	 * Note that by hardware we actually mean "it shows up in ifconfig".
1480	 * This may still be a tap, a veth/epair, or even a
1481	 * persistent VALE port.
1482	 */
1483	ifp = ifunit_ref(nmr->nr_name);
1484	if (ifp == NULL) {
1485	        return ENXIO;
1486	}
1487
1488	error = netmap_get_hw_na(ifp, &ret);
1489	if (error)
1490		goto out;
1491
1492	*na = ret;
1493	netmap_adapter_get(ret);
1494
1495pipes:
1496	/*
1497	 * If we are opening a pipe whose parent was not in netmap mode,
1498	 * we have to allocate the pipe array now.
1499	 * XXX get rid of this clumsiness (2014-03-15)
1500	 */
1501	error = netmap_pipe_alloc(*na, nmr);
1502
1503out:
1504	if (error && ret != NULL)
1505		netmap_adapter_put(ret);
1506
1507	if (ifp)
1508		if_rele(ifp); /* allow live unloading of drivers modules */
1509
1510	return error;
1511}
1512
1513
1514/*
1515 * validate parameters on entry for *_txsync()
1516 * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1517 * in case of error.
1518 *
1519 * rhead, rcur and rtail=hwtail are stored from previous round.
1520 * hwcur is the next packet to send to the ring.
1521 *
1522 * We want
1523 *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1524 *
1525 * hwcur, rhead, rtail and hwtail are reliable
1526 */
1527u_int
1528nm_txsync_prologue(struct netmap_kring *kring)
1529{
1530	struct netmap_ring *ring = kring->ring;
1531	u_int head = ring->head; /* read only once */
1532	u_int cur = ring->cur; /* read only once */
1533	u_int n = kring->nkr_num_slots;
1534
1535	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1536		kring->name,
1537		kring->nr_hwcur, kring->nr_hwtail,
1538		ring->head, ring->cur, ring->tail);
1539#if 1 /* kernel sanity checks; but we can trust the kring. */
1540	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1541	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1542		goto error;
1543#endif /* kernel sanity checks */
1544	/*
1545	 * user sanity checks. We only use 'cur',
1546	 * A, B, ... are possible positions for cur:
1547	 *
1548	 *  0    A  cur   B  tail  C  n-1
1549	 *  0    D  tail  E  cur   F  n-1
1550	 *
1551	 * B, F, D are valid. A, C, E are wrong
1552	 */
1553	if (kring->rtail >= kring->rhead) {
1554		/* want rhead <= head <= rtail */
1555		if (head < kring->rhead || head > kring->rtail)
1556			goto error;
1557		/* and also head <= cur <= rtail */
1558		if (cur < head || cur > kring->rtail)
1559			goto error;
1560	} else { /* here rtail < rhead */
1561		/* we need head outside rtail .. rhead */
1562		if (head > kring->rtail && head < kring->rhead)
1563			goto error;
1564
1565		/* two cases now: head <= rtail or head >= rhead  */
1566		if (head <= kring->rtail) {
1567			/* want head <= cur <= rtail */
1568			if (cur < head || cur > kring->rtail)
1569				goto error;
1570		} else { /* head >= rhead */
1571			/* cur must be outside rtail..head */
1572			if (cur > kring->rtail && cur < head)
1573				goto error;
1574		}
1575	}
1576	if (ring->tail != kring->rtail) {
1577		RD(5, "tail overwritten was %d need %d",
1578			ring->tail, kring->rtail);
1579		ring->tail = kring->rtail;
1580	}
1581	kring->rhead = head;
1582	kring->rcur = cur;
1583	return head;
1584
1585error:
1586	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1587		kring->name,
1588		kring->nr_hwcur,
1589		kring->rcur, kring->nr_hwtail,
1590		cur, ring->tail);
1591	return n;
1592}
1593
1594
1595/*
1596 * validate parameters on entry for *_rxsync()
1597 * Returns ring->head if ok, kring->nkr_num_slots on error.
1598 *
1599 * For a valid configuration,
1600 * hwcur <= head <= cur <= tail <= hwtail
1601 *
1602 * We only consider head and cur.
1603 * hwcur and hwtail are reliable.
1604 *
1605 */
1606u_int
1607nm_rxsync_prologue(struct netmap_kring *kring)
1608{
1609	struct netmap_ring *ring = kring->ring;
1610	uint32_t const n = kring->nkr_num_slots;
1611	uint32_t head, cur;
1612
1613	ND("%s kc %d kt %d h %d c %d t %d",
1614		kring->name,
1615		kring->nr_hwcur, kring->nr_hwtail,
1616		ring->head, ring->cur, ring->tail);
1617	/*
1618	 * Before storing the new values, we should check they do not
1619	 * move backwards. However:
1620	 * - head is not an issue because the previous value is hwcur;
1621	 * - cur could in principle go back, however it does not matter
1622	 *   because we are processing a brand new rxsync()
1623	 */
1624	cur = kring->rcur = ring->cur;	/* read only once */
1625	head = kring->rhead = ring->head;	/* read only once */
1626#if 1 /* kernel sanity checks */
1627	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1628		goto error;
1629#endif /* kernel sanity checks */
1630	/* user sanity checks */
1631	if (kring->nr_hwtail >= kring->nr_hwcur) {
1632		/* want hwcur <= rhead <= hwtail */
1633		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1634			goto error;
1635		/* and also rhead <= rcur <= hwtail */
1636		if (cur < head || cur > kring->nr_hwtail)
1637			goto error;
1638	} else {
1639		/* we need rhead outside hwtail..hwcur */
1640		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1641			goto error;
1642		/* two cases now: head <= hwtail or head >= hwcur  */
1643		if (head <= kring->nr_hwtail) {
1644			/* want head <= cur <= hwtail */
1645			if (cur < head || cur > kring->nr_hwtail)
1646				goto error;
1647		} else {
1648			/* cur must be outside hwtail..head */
1649			if (cur < head && cur > kring->nr_hwtail)
1650				goto error;
1651		}
1652	}
1653	if (ring->tail != kring->rtail) {
1654		RD(5, "%s tail overwritten was %d need %d",
1655			kring->name,
1656			ring->tail, kring->rtail);
1657		ring->tail = kring->rtail;
1658	}
1659	return head;
1660
1661error:
1662	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1663		kring->nr_hwcur,
1664		kring->rcur, kring->nr_hwtail,
1665		kring->rhead, kring->rcur, ring->tail);
1666	return n;
1667}
1668
1669
1670/*
1671 * Error routine called when txsync/rxsync detects an error.
1672 * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1673 * Return 1 on reinit.
1674 *
1675 * This routine is only called by the upper half of the kernel.
1676 * It only reads hwcur (which is changed only by the upper half, too)
1677 * and hwtail (which may be changed by the lower half, but only on
1678 * a tx ring and only to increase it, so any error will be recovered
1679 * on the next call). For the above, we don't strictly need to call
1680 * it under lock.
1681 */
1682int
1683netmap_ring_reinit(struct netmap_kring *kring)
1684{
1685	struct netmap_ring *ring = kring->ring;
1686	u_int i, lim = kring->nkr_num_slots - 1;
1687	int errors = 0;
1688
1689	// XXX KASSERT nm_kr_tryget
1690	RD(10, "called for %s", kring->name);
1691	// XXX probably wrong to trust userspace
1692	kring->rhead = ring->head;
1693	kring->rcur  = ring->cur;
1694	kring->rtail = ring->tail;
1695
1696	if (ring->cur > lim)
1697		errors++;
1698	if (ring->head > lim)
1699		errors++;
1700	if (ring->tail > lim)
1701		errors++;
1702	for (i = 0; i <= lim; i++) {
1703		u_int idx = ring->slot[i].buf_idx;
1704		u_int len = ring->slot[i].len;
1705		if (idx < 2 || idx >= netmap_total_buffers) {
1706			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1707			ring->slot[i].buf_idx = 0;
1708			ring->slot[i].len = 0;
1709		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1710			ring->slot[i].len = 0;
1711			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1712		}
1713	}
1714	if (errors) {
1715		RD(10, "total %d errors", errors);
1716		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1717			kring->name,
1718			ring->cur, kring->nr_hwcur,
1719			ring->tail, kring->nr_hwtail);
1720		ring->head = kring->rhead = kring->nr_hwcur;
1721		ring->cur  = kring->rcur  = kring->nr_hwcur;
1722		ring->tail = kring->rtail = kring->nr_hwtail;
1723	}
1724	return (errors ? 1 : 0);
1725}
1726
1727/* interpret the ringid and flags fields of an nmreq, by translating them
1728 * into a pair of intervals of ring indices:
1729 *
1730 * [priv->np_txqfirst, priv->np_txqlast) and
1731 * [priv->np_rxqfirst, priv->np_rxqlast)
1732 *
1733 */
1734int
1735netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1736{
1737	struct netmap_adapter *na = priv->np_na;
1738	u_int j, i = ringid & NETMAP_RING_MASK;
1739	u_int reg = flags & NR_REG_MASK;
1740
1741	if (reg == NR_REG_DEFAULT) {
1742		/* convert from old ringid to flags */
1743		if (ringid & NETMAP_SW_RING) {
1744			reg = NR_REG_SW;
1745		} else if (ringid & NETMAP_HW_RING) {
1746			reg = NR_REG_ONE_NIC;
1747		} else {
1748			reg = NR_REG_ALL_NIC;
1749		}
1750		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1751	}
1752	switch (reg) {
1753	case NR_REG_ALL_NIC:
1754	case NR_REG_PIPE_MASTER:
1755	case NR_REG_PIPE_SLAVE:
1756		priv->np_txqfirst = 0;
1757		priv->np_txqlast = na->num_tx_rings;
1758		priv->np_rxqfirst = 0;
1759		priv->np_rxqlast = na->num_rx_rings;
1760		ND("%s %d %d", "ALL/PIPE",
1761			priv->np_rxqfirst, priv->np_rxqlast);
1762		break;
1763	case NR_REG_SW:
1764	case NR_REG_NIC_SW:
1765		if (!(na->na_flags & NAF_HOST_RINGS)) {
1766			D("host rings not supported");
1767			return EINVAL;
1768		}
1769		priv->np_txqfirst = (reg == NR_REG_SW ?
1770			na->num_tx_rings : 0);
1771		priv->np_txqlast = na->num_tx_rings + 1;
1772		priv->np_rxqfirst = (reg == NR_REG_SW ?
1773			na->num_rx_rings : 0);
1774		priv->np_rxqlast = na->num_rx_rings + 1;
1775		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1776			priv->np_rxqfirst, priv->np_rxqlast);
1777		break;
1778	case NR_REG_ONE_NIC:
1779		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1780			D("invalid ring id %d", i);
1781			return EINVAL;
1782		}
1783		/* if not enough rings, use the first one */
1784		j = i;
1785		if (j >= na->num_tx_rings)
1786			j = 0;
1787		priv->np_txqfirst = j;
1788		priv->np_txqlast = j + 1;
1789		j = i;
1790		if (j >= na->num_rx_rings)
1791			j = 0;
1792		priv->np_rxqfirst = j;
1793		priv->np_rxqlast = j + 1;
1794		break;
1795	default:
1796		D("invalid regif type %d", reg);
1797		return EINVAL;
1798	}
1799	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1800
1801	if (netmap_verbose) {
1802		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1803			na->name,
1804			priv->np_txqfirst,
1805			priv->np_txqlast,
1806			priv->np_rxqfirst,
1807			priv->np_rxqlast,
1808			i);
1809	}
1810	return 0;
1811}
1812
1813
1814/*
1815 * Set the ring ID. For devices with a single queue, a request
1816 * for all rings is the same as a single ring.
1817 */
1818static int
1819netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1820{
1821	struct netmap_adapter *na = priv->np_na;
1822	int error;
1823
1824	error = netmap_interp_ringid(priv, ringid, flags);
1825	if (error) {
1826		return error;
1827	}
1828
1829	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1830
1831	/* optimization: count the users registered for more than
1832	 * one ring, which are the ones sleeping on the global queue.
1833	 * The default netmap_notify() callback will then
1834	 * avoid signaling the global queue if nobody is using it
1835	 */
1836	if (nm_tx_si_user(priv))
1837		na->tx_si_users++;
1838	if (nm_rx_si_user(priv))
1839		na->rx_si_users++;
1840	return 0;
1841}
1842
1843/*
1844 * possibly move the interface to netmap-mode.
1845 * If success it returns a pointer to netmap_if, otherwise NULL.
1846 * This must be called with NMG_LOCK held.
1847 *
1848 * The following na callbacks are called in the process:
1849 *
1850 * na->nm_config()			[by netmap_update_config]
1851 * (get current number and size of rings)
1852 *
1853 *  	We have a generic one for linux (netmap_linux_config).
1854 *  	The bwrap has to override this, since it has to forward
1855 *  	the request to the wrapped adapter (netmap_bwrap_config).
1856 *
1857 *    	XXX netmap_if_new calls this again (2014-03-15)
1858 *
1859 * na->nm_krings_create()		[by netmap_if_new]
1860 * (create and init the krings array)
1861 *
1862 * 	One of the following:
1863 *
1864 *	* netmap_hw_krings_create, 			(hw ports)
1865 *		creates the standard layout for the krings
1866 * 		and adds the mbq (used for the host rings).
1867 *
1868 * 	* netmap_vp_krings_create			(VALE ports)
1869 * 		add leases and scratchpads
1870 *
1871 * 	* netmap_pipe_krings_create			(pipes)
1872 * 		create the krings and rings of both ends and
1873 * 		cross-link them
1874 *
1875 *      * netmap_monitor_krings_create 			(monitors)
1876 *      	avoid allocating the mbq
1877 *
1878 *      * netmap_bwrap_krings_create			(bwraps)
1879 *      	create both the brap krings array,
1880 *      	the krings array of the wrapped adapter, and
1881 *      	(if needed) the fake array for the host adapter
1882 *
1883 * na->nm_register(, 1)
1884 * (put the adapter in netmap mode)
1885 *
1886 * 	This may be one of the following:
1887 * 	(XXX these should be either all *_register or all *_reg 2014-03-15)
1888 *
1889 * 	* netmap_hw_register				(hw ports)
1890 * 		checks that the ifp is still there, then calls
1891 * 		the hardware specific callback;
1892 *
1893 * 	* netmap_vp_reg					(VALE ports)
1894 *		If the port is connected to a bridge,
1895 *		set the NAF_NETMAP_ON flag under the
1896 *		bridge write lock.
1897 *
1898 *	* netmap_pipe_reg				(pipes)
1899 *		inform the other pipe end that it is no
1900 *		longer responsibile for the lifetime of this
1901 *		pipe end
1902 *
1903 *	* netmap_monitor_reg				(monitors)
1904 *		intercept the sync callbacks of the monitored
1905 *		rings
1906 *
1907 *	* netmap_bwrap_register				(bwraps)
1908 *		cross-link the bwrap and hwna rings,
1909 *		forward the request to the hwna, override
1910 *		the hwna notify callback (to get the frames
1911 *		coming from outside go through the bridge).
1912 *
1913 * XXX maybe netmap_if_new() should be merged with this (2014-03-15).
1914 *
1915 */
1916struct netmap_if *
1917netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1918	uint16_t ringid, uint32_t flags, int *err)
1919{
1920	struct netmap_if *nifp = NULL;
1921	int error, need_mem = 0;
1922
1923	NMG_LOCK_ASSERT();
1924	/* ring configuration may have changed, fetch from the card */
1925	netmap_update_config(na);
1926	priv->np_na = na;     /* store the reference */
1927	error = netmap_set_ringid(priv, ringid, flags);
1928	if (error)
1929		goto out;
1930	/* ensure allocators are ready */
1931	need_mem = !netmap_have_memory_locked(priv);
1932	if (need_mem) {
1933		error = netmap_get_memory_locked(priv);
1934		ND("get_memory returned %d", error);
1935		if (error)
1936			goto out;
1937	}
1938	/* Allocate a netmap_if and, if necessary, all the netmap_ring's */
1939	nifp = netmap_if_new(na);
1940	if (nifp == NULL) { /* allocation failed */
1941		error = ENOMEM;
1942		goto out;
1943	}
1944	na->active_fds++;
1945	if (!nm_netmap_on(na)) {
1946		/* Netmap not active, set the card in netmap mode
1947		 * and make it use the shared buffers.
1948		 */
1949		/* cache the allocator info in the na */
1950		na->na_lut = netmap_mem_get_lut(na->nm_mem);
1951		ND("%p->na_lut == %p", na, na->na_lut);
1952		na->na_lut_objtotal = netmap_mem_get_buftotal(na->nm_mem);
1953		na->na_lut_objsize = netmap_mem_get_bufsize(na->nm_mem);
1954		error = na->nm_register(na, 1); /* mode on */
1955		if (error) {
1956			netmap_do_unregif(priv, nifp);
1957			nifp = NULL;
1958		}
1959	}
1960out:
1961	*err = error;
1962	if (error) {
1963		/* we should drop the allocator, but only
1964		 * if we were the ones who grabbed it
1965		 */
1966		if (need_mem)
1967			netmap_drop_memory_locked(priv);
1968		priv->np_na = NULL;
1969	}
1970	if (nifp != NULL) {
1971		/*
1972		 * advertise that the interface is ready bt setting ni_nifp.
1973		 * The barrier is needed because readers (poll and *SYNC)
1974		 * check for priv->np_nifp != NULL without locking
1975		 */
1976		wmb(); /* make sure previous writes are visible to all CPUs */
1977		priv->np_nifp = nifp;
1978	}
1979	return nifp;
1980}
1981
1982
1983
1984/*
1985 * ioctl(2) support for the "netmap" device.
1986 *
1987 * Following a list of accepted commands:
1988 * - NIOCGINFO
1989 * - SIOCGIFADDR	just for convenience
1990 * - NIOCREGIF
1991 * - NIOCTXSYNC
1992 * - NIOCRXSYNC
1993 *
1994 * Return 0 on success, errno otherwise.
1995 */
1996int
1997netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
1998	int fflag, struct thread *td)
1999{
2000	struct netmap_priv_d *priv = NULL;
2001	struct nmreq *nmr = (struct nmreq *) data;
2002	struct netmap_adapter *na = NULL;
2003	int error;
2004	u_int i, qfirst, qlast;
2005	struct netmap_if *nifp;
2006	struct netmap_kring *krings;
2007
2008	(void)dev;	/* UNUSED */
2009	(void)fflag;	/* UNUSED */
2010
2011	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2012		/* truncate name */
2013		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2014		if (nmr->nr_version != NETMAP_API) {
2015			D("API mismatch for %s got %d need %d",
2016				nmr->nr_name,
2017				nmr->nr_version, NETMAP_API);
2018			nmr->nr_version = NETMAP_API;
2019		}
2020		if (nmr->nr_version < NETMAP_MIN_API ||
2021		    nmr->nr_version > NETMAP_MAX_API) {
2022			return EINVAL;
2023		}
2024	}
2025	CURVNET_SET(TD_TO_VNET(td));
2026
2027	error = devfs_get_cdevpriv((void **)&priv);
2028	if (error) {
2029		CURVNET_RESTORE();
2030		/* XXX ENOENT should be impossible, since the priv
2031		 * is now created in the open */
2032		return (error == ENOENT ? ENXIO : error);
2033	}
2034
2035	switch (cmd) {
2036	case NIOCGINFO:		/* return capabilities etc */
2037		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2038			error = netmap_bdg_ctl(nmr, NULL);
2039			break;
2040		}
2041
2042		NMG_LOCK();
2043		do {
2044			/* memsize is always valid */
2045			struct netmap_mem_d *nmd = &nm_mem;
2046			u_int memflags;
2047
2048			if (nmr->nr_name[0] != '\0') {
2049				/* get a refcount */
2050				error = netmap_get_na(nmr, &na, 1 /* create */);
2051				if (error)
2052					break;
2053				nmd = na->nm_mem; /* get memory allocator */
2054			}
2055
2056			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
2057				&nmr->nr_arg2);
2058			if (error)
2059				break;
2060			if (na == NULL) /* only memory info */
2061				break;
2062			nmr->nr_offset = 0;
2063			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2064			netmap_update_config(na);
2065			nmr->nr_rx_rings = na->num_rx_rings;
2066			nmr->nr_tx_rings = na->num_tx_rings;
2067			nmr->nr_rx_slots = na->num_rx_desc;
2068			nmr->nr_tx_slots = na->num_tx_desc;
2069			netmap_adapter_put(na);
2070		} while (0);
2071		NMG_UNLOCK();
2072		break;
2073
2074	case NIOCREGIF:
2075		/* possibly attach/detach NIC and VALE switch */
2076		i = nmr->nr_cmd;
2077		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2078				|| i == NETMAP_BDG_VNET_HDR
2079				|| i == NETMAP_BDG_NEWIF
2080				|| i == NETMAP_BDG_DELIF) {
2081			error = netmap_bdg_ctl(nmr, NULL);
2082			break;
2083		} else if (i != 0) {
2084			D("nr_cmd must be 0 not %d", i);
2085			error = EINVAL;
2086			break;
2087		}
2088
2089		/* protect access to priv from concurrent NIOCREGIF */
2090		NMG_LOCK();
2091		do {
2092			u_int memflags;
2093
2094			if (priv->np_na != NULL) {	/* thread already registered */
2095				error = EBUSY;
2096				break;
2097			}
2098			/* find the interface and a reference */
2099			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
2100			if (error)
2101				break;
2102			if (NETMAP_OWNED_BY_KERN(na)) {
2103				netmap_adapter_put(na);
2104				error = EBUSY;
2105				break;
2106			}
2107			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error);
2108			if (!nifp) {    /* reg. failed, release priv and ref */
2109				netmap_adapter_put(na);
2110				priv->np_nifp = NULL;
2111				break;
2112			}
2113			priv->np_td = td; // XXX kqueue, debugging only
2114
2115			/* return the offset of the netmap_if object */
2116			nmr->nr_rx_rings = na->num_rx_rings;
2117			nmr->nr_tx_rings = na->num_tx_rings;
2118			nmr->nr_rx_slots = na->num_rx_desc;
2119			nmr->nr_tx_slots = na->num_tx_desc;
2120			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
2121				&nmr->nr_arg2);
2122			if (error) {
2123				netmap_adapter_put(na);
2124				break;
2125			}
2126			if (memflags & NETMAP_MEM_PRIVATE) {
2127				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2128			}
2129			priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ?
2130				&na->tx_si : &na->tx_rings[priv->np_txqfirst].si;
2131			priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ?
2132				&na->rx_si : &na->rx_rings[priv->np_rxqfirst].si;
2133
2134			if (nmr->nr_arg3) {
2135				D("requested %d extra buffers", nmr->nr_arg3);
2136				nmr->nr_arg3 = netmap_extra_alloc(na,
2137					&nifp->ni_bufs_head, nmr->nr_arg3);
2138				D("got %d extra buffers", nmr->nr_arg3);
2139			}
2140			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2141		} while (0);
2142		NMG_UNLOCK();
2143		break;
2144
2145	case NIOCTXSYNC:
2146	case NIOCRXSYNC:
2147		nifp = priv->np_nifp;
2148
2149		if (nifp == NULL) {
2150			error = ENXIO;
2151			break;
2152		}
2153		rmb(); /* make sure following reads are not from cache */
2154
2155		na = priv->np_na;      /* we have a reference */
2156
2157		if (na == NULL) {
2158			D("Internal error: nifp != NULL && na == NULL");
2159			error = ENXIO;
2160			break;
2161		}
2162
2163		if (!nm_netmap_on(na)) {
2164			error = ENXIO;
2165			break;
2166		}
2167
2168		if (cmd == NIOCTXSYNC) {
2169			krings = na->tx_rings;
2170			qfirst = priv->np_txqfirst;
2171			qlast = priv->np_txqlast;
2172		} else {
2173			krings = na->rx_rings;
2174			qfirst = priv->np_rxqfirst;
2175			qlast = priv->np_rxqlast;
2176		}
2177
2178		for (i = qfirst; i < qlast; i++) {
2179			struct netmap_kring *kring = krings + i;
2180			if (nm_kr_tryget(kring)) {
2181				error = EBUSY;
2182				goto out;
2183			}
2184			if (cmd == NIOCTXSYNC) {
2185				if (netmap_verbose & NM_VERB_TXSYNC)
2186					D("pre txsync ring %d cur %d hwcur %d",
2187					    i, kring->ring->cur,
2188					    kring->nr_hwcur);
2189				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2190					netmap_ring_reinit(kring);
2191				} else {
2192					kring->nm_sync(kring, NAF_FORCE_RECLAIM);
2193				}
2194				if (netmap_verbose & NM_VERB_TXSYNC)
2195					D("post txsync ring %d cur %d hwcur %d",
2196					    i, kring->ring->cur,
2197					    kring->nr_hwcur);
2198			} else {
2199				kring->nm_sync(kring, NAF_FORCE_READ);
2200				microtime(&na->rx_rings[i].ring->ts);
2201			}
2202			nm_kr_put(kring);
2203		}
2204
2205		break;
2206
2207	case NIOCCONFIG:
2208		error = netmap_bdg_config(nmr);
2209		break;
2210#ifdef __FreeBSD__
2211	case FIONBIO:
2212	case FIOASYNC:
2213		ND("FIONBIO/FIOASYNC are no-ops");
2214		break;
2215
2216	case BIOCIMMEDIATE:
2217	case BIOCGHDRCMPLT:
2218	case BIOCSHDRCMPLT:
2219	case BIOCSSEESENT:
2220		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2221		break;
2222
2223	default:	/* allow device-specific ioctls */
2224	    {
2225		struct socket so;
2226		struct ifnet *ifp;
2227
2228		bzero(&so, sizeof(so));
2229		NMG_LOCK();
2230		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
2231		if (error) {
2232			netmap_adapter_put(na);
2233			NMG_UNLOCK();
2234			break;
2235		}
2236		ifp = na->ifp;
2237		so.so_vnet = ifp->if_vnet;
2238		// so->so_proto not null.
2239		error = ifioctl(&so, cmd, data, td);
2240		netmap_adapter_put(na);
2241		NMG_UNLOCK();
2242		break;
2243	    }
2244
2245#else /* linux */
2246	default:
2247		error = EOPNOTSUPP;
2248#endif /* linux */
2249	}
2250out:
2251
2252	CURVNET_RESTORE();
2253	return (error);
2254}
2255
2256
2257/*
2258 * select(2) and poll(2) handlers for the "netmap" device.
2259 *
2260 * Can be called for one or more queues.
2261 * Return true the event mask corresponding to ready events.
2262 * If there are no ready events, do a selrecord on either individual
2263 * selinfo or on the global one.
2264 * Device-dependent parts (locking and sync of tx/rx rings)
2265 * are done through callbacks.
2266 *
2267 * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2268 * The first one is remapped to pwait as selrecord() uses the name as an
2269 * hidden argument.
2270 */
2271int
2272netmap_poll(struct cdev *dev, int events, struct thread *td)
2273{
2274	struct netmap_priv_d *priv = NULL;
2275	struct netmap_adapter *na;
2276	struct netmap_kring *kring;
2277	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
2278	struct mbq q;		/* packets from hw queues to host stack */
2279	void *pwait = dev;	/* linux compatibility */
2280	int is_kevent = 0;
2281
2282	/*
2283	 * In order to avoid nested locks, we need to "double check"
2284	 * txsync and rxsync if we decide to do a selrecord().
2285	 * retry_tx (and retry_rx, later) prevent looping forever.
2286	 */
2287	int retry_tx = 1, retry_rx = 1;
2288
2289	(void)pwait;
2290	mbq_init(&q);
2291
2292	/*
2293	 * XXX kevent has curthread->tp_fop == NULL,
2294	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
2295	 * priv as the first argument, which is also useful to avoid
2296	 * the selrecord() which are not necessary in that case.
2297	 */
2298	if (devfs_get_cdevpriv((void **)&priv) != 0) {
2299		is_kevent = 1;
2300		if (netmap_verbose)
2301			D("called from kevent");
2302		priv = (struct netmap_priv_d *)dev;
2303	}
2304	if (priv == NULL)
2305		return POLLERR;
2306
2307	if (priv->np_nifp == NULL) {
2308		D("No if registered");
2309		return POLLERR;
2310	}
2311	rmb(); /* make sure following reads are not from cache */
2312
2313	na = priv->np_na;
2314
2315	if (!nm_netmap_on(na))
2316		return POLLERR;
2317
2318	if (netmap_verbose & 0x8000)
2319		D("device %s events 0x%x", na->name, events);
2320	want_tx = events & (POLLOUT | POLLWRNORM);
2321	want_rx = events & (POLLIN | POLLRDNORM);
2322
2323
2324	/*
2325	 * check_all_{tx|rx} are set if the card has more than one queue AND
2326	 * the file descriptor is bound to all of them. If so, we sleep on
2327	 * the "global" selinfo, otherwise we sleep on individual selinfo
2328	 * (FreeBSD only allows two selinfo's per file descriptor).
2329	 * The interrupt routine in the driver wake one or the other
2330	 * (or both) depending on which clients are active.
2331	 *
2332	 * rxsync() is only called if we run out of buffers on a POLLIN.
2333	 * txsync() is called if we run out of buffers on POLLOUT, or
2334	 * there are pending packets to send. The latter can be disabled
2335	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2336	 */
2337	check_all_tx = nm_tx_si_user(priv);
2338	check_all_rx = nm_rx_si_user(priv);
2339
2340	/*
2341	 * We start with a lock free round which is cheap if we have
2342	 * slots available. If this fails, then lock and call the sync
2343	 * routines.
2344	 */
2345	for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) {
2346		kring = &na->rx_rings[i];
2347		/* XXX compare ring->cur and kring->tail */
2348		if (!nm_ring_empty(kring->ring)) {
2349			revents |= want_rx;
2350			want_rx = 0;	/* also breaks the loop */
2351		}
2352	}
2353	for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) {
2354		kring = &na->tx_rings[i];
2355		/* XXX compare ring->cur and kring->tail */
2356		if (!nm_ring_empty(kring->ring)) {
2357			revents |= want_tx;
2358			want_tx = 0;	/* also breaks the loop */
2359		}
2360	}
2361
2362	/*
2363	 * If we want to push packets out (priv->np_txpoll) or
2364	 * want_tx is still set, we must issue txsync calls
2365	 * (on all rings, to avoid that the tx rings stall).
2366	 * XXX should also check cur != hwcur on the tx rings.
2367	 * Fortunately, normal tx mode has np_txpoll set.
2368	 */
2369	if (priv->np_txpoll || want_tx) {
2370		/*
2371		 * The first round checks if anyone is ready, if not
2372		 * do a selrecord and another round to handle races.
2373		 * want_tx goes to 0 if any space is found, and is
2374		 * used to skip rings with no pending transmissions.
2375		 */
2376flush_tx:
2377		for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
2378			int found = 0;
2379
2380			kring = &na->tx_rings[i];
2381			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2382				continue;
2383			/* only one thread does txsync */
2384			if (nm_kr_tryget(kring)) {
2385				/* either busy or stopped
2386				 * XXX if the ring is stopped, sleeping would
2387				 * be better. In current code, however, we only
2388				 * stop the rings for brief intervals (2014-03-14)
2389				 */
2390				if (netmap_verbose)
2391					RD(2, "%p lost race on txring %d, ok",
2392					    priv, i);
2393				continue;
2394			}
2395			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2396				netmap_ring_reinit(kring);
2397				revents |= POLLERR;
2398			} else {
2399				if (kring->nm_sync(kring, 0))
2400					revents |= POLLERR;
2401			}
2402
2403			/*
2404			 * If we found new slots, notify potential
2405			 * listeners on the same ring.
2406			 * Since we just did a txsync, look at the copies
2407			 * of cur,tail in the kring.
2408			 */
2409			found = kring->rcur != kring->rtail;
2410			nm_kr_put(kring);
2411			if (found) { /* notify other listeners */
2412				revents |= want_tx;
2413				want_tx = 0;
2414				na->nm_notify(na, i, NR_TX, 0);
2415			}
2416		}
2417		if (want_tx && retry_tx && !is_kevent) {
2418			selrecord(td, check_all_tx ?
2419			    &na->tx_si : &na->tx_rings[priv->np_txqfirst].si);
2420			retry_tx = 0;
2421			goto flush_tx;
2422		}
2423	}
2424
2425	/*
2426	 * If want_rx is still set scan receive rings.
2427	 * Do it on all rings because otherwise we starve.
2428	 */
2429	if (want_rx) {
2430		int send_down = 0; /* transparent mode */
2431		/* two rounds here for race avoidance */
2432do_retry_rx:
2433		for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
2434			int found = 0;
2435
2436			kring = &na->rx_rings[i];
2437
2438			if (nm_kr_tryget(kring)) {
2439				if (netmap_verbose)
2440					RD(2, "%p lost race on rxring %d, ok",
2441					    priv, i);
2442				continue;
2443			}
2444
2445			/*
2446			 * transparent mode support: collect packets
2447			 * from the rxring(s).
2448			 * XXX NR_FORWARD should only be read on
2449			 * physical or NIC ports
2450			 */
2451			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2452				ND(10, "forwarding some buffers up %d to %d",
2453				    kring->nr_hwcur, kring->ring->cur);
2454				netmap_grab_packets(kring, &q, netmap_fwd);
2455			}
2456
2457			if (kring->nm_sync(kring, 0))
2458				revents |= POLLERR;
2459			if (netmap_no_timestamp == 0 ||
2460					kring->ring->flags & NR_TIMESTAMP) {
2461				microtime(&kring->ring->ts);
2462			}
2463			/* after an rxsync we can use kring->rcur, rtail */
2464			found = kring->rcur != kring->rtail;
2465			nm_kr_put(kring);
2466			if (found) {
2467				revents |= want_rx;
2468				retry_rx = 0;
2469				na->nm_notify(na, i, NR_RX, 0);
2470			}
2471		}
2472
2473		/* transparent mode XXX only during first pass ? */
2474		if (na->na_flags & NAF_HOST_RINGS) {
2475			kring = &na->rx_rings[na->num_rx_rings];
2476			if (check_all_rx
2477			    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2478				/* XXX fix to use kring fields */
2479				if (nm_ring_empty(kring->ring))
2480					send_down = netmap_rxsync_from_host(na, td, dev);
2481				if (!nm_ring_empty(kring->ring))
2482					revents |= want_rx;
2483			}
2484		}
2485
2486		if (retry_rx && !is_kevent)
2487			selrecord(td, check_all_rx ?
2488			    &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si);
2489		if (send_down > 0 || retry_rx) {
2490			retry_rx = 0;
2491			if (send_down)
2492				goto flush_tx; /* and retry_rx */
2493			else
2494				goto do_retry_rx;
2495		}
2496	}
2497
2498	/*
2499	 * Transparent mode: marked bufs on rx rings between
2500	 * kring->nr_hwcur and ring->head
2501	 * are passed to the other endpoint.
2502	 *
2503	 * In this mode we also scan the sw rxring, which in
2504	 * turn passes packets up.
2505	 *
2506	 * XXX Transparent mode at the moment requires to bind all
2507 	 * rings to a single file descriptor.
2508	 */
2509
2510	if (q.head && na->ifp != NULL)
2511		netmap_send_up(na->ifp, &q);
2512
2513	return (revents);
2514}
2515
2516
2517/*-------------------- driver support routines -------------------*/
2518
2519static int netmap_hw_krings_create(struct netmap_adapter *);
2520
2521/* default notify callback */
2522static int
2523netmap_notify(struct netmap_adapter *na, u_int n_ring,
2524	enum txrx tx, int flags)
2525{
2526	struct netmap_kring *kring;
2527
2528	if (tx == NR_TX) {
2529		kring = na->tx_rings + n_ring;
2530		OS_selwakeup(&kring->si, PI_NET);
2531		/* optimization: avoid a wake up on the global
2532		 * queue if nobody has registered for more
2533		 * than one ring
2534		 */
2535		if (na->tx_si_users > 0)
2536			OS_selwakeup(&na->tx_si, PI_NET);
2537	} else {
2538		kring = na->rx_rings + n_ring;
2539		OS_selwakeup(&kring->si, PI_NET);
2540		/* optimization: same as above */
2541		if (na->rx_si_users > 0)
2542			OS_selwakeup(&na->rx_si, PI_NET);
2543	}
2544	return 0;
2545}
2546
2547
2548/* called by all routines that create netmap_adapters.
2549 * Attach na to the ifp (if any) and provide defaults
2550 * for optional callbacks. Defaults assume that we
2551 * are creating an hardware netmap_adapter.
2552 */
2553int
2554netmap_attach_common(struct netmap_adapter *na)
2555{
2556	struct ifnet *ifp = na->ifp;
2557
2558	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2559		D("%s: invalid rings tx %d rx %d",
2560			na->name, na->num_tx_rings, na->num_rx_rings);
2561		return EINVAL;
2562	}
2563	/* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,
2564	 * pipes, monitors). For bwrap we actually have a non-null ifp for
2565	 * use by the external modules, but that is set after this
2566	 * function has been called.
2567	 * XXX this is ugly, maybe split this function in two (2014-03-14)
2568	 */
2569	if (ifp != NULL) {
2570		WNA(ifp) = na;
2571
2572	/* the following is only needed for na that use the host port.
2573	 * XXX do we have something similar for linux ?
2574	 */
2575#ifdef __FreeBSD__
2576		na->if_input = ifp->if_input; /* for netmap_send_up */
2577#endif /* __FreeBSD__ */
2578
2579		NETMAP_SET_CAPABLE(ifp);
2580	}
2581	if (na->nm_krings_create == NULL) {
2582		/* we assume that we have been called by a driver,
2583		 * since other port types all provide their own
2584		 * nm_krings_create
2585		 */
2586		na->nm_krings_create = netmap_hw_krings_create;
2587		na->nm_krings_delete = netmap_hw_krings_delete;
2588	}
2589	if (na->nm_notify == NULL)
2590		na->nm_notify = netmap_notify;
2591	na->active_fds = 0;
2592
2593	if (na->nm_mem == NULL)
2594		/* use the global allocator */
2595		na->nm_mem = &nm_mem;
2596	if (na->nm_bdg_attach == NULL)
2597		/* no special nm_bdg_attach callback. On VALE
2598		 * attach, we need to interpose a bwrap
2599		 */
2600		na->nm_bdg_attach = netmap_bwrap_attach;
2601	return 0;
2602}
2603
2604
2605/* standard cleanup, called by all destructors */
2606void
2607netmap_detach_common(struct netmap_adapter *na)
2608{
2609	if (na->ifp != NULL)
2610		WNA(na->ifp) = NULL; /* XXX do we need this? */
2611
2612	if (na->tx_rings) { /* XXX should not happen */
2613		D("freeing leftover tx_rings");
2614		na->nm_krings_delete(na);
2615	}
2616	netmap_pipe_dealloc(na);
2617	if (na->na_flags & NAF_MEM_OWNER)
2618		netmap_mem_private_delete(na->nm_mem);
2619	bzero(na, sizeof(*na));
2620	free(na, M_DEVBUF);
2621}
2622
2623/* Wrapper for the register callback provided hardware drivers.
2624 * na->ifp == NULL means the the driver module has been
2625 * unloaded, so we cannot call into it.
2626 * Note that module unloading, in our patched linux drivers,
2627 * happens under NMG_LOCK and after having stopped all the
2628 * nic rings (see netmap_detach). This provides sufficient
2629 * protection for the other driver-provied callbacks
2630 * (i.e., nm_config and nm_*xsync), that therefore don't need
2631 * to wrapped.
2632 */
2633static int
2634netmap_hw_register(struct netmap_adapter *na, int onoff)
2635{
2636	struct netmap_hw_adapter *hwna =
2637		(struct netmap_hw_adapter*)na;
2638
2639	if (na->ifp == NULL)
2640		return onoff ? ENXIO : 0;
2641
2642	return hwna->nm_hw_register(na, onoff);
2643}
2644
2645
2646/*
2647 * Initialize a ``netmap_adapter`` object created by driver on attach.
2648 * We allocate a block of memory with room for a struct netmap_adapter
2649 * plus two sets of N+2 struct netmap_kring (where N is the number
2650 * of hardware rings):
2651 * krings	0..N-1	are for the hardware queues.
2652 * kring	N	is for the host stack queue
2653 * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2654 * Return 0 on success, ENOMEM otherwise.
2655 */
2656int
2657netmap_attach(struct netmap_adapter *arg)
2658{
2659	struct netmap_hw_adapter *hwna = NULL;
2660	// XXX when is arg == NULL ?
2661	struct ifnet *ifp = arg ? arg->ifp : NULL;
2662
2663	if (arg == NULL || ifp == NULL)
2664		goto fail;
2665	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2666	if (hwna == NULL)
2667		goto fail;
2668	hwna->up = *arg;
2669	hwna->up.na_flags |= NAF_HOST_RINGS;
2670	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
2671	hwna->nm_hw_register = hwna->up.nm_register;
2672	hwna->up.nm_register = netmap_hw_register;
2673	if (netmap_attach_common(&hwna->up)) {
2674		free(hwna, M_DEVBUF);
2675		goto fail;
2676	}
2677	netmap_adapter_get(&hwna->up);
2678
2679#ifdef linux
2680	if (ifp->netdev_ops) {
2681		/* prepare a clone of the netdev ops */
2682#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2683		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2684#else
2685		hwna->nm_ndo = *ifp->netdev_ops;
2686#endif
2687	}
2688	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2689	if (ifp->ethtool_ops) {
2690		hwna->nm_eto = *ifp->ethtool_ops;
2691	}
2692	hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
2693#ifdef ETHTOOL_SCHANNELS
2694	hwna->nm_eto.set_channels = linux_netmap_set_channels;
2695#endif
2696	if (arg->nm_config == NULL) {
2697		hwna->up.nm_config = netmap_linux_config;
2698	}
2699#endif /* linux */
2700
2701	D("success for %s tx %d/%d rx %d/%d queues/slots",
2702		hwna->up.name,
2703		hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2704		hwna->up.num_rx_rings, hwna->up.num_rx_desc
2705		);
2706	return 0;
2707
2708fail:
2709	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2710	if (ifp)
2711		netmap_detach(ifp);
2712	return (hwna ? EINVAL : ENOMEM);
2713}
2714
2715
2716void
2717NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2718{
2719	if (!na) {
2720		return;
2721	}
2722
2723	refcount_acquire(&na->na_refcount);
2724}
2725
2726
2727/* returns 1 iff the netmap_adapter is destroyed */
2728int
2729NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2730{
2731	if (!na)
2732		return 1;
2733
2734	if (!refcount_release(&na->na_refcount))
2735		return 0;
2736
2737	if (na->nm_dtor)
2738		na->nm_dtor(na);
2739
2740	netmap_detach_common(na);
2741
2742	return 1;
2743}
2744
2745/* nm_krings_create callback for all hardware native adapters */
2746int
2747netmap_hw_krings_create(struct netmap_adapter *na)
2748{
2749	int ret = netmap_krings_create(na, 0);
2750	if (ret == 0) {
2751		/* initialize the mbq for the sw rx ring */
2752		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2753		ND("initialized sw rx queue %d", na->num_rx_rings);
2754	}
2755	return ret;
2756}
2757
2758
2759
2760/*
2761 * Called on module unload by the netmap-enabled drivers
2762 */
2763void
2764netmap_detach(struct ifnet *ifp)
2765{
2766	struct netmap_adapter *na = NA(ifp);
2767
2768	if (!na)
2769		return;
2770
2771	NMG_LOCK();
2772	netmap_disable_all_rings(ifp);
2773	if (!netmap_adapter_put(na)) {
2774		/* someone is still using the adapter,
2775		 * tell them that the interface is gone
2776		 */
2777		na->ifp = NULL;
2778		// XXX also clear NAF_NATIVE_ON ?
2779		na->na_flags &= ~NAF_NETMAP_ON;
2780		/* give them a chance to notice */
2781		netmap_enable_all_rings(ifp);
2782	}
2783	NMG_UNLOCK();
2784}
2785
2786
2787/*
2788 * Intercept packets from the network stack and pass them
2789 * to netmap as incoming packets on the 'software' ring.
2790 *
2791 * We only store packets in a bounded mbq and then copy them
2792 * in the relevant rxsync routine.
2793 *
2794 * We rely on the OS to make sure that the ifp and na do not go
2795 * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2796 * In nm_register() or whenever there is a reinitialization,
2797 * we make sure to make the mode change visible here.
2798 */
2799int
2800netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2801{
2802	struct netmap_adapter *na = NA(ifp);
2803	struct netmap_kring *kring;
2804	u_int len = MBUF_LEN(m);
2805	u_int error = ENOBUFS;
2806	struct mbq *q;
2807	int space;
2808
2809	// XXX [Linux] we do not need this lock
2810	// if we follow the down/configure/up protocol -gl
2811	// mtx_lock(&na->core_lock);
2812
2813	if (!nm_netmap_on(na)) {
2814		D("%s not in netmap mode anymore", na->name);
2815		error = ENXIO;
2816		goto done;
2817	}
2818
2819	kring = &na->rx_rings[na->num_rx_rings];
2820	q = &kring->rx_queue;
2821
2822	// XXX reconsider long packets if we handle fragments
2823	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
2824		D("%s from_host, drop packet size %d > %d", na->name,
2825			len, NETMAP_BUF_SIZE(na));
2826		goto done;
2827	}
2828
2829	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2830	 * and maybe other instances of netmap_transmit (the latter
2831	 * not possible on Linux).
2832	 * Also avoid overflowing the queue.
2833	 */
2834	mbq_lock(q);
2835
2836        space = kring->nr_hwtail - kring->nr_hwcur;
2837        if (space < 0)
2838                space += kring->nkr_num_slots;
2839	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2840		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2841			na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2842			len, m);
2843	} else {
2844		mbq_enqueue(q, m);
2845		ND(10, "%s %d bufs in queue len %d m %p",
2846			na->name, mbq_len(q), len, m);
2847		/* notify outside the lock */
2848		m = NULL;
2849		error = 0;
2850	}
2851	mbq_unlock(q);
2852
2853done:
2854	if (m)
2855		m_freem(m);
2856	/* unconditionally wake up listeners */
2857	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2858	/* this is normally netmap_notify(), but for nics
2859	 * connected to a bridge it is netmap_bwrap_intr_notify(),
2860	 * that possibly forwards the frames through the switch
2861	 */
2862
2863	return (error);
2864}
2865
2866
2867/*
2868 * netmap_reset() is called by the driver routines when reinitializing
2869 * a ring. The driver is in charge of locking to protect the kring.
2870 * If native netmap mode is not set just return NULL.
2871 */
2872struct netmap_slot *
2873netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2874	u_int new_cur)
2875{
2876	struct netmap_kring *kring;
2877	int new_hwofs, lim;
2878
2879	if (!nm_native_on(na)) {
2880		ND("interface not in native netmap mode");
2881		return NULL;	/* nothing to reinitialize */
2882	}
2883
2884	/* XXX note- in the new scheme, we are not guaranteed to be
2885	 * under lock (e.g. when called on a device reset).
2886	 * In this case, we should set a flag and do not trust too
2887	 * much the values. In practice: TODO
2888	 * - set a RESET flag somewhere in the kring
2889	 * - do the processing in a conservative way
2890	 * - let the *sync() fixup at the end.
2891	 */
2892	if (tx == NR_TX) {
2893		if (n >= na->num_tx_rings)
2894			return NULL;
2895		kring = na->tx_rings + n;
2896		// XXX check whether we should use hwcur or rcur
2897		new_hwofs = kring->nr_hwcur - new_cur;
2898	} else {
2899		if (n >= na->num_rx_rings)
2900			return NULL;
2901		kring = na->rx_rings + n;
2902		new_hwofs = kring->nr_hwtail - new_cur;
2903	}
2904	lim = kring->nkr_num_slots - 1;
2905	if (new_hwofs > lim)
2906		new_hwofs -= lim + 1;
2907
2908	/* Always set the new offset value and realign the ring. */
2909	if (netmap_verbose)
2910	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2911		na->name,
2912		tx == NR_TX ? "TX" : "RX", n,
2913		kring->nkr_hwofs, new_hwofs,
2914		kring->nr_hwtail,
2915		tx == NR_TX ? lim : kring->nr_hwtail);
2916	kring->nkr_hwofs = new_hwofs;
2917	if (tx == NR_TX) {
2918		kring->nr_hwtail = kring->nr_hwcur + lim;
2919		if (kring->nr_hwtail > lim)
2920			kring->nr_hwtail -= lim + 1;
2921	}
2922
2923#if 0 // def linux
2924	/* XXX check that the mappings are correct */
2925	/* need ring_nr, adapter->pdev, direction */
2926	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2927	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2928		D("error mapping rx netmap buffer %d", i);
2929		// XXX fix error handling
2930	}
2931
2932#endif /* linux */
2933	/*
2934	 * Wakeup on the individual and global selwait
2935	 * We do the wakeup here, but the ring is not yet reconfigured.
2936	 * However, we are under lock so there are no races.
2937	 */
2938	na->nm_notify(na, n, tx, 0);
2939	return kring->ring->slot;
2940}
2941
2942
2943/*
2944 * Dispatch rx/tx interrupts to the netmap rings.
2945 *
2946 * "work_done" is non-null on the RX path, NULL for the TX path.
2947 * We rely on the OS to make sure that there is only one active
2948 * instance per queue, and that there is appropriate locking.
2949 *
2950 * The 'notify' routine depends on what the ring is attached to.
2951 * - for a netmap file descriptor, do a selwakeup on the individual
2952 *   waitqueue, plus one on the global one if needed
2953 *   (see netmap_notify)
2954 * - for a nic connected to a switch, call the proper forwarding routine
2955 *   (see netmap_bwrap_intr_notify)
2956 */
2957void
2958netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2959{
2960	struct netmap_adapter *na = NA(ifp);
2961	struct netmap_kring *kring;
2962
2963	q &= NETMAP_RING_MASK;
2964
2965	if (netmap_verbose) {
2966	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2967	}
2968
2969	if (work_done) { /* RX path */
2970		if (q >= na->num_rx_rings)
2971			return;	// not a physical queue
2972		kring = na->rx_rings + q;
2973		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2974		na->nm_notify(na, q, NR_RX, 0);
2975		*work_done = 1; /* do not fire napi again */
2976	} else { /* TX path */
2977		if (q >= na->num_tx_rings)
2978			return;	// not a physical queue
2979		kring = na->tx_rings + q;
2980		na->nm_notify(na, q, NR_TX, 0);
2981	}
2982}
2983
2984
2985/*
2986 * Default functions to handle rx/tx interrupts from a physical device.
2987 * "work_done" is non-null on the RX path, NULL for the TX path.
2988 *
2989 * If the card is not in netmap mode, simply return 0,
2990 * so that the caller proceeds with regular processing.
2991 * Otherwise call netmap_common_irq() and return 1.
2992 *
2993 * If the card is connected to a netmap file descriptor,
2994 * do a selwakeup on the individual queue, plus one on the global one
2995 * if needed (multiqueue card _and_ there are multiqueue listeners),
2996 * and return 1.
2997 *
2998 * Finally, if called on rx from an interface connected to a switch,
2999 * calls the proper forwarding routine, and return 1.
3000 */
3001int
3002netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3003{
3004	struct netmap_adapter *na = NA(ifp);
3005
3006	/*
3007	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3008	 * we still use the regular driver even though the previous
3009	 * check fails. It is unclear whether we should use
3010	 * nm_native_on() here.
3011	 */
3012	if (!nm_netmap_on(na))
3013		return 0;
3014
3015	if (na->na_flags & NAF_SKIP_INTR) {
3016		ND("use regular interrupt");
3017		return 0;
3018	}
3019
3020	netmap_common_irq(ifp, q, work_done);
3021	return 1;
3022}
3023
3024
3025/*
3026 * Module loader and unloader
3027 *
3028 * netmap_init() creates the /dev/netmap device and initializes
3029 * all global variables. Returns 0 on success, errno on failure
3030 * (but there is no chance)
3031 *
3032 * netmap_fini() destroys everything.
3033 */
3034
3035static struct cdev *netmap_dev; /* /dev/netmap character device. */
3036extern struct cdevsw netmap_cdevsw;
3037
3038
3039void
3040netmap_fini(void)
3041{
3042	// XXX destroy_bridges() ?
3043	if (netmap_dev)
3044		destroy_dev(netmap_dev);
3045	netmap_mem_fini();
3046	NMG_LOCK_DESTROY();
3047	printf("netmap: unloaded module.\n");
3048}
3049
3050
3051int
3052netmap_init(void)
3053{
3054	int error;
3055
3056	NMG_LOCK_INIT();
3057
3058	error = netmap_mem_init();
3059	if (error != 0)
3060		goto fail;
3061	/* XXX could use make_dev_credv() to get error number */
3062	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
3063			      "netmap");
3064	if (!netmap_dev)
3065		goto fail;
3066
3067	netmap_init_bridges();
3068#ifdef __FreeBSD__
3069	nm_vi_init_index();
3070#endif
3071	printf("netmap: loaded module\n");
3072	return (0);
3073fail:
3074	netmap_fini();
3075	return (EINVAL); /* may be incorrect */
3076}
3077