1/*
2 * Copyright (C) 2018 jingle YANG. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 *   1. Redistributions of source code must retain the above copyright
9 *      notice, this list of conditions and the following disclaimer.
10 *   2. Redistributions in binary form must reproduce the above copyright
11 *      notice, this list of conditions and the following disclaimer in the
12 *      documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27/*
28Date: Dec 16, 2018
29
30Description:
311. Pcap-dpdk provides libpcap the ability to use DPDK with the device name as dpdk:{portid}, such as dpdk:0.
322. DPDK is a set of libraries and drivers for fast packet processing. (https://www.dpdk.org/)
333. The testprogs/capturetest provides 6.4Gbps/800,000 pps on Intel 10-Gigabit X540-AT2 with DPDK 18.11.
34
35Limitations:
361. DPDK support will be on if DPDK is available. Please set DIR for --with-dpdk[=DIR] with ./configure or -DDPDK_DIR[=DIR] with cmake if DPDK is installed manually.
372. Only support link libdpdk.so dynamically, because the libdpdk.a will not work correctly.
383. Only support read operation, and packet injection has not been supported yet.
39
40Usage:
411. Compile DPDK as shared library and install.(https://github.com/DPDK/dpdk.git)
42
43You shall modify the file $RTE_SDK/$RTE_TARGET/.config and set:
44CONFIG_RTE_BUILD_SHARED_LIB=y
45By the following command:
46sed -i 's/CONFIG_RTE_BUILD_SHARED_LIB=n/CONFIG_RTE_BUILD_SHARED_LIB=y/' $RTE_SDK/$RTE_TARGET/.config
47
482. Launch l2fwd that is one of DPDK examples correctly, and get device information.
49
50You shall learn how to bind nic with DPDK-compatible driver by $RTE_SDK/usertools/dpdk-devbind.py, such as igb_uio.
51And enable hugepages by dpdk-setup.sh
52
53Then launch the l2fwd with dynamic driver support. For example:
54$RTE_SDK/examples/l2fwd/$RTE_TARGET/l2fwd -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so -- -p 0x1
55
563. Compile libpcap with dpdk options.
57
58If DPDK has not been found automatically, you shall export DPDK environment variable which are used for compiling DPDK. And then pass $RTE_SDK/$RTE_TARGET to --with-dpdk or -DDPDK_DIR
59
60export RTE_SDK={your DPDK base directory}
61export RTE_TARGET={your target name}
62
633.1 With configure
64
65./configure --with-dpdk=$RTE_SDK/$RTE_TARGET && make -s all && make -s testprogs && make install
66
673.2 With cmake
68
69mkdir -p build && cd build && cmake -DDPDK_DIR=$RTE_SDK/$RTE_TARGET ../ && make -s all && make -s testprogs && make install
70
714. Link your own program with libpcap, and use DPDK with the device name as dpdk:{portid}, such as dpdk:0.
72And you shall set DPDK configure options by environment variable DPDK_CFG
73For example, the testprogs/capturetest could be lanched by:
74
75env DPDK_CFG="--log-level=debug -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so" ./capturetest -i dpdk:0
76*/
77
78#ifdef HAVE_CONFIG_H
79#include <config.h>
80#endif
81
82#include <errno.h>
83#include <netdb.h>
84#include <stdio.h>
85#include <stdlib.h>
86#include <string.h>
87#include <unistd.h>
88#include <limits.h> /* for INT_MAX */
89#include <time.h>
90
91#include <sys/time.h>
92
93//header for calling dpdk
94#include <rte_config.h>
95#include <rte_common.h>
96#include <rte_errno.h>
97#include <rte_log.h>
98#include <rte_malloc.h>
99#include <rte_memory.h>
100#include <rte_eal.h>
101#include <rte_launch.h>
102#include <rte_atomic.h>
103#include <rte_cycles.h>
104#include <rte_lcore.h>
105#include <rte_per_lcore.h>
106#include <rte_branch_prediction.h>
107#include <rte_interrupts.h>
108#include <rte_random.h>
109#include <rte_debug.h>
110#include <rte_ether.h>
111#include <rte_ethdev.h>
112#include <rte_mempool.h>
113#include <rte_mbuf.h>
114#include <rte_bus.h>
115
116#include "pcap-int.h"
117#include "pcap-dpdk.h"
118
119/*
120 * Deal with API changes that break source compatibility.
121 */
122
123#ifdef HAVE_STRUCT_RTE_ETHER_ADDR
124#define ETHER_ADDR_TYPE	struct rte_ether_addr
125#else
126#define ETHER_ADDR_TYPE	struct ether_addr
127#endif
128
129#define DPDK_DEF_LOG_LEV RTE_LOG_ERR
130//
131// This is set to 0 if we haven't initialized DPDK yet, 1 if we've
132// successfully initialized it, a negative value, which is the negative
133// of the rte_errno from rte_eal_init(), if we tried to initialize it
134// and got an error.
135//
136static int is_dpdk_pre_inited=0;
137#define DPDK_LIB_NAME "libpcap_dpdk"
138#define DPDK_DESC "Data Plane Development Kit (DPDK) Interface"
139#define DPDK_ERR_PERM_MSG "permission denied, DPDK needs root permission"
140#define DPDK_ARGC_MAX 64
141#define DPDK_CFG_MAX_LEN 1024
142#define DPDK_DEV_NAME_MAX 32
143#define DPDK_DEV_DESC_MAX 512
144#define DPDK_CFG_ENV_NAME "DPDK_CFG"
145#define DPDK_DEF_MIN_SLEEP_MS 1
146static char dpdk_cfg_buf[DPDK_CFG_MAX_LEN];
147#define DPDK_MAC_ADDR_SIZE 32
148#define DPDK_DEF_MAC_ADDR "00:00:00:00:00:00"
149#define DPDK_PCI_ADDR_SIZE 16
150#define DPDK_DEF_CFG "--log-level=error -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so"
151#define DPDK_PREFIX "dpdk:"
152#define DPDK_PORTID_MAX 65535U
153#define MBUF_POOL_NAME "mbuf_pool"
154#define DPDK_TX_BUF_NAME "tx_buffer"
155//The number of elements in the mbuf pool.
156#define DPDK_NB_MBUFS 8192U
157#define MEMPOOL_CACHE_SIZE 256
158#define MAX_PKT_BURST 32
159// Configurable number of RX/TX ring descriptors
160#define RTE_TEST_RX_DESC_DEFAULT 1024
161#define RTE_TEST_TX_DESC_DEFAULT 1024
162
163static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
164static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
165
166#ifdef RTE_ETHER_MAX_JUMBO_FRAME_LEN
167#define RTE_ETH_PCAP_SNAPLEN RTE_ETHER_MAX_JUMBO_FRAME_LEN
168#else
169#define RTE_ETH_PCAP_SNAPLEN ETHER_MAX_JUMBO_FRAME_LEN
170#endif
171
172static struct rte_eth_dev_tx_buffer *tx_buffer;
173
174struct dpdk_ts_helper{
175	struct timeval start_time;
176	uint64_t start_cycles;
177	uint64_t hz;
178};
179struct pcap_dpdk{
180	pcap_t * orig;
181	uint16_t portid; // portid of DPDK
182	int must_clear_promisc;
183	uint64_t bpf_drop;
184	int nonblock;
185	struct timeval required_select_timeout;
186	struct timeval prev_ts;
187	struct rte_eth_stats prev_stats;
188	struct timeval curr_ts;
189	struct rte_eth_stats curr_stats;
190	uint64_t pps;
191	uint64_t bps;
192	struct rte_mempool * pktmbuf_pool;
193	struct dpdk_ts_helper ts_helper;
194	ETHER_ADDR_TYPE eth_addr;
195	char mac_addr[DPDK_MAC_ADDR_SIZE];
196	char pci_addr[DPDK_PCI_ADDR_SIZE];
197	unsigned char pcap_tmp_buf[RTE_ETH_PCAP_SNAPLEN];
198};
199
200static struct rte_eth_conf port_conf = {
201	.rxmode = {
202		.split_hdr_size = 0,
203	},
204	.txmode = {
205		.mq_mode = ETH_MQ_TX_NONE,
206	},
207};
208
209static void	dpdk_fmt_errmsg_for_rte_errno(char *, size_t, int,
210    PCAP_FORMAT_STRING(const char *), ...) PCAP_PRINTFLIKE(4, 5);
211
212/*
213 * Generate an error message based on a format, arguments, and an
214 * rte_errno, with a message for the rte_errno after the formatted output.
215 */
216static void dpdk_fmt_errmsg_for_rte_errno(char *errbuf, size_t errbuflen,
217    int errnum, const char *fmt, ...)
218{
219	va_list ap;
220	size_t msglen;
221	char *p;
222	size_t errbuflen_remaining;
223
224	va_start(ap, fmt);
225	vsnprintf(errbuf, errbuflen, fmt, ap);
226	va_end(ap);
227	msglen = strlen(errbuf);
228
229	/*
230	 * Do we have enough space to append ": "?
231	 * Including the terminating '\0', that's 3 bytes.
232	 */
233	if (msglen + 3 > errbuflen) {
234		/* No - just give them what we've produced. */
235		return;
236	}
237	p = errbuf + msglen;
238	errbuflen_remaining = errbuflen - msglen;
239	*p++ = ':';
240	*p++ = ' ';
241	*p = '\0';
242	msglen += 2;
243	errbuflen_remaining -= 2;
244
245	/*
246	 * Now append the string for the error code.
247	 * rte_strerror() is thread-safe, at least as of dpdk 18.11,
248	 * unlike strerror() - it uses strerror_r() rather than strerror()
249	 * for UN*X errno values, and prints to what I assume is a per-thread
250	 * buffer (based on the "PER_LCORE" in "RTE_DEFINE_PER_LCORE" used
251	 * to declare the buffers statically) for DPDK errors.
252	 */
253	snprintf(p, errbuflen_remaining, "%s", rte_strerror(errnum));
254}
255
256static int dpdk_init_timer(struct pcap_dpdk *pd){
257	gettimeofday(&(pd->ts_helper.start_time),NULL);
258	pd->ts_helper.start_cycles = rte_get_timer_cycles();
259	pd->ts_helper.hz = rte_get_timer_hz();
260	if (pd->ts_helper.hz == 0){
261		return -1;
262	}
263	return 0;
264}
265static inline void calculate_timestamp(struct dpdk_ts_helper *helper,struct timeval *ts)
266{
267	uint64_t cycles;
268	// delta
269	struct timeval cur_time;
270	cycles = rte_get_timer_cycles() - helper->start_cycles;
271	cur_time.tv_sec = (time_t)(cycles/helper->hz);
272	cur_time.tv_usec = (suseconds_t)((cycles%helper->hz)*1e6/helper->hz);
273	timeradd(&(helper->start_time), &cur_time, ts);
274}
275
276static uint32_t dpdk_gather_data(unsigned char *data, uint32_t len, struct rte_mbuf *mbuf)
277{
278	uint32_t total_len = 0;
279	while (mbuf && (total_len+mbuf->data_len) < len ){
280		rte_memcpy(data+total_len, rte_pktmbuf_mtod(mbuf,void *),mbuf->data_len);
281		total_len+=mbuf->data_len;
282		mbuf=mbuf->next;
283	}
284	return total_len;
285}
286
287
288static int dpdk_read_with_timeout(pcap_t *p, struct rte_mbuf **pkts_burst, const uint16_t burst_cnt){
289	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
290	int nb_rx = 0;
291	int timeout_ms = p->opt.timeout;
292	int sleep_ms = 0;
293	if (pd->nonblock){
294		// In non-blocking mode, just read once, no matter how many packets are captured.
295		nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt);
296	}else{
297		// In blocking mode, read many times until packets are captured or timeout or break_loop is set.
298		// if timeout_ms == 0, it may be blocked forever.
299		while (timeout_ms == 0 || sleep_ms < timeout_ms){
300			nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt);
301			if (nb_rx){ // got packets within timeout_ms
302				break;
303			}else{ // no packet arrives at this round.
304				if (p->break_loop){
305					break;
306				}
307				// sleep for a very short while.
308				// block sleep is the only choice, since usleep() will impact performance dramatically.
309				rte_delay_us_block(DPDK_DEF_MIN_SLEEP_MS*1000);
310				sleep_ms += DPDK_DEF_MIN_SLEEP_MS;
311			}
312		}
313	}
314	return nb_rx;
315}
316
317static int pcap_dpdk_dispatch(pcap_t *p, int max_cnt, pcap_handler cb, u_char *cb_arg)
318{
319	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
320	int burst_cnt = 0;
321	int nb_rx = 0;
322	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
323	struct rte_mbuf *m;
324	struct pcap_pkthdr pcap_header;
325	// In DPDK, pkt_len is sum of lengths for all segments. And data_len is for one segment
326	uint32_t pkt_len = 0;
327	uint32_t caplen = 0;
328	u_char *bp = NULL;
329	int i=0;
330	unsigned int gather_len =0;
331	int pkt_cnt = 0;
332	u_char *large_buffer=NULL;
333	int timeout_ms = p->opt.timeout;
334
335	/*
336	 * This can conceivably process more than INT_MAX packets,
337	 * which would overflow the packet count, causing it either
338	 * to look like a negative number, and thus cause us to
339	 * return a value that looks like an error, or overflow
340	 * back into positive territory, and thus cause us to
341	 * return a too-low count.
342	 *
343	 * Therefore, if the packet count is unlimited, we clip
344	 * it at INT_MAX; this routine is not expected to
345	 * process packets indefinitely, so that's not an issue.
346	 */
347	if (PACKET_COUNT_IS_UNLIMITED(max_cnt))
348		max_cnt = INT_MAX;
349
350	if (max_cnt < MAX_PKT_BURST){
351		burst_cnt = max_cnt;
352	}else{
353		burst_cnt = MAX_PKT_BURST;
354	}
355
356	while( pkt_cnt < max_cnt){
357		if (p->break_loop){
358			p->break_loop = 0;
359			return PCAP_ERROR_BREAK;
360		}
361		// read once in non-blocking mode, or try many times waiting for timeout_ms.
362		// if timeout_ms == 0, it will be blocked until one packet arrives or break_loop is set.
363		nb_rx = dpdk_read_with_timeout(p, pkts_burst, burst_cnt);
364		if (nb_rx == 0){
365			if (pd->nonblock){
366				RTE_LOG(DEBUG, USER1, "dpdk: no packets available in non-blocking mode.\n");
367			}else{
368				if (p->break_loop){
369					RTE_LOG(DEBUG, USER1, "dpdk: no packets available and break_loop is set in blocking mode.\n");
370					p->break_loop = 0;
371					return PCAP_ERROR_BREAK;
372
373				}
374				RTE_LOG(DEBUG, USER1, "dpdk: no packets available for timeout %d ms in blocking mode.\n", timeout_ms);
375			}
376			// break if dpdk reads 0 packet, no matter in blocking(timeout) or non-blocking mode.
377			break;
378		}
379		pkt_cnt += nb_rx;
380		for ( i = 0; i < nb_rx; i++) {
381			m = pkts_burst[i];
382			calculate_timestamp(&(pd->ts_helper),&(pcap_header.ts));
383			pkt_len = rte_pktmbuf_pkt_len(m);
384			// caplen = min(pkt_len, p->snapshot);
385			// caplen will not be changed, no matter how long the rte_pktmbuf
386			caplen = pkt_len < (uint32_t)p->snapshot ? pkt_len: (uint32_t)p->snapshot;
387			pcap_header.caplen = caplen;
388			pcap_header.len = pkt_len;
389			// volatile prefetch
390			rte_prefetch0(rte_pktmbuf_mtod(m, void *));
391			bp = NULL;
392			if (m->nb_segs == 1)
393			{
394				bp = rte_pktmbuf_mtod(m, u_char *);
395			}else{
396				// use fast buffer pcap_tmp_buf if pkt_len is small, no need to call malloc and free
397				if ( pkt_len <= RTE_ETH_PCAP_SNAPLEN)
398				{
399					gather_len = dpdk_gather_data(pd->pcap_tmp_buf, RTE_ETH_PCAP_SNAPLEN, m);
400					bp = pd->pcap_tmp_buf;
401				}else{
402					// need call free later
403					large_buffer = (u_char *)malloc(caplen*sizeof(u_char));
404					gather_len = dpdk_gather_data(large_buffer, caplen, m);
405					bp = large_buffer;
406				}
407
408			}
409			if (bp){
410				if (p->fcode.bf_insns==NULL || pcap_filter(p->fcode.bf_insns, bp, pcap_header.len, pcap_header.caplen)){
411					cb(cb_arg, &pcap_header, bp);
412				}else{
413					pd->bpf_drop++;
414				}
415			}
416			//free all pktmbuf
417			rte_pktmbuf_free(m);
418			if (large_buffer){
419				free(large_buffer);
420				large_buffer=NULL;
421			}
422		}
423	}
424	return pkt_cnt;
425}
426
427static int pcap_dpdk_inject(pcap_t *p, const void *buf _U_, int size _U_)
428{
429	//not implemented yet
430	pcap_strlcpy(p->errbuf,
431	    "dpdk error: Inject function has not been implemented yet",
432	    PCAP_ERRBUF_SIZE);
433	return PCAP_ERROR;
434}
435
436static void pcap_dpdk_close(pcap_t *p)
437{
438	struct pcap_dpdk *pd = p->priv;
439	if (pd==NULL)
440	{
441		return;
442	}
443	if (pd->must_clear_promisc)
444	{
445		rte_eth_promiscuous_disable(pd->portid);
446	}
447	rte_eth_dev_stop(pd->portid);
448	rte_eth_dev_close(pd->portid);
449	pcap_cleanup_live_common(p);
450}
451
452static void nic_stats_display(struct pcap_dpdk *pd)
453{
454	uint16_t portid = pd->portid;
455	struct rte_eth_stats stats;
456	rte_eth_stats_get(portid, &stats);
457	RTE_LOG(INFO,USER1, "portid:%d, RX-packets: %-10"PRIu64"  RX-errors:  %-10"PRIu64
458	       "  RX-bytes:  %-10"PRIu64"  RX-Imissed:  %-10"PRIu64"\n", portid, stats.ipackets, stats.ierrors,
459	       stats.ibytes,stats.imissed);
460	RTE_LOG(INFO,USER1, "portid:%d, RX-PPS: %-10"PRIu64" RX-Mbps: %.2lf\n", portid, pd->pps, pd->bps/1e6f );
461}
462
463static int pcap_dpdk_stats(pcap_t *p, struct pcap_stat *ps)
464{
465	struct pcap_dpdk *pd = p->priv;
466	calculate_timestamp(&(pd->ts_helper), &(pd->curr_ts));
467	rte_eth_stats_get(pd->portid,&(pd->curr_stats));
468	if (ps){
469		ps->ps_recv = pd->curr_stats.ipackets;
470		ps->ps_drop = pd->curr_stats.ierrors;
471		ps->ps_drop += pd->bpf_drop;
472		ps->ps_ifdrop = pd->curr_stats.imissed;
473	}
474	uint64_t delta_pkt = pd->curr_stats.ipackets - pd->prev_stats.ipackets;
475	struct timeval delta_tm;
476	timersub(&(pd->curr_ts),&(pd->prev_ts), &delta_tm);
477	uint64_t delta_usec = delta_tm.tv_sec*1e6+delta_tm.tv_usec;
478	uint64_t delta_bit = (pd->curr_stats.ibytes-pd->prev_stats.ibytes)*8;
479	RTE_LOG(DEBUG, USER1, "delta_usec: %-10"PRIu64" delta_pkt: %-10"PRIu64" delta_bit: %-10"PRIu64"\n", delta_usec, delta_pkt, delta_bit);
480	pd->pps = (uint64_t)(delta_pkt*1e6f/delta_usec);
481	pd->bps = (uint64_t)(delta_bit*1e6f/delta_usec);
482	nic_stats_display(pd);
483	pd->prev_stats = pd->curr_stats;
484	pd->prev_ts = pd->curr_ts;
485	return 0;
486}
487
488static int pcap_dpdk_setnonblock(pcap_t *p, int nonblock){
489	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
490	pd->nonblock = nonblock;
491	return 0;
492}
493
494static int pcap_dpdk_getnonblock(pcap_t *p){
495	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
496	return pd->nonblock;
497}
498static int check_link_status(uint16_t portid, struct rte_eth_link *plink)
499{
500	// wait up to 9 seconds to get link status
501	rte_eth_link_get(portid, plink);
502	return plink->link_status == ETH_LINK_UP;
503}
504static void eth_addr_str(ETHER_ADDR_TYPE *addrp, char* mac_str, int len)
505{
506	int offset=0;
507	if (addrp == NULL){
508		snprintf(mac_str, len-1, DPDK_DEF_MAC_ADDR);
509		return;
510	}
511	for (int i=0; i<6; i++)
512	{
513		if (offset >= len)
514		{ // buffer overflow
515			return;
516		}
517		if (i==0)
518		{
519			snprintf(mac_str+offset, len-1-offset, "%02X",addrp->addr_bytes[i]);
520			offset+=2; // FF
521		}else{
522			snprintf(mac_str+offset, len-1-offset, ":%02X", addrp->addr_bytes[i]);
523			offset+=3; // :FF
524		}
525	}
526	return;
527}
528// return portid by device name, otherwise return -1
529static uint16_t portid_by_device(char * device)
530{
531	uint16_t ret = DPDK_PORTID_MAX;
532	int len = strlen(device);
533	int prefix_len = strlen(DPDK_PREFIX);
534	unsigned long ret_ul = 0L;
535	char *pEnd;
536	if (len<=prefix_len || strncmp(device, DPDK_PREFIX, prefix_len)) // check prefix dpdk:
537	{
538		return ret;
539	}
540	//check all chars are digital
541	for (int i=prefix_len; device[i]; i++){
542		if (device[i]<'0' || device[i]>'9'){
543			return ret;
544		}
545	}
546	ret_ul = strtoul(&(device[prefix_len]), &pEnd, 10);
547	if (pEnd == &(device[prefix_len]) || *pEnd != '\0'){
548		return ret;
549	}
550	// too large for portid
551	if (ret_ul >= DPDK_PORTID_MAX){
552		return ret;
553	}
554	ret = (uint16_t)ret_ul;
555	return ret;
556}
557
558static int parse_dpdk_cfg(char* dpdk_cfg,char** dargv)
559{
560	int cnt=0;
561	memset(dargv,0,sizeof(dargv[0])*DPDK_ARGC_MAX);
562	//current process name
563	int skip_space = 1;
564	int i=0;
565	RTE_LOG(INFO, USER1,"dpdk cfg: %s\n",dpdk_cfg);
566	// find first non space char
567	// The last opt is NULL
568	for (i=0;dpdk_cfg[i] && cnt<DPDK_ARGC_MAX-1;i++){
569		if (skip_space && dpdk_cfg[i]!=' '){ // not space
570			skip_space=!skip_space; // skip normal char
571			dargv[cnt++] = dpdk_cfg+i;
572		}
573		if (!skip_space && dpdk_cfg[i]==' '){ // fint a space
574			dpdk_cfg[i]=0x00; // end of this opt
575			skip_space=!skip_space; // skip space char
576		}
577	}
578	dargv[cnt]=NULL;
579	return cnt;
580}
581
582// only called once
583// Returns:
584//
585//    1 on success;
586//
587//    0 if "the EAL cannot initialize on this system", which we treat as
588//    meaning "DPDK isn't available";
589//
590//    a PCAP_ERROR_ code for other errors.
591//
592// If eaccess_not_fatal is non-zero, treat "a permissions issue" the way
593// we treat "the EAL cannot initialize on this system".  We use that
594// when trying to find DPDK devices, as we don't want to fail to return
595// *any* devices just because we can't support DPDK; when we're trying
596// to open a device, we need to return a permissions error in that case.
597static int dpdk_pre_init(char * ebuf, int eaccess_not_fatal)
598{
599	int dargv_cnt=0;
600	char *dargv[DPDK_ARGC_MAX];
601	char *ptr_dpdk_cfg = NULL;
602	int ret;
603	// globale var
604	if (is_dpdk_pre_inited != 0)
605	{
606		// already inited; did that succeed?
607		if (is_dpdk_pre_inited < 0)
608		{
609			// failed
610			goto error;
611		}
612		else
613		{
614			// succeeded
615			return 1;
616		}
617	}
618	// init EAL
619	ptr_dpdk_cfg = getenv(DPDK_CFG_ENV_NAME);
620	// set default log level to debug
621	rte_log_set_global_level(DPDK_DEF_LOG_LEV);
622	if (ptr_dpdk_cfg == NULL)
623	{
624		RTE_LOG(INFO,USER1,"env $DPDK_CFG is unset, so using default: %s\n",DPDK_DEF_CFG);
625		ptr_dpdk_cfg = DPDK_DEF_CFG;
626	}
627	memset(dpdk_cfg_buf,0,sizeof(dpdk_cfg_buf));
628	snprintf(dpdk_cfg_buf,DPDK_CFG_MAX_LEN-1,"%s %s",DPDK_LIB_NAME,ptr_dpdk_cfg);
629	dargv_cnt = parse_dpdk_cfg(dpdk_cfg_buf,dargv);
630	ret = rte_eal_init(dargv_cnt,dargv);
631	if (ret == -1)
632	{
633		// Indicate that we've called rte_eal_init() by setting
634		// is_dpdk_pre_inited to the negative of the error code,
635		// and process the error.
636		is_dpdk_pre_inited = -rte_errno;
637		goto error;
638	}
639	// init succeeded, so we do not need to do it again later.
640	is_dpdk_pre_inited = 1;
641	return 1;
642
643error:
644	switch (-is_dpdk_pre_inited)
645	{
646		case EACCES:
647			// This "indicates a permissions issue.".
648			RTE_LOG(ERR, USER1, "%s\n", DPDK_ERR_PERM_MSG);
649			// If we were told to treat this as just meaning
650			// DPDK isn't available, do so.
651			if (eaccess_not_fatal)
652				return 0;
653			// Otherwise report a fatal error.
654			snprintf(ebuf, PCAP_ERRBUF_SIZE,
655			    "DPDK requires that it run as root");
656			return PCAP_ERROR_PERM_DENIED;
657
658		case EAGAIN:
659			// This "indicates either a bus or system
660			// resource was not available, setup may
661			// be attempted again."
662			// There's no such error in pcap, so I'm
663			// not sure what we should do here.
664			snprintf(ebuf, PCAP_ERRBUF_SIZE,
665			    "Bus or system resource was not available");
666			break;
667
668		case EALREADY:
669			// This "indicates that the rte_eal_init
670			// function has already been called, and
671			// cannot be called again."
672			// That's not an error; set the "we've
673			// been here before" flag and return
674			// success.
675			is_dpdk_pre_inited = 1;
676			return 1;
677
678		case EFAULT:
679			// This "indicates the tailq configuration
680			// name was not found in memory configuration."
681			snprintf(ebuf, PCAP_ERRBUF_SIZE,
682			    "The tailq configuration name was not found in the memory configuration");
683			return PCAP_ERROR;
684
685		case EINVAL:
686			// This "indicates invalid parameters were
687			// passed as argv/argc."  Those came from
688			// the configuration file.
689			snprintf(ebuf, PCAP_ERRBUF_SIZE,
690			    "The configuration file has invalid parameters");
691			break;
692
693		case ENOMEM:
694			// This "indicates failure likely caused by
695			// an out-of-memory condition."
696			snprintf(ebuf, PCAP_ERRBUF_SIZE,
697			    "Out of memory");
698			break;
699
700		case ENODEV:
701			// This "indicates memory setup issues."
702			snprintf(ebuf, PCAP_ERRBUF_SIZE,
703			    "An error occurred setting up memory");
704			break;
705
706		case ENOTSUP:
707			// This "indicates that the EAL cannot
708			// initialize on this system."  We treat
709			// that as meaning DPDK isn't available
710			// on this machine, rather than as a
711			// fatal error, and let our caller decide
712			// whether that's a fatal error (if trying
713			// to activate a DPDK device) or not (if
714			// trying to enumerate devices).
715			return 0;
716
717		case EPROTO:
718			// This "indicates that the PCI bus is
719			// either not present, or is not readable
720			// by the eal."  Does "the PCI bus is not
721			// present" mean "this machine has no PCI
722			// bus", which strikes me as a "not available"
723			// case?  If so, should "is not readable by
724			// the EAL" also something we should treat
725			// as a "not available" case?  If not, we
726			// can't distinguish between the two, so
727			// we're stuck.
728			snprintf(ebuf, PCAP_ERRBUF_SIZE,
729			    "PCI bus is not present or not readable by the EAL");
730			break;
731
732		case ENOEXEC:
733			// This "indicates that a service core
734			// failed to launch successfully."
735			snprintf(ebuf, PCAP_ERRBUF_SIZE,
736			    "A service core failed to launch successfully");
737			break;
738
739		default:
740			//
741			// That's not in the list of errors in
742			// the documentation; let it be reported
743			// as an error.
744			//
745			dpdk_fmt_errmsg_for_rte_errno(ebuf,
746			    PCAP_ERRBUF_SIZE, -is_dpdk_pre_inited,
747			    "dpdk error: dpdk_pre_init failed");
748			break;
749	}
750	// Error.
751	return PCAP_ERROR;
752}
753
754static int pcap_dpdk_activate(pcap_t *p)
755{
756	struct pcap_dpdk *pd = p->priv;
757	pd->orig = p;
758	int ret = PCAP_ERROR;
759	uint16_t nb_ports=0;
760	uint16_t portid= DPDK_PORTID_MAX;
761	unsigned nb_mbufs = DPDK_NB_MBUFS;
762	struct rte_eth_rxconf rxq_conf;
763	struct rte_eth_txconf txq_conf;
764	struct rte_eth_conf local_port_conf = port_conf;
765	struct rte_eth_dev_info dev_info;
766	int is_port_up = 0;
767	struct rte_eth_link link;
768	do{
769		//init EAL; fail if we have insufficient permission
770		char dpdk_pre_init_errbuf[PCAP_ERRBUF_SIZE];
771		ret = dpdk_pre_init(dpdk_pre_init_errbuf, 0);
772		if (ret < 0)
773		{
774			// This returns a negative value on an error.
775			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
776			    "Can't open device %s: %s",
777			    p->opt.device, dpdk_pre_init_errbuf);
778			// ret is set to the correct error
779			break;
780		}
781		if (ret == 0)
782		{
783			// This means DPDK isn't available on this machine.
784			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
785			    "Can't open device %s: DPDK is not available on this machine",
786			    p->opt.device);
787			return PCAP_ERROR_NO_SUCH_DEVICE;
788		}
789
790		ret = dpdk_init_timer(pd);
791		if (ret<0)
792		{
793			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
794				"dpdk error: Init timer is zero with device %s",
795				p->opt.device);
796			ret = PCAP_ERROR;
797			break;
798		}
799
800		nb_ports = rte_eth_dev_count_avail();
801		if (nb_ports == 0)
802		{
803			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
804			    "dpdk error: No Ethernet ports");
805			ret = PCAP_ERROR;
806			break;
807		}
808
809		portid = portid_by_device(p->opt.device);
810		if (portid == DPDK_PORTID_MAX){
811			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
812			    "dpdk error: portid is invalid. device %s",
813			    p->opt.device);
814			ret = PCAP_ERROR_NO_SUCH_DEVICE;
815			break;
816		}
817
818		pd->portid = portid;
819
820		if (p->snapshot <= 0 || p->snapshot > MAXIMUM_SNAPLEN)
821		{
822			p->snapshot = MAXIMUM_SNAPLEN;
823		}
824		// create the mbuf pool
825		pd->pktmbuf_pool = rte_pktmbuf_pool_create(MBUF_POOL_NAME, nb_mbufs,
826			MEMPOOL_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
827			rte_socket_id());
828		if (pd->pktmbuf_pool == NULL)
829		{
830			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
831			    PCAP_ERRBUF_SIZE, rte_errno,
832			    "dpdk error: Cannot init mbuf pool");
833			ret = PCAP_ERROR;
834			break;
835		}
836		// config dev
837		rte_eth_dev_info_get(portid, &dev_info);
838		if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
839		{
840			local_port_conf.txmode.offloads |=DEV_TX_OFFLOAD_MBUF_FAST_FREE;
841		}
842		// only support 1 queue
843		ret = rte_eth_dev_configure(portid, 1, 1, &local_port_conf);
844		if (ret < 0)
845		{
846			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
847			    PCAP_ERRBUF_SIZE, -ret,
848			    "dpdk error: Cannot configure device: port=%u",
849			    portid);
850			ret = PCAP_ERROR;
851			break;
852		}
853		// adjust rx tx
854		ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
855		if (ret < 0)
856		{
857			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
858			    PCAP_ERRBUF_SIZE, -ret,
859			    "dpdk error: Cannot adjust number of descriptors: port=%u",
860			    portid);
861			ret = PCAP_ERROR;
862			break;
863		}
864		// get MAC addr
865		rte_eth_macaddr_get(portid, &(pd->eth_addr));
866		eth_addr_str(&(pd->eth_addr), pd->mac_addr, DPDK_MAC_ADDR_SIZE-1);
867
868		// init one RX queue
869		rxq_conf = dev_info.default_rxconf;
870		rxq_conf.offloads = local_port_conf.rxmode.offloads;
871		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
872					     rte_eth_dev_socket_id(portid),
873					     &rxq_conf,
874					     pd->pktmbuf_pool);
875		if (ret < 0)
876		{
877			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
878			    PCAP_ERRBUF_SIZE, -ret,
879			    "dpdk error: rte_eth_rx_queue_setup:port=%u",
880			    portid);
881			ret = PCAP_ERROR;
882			break;
883		}
884
885		// init one TX queue
886		txq_conf = dev_info.default_txconf;
887		txq_conf.offloads = local_port_conf.txmode.offloads;
888		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
889				rte_eth_dev_socket_id(portid),
890				&txq_conf);
891		if (ret < 0)
892		{
893			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
894			    PCAP_ERRBUF_SIZE, -ret,
895			    "dpdk error: rte_eth_tx_queue_setup:port=%u",
896			    portid);
897			ret = PCAP_ERROR;
898			break;
899		}
900		// Initialize TX buffers
901		tx_buffer = rte_zmalloc_socket(DPDK_TX_BUF_NAME,
902				RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0,
903				rte_eth_dev_socket_id(portid));
904		if (tx_buffer == NULL)
905		{
906			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
907			    "dpdk error: Cannot allocate buffer for tx on port %u", portid);
908			ret = PCAP_ERROR;
909			break;
910		}
911		rte_eth_tx_buffer_init(tx_buffer, MAX_PKT_BURST);
912		// Start device
913		ret = rte_eth_dev_start(portid);
914		if (ret < 0)
915		{
916			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
917			    PCAP_ERRBUF_SIZE, -ret,
918			    "dpdk error: rte_eth_dev_start:port=%u",
919			    portid);
920			ret = PCAP_ERROR;
921			break;
922		}
923		// set promiscuous mode
924		if (p->opt.promisc){
925			pd->must_clear_promisc=1;
926			rte_eth_promiscuous_enable(portid);
927		}
928		// check link status
929		is_port_up = check_link_status(portid, &link);
930		if (!is_port_up){
931			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
932			    "dpdk error: link is down, port=%u",portid);
933			ret = PCAP_ERROR_IFACE_NOT_UP;
934			break;
935		}
936		// reset statistics
937		rte_eth_stats_reset(pd->portid);
938		calculate_timestamp(&(pd->ts_helper), &(pd->prev_ts));
939		rte_eth_stats_get(pd->portid,&(pd->prev_stats));
940		// format pcap_t
941		pd->portid = portid;
942		p->fd = pd->portid;
943		if (p->snapshot <=0 || p->snapshot> MAXIMUM_SNAPLEN)
944		{
945			p->snapshot = MAXIMUM_SNAPLEN;
946		}
947		p->linktype = DLT_EN10MB; // Ethernet, the 10MB is historical.
948		p->selectable_fd = p->fd;
949		p->read_op = pcap_dpdk_dispatch;
950		p->inject_op = pcap_dpdk_inject;
951		// using pcap_filter currently, though DPDK provides their own BPF function. Because DPDK BPF needs load a ELF file as a filter.
952		p->setfilter_op = install_bpf_program;
953		p->setdirection_op = NULL;
954		p->set_datalink_op = NULL;
955		p->getnonblock_op = pcap_dpdk_getnonblock;
956		p->setnonblock_op = pcap_dpdk_setnonblock;
957		p->stats_op = pcap_dpdk_stats;
958		p->cleanup_op = pcap_dpdk_close;
959		p->breakloop_op = pcap_breakloop_common;
960		// set default timeout
961		pd->required_select_timeout.tv_sec = 0;
962		pd->required_select_timeout.tv_usec = DPDK_DEF_MIN_SLEEP_MS*1000;
963		p->required_select_timeout = &pd->required_select_timeout;
964		ret = 0; // OK
965	}while(0);
966
967	if (ret <= PCAP_ERROR) // all kinds of error code
968	{
969		pcap_cleanup_live_common(p);
970	}else{
971		rte_eth_dev_get_name_by_port(portid,pd->pci_addr);
972		RTE_LOG(INFO, USER1,"Port %d device: %s, MAC:%s, PCI:%s\n", portid, p->opt.device, pd->mac_addr, pd->pci_addr);
973		RTE_LOG(INFO, USER1,"Port %d Link Up. Speed %u Mbps - %s\n",
974							portid, link.link_speed,
975					(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
976						("full-duplex") : ("half-duplex\n"));
977	}
978	return ret;
979}
980
981// device name for dpdk should be in the form as dpdk:number, such as dpdk:0
982pcap_t * pcap_dpdk_create(const char *device, char *ebuf, int *is_ours)
983{
984	pcap_t *p=NULL;
985	*is_ours = 0;
986
987	*is_ours = !strncmp(device, "dpdk:", 5);
988	if (! *is_ours)
989		return NULL;
990	//memset will happen
991	p = PCAP_CREATE_COMMON(ebuf, struct pcap_dpdk);
992
993	if (p == NULL)
994		return NULL;
995	p->activate_op = pcap_dpdk_activate;
996	return p;
997}
998
999int pcap_dpdk_findalldevs(pcap_if_list_t *devlistp, char *ebuf)
1000{
1001	int ret=0;
1002	unsigned int nb_ports = 0;
1003	char dpdk_name[DPDK_DEV_NAME_MAX];
1004	char dpdk_desc[DPDK_DEV_DESC_MAX];
1005	ETHER_ADDR_TYPE eth_addr;
1006	char mac_addr[DPDK_MAC_ADDR_SIZE];
1007	char pci_addr[DPDK_PCI_ADDR_SIZE];
1008	do{
1009		// init EAL; return "DPDK not available" if we
1010		// have insufficient permission
1011		char dpdk_pre_init_errbuf[PCAP_ERRBUF_SIZE];
1012		ret = dpdk_pre_init(dpdk_pre_init_errbuf, 1);
1013		if (ret < 0)
1014		{
1015			// This returns a negative value on an error.
1016			snprintf(ebuf, PCAP_ERRBUF_SIZE,
1017			    "Can't look for DPDK devices: %s",
1018			    dpdk_pre_init_errbuf);
1019			ret = PCAP_ERROR;
1020			break;
1021		}
1022		if (ret == 0)
1023		{
1024			// This means DPDK isn't available on this machine.
1025			// That just means "don't return any devices".
1026			break;
1027		}
1028		nb_ports = rte_eth_dev_count_avail();
1029		if (nb_ports == 0)
1030		{
1031			// That just means "don't return any devices".
1032			ret = 0;
1033			break;
1034		}
1035		for (unsigned int i=0; i<nb_ports; i++){
1036			snprintf(dpdk_name, DPDK_DEV_NAME_MAX-1,
1037			    "%s%u", DPDK_PREFIX, i);
1038			// mac addr
1039			rte_eth_macaddr_get(i, &eth_addr);
1040			eth_addr_str(&eth_addr,mac_addr,DPDK_MAC_ADDR_SIZE);
1041			// PCI addr
1042			rte_eth_dev_get_name_by_port(i,pci_addr);
1043			snprintf(dpdk_desc,DPDK_DEV_DESC_MAX-1,"%s %s, MAC:%s, PCI:%s", DPDK_DESC, dpdk_name, mac_addr, pci_addr);
1044			if (add_dev(devlistp, dpdk_name, 0, dpdk_desc, ebuf)==NULL){
1045				ret = PCAP_ERROR;
1046				break;
1047			}
1048		}
1049	}while(0);
1050	return ret;
1051}
1052
1053#ifdef DPDK_ONLY
1054/*
1055 * This libpcap build supports only DPDK, not regular network interfaces.
1056 */
1057
1058/*
1059 * There are no regular interfaces, just DPDK interfaces.
1060 */
1061int
1062pcap_platform_finddevs(pcap_if_list_t *devlistp _U_, char *errbuf)
1063{
1064	return (0);
1065}
1066
1067/*
1068 * Attempts to open a regular interface fail.
1069 */
1070pcap_t *
1071pcap_create_interface(const char *device, char *errbuf)
1072{
1073	snprintf(errbuf, PCAP_ERRBUF_SIZE,
1074	    "This version of libpcap only supports DPDK");
1075	return NULL;
1076}
1077
1078/*
1079 * Libpcap version string.
1080 */
1081const char *
1082pcap_lib_version(void)
1083{
1084	return (PCAP_VERSION_STRING " (DPDK-only)");
1085}
1086#endif
1087