1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012-2024 Marko Zec
5 * Copyright (c) 2005, 2018 University of Zagreb
6 * Copyright (c) 2005 International Computer Science Institute
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/*
31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32 * structures and a trivial search procedure.  More significant bits of
33 * the search key are used to directly index a two-stage trie, while the
34 * remaining bits are used for finding the next hop in a sorted array.
35 * More details in:
36 *
37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38 * second in software, ACM SIGCOMM Computer Communication Review, September
39 * 2012
40 *
41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43 */
44
45#include <sys/cdefs.h>
46#include "opt_inet.h"
47
48#include <sys/param.h>
49#include <sys/kernel.h>
50#include <sys/ctype.h>
51#include <sys/epoch.h>
52#include <sys/malloc.h>
53#include <sys/module.h>
54#include <sys/socket.h>
55#include <sys/sysctl.h>
56#include <sys/syslog.h>
57
58#include <vm/uma.h>
59
60#include <netinet/in.h>
61#include <netinet/in_fib.h>
62
63#include <net/route.h>
64#include <net/route/route_ctl.h>
65#include <net/route/fib_algo.h>
66
67#define	DXR_TRIE_BITS		20
68
69CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
70
71#if DXR_TRIE_BITS > 16
72#define	DXR_D			16
73#else
74#define	DXR_D			(DXR_TRIE_BITS - 1)
75#endif
76
77#define	D_TBL_SIZE		(1 << DXR_D)
78#define	DIRECT_TBL_SIZE		(1 << DXR_TRIE_BITS)
79#define	DXR_RANGE_MASK		(0xffffffffU >> DXR_TRIE_BITS)
80#define	DXR_RANGE_SHIFT		(32 - DXR_TRIE_BITS)
81
82#define	DESC_BASE_BITS		22
83#define	DESC_FRAGMENTS_BITS	(32 - DESC_BASE_BITS)
84#define	BASE_MAX		((1 << DESC_BASE_BITS) - 1)
85#define	RTBL_SIZE_INCR		(BASE_MAX / 64)
86
87#if DXR_TRIE_BITS < 24
88#define	FRAGS_MASK_SHORT	((1 << (23 - DXR_TRIE_BITS)) - 1)
89#else
90#define	FRAGS_MASK_SHORT	0
91#endif
92#define	FRAGS_PREF_SHORT	(((1 << DESC_FRAGMENTS_BITS) - 1) & \
93				 ~FRAGS_MASK_SHORT)
94#define	FRAGS_MARK_XL		(FRAGS_PREF_SHORT - 1)
95#define	FRAGS_MARK_HIT		(FRAGS_PREF_SHORT - 2)
96
97#define	IS_SHORT_FORMAT(x)	((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
98#define	IS_LONG_FORMAT(x)	((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
99#define	IS_XL_FORMAT(x)		(x == FRAGS_MARK_XL)
100
101#define	RE_SHORT_MAX_NH		((1 << (DXR_TRIE_BITS - 8)) - 1)
102
103#define	CHUNK_HASH_BITS		16
104#define	CHUNK_HASH_SIZE		(1 << CHUNK_HASH_BITS)
105#define	CHUNK_HASH_MASK		(CHUNK_HASH_SIZE - 1)
106
107#define	TRIE_HASH_BITS		16
108#define	TRIE_HASH_SIZE		(1 << TRIE_HASH_BITS)
109#define	TRIE_HASH_MASK		(TRIE_HASH_SIZE - 1)
110
111#define	XTBL_SIZE_INCR		(DIRECT_TBL_SIZE / 16)
112
113#define	UNUSED_BUCKETS		8
114
115/* Lookup structure elements */
116
117struct direct_entry {
118	uint32_t		fragments: DESC_FRAGMENTS_BITS,
119				base: DESC_BASE_BITS;
120};
121
122struct range_entry_long {
123	uint32_t		start: DXR_RANGE_SHIFT,
124				nexthop: DXR_TRIE_BITS;
125};
126
127#if DXR_TRIE_BITS < 24
128struct range_entry_short {
129	uint16_t		start: DXR_RANGE_SHIFT - 8,
130				nexthop: DXR_TRIE_BITS - 8;
131};
132#endif
133
134/* Auxiliary structures */
135
136struct heap_entry {
137	uint32_t		start;
138	uint32_t		end;
139	uint32_t		preflen;
140	uint32_t		nexthop;
141};
142
143struct chunk_desc {
144	LIST_ENTRY(chunk_desc)	cd_all_le;
145	LIST_ENTRY(chunk_desc)	cd_hash_le;
146	uint32_t		cd_hash;
147	uint32_t		cd_refcnt;
148	uint32_t		cd_base;
149	uint32_t		cd_cur_size;
150	uint32_t		cd_max_size;
151};
152
153struct trie_desc {
154	LIST_ENTRY(trie_desc)	td_all_le;
155	LIST_ENTRY(trie_desc)	td_hash_le;
156	uint32_t		td_hash;
157	uint32_t		td_index;
158	uint32_t		td_refcnt;
159};
160
161struct dxr_aux {
162	/* Glue to external state */
163	struct fib_data		*fd;
164	uint32_t		fibnum;
165	int			refcnt;
166
167	/* Auxiliary build-time tables */
168	struct direct_entry	direct_tbl[DIRECT_TBL_SIZE];
169	uint16_t		d_tbl[D_TBL_SIZE];
170	struct direct_entry	*x_tbl;
171	union {
172		struct range_entry_long	re;
173		uint32_t	fragments;
174	}			*range_tbl;
175
176	/* Auxiliary internal state */
177	uint32_t		updates_mask[DIRECT_TBL_SIZE / 32];
178	struct trie_desc	*trietbl[D_TBL_SIZE];
179	LIST_HEAD(, chunk_desc)	chunk_hashtbl[CHUNK_HASH_SIZE];
180	LIST_HEAD(, chunk_desc)	all_chunks;
181	LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
182	LIST_HEAD(, trie_desc)	trie_hashtbl[TRIE_HASH_SIZE];
183	LIST_HEAD(, trie_desc)	all_trie;
184	LIST_HEAD(, trie_desc)	unused_trie; /* abuses hash link entry */
185	struct sockaddr_in	dst;
186	struct sockaddr_in	mask;
187	struct heap_entry	heap[33];
188	uint32_t		prefixes;
189	uint32_t		updates_low;
190	uint32_t		updates_high;
191	uint32_t		unused_chunks_size;
192	uint32_t		xtbl_size;
193	uint32_t		all_trie_cnt;
194	uint32_t		unused_trie_cnt;
195	uint32_t		trie_rebuilt_prefixes;
196	uint32_t		heap_index;
197	uint32_t		d_bits;
198	uint32_t		rtbl_size;
199	uint32_t		rtbl_top;
200	uint32_t		rtbl_work_frags;
201	uint32_t		work_chunk;
202};
203
204/* Main lookup structure container */
205
206struct dxr {
207	/* Lookup tables */
208	void			*d;
209	void			*x;
210	void			*r;
211	struct nhop_object	**nh_tbl;
212
213	/* Glue to external state */
214	struct dxr_aux		*aux;
215	struct fib_data		*fd;
216	struct epoch_context	epoch_ctx;
217	uint32_t		fibnum;
218	uint16_t		d_shift;
219	uint16_t		x_shift;
220	uint32_t		x_mask;
221};
222
223static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
224static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
225
226uma_zone_t chunk_zone;
227uma_zone_t trie_zone;
228
229VNET_DEFINE_STATIC(int, frag_limit) = 100;
230#define	V_frag_limit	VNET(frag_limit)
231
232/* Binary search for a matching address range */
233#define	DXR_LOOKUP_STAGE					\
234	if (masked_dst < range[middle].start) {			\
235		upperbound = middle;				\
236		middle = (middle + lowerbound) / 2;		\
237	} else if (masked_dst < range[middle + 1].start)	\
238		return (range[middle].nexthop);			\
239	else {							\
240		lowerbound = middle + 1;			\
241		middle = (upperbound + middle + 1) / 2;		\
242	}							\
243	if (upperbound == lowerbound)				\
244		return (range[lowerbound].nexthop);
245
246static int
247range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
248{
249	uint32_t base;
250	uint32_t upperbound;
251	uint32_t middle;
252	uint32_t lowerbound;
253	uint32_t masked_dst;
254
255	base = de.base;
256	lowerbound = 0;
257	masked_dst = dst & DXR_RANGE_MASK;
258
259#if DXR_TRIE_BITS < 24
260	if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
261		upperbound = de.fragments & FRAGS_MASK_SHORT;
262		struct range_entry_short *range =
263		    (struct range_entry_short *) &rt[base];
264
265		masked_dst >>= 8;
266		middle = upperbound;
267		upperbound = upperbound * 2 + 1;
268
269		for (;;) {
270			DXR_LOOKUP_STAGE
271			DXR_LOOKUP_STAGE
272		}
273	}
274#endif
275
276	upperbound = de.fragments;
277	middle = upperbound / 2;
278	struct range_entry_long *range = &rt[base];
279	if (__predict_false(IS_XL_FORMAT(de.fragments))) {
280		upperbound = *((uint32_t *) range);
281		range++;
282		middle = upperbound / 2;
283	}
284
285	for (;;) {
286		DXR_LOOKUP_STAGE
287		DXR_LOOKUP_STAGE
288	}
289}
290
291#define DXR_LOOKUP_DEFINE(D)						\
292	static int inline						\
293	dxr_lookup_##D(struct dxr *dxr, uint32_t dst)			\
294	{								\
295		struct direct_entry de;					\
296		uint16_t *dt = dxr->d;					\
297		struct direct_entry *xt = dxr->x;			\
298									\
299		de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
300		    + ((dst >> DXR_RANGE_SHIFT) &			\
301		    (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))];	\
302		if (__predict_true(de.fragments == FRAGS_MARK_HIT))	\
303			return (de.base);				\
304		return (range_lookup(dxr->r, de, dst));			\
305	}								\
306									\
307	static struct nhop_object *					\
308	dxr_fib_lookup_##D(void *algo_data,				\
309	    const struct flm_lookup_key key, uint32_t scopeid __unused)	\
310	{								\
311		struct dxr *dxr = algo_data;				\
312									\
313		return (dxr->nh_tbl[dxr_lookup_##D(dxr,			\
314		    ntohl(key.addr4.s_addr))]);				\
315	}
316
317#if DXR_TRIE_BITS > 16
318DXR_LOOKUP_DEFINE(16)
319#endif
320DXR_LOOKUP_DEFINE(15)
321DXR_LOOKUP_DEFINE(14)
322DXR_LOOKUP_DEFINE(13)
323DXR_LOOKUP_DEFINE(12)
324DXR_LOOKUP_DEFINE(11)
325DXR_LOOKUP_DEFINE(10)
326DXR_LOOKUP_DEFINE(9)
327
328static int inline
329dxr_lookup(struct dxr *dxr, uint32_t dst)
330{
331	struct direct_entry de;
332	uint16_t *dt = dxr->d;
333	struct direct_entry *xt = dxr->x;
334
335	de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
336	    ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
337	if (__predict_true(de.fragments == FRAGS_MARK_HIT))
338		return (de.base);
339	return (range_lookup(dxr->r, de, dst));
340}
341
342static void
343initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
344{
345	struct heap_entry *fhp = &da->heap[0];
346	struct rtentry *rt;
347	struct route_nhop_data rnd;
348
349	da->heap_index = 0;
350	da->dst.sin_addr.s_addr = htonl(dst_u32);
351	rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
352	    &rnd);
353	if (rt != NULL) {
354		struct in_addr addr;
355		uint32_t scopeid;
356
357		rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
358		fhp->start = ntohl(addr.s_addr);
359		fhp->end = fhp->start;
360		if (fhp->preflen < 32)
361			fhp->end |= (0xffffffffU >> fhp->preflen);
362		fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
363	} else {
364		fhp->preflen = fhp->nexthop = fhp->start = 0;
365		fhp->end = 0xffffffffU;
366	}
367}
368
369static uint32_t
370chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
371{
372
373	if (IS_SHORT_FORMAT(fdesc->fragments))
374		return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
375	else if (IS_XL_FORMAT(fdesc->fragments))
376		return (da->range_tbl[fdesc->base].fragments + 2);
377	else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
378		return (fdesc->fragments + 1);
379}
380
381static uint32_t
382chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
383{
384	uint32_t size = chunk_size(da, fdesc);
385	uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
386	uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
387	uint32_t hash = fdesc->fragments;
388
389	for (; p < l; p++)
390		hash = (hash << 7) + (hash >> 13) + *p;
391
392	return (hash + (hash >> 16));
393}
394
395static int
396chunk_ref(struct dxr_aux *da, uint32_t chunk)
397{
398	struct direct_entry *fdesc = &da->direct_tbl[chunk];
399	struct chunk_desc *cdp, *empty_cdp;
400	uint32_t base = fdesc->base;
401	uint32_t size = chunk_size(da, fdesc);
402	uint32_t hash = chunk_hash(da, fdesc);
403	int i;
404
405	/* Find an existing descriptor */
406	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
407	    cd_hash_le) {
408		if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
409		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
410		    sizeof(struct range_entry_long) * size))
411			continue;
412		da->rtbl_top = fdesc->base;
413		fdesc->base = cdp->cd_base;
414		cdp->cd_refcnt++;
415		return (0);
416	}
417
418	/* No matching chunks found. Find an empty one to recycle. */
419	for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
420		cdp = LIST_FIRST(&da->unused_chunks[i]);
421
422	if (cdp == NULL)
423		LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
424			if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
425			    empty_cdp->cd_max_size < cdp->cd_max_size)) {
426				cdp = empty_cdp;
427				if (empty_cdp->cd_max_size == size)
428					break;
429			}
430
431	if (cdp != NULL) {
432		/* Copy from heap into the recycled chunk */
433		bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
434		    size * sizeof(struct range_entry_long));
435		fdesc->base = cdp->cd_base;
436		da->rtbl_top -= size;
437		da->unused_chunks_size -= cdp->cd_max_size;
438		if (cdp->cd_max_size > size) {
439			/* Split the range in two, need a new descriptor */
440			empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
441			if (empty_cdp == NULL)
442				return (1);
443			LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
444			empty_cdp->cd_base = cdp->cd_base + size;
445			empty_cdp->cd_cur_size = 0;
446			empty_cdp->cd_max_size = cdp->cd_max_size - size;
447
448			i = empty_cdp->cd_max_size;
449			if (i >= UNUSED_BUCKETS)
450				i = 0;
451			LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
452			    cd_hash_le);
453
454			da->unused_chunks_size += empty_cdp->cd_max_size;
455			cdp->cd_max_size = size;
456		}
457		LIST_REMOVE(cdp, cd_hash_le);
458	} else {
459		/* Alloc a new descriptor at the top of the heap*/
460		cdp = uma_zalloc(chunk_zone, M_NOWAIT);
461		if (cdp == NULL)
462			return (1);
463		cdp->cd_max_size = size;
464		cdp->cd_base = fdesc->base;
465		LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
466		MPASS(cdp->cd_base + cdp->cd_max_size == da->rtbl_top);
467	}
468
469	cdp->cd_hash = hash;
470	cdp->cd_refcnt = 1;
471	cdp->cd_cur_size = size;
472	LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
473	    cd_hash_le);
474	if (da->rtbl_top >= da->rtbl_size) {
475		if (da->rtbl_top >= BASE_MAX) {
476			FIB_PRINTF(LOG_ERR, da->fd,
477			    "structural limit exceeded at %d "
478			    "range table elements", da->rtbl_top);
479			return (1);
480		}
481		da->rtbl_size += RTBL_SIZE_INCR;
482		i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
483		FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
484		    da->rtbl_top * 100 / BASE_MAX);
485		da->range_tbl = realloc(da->range_tbl,
486		    sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
487		    M_DXRAUX, M_NOWAIT);
488		if (da->range_tbl == NULL) {
489			FIB_PRINTF(LOG_NOTICE, da->fd,
490			    "Unable to allocate DXR range table");
491			return (1);
492		}
493	}
494
495	return (0);
496}
497
498static void
499chunk_unref(struct dxr_aux *da, uint32_t chunk)
500{
501	struct direct_entry *fdesc = &da->direct_tbl[chunk];
502	struct chunk_desc *cdp, *cdp2;
503	uint32_t base = fdesc->base;
504	uint32_t size = chunk_size(da, fdesc);
505	uint32_t hash = chunk_hash(da, fdesc);
506	int i;
507
508	/* Find the corresponding descriptor */
509	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
510	    cd_hash_le)
511		if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
512		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
513		    sizeof(struct range_entry_long) * size) == 0)
514			break;
515
516	MPASS(cdp != NULL);
517	if (--cdp->cd_refcnt > 0)
518		return;
519
520	LIST_REMOVE(cdp, cd_hash_le);
521	da->unused_chunks_size += cdp->cd_max_size;
522	cdp->cd_cur_size = 0;
523
524	/* Attempt to merge with the preceding chunk, if empty */
525	cdp2 = LIST_NEXT(cdp, cd_all_le);
526	if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
527		MPASS(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base);
528		LIST_REMOVE(cdp, cd_all_le);
529		LIST_REMOVE(cdp2, cd_hash_le);
530		cdp2->cd_max_size += cdp->cd_max_size;
531		uma_zfree(chunk_zone, cdp);
532		cdp = cdp2;
533	}
534
535	/* Attempt to merge with the subsequent chunk, if empty */
536	cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
537	if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
538		MPASS(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base);
539		LIST_REMOVE(cdp, cd_all_le);
540		LIST_REMOVE(cdp2, cd_hash_le);
541		cdp2->cd_max_size += cdp->cd_max_size;
542		cdp2->cd_base = cdp->cd_base;
543		uma_zfree(chunk_zone, cdp);
544		cdp = cdp2;
545	}
546
547	if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
548		/* Free the chunk on the top of the range heap, trim the heap */
549		MPASS(cdp == LIST_FIRST(&da->all_chunks));
550		da->rtbl_top -= cdp->cd_max_size;
551		da->unused_chunks_size -= cdp->cd_max_size;
552		LIST_REMOVE(cdp, cd_all_le);
553		uma_zfree(chunk_zone, cdp);
554		return;
555	}
556
557	i = cdp->cd_max_size;
558	if (i >= UNUSED_BUCKETS)
559		i = 0;
560	LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
561}
562
563static uint32_t
564trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
565{
566	uint32_t i, *val;
567	uint32_t hash = 0;
568
569	for (i = 0; i < (1 << dxr_x); i++) {
570		hash = (hash << 3) ^ (hash >> 3);
571		val = (uint32_t *)
572		    (void *) &da->direct_tbl[(index << dxr_x) + i];
573		hash += (*val << 5);
574		hash += (*val >> 5);
575	}
576
577	return (hash + (hash >> 16));
578}
579
580static int
581trie_ref(struct dxr_aux *da, uint32_t index)
582{
583	struct trie_desc *tp;
584	uint32_t dxr_d = da->d_bits;
585	uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
586	uint32_t hash = trie_hash(da, dxr_x, index);
587
588	/* Find an existing descriptor */
589	LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
590		if (tp->td_hash == hash &&
591		    memcmp(&da->direct_tbl[index << dxr_x],
592		    &da->x_tbl[tp->td_index << dxr_x],
593		    sizeof(*da->x_tbl) << dxr_x) == 0) {
594			tp->td_refcnt++;
595			da->trietbl[index] = tp;
596			return(tp->td_index);
597		}
598
599	tp = LIST_FIRST(&da->unused_trie);
600	if (tp != NULL) {
601		LIST_REMOVE(tp, td_hash_le);
602		da->unused_trie_cnt--;
603	} else {
604		tp = uma_zalloc(trie_zone, M_NOWAIT);
605		if (tp == NULL)
606			return (-1);
607		LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
608		tp->td_index = da->all_trie_cnt++;
609	}
610
611	tp->td_hash = hash;
612	tp->td_refcnt = 1;
613	LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
614	   td_hash_le);
615	memcpy(&da->x_tbl[tp->td_index << dxr_x],
616	    &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
617	da->trietbl[index] = tp;
618	if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
619		da->xtbl_size += XTBL_SIZE_INCR;
620		da->x_tbl = realloc(da->x_tbl,
621		    sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
622		if (da->x_tbl == NULL) {
623			FIB_PRINTF(LOG_NOTICE, da->fd,
624			    "Unable to allocate DXR extension table");
625			return (-1);
626		}
627	}
628	return(tp->td_index);
629}
630
631static void
632trie_unref(struct dxr_aux *da, uint32_t index)
633{
634	struct trie_desc *tp = da->trietbl[index];
635
636	if (tp == NULL)
637		return;
638	da->trietbl[index] = NULL;
639	if (--tp->td_refcnt > 0)
640		return;
641
642	LIST_REMOVE(tp, td_hash_le);
643	da->unused_trie_cnt++;
644	if (tp->td_index != da->all_trie_cnt - 1) {
645		LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
646		return;
647	}
648
649	do {
650		da->all_trie_cnt--;
651		da->unused_trie_cnt--;
652		LIST_REMOVE(tp, td_all_le);
653		uma_zfree(trie_zone, tp);
654		LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
655			if (tp->td_index == da->all_trie_cnt - 1) {
656				LIST_REMOVE(tp, td_hash_le);
657				break;
658			}
659	} while (tp != NULL);
660}
661
662static void
663heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
664    uint32_t nh)
665{
666	struct heap_entry *fhp;
667	int i;
668
669	for (i = da->heap_index; i >= 0; i--) {
670		if (preflen > da->heap[i].preflen)
671			break;
672		else if (preflen < da->heap[i].preflen)
673			da->heap[i + 1] = da->heap[i];
674		else
675			return;
676	}
677
678	fhp = &da->heap[i + 1];
679	fhp->preflen = preflen;
680	fhp->start = start;
681	fhp->end = end;
682	fhp->nexthop = nh;
683	da->heap_index++;
684}
685
686static int
687dxr_walk(struct rtentry *rt, void *arg)
688{
689	struct dxr_aux *da = arg;
690	uint32_t chunk = da->work_chunk;
691	uint32_t first = chunk << DXR_RANGE_SHIFT;
692	uint32_t last = first | DXR_RANGE_MASK;
693	struct range_entry_long *fp =
694	    &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
695	struct heap_entry *fhp = &da->heap[da->heap_index];
696	uint32_t preflen, nh, start, end, scopeid;
697	struct in_addr addr;
698
699	rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
700	start = ntohl(addr.s_addr);
701	if (start > last)
702		return (-1);	/* Beyond chunk boundaries, we are done */
703	if (start < first)
704		return (0);	/* Skip this route */
705
706	end = start;
707	if (preflen < 32)
708		end |= (0xffffffffU >> preflen);
709	nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
710
711	if (start == fhp->start)
712		heap_inject(da, start, end, preflen, nh);
713	else {
714		/* start > fhp->start */
715		while (start > fhp->end) {
716			uint32_t oend = fhp->end;
717
718			if (da->heap_index > 0) {
719				fhp--;
720				da->heap_index--;
721			} else
722				initheap(da, fhp->end + 1, chunk);
723			if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
724				fp++;
725				da->rtbl_work_frags++;
726				fp->start = (oend + 1) & DXR_RANGE_MASK;
727				fp->nexthop = fhp->nexthop;
728			}
729		}
730		if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
731		    nh != fp->nexthop) {
732			fp++;
733			da->rtbl_work_frags++;
734			fp->start = start & DXR_RANGE_MASK;
735		} else if (da->rtbl_work_frags) {
736			if ((--fp)->nexthop == nh)
737				da->rtbl_work_frags--;
738			else
739				fp++;
740		}
741		fp->nexthop = nh;
742		heap_inject(da, start, end, preflen, nh);
743	}
744
745	return (0);
746}
747
748static int
749update_chunk(struct dxr_aux *da, uint32_t chunk)
750{
751	struct range_entry_long *fp;
752#if DXR_TRIE_BITS < 24
753	struct range_entry_short *fps;
754	uint32_t start, nh, i;
755#endif
756	struct heap_entry *fhp;
757	uint32_t first = chunk << DXR_RANGE_SHIFT;
758	uint32_t last = first | DXR_RANGE_MASK;
759
760	if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
761		chunk_unref(da, chunk);
762
763	initheap(da, first, chunk);
764
765	fp = &da->range_tbl[da->rtbl_top].re;
766	da->rtbl_work_frags = 0;
767	fp->start = first & DXR_RANGE_MASK;
768	fp->nexthop = da->heap[0].nexthop;
769
770	da->dst.sin_addr.s_addr = htonl(first);
771	da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
772
773	da->work_chunk = chunk;
774	rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
775	    (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
776	    dxr_walk, da);
777
778	/* Flush any remaining objects on the heap */
779	fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
780	fhp = &da->heap[da->heap_index];
781	while (fhp->preflen > DXR_TRIE_BITS) {
782		uint32_t oend = fhp->end;
783
784		if (da->heap_index > 0) {
785			fhp--;
786			da->heap_index--;
787		} else
788			initheap(da, fhp->end + 1, chunk);
789		if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
790			/* Have we crossed the upper chunk boundary? */
791			if (oend >= last)
792				break;
793			fp++;
794			da->rtbl_work_frags++;
795			fp->start = (oend + 1) & DXR_RANGE_MASK;
796			fp->nexthop = fhp->nexthop;
797		}
798	}
799
800	/* Direct hit if the chunk contains only a single fragment */
801	if (da->rtbl_work_frags == 0) {
802		da->direct_tbl[chunk].base = fp->nexthop;
803		da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
804		return (0);
805	}
806
807	da->direct_tbl[chunk].base = da->rtbl_top;
808	da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
809
810#if DXR_TRIE_BITS < 24
811	/* Check whether the chunk can be more compactly encoded */
812	fp = &da->range_tbl[da->rtbl_top].re;
813	for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
814		if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
815			break;
816	if (i == da->rtbl_work_frags + 1) {
817		fp = &da->range_tbl[da->rtbl_top].re;
818		fps = (void *) fp;
819		for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
820			start = fp->start;
821			nh = fp->nexthop;
822			fps->start = start >> 8;
823			fps->nexthop = nh;
824		}
825		fps->start = start >> 8;
826		fps->nexthop = nh;
827		da->rtbl_work_frags >>= 1;
828		da->direct_tbl[chunk].fragments =
829		    da->rtbl_work_frags | FRAGS_PREF_SHORT;
830	} else
831#endif
832	if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
833		da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
834		memmove(&da->range_tbl[da->rtbl_top + 1],
835		   &da->range_tbl[da->rtbl_top],
836		   (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
837		da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
838		da->rtbl_work_frags++;
839	}
840	da->rtbl_top += (da->rtbl_work_frags + 1);
841	return (chunk_ref(da, chunk));
842}
843
844static void
845dxr_build(struct dxr *dxr)
846{
847	struct dxr_aux *da = dxr->aux;
848	struct chunk_desc *cdp;
849	struct rib_rtable_info rinfo;
850	struct timeval t0, t1, t2, t3;
851	uint32_t r_size, dxr_tot_size;
852	uint32_t i, m, range_rebuild = 0;
853	uint32_t range_frag;
854	struct trie_desc *tp;
855	uint32_t d_tbl_size, dxr_x, d_size, x_size;
856	uint32_t ti, trie_rebuild = 0, prev_size = 0;
857	uint32_t trie_frag;
858
859	MPASS(dxr->d == NULL);
860
861	if (da == NULL) {
862		da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
863		if (da == NULL) {
864			FIB_PRINTF(LOG_NOTICE, dxr->fd,
865			    "Unable to allocate DXR aux struct");
866			return;
867		}
868		dxr->aux = da;
869		da->fibnum = dxr->fibnum;
870		da->fd = dxr->fd;
871		da->refcnt = 1;
872		LIST_INIT(&da->all_chunks);
873		LIST_INIT(&da->all_trie);
874		da->rtbl_size = RTBL_SIZE_INCR;
875		da->range_tbl = NULL;
876		da->xtbl_size = XTBL_SIZE_INCR;
877		da->x_tbl = NULL;
878		bzero(&da->dst, sizeof(da->dst));
879		bzero(&da->mask, sizeof(da->mask));
880		da->dst.sin_len = sizeof(da->dst);
881		da->mask.sin_len = sizeof(da->mask);
882		da->dst.sin_family = AF_INET;
883		da->mask.sin_family = AF_INET;
884	}
885	if (da->range_tbl == NULL) {
886		da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
887		    + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
888		if (da->range_tbl == NULL) {
889			FIB_PRINTF(LOG_NOTICE, da->fd,
890			    "Unable to allocate DXR range table");
891			return;
892		}
893		range_rebuild = 1;
894	}
895	if (da->x_tbl == NULL) {
896		da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
897		    M_DXRAUX, M_NOWAIT);
898		if (da->x_tbl == NULL) {
899			FIB_PRINTF(LOG_NOTICE, da->fd,
900			    "Unable to allocate DXR extension table");
901			return;
902		}
903		trie_rebuild = 1;
904	}
905
906	microuptime(&t0);
907
908	dxr->nh_tbl = fib_get_nhop_array(da->fd);
909	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
910
911	if (da->updates_low > da->updates_high)
912		range_rebuild = 1;
913
914range_build:
915	if (range_rebuild) {
916		/* Bulk cleanup */
917		bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
918		while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
919			LIST_REMOVE(cdp, cd_all_le);
920			uma_zfree(chunk_zone, cdp);
921		}
922		for (i = 0; i < UNUSED_BUCKETS; i++)
923			LIST_INIT(&da->unused_chunks[i]);
924		da->unused_chunks_size = 0;
925		da->rtbl_top = 0;
926		da->updates_low = 0;
927		da->updates_high = DIRECT_TBL_SIZE - 1;
928		memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
929		for (i = 0; i < DIRECT_TBL_SIZE; i++) {
930			da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
931			da->direct_tbl[i].base = 0;
932		}
933	}
934	da->prefixes = rinfo.num_prefixes;
935
936	/* DXR: construct direct & range table */
937	for (i = da->updates_low; i <= da->updates_high; i++) {
938		m = da->updates_mask[i >> 5] >> (i & 0x1f);
939		if (m == 0)
940			i |= 0x1f;
941		else if (m & 1 && update_chunk(da, i) != 0)
942			return;
943	}
944
945	range_frag = 0;
946	if (da->rtbl_top)
947		range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
948	if (range_frag > V_frag_limit) {
949		range_rebuild = 1;
950		goto range_build;
951	}
952
953	r_size = sizeof(*da->range_tbl) * da->rtbl_top;
954	microuptime(&t1);
955
956	if (range_rebuild ||
957	    abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
958		trie_rebuild = 1;
959
960trie_build:
961	if (trie_rebuild) {
962		da->trie_rebuilt_prefixes = da->prefixes;
963		da->d_bits = DXR_D;
964		da->updates_low = 0;
965		da->updates_high = DIRECT_TBL_SIZE - 1;
966		if (!range_rebuild)
967			memset(da->updates_mask, 0xff,
968			    sizeof(da->updates_mask));
969	}
970
971dxr2_try_squeeze:
972	if (trie_rebuild) {
973		/* Bulk cleanup */
974		bzero(da->trietbl, sizeof(da->trietbl));
975		bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
976		while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
977			LIST_REMOVE(tp, td_all_le);
978			uma_zfree(trie_zone, tp);
979		}
980		LIST_INIT(&da->unused_trie);
981		da->all_trie_cnt = da->unused_trie_cnt = 0;
982	}
983
984	/* Populate d_tbl, x_tbl */
985	dxr_x = DXR_TRIE_BITS - da->d_bits;
986	d_tbl_size = (1 << da->d_bits);
987
988	for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
989	    i++) {
990		if (!trie_rebuild) {
991			m = 0;
992			for (int j = 0; j < (1 << dxr_x); j += 32)
993				m |= da->updates_mask[((i << dxr_x) + j) >> 5];
994			if (m == 0)
995				continue;
996			trie_unref(da, i);
997		}
998		ti = trie_ref(da, i);
999		if (ti < 0)
1000			return;
1001		da->d_tbl[i] = ti;
1002	}
1003
1004	trie_frag = 0;
1005	if (da->all_trie_cnt)
1006		trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1007	if (trie_frag > V_frag_limit) {
1008		trie_rebuild = 1;
1009		goto trie_build;
1010	}
1011
1012	d_size = sizeof(*da->d_tbl) * d_tbl_size;
1013	x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1014	    * da->all_trie_cnt;
1015	dxr_tot_size = d_size + x_size + r_size;
1016
1017	if (trie_rebuild == 1) {
1018		/* Try to find a more compact D/X split */
1019		if (prev_size == 0 || dxr_tot_size <= prev_size)
1020			da->d_bits--;
1021		else {
1022			da->d_bits++;
1023			trie_rebuild = 2;
1024		}
1025		prev_size = dxr_tot_size;
1026		goto dxr2_try_squeeze;
1027	}
1028	microuptime(&t2);
1029
1030	dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1031	if (dxr->d == NULL) {
1032		FIB_PRINTF(LOG_NOTICE, da->fd,
1033		    "Unable to allocate DXR lookup table");
1034		return;
1035	}
1036	memcpy(dxr->d, da->d_tbl, d_size);
1037	dxr->x = ((char *) dxr->d) + d_size;
1038	memcpy(dxr->x, da->x_tbl, x_size);
1039	dxr->r = ((char *) dxr->x) + x_size;
1040	dxr->d_shift = 32 - da->d_bits;
1041	dxr->x_shift = dxr_x;
1042	dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1043	memcpy(dxr->r, da->range_tbl, r_size);
1044
1045	if (da->updates_low <= da->updates_high)
1046		bzero(&da->updates_mask[da->updates_low / 32],
1047		    (da->updates_high - da->updates_low) / 8 + 1);
1048	da->updates_low = DIRECT_TBL_SIZE - 1;
1049	da->updates_high = 0;
1050	microuptime(&t3);
1051
1052	FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1053	    da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1054	i = dxr_tot_size * 100;
1055	if (rinfo.num_prefixes)
1056		i /= rinfo.num_prefixes;
1057	FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1058	    dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1059	    i / 100, i % 100);
1060	FIB_PRINTF(LOG_INFO, da->fd,
1061	    "%d.%02d%% trie, %d.%02d%% range fragmentation",
1062	    trie_frag / 100, trie_frag % 100,
1063	    range_frag / 100, range_frag % 100);
1064	i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1065	FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1066	    range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1067	i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1068	FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1069	    trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1070	i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1071	FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1072	    i / 1000, i % 1000);
1073}
1074
1075/*
1076 * Glue functions for attaching to the FIB_ALGO infrastructure.
1077 */
1078
1079static struct nhop_object *
1080dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1081    uint32_t scopeid)
1082{
1083	struct dxr *dxr = algo_data;
1084
1085	return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1086}
1087
1088static enum flm_op_result
1089dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1090{
1091	struct dxr *old_dxr = old_data;
1092	struct dxr_aux *da = NULL;
1093	struct dxr *dxr;
1094
1095	dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1096	if (dxr == NULL) {
1097		FIB_PRINTF(LOG_NOTICE, fd,
1098		    "Unable to allocate DXR container struct");
1099		return (FLM_REBUILD);
1100	}
1101
1102	/* Check whether we may reuse the old auxiliary structures */
1103	if (old_dxr != NULL && old_dxr->aux != NULL &&
1104	    old_dxr->aux->fd == fd) {
1105		da = old_dxr->aux;
1106		atomic_add_int(&da->refcnt, 1);
1107	}
1108
1109	dxr->aux = da;
1110	dxr->d = NULL;
1111	dxr->fd = fd;
1112	dxr->fibnum = fibnum;
1113	*data = dxr;
1114
1115	return (FLM_SUCCESS);
1116}
1117
1118static void
1119dxr_destroy(void *data)
1120{
1121	struct dxr *dxr = data;
1122	struct dxr_aux *da = dxr->aux;
1123	struct chunk_desc *cdp;
1124	struct trie_desc *tp;
1125
1126	free(dxr->d, M_DXRLPM);
1127	free(dxr, M_DXRAUX);
1128
1129	if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1130		return;
1131
1132	/* Release auxiliary structures */
1133	while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1134		LIST_REMOVE(cdp, cd_all_le);
1135		uma_zfree(chunk_zone, cdp);
1136	}
1137	while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1138		LIST_REMOVE(tp, td_all_le);
1139		uma_zfree(trie_zone, tp);
1140	}
1141	free(da->range_tbl, M_DXRAUX);
1142	free(da->x_tbl, M_DXRAUX);
1143	free(da, M_DXRAUX);
1144}
1145
1146static void
1147epoch_dxr_destroy(epoch_context_t ctx)
1148{
1149	struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1150
1151	dxr_destroy(dxr);
1152}
1153
1154static void *
1155choose_lookup_fn(struct dxr_aux *da)
1156{
1157
1158	switch (da->d_bits) {
1159#if DXR_TRIE_BITS > 16
1160	case 16:
1161		return (dxr_fib_lookup_16);
1162#endif
1163	case 15:
1164		return (dxr_fib_lookup_15);
1165	case 14:
1166		return (dxr_fib_lookup_14);
1167	case 13:
1168		return (dxr_fib_lookup_13);
1169	case 12:
1170		return (dxr_fib_lookup_12);
1171	case 11:
1172		return (dxr_fib_lookup_11);
1173	case 10:
1174		return (dxr_fib_lookup_10);
1175	case 9:
1176		return (dxr_fib_lookup_9);
1177	}
1178	return (dxr_fib_lookup);
1179}
1180
1181static enum flm_op_result
1182dxr_dump_end(void *data, struct fib_dp *dp)
1183{
1184	struct dxr *dxr = data;
1185	struct dxr_aux *da;
1186
1187	dxr_build(dxr);
1188
1189	da = dxr->aux;
1190	if (da == NULL || dxr->d == NULL)
1191		return (FLM_REBUILD);
1192
1193	if (da->rtbl_top >= BASE_MAX)
1194		return (FLM_ERROR);
1195
1196	dp->f = choose_lookup_fn(da);
1197	dp->arg = dxr;
1198
1199	return (FLM_SUCCESS);
1200}
1201
1202static enum flm_op_result
1203dxr_dump_rib_item(struct rtentry *rt, void *data)
1204{
1205
1206	return (FLM_SUCCESS);
1207}
1208
1209static enum flm_op_result
1210dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1211    void *data)
1212{
1213
1214	return (FLM_BATCH);
1215}
1216
1217static enum flm_op_result
1218dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1219    void *data)
1220{
1221	struct dxr *dxr = data;
1222	struct dxr *new_dxr;
1223	struct dxr_aux *da;
1224	struct fib_dp new_dp;
1225	enum flm_op_result res;
1226	uint32_t ip, plen, hmask, start, end, i, ui;
1227#ifdef INVARIANTS
1228	struct rib_rtable_info rinfo;
1229	int update_delta = 0;
1230#endif
1231
1232	MPASS(data != NULL);
1233	MPASS(q != NULL);
1234	MPASS(q->count < q->size);
1235
1236	da = dxr->aux;
1237	MPASS(da != NULL);
1238	MPASS(da->fd == dxr->fd);
1239	MPASS(da->refcnt > 0);
1240
1241	FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1242	for (ui = 0; ui < q->count; ui++) {
1243#ifdef INVARIANTS
1244		if (q->entries[ui].nh_new != NULL)
1245			update_delta++;
1246		if (q->entries[ui].nh_old != NULL)
1247			update_delta--;
1248#endif
1249		plen = q->entries[ui].plen;
1250		ip = ntohl(q->entries[ui].addr4.s_addr);
1251		if (plen < 32)
1252			hmask = 0xffffffffU >> plen;
1253		else
1254			hmask = 0;
1255		start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1256		end = (ip | hmask) >> DXR_RANGE_SHIFT;
1257
1258		if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1259			for (i = start >> 5; i <= end >> 5; i++)
1260				da->updates_mask[i] = 0xffffffffU;
1261		else
1262			for (i = start; i <= end; i++)
1263				da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1264		if (start < da->updates_low)
1265			da->updates_low = start;
1266		if (end > da->updates_high)
1267			da->updates_high = end;
1268	}
1269
1270#ifdef INVARIANTS
1271	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1272	MPASS(da->prefixes + update_delta == rinfo.num_prefixes);
1273#endif
1274
1275	res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1276	if (res != FLM_SUCCESS)
1277		return (res);
1278
1279	dxr_build(new_dxr);
1280
1281	/* Structural limit exceeded, hard error */
1282	if (da->rtbl_top >= BASE_MAX) {
1283		dxr_destroy(new_dxr);
1284		return (FLM_ERROR);
1285	}
1286
1287	if (new_dxr->d == NULL) {
1288		dxr_destroy(new_dxr);
1289		return (FLM_REBUILD);
1290	}
1291
1292	new_dp.f = choose_lookup_fn(da);
1293	new_dp.arg = new_dxr;
1294	if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1295		fib_set_algo_ptr(dxr->fd, new_dxr);
1296		fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1297		return (FLM_SUCCESS);
1298	}
1299
1300	FIB_PRINTF(LOG_NOTICE, dxr->fd, "fib_set_datapath_ptr() failed");
1301	dxr_destroy(new_dxr);
1302	return (FLM_REBUILD);
1303}
1304
1305static uint8_t
1306dxr_get_pref(const struct rib_rtable_info *rinfo)
1307{
1308
1309	/* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1310	return (251);
1311}
1312
1313SYSCTL_DECL(_net_route_algo);
1314SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1315    "DXR tunables");
1316
1317static int
1318sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1319{
1320	char buf[8];
1321	int error, new, i;
1322
1323	snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1324	    V_frag_limit % 100);
1325	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1326	if (error != 0 || req->newptr == NULL)
1327		return (error);
1328	if (!isdigit(*buf) && *buf != '.')
1329		return (EINVAL);
1330	for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1331		new = new * 10 + buf[i] - '0';
1332	new *= 100;
1333	if (buf[i++] == '.') {
1334		if (!isdigit(buf[i]))
1335			return (EINVAL);
1336		new += (buf[i++] - '0') * 10;
1337		if (isdigit(buf[i]))
1338			new += buf[i++] - '0';
1339	}
1340	if (new > 1000)
1341		return (EINVAL);
1342	V_frag_limit = new;
1343	snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1344	    V_frag_limit % 100);
1345	return (0);
1346}
1347
1348SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1349    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1350    0, 0, sysctl_dxr_frag_limit, "A",
1351    "Fragmentation threshold to full rebuild");
1352
1353static struct fib_lookup_module fib_dxr_mod = {
1354	.flm_name = "dxr",
1355	.flm_family = AF_INET,
1356	.flm_init_cb = dxr_init,
1357	.flm_destroy_cb = dxr_destroy,
1358	.flm_dump_rib_item_cb = dxr_dump_rib_item,
1359	.flm_dump_end_cb = dxr_dump_end,
1360	.flm_change_rib_item_cb = dxr_change_rib_item,
1361	.flm_change_rib_items_cb = dxr_change_rib_batch,
1362	.flm_get_pref = dxr_get_pref,
1363};
1364
1365static int
1366dxr_modevent(module_t mod, int type, void *unused)
1367{
1368	int error;
1369
1370	switch (type) {
1371	case MOD_LOAD:
1372		chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1373		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1374		trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1375		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1376		fib_module_register(&fib_dxr_mod);
1377		return(0);
1378	case MOD_UNLOAD:
1379		error = fib_module_unregister(&fib_dxr_mod);
1380		if (error)
1381			return (error);
1382		uma_zdestroy(chunk_zone);
1383		uma_zdestroy(trie_zone);
1384		return(0);
1385	default:
1386		return(EOPNOTSUPP);
1387	}
1388}
1389
1390static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1391
1392DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1393MODULE_VERSION(fib_dxr, 1);
1394