flowtable.c revision 262743
1/*- 2 * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org> 3 * Copyright (c) 2008-2010, BitGravity Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, 10 * this list of conditions and the following disclaimer. 11 * 12 * 2. Neither the name of the BitGravity Corporation nor the names of its 13 * contributors may be used to endorse or promote products derived from 14 * this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "opt_route.h" 30#include "opt_mpath.h" 31#include "opt_ddb.h" 32#include "opt_inet.h" 33#include "opt_inet6.h" 34 35#include <sys/cdefs.h> 36__FBSDID("$FreeBSD: stable/10/sys/net/flowtable.c 262743 2014-03-04 15:14:47Z glebius $"); 37 38#include <sys/param.h> 39#include <sys/types.h> 40#include <sys/bitstring.h> 41#include <sys/condvar.h> 42#include <sys/callout.h> 43#include <sys/hash.h> 44#include <sys/kernel.h> 45#include <sys/kthread.h> 46#include <sys/limits.h> 47#include <sys/malloc.h> 48#include <sys/mbuf.h> 49#include <sys/pcpu.h> 50#include <sys/proc.h> 51#include <sys/queue.h> 52#include <sys/sbuf.h> 53#include <sys/sched.h> 54#include <sys/smp.h> 55#include <sys/socket.h> 56#include <sys/syslog.h> 57#include <sys/sysctl.h> 58#include <vm/uma.h> 59 60#include <net/if.h> 61#include <net/if_llatbl.h> 62#include <net/if_var.h> 63#include <net/route.h> 64#include <net/flowtable.h> 65#include <net/vnet.h> 66 67#include <netinet/in.h> 68#include <netinet/in_systm.h> 69#include <netinet/in_var.h> 70#include <netinet/if_ether.h> 71#include <netinet/ip.h> 72#ifdef INET6 73#include <netinet/ip6.h> 74#endif 75#ifdef FLOWTABLE_HASH_ALL 76#include <netinet/tcp.h> 77#include <netinet/udp.h> 78#include <netinet/sctp.h> 79#endif 80 81#include <ddb/ddb.h> 82 83#ifdef FLOWTABLE_HASH_ALL 84#define KEY_PORTS (sizeof(uint16_t) * 2) 85#define KEY_ADDRS 2 86#else 87#define KEY_PORTS 0 88#define KEY_ADDRS 1 89#endif 90 91#ifdef INET6 92#define KEY_ADDR_LEN sizeof(struct in6_addr) 93#else 94#define KEY_ADDR_LEN sizeof(struct in_addr) 95#endif 96 97#define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t)) 98 99struct flentry { 100 uint32_t f_hash; /* hash flowing forward */ 101 uint32_t f_key[KEYLEN]; /* address(es and ports) */ 102 uint32_t f_uptime; /* uptime at last access */ 103 uint16_t f_fibnum; /* fib index */ 104#ifdef FLOWTABLE_HASH_ALL 105 uint8_t f_proto; /* protocol */ 106 uint8_t f_flags; /* stale? */ 107#define FL_STALE 1 108#endif 109 SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */ 110 struct rtentry *f_rt; /* rtentry for flow */ 111 struct llentry *f_lle; /* llentry for flow */ 112}; 113#undef KEYLEN 114 115SLIST_HEAD(flist, flentry); 116/* Make sure we can use pcpu_zone_ptr for struct flist. */ 117CTASSERT(sizeof(struct flist) == sizeof(void *)); 118 119struct flowtable { 120 counter_u64_t *ft_stat; 121 int ft_size; 122 /* 123 * ft_table is a malloc(9)ed array of pointers. Pointers point to 124 * memory from UMA_ZONE_PCPU zone. 125 * ft_masks is per-cpu pointer itself. Each instance points 126 * to a malloc(9)ed bitset, that is private to corresponding CPU. 127 */ 128 struct flist **ft_table; 129 bitstr_t **ft_masks; 130 bitstr_t *ft_tmpmask; 131}; 132 133#define FLOWSTAT_ADD(ft, name, v) \ 134 counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v)) 135#define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1) 136 137static struct proc *flowcleanerproc; 138static uint32_t flow_hashjitter; 139 140static struct cv flowclean_f_cv; 141static struct cv flowclean_c_cv; 142static struct mtx flowclean_lock; 143static uint32_t flowclean_cycles; 144 145/* 146 * TODO: 147 * - add sysctls to resize && flush flow tables 148 * - Add per flowtable sysctls for statistics and configuring timeouts 149 * - add saturation counter to rtentry to support per-packet load-balancing 150 * add flag to indicate round-robin flow, add list lookup from head 151 for flows 152 * - add sysctl / device node / syscall to support exporting and importing 153 * of flows with flag to indicate that a flow was imported so should 154 * not be considered for auto-cleaning 155 * - support explicit connection state (currently only ad-hoc for DSR) 156 * - idetach() cleanup for options VIMAGE builds. 157 */ 158#ifdef INET 159static VNET_DEFINE(struct flowtable, ip4_ft); 160#define V_ip4_ft VNET(ip4_ft) 161#endif 162#ifdef INET6 163static VNET_DEFINE(struct flowtable, ip6_ft); 164#define V_ip6_ft VNET(ip6_ft) 165#endif 166 167static uma_zone_t flow_zone; 168 169static VNET_DEFINE(int, flowtable_enable) = 1; 170#define V_flowtable_enable VNET(flowtable_enable) 171 172static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL, 173 "flowtable"); 174SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW, 175 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); 176SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW, 177 &flow_zone, "Maximum number of flows allowed"); 178 179static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings"); 180 181static struct flentry * 182flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t); 183 184#ifdef INET 185static struct flentry * 186flowtable_lookup_ipv4(struct mbuf *m, struct route *ro) 187{ 188 struct flentry *fle; 189 struct sockaddr_in *sin; 190 struct ip *ip; 191 uint32_t fibnum; 192#ifdef FLOWTABLE_HASH_ALL 193 uint32_t key[3]; 194 int iphlen; 195 uint16_t sport, dport; 196 uint8_t proto; 197#endif 198 199 ip = mtod(m, struct ip *); 200 201 if (ip->ip_src.s_addr == ip->ip_dst.s_addr || 202 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 203 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) 204 return (NULL); 205 206 fibnum = M_GETFIB(m); 207 208#ifdef FLOWTABLE_HASH_ALL 209 iphlen = ip->ip_hl << 2; 210 proto = ip->ip_p; 211 212 switch (proto) { 213 case IPPROTO_TCP: { 214 struct tcphdr *th; 215 216 th = (struct tcphdr *)((char *)ip + iphlen); 217 sport = th->th_sport; 218 dport = th->th_dport; 219 if (th->th_flags & (TH_RST|TH_FIN)) 220 fibnum |= (FL_STALE << 24); 221 break; 222 } 223 case IPPROTO_UDP: { 224 struct udphdr *uh; 225 226 uh = (struct udphdr *)((char *)ip + iphlen); 227 sport = uh->uh_sport; 228 dport = uh->uh_dport; 229 break; 230 } 231 case IPPROTO_SCTP: { 232 struct sctphdr *sh; 233 234 sh = (struct sctphdr *)((char *)ip + iphlen); 235 sport = sh->src_port; 236 dport = sh->dest_port; 237 /* XXXGL: handle stale? */ 238 break; 239 } 240 default: 241 sport = dport = 0; 242 break; 243 } 244 245 key[0] = ip->ip_dst.s_addr; 246 key[1] = ip->ip_src.s_addr; 247 key[2] = (dport << 16) | sport; 248 fibnum |= proto << 16; 249 250 fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t), 251 fibnum); 252 253#else /* !FLOWTABLE_HASH_ALL */ 254 255 fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst, 256 sizeof(struct in_addr), fibnum); 257 258#endif /* FLOWTABLE_HASH_ALL */ 259 260 if (fle == NULL) 261 return (NULL); 262 263 sin = (struct sockaddr_in *)&ro->ro_dst; 264 sin->sin_family = AF_INET; 265 sin->sin_len = sizeof(*sin); 266 sin->sin_addr = ip->ip_dst; 267 268 return (fle); 269} 270#endif /* INET */ 271 272#ifdef INET6 273/* 274 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, 275 * then it sets p to point at the offset "len" in the mbuf. WARNING: the 276 * pointer might become stale after other pullups (but we never use it 277 * this way). 278 */ 279#define PULLUP_TO(_len, p, T) \ 280do { \ 281 int x = (_len) + sizeof(T); \ 282 if ((m)->m_len < x) \ 283 return (NULL); \ 284 p = (mtod(m, char *) + (_len)); \ 285} while (0) 286 287#define TCP(p) ((struct tcphdr *)(p)) 288#define SCTP(p) ((struct sctphdr *)(p)) 289#define UDP(p) ((struct udphdr *)(p)) 290 291static struct flentry * 292flowtable_lookup_ipv6(struct mbuf *m, struct route *ro) 293{ 294 struct flentry *fle; 295 struct sockaddr_in6 *sin6; 296 struct ip6_hdr *ip6; 297 uint32_t fibnum; 298#ifdef FLOWTABLE_HASH_ALL 299 uint32_t key[9]; 300 void *ulp; 301 int hlen; 302 uint16_t sport, dport; 303 u_short offset; 304 uint8_t proto; 305#else 306 uint32_t key[4]; 307#endif 308 309 ip6 = mtod(m, struct ip6_hdr *); 310 if (in6_localaddr(&ip6->ip6_dst)) 311 return (NULL); 312 313 fibnum = M_GETFIB(m); 314 315#ifdef FLOWTABLE_HASH_ALL 316 hlen = sizeof(struct ip6_hdr); 317 proto = ip6->ip6_nxt; 318 offset = sport = dport = 0; 319 ulp = NULL; 320 while (ulp == NULL) { 321 switch (proto) { 322 case IPPROTO_ICMPV6: 323 case IPPROTO_OSPFIGP: 324 case IPPROTO_PIM: 325 case IPPROTO_CARP: 326 case IPPROTO_ESP: 327 case IPPROTO_NONE: 328 ulp = ip6; 329 break; 330 case IPPROTO_TCP: 331 PULLUP_TO(hlen, ulp, struct tcphdr); 332 dport = TCP(ulp)->th_dport; 333 sport = TCP(ulp)->th_sport; 334 if (TCP(ulp)->th_flags & (TH_RST|TH_FIN)) 335 fibnum |= (FL_STALE << 24); 336 break; 337 case IPPROTO_SCTP: 338 PULLUP_TO(hlen, ulp, struct sctphdr); 339 dport = SCTP(ulp)->src_port; 340 sport = SCTP(ulp)->dest_port; 341 /* XXXGL: handle stale? */ 342 break; 343 case IPPROTO_UDP: 344 PULLUP_TO(hlen, ulp, struct udphdr); 345 dport = UDP(ulp)->uh_dport; 346 sport = UDP(ulp)->uh_sport; 347 break; 348 case IPPROTO_HOPOPTS: /* RFC 2460 */ 349 PULLUP_TO(hlen, ulp, struct ip6_hbh); 350 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; 351 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; 352 ulp = NULL; 353 break; 354 case IPPROTO_ROUTING: /* RFC 2460 */ 355 PULLUP_TO(hlen, ulp, struct ip6_rthdr); 356 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; 357 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; 358 ulp = NULL; 359 break; 360 case IPPROTO_FRAGMENT: /* RFC 2460 */ 361 PULLUP_TO(hlen, ulp, struct ip6_frag); 362 hlen += sizeof (struct ip6_frag); 363 proto = ((struct ip6_frag *)ulp)->ip6f_nxt; 364 offset = ((struct ip6_frag *)ulp)->ip6f_offlg & 365 IP6F_OFF_MASK; 366 ulp = NULL; 367 break; 368 case IPPROTO_DSTOPTS: /* RFC 2460 */ 369 PULLUP_TO(hlen, ulp, struct ip6_hbh); 370 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; 371 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; 372 ulp = NULL; 373 break; 374 case IPPROTO_AH: /* RFC 2402 */ 375 PULLUP_TO(hlen, ulp, struct ip6_ext); 376 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; 377 proto = ((struct ip6_ext *)ulp)->ip6e_nxt; 378 ulp = NULL; 379 break; 380 default: 381 PULLUP_TO(hlen, ulp, struct ip6_ext); 382 break; 383 } 384 } 385 386 bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); 387 bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr)); 388 key[8] = (dport << 16) | sport; 389 fibnum |= proto << 16; 390 391 fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t), 392 fibnum); 393#else /* !FLOWTABLE_HASH_ALL */ 394 bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); 395 fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr), 396 fibnum); 397#endif /* FLOWTABLE_HASH_ALL */ 398 399 if (fle == NULL) 400 return (NULL); 401 402 sin6 = (struct sockaddr_in6 *)&ro->ro_dst; 403 sin6->sin6_family = AF_INET6; 404 sin6->sin6_len = sizeof(*sin6); 405 bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr)); 406 407 return (fle); 408} 409#endif /* INET6 */ 410 411static bitstr_t * 412flowtable_mask(struct flowtable *ft) 413{ 414 415 /* 416 * flowtable_free_stale() calls w/o critical section, but 417 * with sched_bind(). Since pointer is stable throughout 418 * ft lifetime, it is safe, otherwise... 419 * 420 * CRITICAL_ASSERT(curthread); 421 */ 422 423 return (*(bitstr_t **)zpcpu_get(ft->ft_masks)); 424} 425 426static struct flist * 427flowtable_list(struct flowtable *ft, uint32_t hash) 428{ 429 430 CRITICAL_ASSERT(curthread); 431 return (zpcpu_get(ft->ft_table[hash % ft->ft_size])); 432} 433 434static int 435flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle) 436{ 437 438 if (((fle->f_rt->rt_flags & RTF_HOST) && 439 ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) || 440 (fle->f_rt->rt_ifp == NULL) || 441 !RT_LINK_IS_UP(fle->f_rt->rt_ifp) || 442 (fle->f_lle->la_flags & LLE_VALID) == 0) 443 return (1); 444 445 if (time_uptime - fle->f_uptime > maxidle) 446 return (1); 447 448#ifdef FLOWTABLE_HASH_ALL 449 if (fle->f_flags & FL_STALE) 450 return (1); 451#endif 452 453 return (0); 454} 455 456static int 457flow_full(void) 458{ 459 int count, max; 460 461 count = uma_zone_get_cur(flow_zone); 462 max = uma_zone_get_max(flow_zone); 463 464 return (count > (max - (max >> 3))); 465} 466 467static int 468flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum) 469{ 470#ifdef FLOWTABLE_HASH_ALL 471 uint8_t proto; 472 473 proto = (fibnum >> 16) & 0xff; 474 fibnum &= 0xffff; 475#endif 476 477 CRITICAL_ASSERT(curthread); 478 479 /* Microoptimization for IPv4: don't use bcmp(). */ 480 if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) || 481 (bcmp(fle->f_key, key, keylen) == 0)) && 482 fibnum == fle->f_fibnum && 483#ifdef FLOWTABLE_HASH_ALL 484 proto == fle->f_proto && 485#endif 486 (fle->f_rt->rt_flags & RTF_UP) && 487 fle->f_rt->rt_ifp != NULL && 488 (fle->f_lle->la_flags & LLE_VALID)) 489 return (1); 490 491 return (0); 492} 493 494static struct flentry * 495flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, 496 int keylen, uint32_t fibnum0) 497{ 498#ifdef INET6 499 struct route_in6 sro6; 500#endif 501#ifdef INET 502 struct route sro; 503#endif 504 struct route *ro = NULL; 505 struct rtentry *rt; 506 struct lltable *lt = NULL; 507 struct llentry *lle; 508 struct sockaddr_storage *l3addr; 509 struct ifnet *ifp; 510 struct flist *flist; 511 struct flentry *fle, *iter; 512 bitstr_t *mask; 513 uint16_t fibnum = fibnum0; 514#ifdef FLOWTABLE_HASH_ALL 515 uint8_t proto; 516 517 proto = (fibnum0 >> 16) & 0xff; 518 fibnum = fibnum0 & 0xffff; 519#endif 520 521 /* 522 * This bit of code ends up locking the 523 * same route 3 times (just like ip_output + ether_output) 524 * - at lookup 525 * - in rt_check when called by arpresolve 526 * - dropping the refcount for the rtentry 527 * 528 * This could be consolidated to one if we wrote a variant 529 * of arpresolve with an rt_check variant that expected to 530 * receive the route locked 531 */ 532#ifdef INET 533 if (ft == &V_ip4_ft) { 534 struct sockaddr_in *sin; 535 536 ro = &sro; 537 bzero(&sro.ro_dst, sizeof(sro.ro_dst)); 538 539 sin = (struct sockaddr_in *)&sro.ro_dst; 540 sin->sin_family = AF_INET; 541 sin->sin_len = sizeof(*sin); 542 sin->sin_addr.s_addr = key[0]; 543 } 544#endif 545#ifdef INET6 546 if (ft == &V_ip6_ft) { 547 struct sockaddr_in6 *sin6; 548 549 ro = (struct route *)&sro6; 550 sin6 = &sro6.ro_dst; 551 552 bzero(sin6, sizeof(*sin6)); 553 sin6->sin6_family = AF_INET6; 554 sin6->sin6_len = sizeof(*sin6); 555 bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr)); 556 } 557#endif 558 559 ro->ro_rt = NULL; 560#ifdef RADIX_MPATH 561 rtalloc_mpath_fib(ro, hash, fibnum); 562#else 563 rtalloc_ign_fib(ro, 0, fibnum); 564#endif 565 if (ro->ro_rt == NULL) 566 return (NULL); 567 568 rt = ro->ro_rt; 569 ifp = rt->rt_ifp; 570 571 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { 572 RTFREE(rt); 573 return (NULL); 574 } 575 576#ifdef INET 577 if (ft == &V_ip4_ft) 578 lt = LLTABLE(ifp); 579#endif 580#ifdef INET6 581 if (ft == &V_ip6_ft) 582 lt = LLTABLE6(ifp); 583#endif 584 585 if (rt->rt_flags & RTF_GATEWAY) 586 l3addr = (struct sockaddr_storage *)rt->rt_gateway; 587 else 588 l3addr = (struct sockaddr_storage *)&ro->ro_dst; 589 lle = llentry_alloc(ifp, lt, l3addr); 590 591 if (lle == NULL) { 592 RTFREE(rt); 593 return (NULL); 594 } 595 596 /* Don't insert the entry if the ARP hasn't yet finished resolving. */ 597 if ((lle->la_flags & LLE_VALID) == 0) { 598 RTFREE(rt); 599 LLE_FREE(lle); 600 FLOWSTAT_INC(ft, ft_fail_lle_invalid); 601 return (NULL); 602 } 603 604 fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO); 605 if (fle == NULL) { 606 RTFREE(rt); 607 LLE_FREE(lle); 608 return (NULL); 609 } 610 611 fle->f_hash = hash; 612 bcopy(key, &fle->f_key, keylen); 613 fle->f_rt = rt; 614 fle->f_lle = lle; 615 fle->f_fibnum = fibnum; 616 fle->f_uptime = time_uptime; 617#ifdef FLOWTABLE_HASH_ALL 618 fle->f_proto = proto; 619 fle->f_flags = fibnum0 >> 24; 620#endif 621 622 critical_enter(); 623 mask = flowtable_mask(ft); 624 flist = flowtable_list(ft, hash); 625 626 if (SLIST_EMPTY(flist)) { 627 bit_set(mask, (hash % ft->ft_size)); 628 SLIST_INSERT_HEAD(flist, fle, f_next); 629 goto skip; 630 } 631 632 /* 633 * find end of list and make sure that we were not 634 * preempted by another thread handling this flow 635 */ 636 SLIST_FOREACH(iter, flist, f_next) { 637 KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size, 638 ("%s: wrong hash", __func__)); 639 if (flow_matches(iter, key, keylen, fibnum)) { 640 /* 641 * We probably migrated to an other CPU after 642 * lookup in flowtable_lookup_common() failed. 643 * It appeared that this CPU already has flow 644 * entry. 645 */ 646 iter->f_uptime = time_uptime; 647#ifdef FLOWTABLE_HASH_ALL 648 iter->f_flags |= fibnum >> 24; 649#endif 650 critical_exit(); 651 FLOWSTAT_INC(ft, ft_collisions); 652 uma_zfree(flow_zone, fle); 653 return (iter); 654 } 655 } 656 657 SLIST_INSERT_HEAD(flist, fle, f_next); 658skip: 659 critical_exit(); 660 FLOWSTAT_INC(ft, ft_inserts); 661 662 return (fle); 663} 664 665int 666flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro) 667{ 668 struct flentry *fle; 669 670 if (V_flowtable_enable == 0) 671 return (ENXIO); 672 673 switch (sa) { 674#ifdef INET 675 case AF_INET: 676 fle = flowtable_lookup_ipv4(m, ro); 677 break; 678#endif 679#ifdef INET6 680 case AF_INET6: 681 fle = flowtable_lookup_ipv6(m, ro); 682 break; 683#endif 684 default: 685 panic("%s: sa %d", __func__, sa); 686 } 687 688 if (fle == NULL) 689 return (EHOSTUNREACH); 690 691 if (!(m->m_flags & M_FLOWID)) { 692 m->m_flags |= M_FLOWID; 693 m->m_pkthdr.flowid = fle->f_hash; 694 } 695 696 ro->ro_rt = fle->f_rt; 697 ro->ro_lle = fle->f_lle; 698 ro->ro_flags |= RT_NORTREF; 699 700 return (0); 701} 702 703static struct flentry * 704flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen, 705 uint32_t fibnum) 706{ 707 struct flist *flist; 708 struct flentry *fle; 709 uint32_t hash; 710 711 FLOWSTAT_INC(ft, ft_lookups); 712 713 hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter); 714 715 critical_enter(); 716 flist = flowtable_list(ft, hash); 717 SLIST_FOREACH(fle, flist, f_next) { 718 KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size, 719 ("%s: wrong hash", __func__)); 720 if (flow_matches(fle, key, keylen, fibnum)) { 721 fle->f_uptime = time_uptime; 722#ifdef FLOWTABLE_HASH_ALL 723 fle->f_flags |= fibnum >> 24; 724#endif 725 critical_exit(); 726 FLOWSTAT_INC(ft, ft_hits); 727 return (fle); 728 } 729 } 730 critical_exit(); 731 732 FLOWSTAT_INC(ft, ft_misses); 733 734 return (flowtable_insert(ft, hash, key, keylen, fibnum)); 735} 736 737/* 738 * used by the bit_alloc macro 739 */ 740#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO) 741static void 742flowtable_alloc(struct flowtable *ft) 743{ 744 745 ft->ft_table = malloc(ft->ft_size * sizeof(struct flist), 746 M_FTABLE, M_WAITOK); 747 for (int i = 0; i < ft->ft_size; i++) 748 ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO); 749 750 ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK); 751 for (int i = 0; i < mp_ncpus; i++) { 752 bitstr_t **b; 753 754 b = zpcpu_get_cpu(ft->ft_masks, i); 755 *b = bit_alloc(ft->ft_size); 756 } 757 ft->ft_tmpmask = bit_alloc(ft->ft_size); 758} 759#undef calloc 760 761static void 762flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle) 763{ 764 struct flist *flist, freelist; 765 struct flentry *fle, *fle1, *fleprev; 766 bitstr_t *mask, *tmpmask; 767 int curbit, tmpsize; 768 769 SLIST_INIT(&freelist); 770 mask = flowtable_mask(ft); 771 tmpmask = ft->ft_tmpmask; 772 tmpsize = ft->ft_size; 773 memcpy(tmpmask, mask, ft->ft_size/8); 774 curbit = 0; 775 /* 776 * XXX Note to self, bit_ffs operates at the byte level 777 * and thus adds gratuitous overhead 778 */ 779 bit_ffs(tmpmask, ft->ft_size, &curbit); 780 while (curbit != -1) { 781 if (curbit >= ft->ft_size || curbit < -1) { 782 log(LOG_ALERT, 783 "warning: bad curbit value %d \n", 784 curbit); 785 break; 786 } 787 788 FLOWSTAT_INC(ft, ft_free_checks); 789 790 critical_enter(); 791 flist = flowtable_list(ft, curbit); 792#ifdef DIAGNOSTIC 793 if (SLIST_EMPTY(flist) && curbit > 0) { 794 log(LOG_ALERT, 795 "warning bit=%d set, but no fle found\n", 796 curbit); 797 } 798#endif 799 SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) { 800 if (rt != NULL && fle->f_rt != rt) { 801 fleprev = fle; 802 continue; 803 } 804 if (!flow_stale(ft, fle, maxidle)) { 805 fleprev = fle; 806 continue; 807 } 808 809 if (fle == SLIST_FIRST(flist)) 810 SLIST_REMOVE_HEAD(flist, f_next); 811 else 812 SLIST_REMOVE_AFTER(fleprev, f_next); 813 SLIST_INSERT_HEAD(&freelist, fle, f_next); 814 } 815 if (SLIST_EMPTY(flist)) 816 bit_clear(mask, curbit); 817 critical_exit(); 818 819 bit_clear(tmpmask, curbit); 820 tmpmask += (curbit / 8); 821 tmpsize -= (curbit / 8) * 8; 822 bit_ffs(tmpmask, tmpsize, &curbit); 823 } 824 825 SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) { 826 FLOWSTAT_INC(ft, ft_frees); 827 if (fle->f_rt != NULL) 828 RTFREE(fle->f_rt); 829 if (fle->f_lle != NULL) 830 LLE_FREE(fle->f_lle); 831 uma_zfree(flow_zone, fle); 832 } 833} 834 835static void 836flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle) 837{ 838 int i; 839 840 CPU_FOREACH(i) { 841 if (smp_started == 1) { 842 thread_lock(curthread); 843 sched_bind(curthread, i); 844 thread_unlock(curthread); 845 } 846 847 flowtable_free_stale(ft, rt, maxidle); 848 849 if (smp_started == 1) { 850 thread_lock(curthread); 851 sched_unbind(curthread); 852 thread_unlock(curthread); 853 } 854 } 855} 856 857void 858flowtable_route_flush(sa_family_t sa, struct rtentry *rt) 859{ 860 struct flowtable *ft; 861 862 switch (sa) { 863#ifdef INET 864 case AF_INET: 865 ft = &V_ip4_ft; 866 break; 867#endif 868#ifdef INET6 869 case AF_INET6: 870 ft = &V_ip6_ft; 871 break; 872#endif 873 default: 874 panic("%s: sa %d", __func__, sa); 875 } 876 877 flowtable_clean_vnet(ft, rt, 0); 878} 879 880static void 881flowtable_cleaner(void) 882{ 883 VNET_ITERATOR_DECL(vnet_iter); 884 struct thread *td; 885 886 if (bootverbose) 887 log(LOG_INFO, "flowtable cleaner started\n"); 888 td = curthread; 889 while (1) { 890 uint32_t flowclean_freq, maxidle; 891 892 /* 893 * The maximum idle time, as well as frequency are arbitrary. 894 */ 895 if (flow_full()) 896 maxidle = 5; 897 else 898 maxidle = 30; 899 900 VNET_LIST_RLOCK(); 901 VNET_FOREACH(vnet_iter) { 902 CURVNET_SET(vnet_iter); 903#ifdef INET 904 flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle); 905#endif 906#ifdef INET6 907 flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle); 908#endif 909 CURVNET_RESTORE(); 910 } 911 VNET_LIST_RUNLOCK(); 912 913 if (flow_full()) 914 flowclean_freq = 4*hz; 915 else 916 flowclean_freq = 20*hz; 917 mtx_lock(&flowclean_lock); 918 thread_lock(td); 919 sched_prio(td, PPAUSE); 920 thread_unlock(td); 921 flowclean_cycles++; 922 cv_broadcast(&flowclean_f_cv); 923 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq); 924 mtx_unlock(&flowclean_lock); 925 } 926} 927 928static void 929flowtable_flush(void *unused __unused) 930{ 931 uint64_t start; 932 933 mtx_lock(&flowclean_lock); 934 start = flowclean_cycles; 935 while (start == flowclean_cycles) { 936 cv_broadcast(&flowclean_c_cv); 937 cv_wait(&flowclean_f_cv, &flowclean_lock); 938 } 939 mtx_unlock(&flowclean_lock); 940} 941 942static struct kproc_desc flow_kp = { 943 "flowcleaner", 944 flowtable_cleaner, 945 &flowcleanerproc 946}; 947SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp); 948 949static int 950flowtable_get_size(char *name) 951{ 952 int size; 953 954 if (TUNABLE_INT_FETCH(name, &size)) { 955 if (size < 256) 956 size = 256; 957 if (!powerof2(size)) { 958 printf("%s must be power of 2\n", name); 959 size = 2048; 960 } 961 } else { 962 /* 963 * round up to the next power of 2 964 */ 965 size = 1 << fls((1024 + maxusers * 64) - 1); 966 } 967 968 return (size); 969} 970 971static void 972flowtable_init(const void *unused __unused) 973{ 974 975 flow_hashjitter = arc4random(); 976 977 flow_zone = uma_zcreate("flows", sizeof(struct flentry), 978 NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET); 979 uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus); 980 981 cv_init(&flowclean_c_cv, "c_flowcleanwait"); 982 cv_init(&flowclean_f_cv, "f_flowcleanwait"); 983 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF); 984 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL, 985 EVENTHANDLER_PRI_ANY); 986} 987SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, 988 flowtable_init, NULL); 989 990#ifdef INET 991static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL, 992 "Flowtable for IPv4"); 993 994static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat); 995VNET_PCPUSTAT_SYSINIT(ip4_ftstat); 996VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat); 997SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat, 998 ip4_ftstat, "Flowtable statistics for IPv4 " 999 "(struct flowtable_stat, net/flowtable.h)"); 1000 1001static void 1002flowtable_init_vnet_v4(const void *unused __unused) 1003{ 1004 1005 V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size"); 1006 V_ip4_ft.ft_stat = VNET(ip4_ftstat); 1007 flowtable_alloc(&V_ip4_ft); 1008} 1009VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, 1010 flowtable_init_vnet_v4, NULL); 1011#endif /* INET */ 1012 1013#ifdef INET6 1014static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL, 1015 "Flowtable for IPv6"); 1016 1017static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat); 1018VNET_PCPUSTAT_SYSINIT(ip6_ftstat); 1019VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat); 1020SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat, 1021 ip6_ftstat, "Flowtable statistics for IPv6 " 1022 "(struct flowtable_stat, net/flowtable.h)"); 1023 1024static void 1025flowtable_init_vnet_v6(const void *unused __unused) 1026{ 1027 1028 V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size"); 1029 V_ip6_ft.ft_stat = VNET(ip6_ftstat); 1030 flowtable_alloc(&V_ip6_ft); 1031} 1032VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, 1033 flowtable_init_vnet_v6, NULL); 1034#endif /* INET6 */ 1035 1036#ifdef DDB 1037static bitstr_t * 1038flowtable_mask_pcpu(struct flowtable *ft, int cpuid) 1039{ 1040 1041 return (zpcpu_get_cpu(*ft->ft_masks, cpuid)); 1042} 1043 1044static struct flist * 1045flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) 1046{ 1047 1048 return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid)); 1049} 1050 1051static void 1052flow_show(struct flowtable *ft, struct flentry *fle) 1053{ 1054 int idle_time; 1055 int rt_valid, ifp_valid; 1056 volatile struct rtentry *rt; 1057 struct ifnet *ifp = NULL; 1058 uint32_t *hashkey = fle->f_key; 1059 1060 idle_time = (int)(time_uptime - fle->f_uptime); 1061 rt = fle->f_rt; 1062 rt_valid = rt != NULL; 1063 if (rt_valid) 1064 ifp = rt->rt_ifp; 1065 ifp_valid = ifp != NULL; 1066 1067#ifdef INET 1068 if (ft == &V_ip4_ft) { 1069 char daddr[4*sizeof "123"]; 1070#ifdef FLOWTABLE_HASH_ALL 1071 char saddr[4*sizeof "123"]; 1072 uint16_t sport, dport; 1073#endif 1074 1075 inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr); 1076#ifdef FLOWTABLE_HASH_ALL 1077 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); 1078 dport = ntohs((uint16_t)(hashkey[2] >> 16)); 1079 sport = ntohs((uint16_t)(hashkey[2] & 0xffff)); 1080 db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport); 1081#else 1082 db_printf("%s ", daddr); 1083#endif 1084 } 1085#endif /* INET */ 1086#ifdef INET6 1087 if (ft == &V_ip6_ft) { 1088#ifdef FLOWTABLE_HASH_ALL 1089 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", 1090 hashkey[0], hashkey[1], hashkey[2], 1091 hashkey[3], hashkey[4], hashkey[5], 1092 hashkey[6], hashkey[7], hashkey[8]); 1093#else 1094 db_printf("\n\tkey=%08x:%08x:%08x ", 1095 hashkey[0], hashkey[1], hashkey[2]); 1096#endif 1097 } 1098#endif /* INET6 */ 1099 1100 db_printf("hash=%08x idle_time=%03d" 1101 "\n\tfibnum=%02d rt=%p", 1102 fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt); 1103 1104#ifdef FLOWTABLE_HASH_ALL 1105 if (fle->f_flags & FL_STALE) 1106 db_printf(" FL_STALE "); 1107#endif 1108 if (rt_valid) { 1109 if (rt->rt_flags & RTF_UP) 1110 db_printf(" RTF_UP "); 1111 } 1112 if (ifp_valid) { 1113 if (ifp->if_flags & IFF_LOOPBACK) 1114 db_printf(" IFF_LOOPBACK "); 1115 if (ifp->if_flags & IFF_UP) 1116 db_printf(" IFF_UP "); 1117 if (ifp->if_flags & IFF_POINTOPOINT) 1118 db_printf(" IFF_POINTOPOINT "); 1119 } 1120 db_printf("\n"); 1121} 1122 1123static void 1124flowtable_show(struct flowtable *ft, int cpuid) 1125{ 1126 int curbit = 0; 1127 bitstr_t *mask, *tmpmask; 1128 1129 if (cpuid != -1) 1130 db_printf("cpu: %d\n", cpuid); 1131 mask = flowtable_mask_pcpu(ft, cpuid); 1132 tmpmask = ft->ft_tmpmask; 1133 memcpy(tmpmask, mask, ft->ft_size/8); 1134 /* 1135 * XXX Note to self, bit_ffs operates at the byte level 1136 * and thus adds gratuitous overhead 1137 */ 1138 bit_ffs(tmpmask, ft->ft_size, &curbit); 1139 while (curbit != -1) { 1140 struct flist *flist; 1141 struct flentry *fle; 1142 1143 if (curbit >= ft->ft_size || curbit < -1) { 1144 db_printf("warning: bad curbit value %d \n", 1145 curbit); 1146 break; 1147 } 1148 1149 flist = flowtable_list_pcpu(ft, curbit, cpuid); 1150 1151 SLIST_FOREACH(fle, flist, f_next) 1152 flow_show(ft, fle); 1153 bit_clear(tmpmask, curbit); 1154 bit_ffs(tmpmask, ft->ft_size, &curbit); 1155 } 1156} 1157 1158static void 1159flowtable_show_vnet(struct flowtable *ft) 1160{ 1161 1162 int i; 1163 1164 CPU_FOREACH(i) 1165 flowtable_show(ft, i); 1166} 1167 1168DB_SHOW_COMMAND(flowtables, db_show_flowtables) 1169{ 1170 VNET_ITERATOR_DECL(vnet_iter); 1171 1172 VNET_FOREACH(vnet_iter) { 1173 CURVNET_SET(vnet_iter); 1174#ifdef VIMAGE 1175 db_printf("vnet %p\n", vnet_iter); 1176#endif 1177#ifdef INET 1178 printf("IPv4:\n"); 1179 flowtable_show_vnet(&V_ip4_ft); 1180#endif 1181#ifdef INET6 1182 printf("IPv6:\n"); 1183 flowtable_show_vnet(&V_ip6_ft); 1184#endif 1185 CURVNET_RESTORE(); 1186 } 1187} 1188#endif 1189