flowtable.c revision 281955
1/*- 2 * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org> 3 * Copyright (c) 2008-2010, BitGravity Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, 10 * this list of conditions and the following disclaimer. 11 * 12 * 2. Neither the name of the BitGravity Corporation nor the names of its 13 * contributors may be used to endorse or promote products derived from 14 * this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "opt_route.h" 30#include "opt_mpath.h" 31#include "opt_ddb.h" 32#include "opt_inet.h" 33#include "opt_inet6.h" 34 35#include <sys/cdefs.h> 36__FBSDID("$FreeBSD: stable/10/sys/net/flowtable.c 281955 2015-04-24 23:26:44Z hiren $"); 37 38#include <sys/param.h> 39#include <sys/types.h> 40#include <sys/bitstring.h> 41#include <sys/condvar.h> 42#include <sys/callout.h> 43#include <sys/hash.h> 44#include <sys/kernel.h> 45#include <sys/kthread.h> 46#include <sys/limits.h> 47#include <sys/malloc.h> 48#include <sys/mbuf.h> 49#include <sys/pcpu.h> 50#include <sys/proc.h> 51#include <sys/queue.h> 52#include <sys/sbuf.h> 53#include <sys/sched.h> 54#include <sys/smp.h> 55#include <sys/socket.h> 56#include <sys/syslog.h> 57#include <sys/sysctl.h> 58#include <vm/uma.h> 59 60#include <net/if.h> 61#include <net/if_llatbl.h> 62#include <net/if_var.h> 63#include <net/route.h> 64#include <net/flowtable.h> 65#include <net/vnet.h> 66 67#include <netinet/in.h> 68#include <netinet/in_systm.h> 69#include <netinet/in_var.h> 70#include <netinet/if_ether.h> 71#include <netinet/ip.h> 72#ifdef INET6 73#include <netinet/ip6.h> 74#endif 75#ifdef FLOWTABLE_HASH_ALL 76#include <netinet/tcp.h> 77#include <netinet/udp.h> 78#include <netinet/sctp.h> 79#endif 80 81#include <ddb/ddb.h> 82 83#ifdef FLOWTABLE_HASH_ALL 84#define KEY_PORTS (sizeof(uint16_t) * 2) 85#define KEY_ADDRS 2 86#else 87#define KEY_PORTS 0 88#define KEY_ADDRS 1 89#endif 90 91#ifdef INET6 92#define KEY_ADDR_LEN sizeof(struct in6_addr) 93#else 94#define KEY_ADDR_LEN sizeof(struct in_addr) 95#endif 96 97#define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t)) 98 99struct flentry { 100 uint32_t f_hash; /* hash flowing forward */ 101 uint32_t f_key[KEYLEN]; /* address(es and ports) */ 102 uint32_t f_uptime; /* uptime at last access */ 103 uint16_t f_fibnum; /* fib index */ 104#ifdef FLOWTABLE_HASH_ALL 105 uint8_t f_proto; /* protocol */ 106 uint8_t f_flags; /* stale? */ 107#define FL_STALE 1 108#endif 109 SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */ 110 struct rtentry *f_rt; /* rtentry for flow */ 111 struct llentry *f_lle; /* llentry for flow */ 112}; 113#undef KEYLEN 114 115SLIST_HEAD(flist, flentry); 116/* Make sure we can use pcpu_zone_ptr for struct flist. */ 117CTASSERT(sizeof(struct flist) == sizeof(void *)); 118 119struct flowtable { 120 counter_u64_t *ft_stat; 121 int ft_size; 122 /* 123 * ft_table is a malloc(9)ed array of pointers. Pointers point to 124 * memory from UMA_ZONE_PCPU zone. 125 * ft_masks is per-cpu pointer itself. Each instance points 126 * to a malloc(9)ed bitset, that is private to corresponding CPU. 127 */ 128 struct flist **ft_table; 129 bitstr_t **ft_masks; 130 bitstr_t *ft_tmpmask; 131}; 132 133#define FLOWSTAT_ADD(ft, name, v) \ 134 counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v)) 135#define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1) 136 137static struct proc *flowcleanerproc; 138static uint32_t flow_hashjitter; 139 140static struct cv flowclean_f_cv; 141static struct cv flowclean_c_cv; 142static struct mtx flowclean_lock; 143static uint32_t flowclean_cycles; 144 145/* 146 * TODO: 147 * - add sysctls to resize && flush flow tables 148 * - Add per flowtable sysctls for statistics and configuring timeouts 149 * - add saturation counter to rtentry to support per-packet load-balancing 150 * add flag to indicate round-robin flow, add list lookup from head 151 for flows 152 * - add sysctl / device node / syscall to support exporting and importing 153 * of flows with flag to indicate that a flow was imported so should 154 * not be considered for auto-cleaning 155 * - support explicit connection state (currently only ad-hoc for DSR) 156 * - idetach() cleanup for options VIMAGE builds. 157 */ 158#ifdef INET 159static VNET_DEFINE(struct flowtable, ip4_ft); 160#define V_ip4_ft VNET(ip4_ft) 161#endif 162#ifdef INET6 163static VNET_DEFINE(struct flowtable, ip6_ft); 164#define V_ip6_ft VNET(ip6_ft) 165#endif 166 167static uma_zone_t flow_zone; 168 169static VNET_DEFINE(int, flowtable_enable) = 1; 170#define V_flowtable_enable VNET(flowtable_enable) 171 172static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL, 173 "flowtable"); 174SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW, 175 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); 176SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW, 177 &flow_zone, "Maximum number of flows allowed"); 178 179static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings"); 180 181static struct flentry * 182flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t); 183 184#ifdef INET 185static struct flentry * 186flowtable_lookup_ipv4(struct mbuf *m, struct route *ro) 187{ 188 struct flentry *fle; 189 struct sockaddr_in *sin; 190 struct ip *ip; 191 uint32_t fibnum; 192#ifdef FLOWTABLE_HASH_ALL 193 uint32_t key[3]; 194 int iphlen; 195 uint16_t sport, dport; 196 uint8_t proto; 197#endif 198 199 ip = mtod(m, struct ip *); 200 201 if (ip->ip_src.s_addr == ip->ip_dst.s_addr || 202 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 203 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) 204 return (NULL); 205 206 fibnum = M_GETFIB(m); 207 208#ifdef FLOWTABLE_HASH_ALL 209 iphlen = ip->ip_hl << 2; 210 proto = ip->ip_p; 211 212 switch (proto) { 213 case IPPROTO_TCP: { 214 struct tcphdr *th; 215 216 th = (struct tcphdr *)((char *)ip + iphlen); 217 sport = th->th_sport; 218 dport = th->th_dport; 219 if (th->th_flags & (TH_RST|TH_FIN)) 220 fibnum |= (FL_STALE << 24); 221 break; 222 } 223 case IPPROTO_UDP: { 224 struct udphdr *uh; 225 226 uh = (struct udphdr *)((char *)ip + iphlen); 227 sport = uh->uh_sport; 228 dport = uh->uh_dport; 229 break; 230 } 231 case IPPROTO_SCTP: { 232 struct sctphdr *sh; 233 234 sh = (struct sctphdr *)((char *)ip + iphlen); 235 sport = sh->src_port; 236 dport = sh->dest_port; 237 /* XXXGL: handle stale? */ 238 break; 239 } 240 default: 241 sport = dport = 0; 242 break; 243 } 244 245 key[0] = ip->ip_dst.s_addr; 246 key[1] = ip->ip_src.s_addr; 247 key[2] = (dport << 16) | sport; 248 fibnum |= proto << 16; 249 250 fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t), 251 fibnum); 252 253#else /* !FLOWTABLE_HASH_ALL */ 254 255 fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst, 256 sizeof(struct in_addr), fibnum); 257 258#endif /* FLOWTABLE_HASH_ALL */ 259 260 if (fle == NULL) 261 return (NULL); 262 263 sin = (struct sockaddr_in *)&ro->ro_dst; 264 sin->sin_family = AF_INET; 265 sin->sin_len = sizeof(*sin); 266 sin->sin_addr = ip->ip_dst; 267 268 return (fle); 269} 270#endif /* INET */ 271 272#ifdef INET6 273/* 274 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, 275 * then it sets p to point at the offset "len" in the mbuf. WARNING: the 276 * pointer might become stale after other pullups (but we never use it 277 * this way). 278 */ 279#define PULLUP_TO(_len, p, T) \ 280do { \ 281 int x = (_len) + sizeof(T); \ 282 if ((m)->m_len < x) \ 283 return (NULL); \ 284 p = (mtod(m, char *) + (_len)); \ 285} while (0) 286 287#define TCP(p) ((struct tcphdr *)(p)) 288#define SCTP(p) ((struct sctphdr *)(p)) 289#define UDP(p) ((struct udphdr *)(p)) 290 291static struct flentry * 292flowtable_lookup_ipv6(struct mbuf *m, struct route *ro) 293{ 294 struct flentry *fle; 295 struct sockaddr_in6 *sin6; 296 struct ip6_hdr *ip6; 297 uint32_t fibnum; 298#ifdef FLOWTABLE_HASH_ALL 299 uint32_t key[9]; 300 void *ulp; 301 int hlen; 302 uint16_t sport, dport; 303 u_short offset; 304 uint8_t proto; 305#else 306 uint32_t key[4]; 307#endif 308 309 ip6 = mtod(m, struct ip6_hdr *); 310 if (in6_localaddr(&ip6->ip6_dst)) 311 return (NULL); 312 313 fibnum = M_GETFIB(m); 314 315#ifdef FLOWTABLE_HASH_ALL 316 hlen = sizeof(struct ip6_hdr); 317 proto = ip6->ip6_nxt; 318 offset = sport = dport = 0; 319 ulp = NULL; 320 while (ulp == NULL) { 321 switch (proto) { 322 case IPPROTO_ICMPV6: 323 case IPPROTO_OSPFIGP: 324 case IPPROTO_PIM: 325 case IPPROTO_CARP: 326 case IPPROTO_ESP: 327 case IPPROTO_NONE: 328 ulp = ip6; 329 break; 330 case IPPROTO_TCP: 331 PULLUP_TO(hlen, ulp, struct tcphdr); 332 dport = TCP(ulp)->th_dport; 333 sport = TCP(ulp)->th_sport; 334 if (TCP(ulp)->th_flags & (TH_RST|TH_FIN)) 335 fibnum |= (FL_STALE << 24); 336 break; 337 case IPPROTO_SCTP: 338 PULLUP_TO(hlen, ulp, struct sctphdr); 339 dport = SCTP(ulp)->src_port; 340 sport = SCTP(ulp)->dest_port; 341 /* XXXGL: handle stale? */ 342 break; 343 case IPPROTO_UDP: 344 PULLUP_TO(hlen, ulp, struct udphdr); 345 dport = UDP(ulp)->uh_dport; 346 sport = UDP(ulp)->uh_sport; 347 break; 348 case IPPROTO_HOPOPTS: /* RFC 2460 */ 349 PULLUP_TO(hlen, ulp, struct ip6_hbh); 350 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; 351 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; 352 ulp = NULL; 353 break; 354 case IPPROTO_ROUTING: /* RFC 2460 */ 355 PULLUP_TO(hlen, ulp, struct ip6_rthdr); 356 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; 357 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; 358 ulp = NULL; 359 break; 360 case IPPROTO_FRAGMENT: /* RFC 2460 */ 361 PULLUP_TO(hlen, ulp, struct ip6_frag); 362 hlen += sizeof (struct ip6_frag); 363 proto = ((struct ip6_frag *)ulp)->ip6f_nxt; 364 offset = ((struct ip6_frag *)ulp)->ip6f_offlg & 365 IP6F_OFF_MASK; 366 ulp = NULL; 367 break; 368 case IPPROTO_DSTOPTS: /* RFC 2460 */ 369 PULLUP_TO(hlen, ulp, struct ip6_hbh); 370 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; 371 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; 372 ulp = NULL; 373 break; 374 case IPPROTO_AH: /* RFC 2402 */ 375 PULLUP_TO(hlen, ulp, struct ip6_ext); 376 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; 377 proto = ((struct ip6_ext *)ulp)->ip6e_nxt; 378 ulp = NULL; 379 break; 380 default: 381 PULLUP_TO(hlen, ulp, struct ip6_ext); 382 break; 383 } 384 } 385 386 bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); 387 bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr)); 388 key[8] = (dport << 16) | sport; 389 fibnum |= proto << 16; 390 391 fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t), 392 fibnum); 393#else /* !FLOWTABLE_HASH_ALL */ 394 bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); 395 fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr), 396 fibnum); 397#endif /* FLOWTABLE_HASH_ALL */ 398 399 if (fle == NULL) 400 return (NULL); 401 402 sin6 = (struct sockaddr_in6 *)&ro->ro_dst; 403 sin6->sin6_family = AF_INET6; 404 sin6->sin6_len = sizeof(*sin6); 405 bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr)); 406 407 return (fle); 408} 409#endif /* INET6 */ 410 411static bitstr_t * 412flowtable_mask(struct flowtable *ft) 413{ 414 415 /* 416 * flowtable_free_stale() calls w/o critical section, but 417 * with sched_bind(). Since pointer is stable throughout 418 * ft lifetime, it is safe, otherwise... 419 * 420 * CRITICAL_ASSERT(curthread); 421 */ 422 423 return (*(bitstr_t **)zpcpu_get(ft->ft_masks)); 424} 425 426static struct flist * 427flowtable_list(struct flowtable *ft, uint32_t hash) 428{ 429 430 CRITICAL_ASSERT(curthread); 431 return (zpcpu_get(ft->ft_table[hash % ft->ft_size])); 432} 433 434static int 435flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle) 436{ 437 438 if (((fle->f_rt->rt_flags & RTF_HOST) && 439 ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) || 440 (fle->f_rt->rt_ifp == NULL) || 441 !RT_LINK_IS_UP(fle->f_rt->rt_ifp) || 442 (fle->f_lle->la_flags & LLE_VALID) == 0) 443 return (1); 444 445 if (time_uptime - fle->f_uptime > maxidle) 446 return (1); 447 448#ifdef FLOWTABLE_HASH_ALL 449 if (fle->f_flags & FL_STALE) 450 return (1); 451#endif 452 453 return (0); 454} 455 456static int 457flow_full(void) 458{ 459 int count, max; 460 461 count = uma_zone_get_cur(flow_zone); 462 max = uma_zone_get_max(flow_zone); 463 464 return (count > (max - (max >> 3))); 465} 466 467static int 468flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum) 469{ 470#ifdef FLOWTABLE_HASH_ALL 471 uint8_t proto; 472 473 proto = (fibnum >> 16) & 0xff; 474 fibnum &= 0xffff; 475#endif 476 477 CRITICAL_ASSERT(curthread); 478 479 /* Microoptimization for IPv4: don't use bcmp(). */ 480 if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) || 481 (bcmp(fle->f_key, key, keylen) == 0)) && 482 fibnum == fle->f_fibnum && 483#ifdef FLOWTABLE_HASH_ALL 484 proto == fle->f_proto && 485#endif 486 (fle->f_rt->rt_flags & RTF_UP) && 487 fle->f_rt->rt_ifp != NULL && 488 (fle->f_lle->la_flags & LLE_VALID)) 489 return (1); 490 491 return (0); 492} 493 494static struct flentry * 495flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, 496 int keylen, uint32_t fibnum0) 497{ 498#ifdef INET6 499 struct route_in6 sro6; 500#endif 501#ifdef INET 502 struct route sro; 503#endif 504 struct route *ro = NULL; 505 struct rtentry *rt; 506 struct lltable *lt = NULL; 507 struct llentry *lle; 508 struct sockaddr_storage *l3addr; 509 struct ifnet *ifp; 510 struct flist *flist; 511 struct flentry *fle, *iter; 512 bitstr_t *mask; 513 uint16_t fibnum = fibnum0; 514#ifdef FLOWTABLE_HASH_ALL 515 uint8_t proto; 516 517 proto = (fibnum0 >> 16) & 0xff; 518 fibnum = fibnum0 & 0xffff; 519#endif 520 521 /* 522 * This bit of code ends up locking the 523 * same route 3 times (just like ip_output + ether_output) 524 * - at lookup 525 * - in rt_check when called by arpresolve 526 * - dropping the refcount for the rtentry 527 * 528 * This could be consolidated to one if we wrote a variant 529 * of arpresolve with an rt_check variant that expected to 530 * receive the route locked 531 */ 532#ifdef INET 533 if (ft == &V_ip4_ft) { 534 struct sockaddr_in *sin; 535 536 ro = &sro; 537 bzero(&sro.ro_dst, sizeof(sro.ro_dst)); 538 539 sin = (struct sockaddr_in *)&sro.ro_dst; 540 sin->sin_family = AF_INET; 541 sin->sin_len = sizeof(*sin); 542 sin->sin_addr.s_addr = key[0]; 543 } 544#endif 545#ifdef INET6 546 if (ft == &V_ip6_ft) { 547 struct sockaddr_in6 *sin6; 548 549 ro = (struct route *)&sro6; 550 sin6 = &sro6.ro_dst; 551 552 bzero(sin6, sizeof(*sin6)); 553 sin6->sin6_family = AF_INET6; 554 sin6->sin6_len = sizeof(*sin6); 555 bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr)); 556 } 557#endif 558 559 ro->ro_rt = NULL; 560#ifdef RADIX_MPATH 561 rtalloc_mpath_fib(ro, hash, fibnum); 562#else 563 rtalloc_ign_fib(ro, 0, fibnum); 564#endif 565 if (ro->ro_rt == NULL) 566 return (NULL); 567 568 rt = ro->ro_rt; 569 ifp = rt->rt_ifp; 570 571 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { 572 RTFREE(rt); 573 return (NULL); 574 } 575 576#ifdef INET 577 if (ft == &V_ip4_ft) 578 lt = LLTABLE(ifp); 579#endif 580#ifdef INET6 581 if (ft == &V_ip6_ft) 582 lt = LLTABLE6(ifp); 583#endif 584 585 if (rt->rt_flags & RTF_GATEWAY) 586 l3addr = (struct sockaddr_storage *)rt->rt_gateway; 587 else 588 l3addr = (struct sockaddr_storage *)&ro->ro_dst; 589 lle = llentry_alloc(ifp, lt, l3addr); 590 591 if (lle == NULL) { 592 RTFREE(rt); 593 return (NULL); 594 } 595 596 /* Don't insert the entry if the ARP hasn't yet finished resolving. */ 597 if ((lle->la_flags & LLE_VALID) == 0) { 598 RTFREE(rt); 599 LLE_FREE(lle); 600 FLOWSTAT_INC(ft, ft_fail_lle_invalid); 601 return (NULL); 602 } 603 604 fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO); 605 if (fle == NULL) { 606 RTFREE(rt); 607 LLE_FREE(lle); 608 return (NULL); 609 } 610 611 fle->f_hash = hash; 612 bcopy(key, &fle->f_key, keylen); 613 fle->f_rt = rt; 614 fle->f_lle = lle; 615 fle->f_fibnum = fibnum; 616 fle->f_uptime = time_uptime; 617#ifdef FLOWTABLE_HASH_ALL 618 fle->f_proto = proto; 619 fle->f_flags = fibnum0 >> 24; 620#endif 621 622 critical_enter(); 623 mask = flowtable_mask(ft); 624 flist = flowtable_list(ft, hash); 625 626 if (SLIST_EMPTY(flist)) { 627 bit_set(mask, (hash % ft->ft_size)); 628 SLIST_INSERT_HEAD(flist, fle, f_next); 629 goto skip; 630 } 631 632 /* 633 * find end of list and make sure that we were not 634 * preempted by another thread handling this flow 635 */ 636 SLIST_FOREACH(iter, flist, f_next) { 637 KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size, 638 ("%s: wrong hash", __func__)); 639 if (flow_matches(iter, key, keylen, fibnum)) { 640 /* 641 * We probably migrated to an other CPU after 642 * lookup in flowtable_lookup_common() failed. 643 * It appeared that this CPU already has flow 644 * entry. 645 */ 646 iter->f_uptime = time_uptime; 647#ifdef FLOWTABLE_HASH_ALL 648 iter->f_flags |= fibnum >> 24; 649#endif 650 critical_exit(); 651 FLOWSTAT_INC(ft, ft_collisions); 652 uma_zfree(flow_zone, fle); 653 return (iter); 654 } 655 } 656 657 SLIST_INSERT_HEAD(flist, fle, f_next); 658skip: 659 critical_exit(); 660 FLOWSTAT_INC(ft, ft_inserts); 661 662 return (fle); 663} 664 665int 666flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro) 667{ 668 struct flentry *fle; 669 670 if (V_flowtable_enable == 0) 671 return (ENXIO); 672 673 switch (sa) { 674#ifdef INET 675 case AF_INET: 676 fle = flowtable_lookup_ipv4(m, ro); 677 break; 678#endif 679#ifdef INET6 680 case AF_INET6: 681 fle = flowtable_lookup_ipv6(m, ro); 682 break; 683#endif 684 default: 685 panic("%s: sa %d", __func__, sa); 686 } 687 688 if (fle == NULL) 689 return (EHOSTUNREACH); 690 691 if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) { 692 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); 693 m->m_pkthdr.flowid = fle->f_hash; 694 } 695 696 ro->ro_rt = fle->f_rt; 697 ro->ro_lle = fle->f_lle; 698 ro->ro_flags |= RT_NORTREF; 699 700 return (0); 701} 702 703static struct flentry * 704flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen, 705 uint32_t fibnum) 706{ 707 struct flist *flist; 708 struct flentry *fle; 709 uint32_t hash; 710 711 FLOWSTAT_INC(ft, ft_lookups); 712 713 hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter); 714 715 critical_enter(); 716 flist = flowtable_list(ft, hash); 717 SLIST_FOREACH(fle, flist, f_next) { 718 KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size, 719 ("%s: wrong hash", __func__)); 720 if (flow_matches(fle, key, keylen, fibnum)) { 721 fle->f_uptime = time_uptime; 722#ifdef FLOWTABLE_HASH_ALL 723 fle->f_flags |= fibnum >> 24; 724#endif 725 critical_exit(); 726 FLOWSTAT_INC(ft, ft_hits); 727 return (fle); 728 } 729 } 730 critical_exit(); 731 732 FLOWSTAT_INC(ft, ft_misses); 733 734 return (flowtable_insert(ft, hash, key, keylen, fibnum)); 735} 736 737/* 738 * used by the bit_alloc macro 739 */ 740#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO) 741static void 742flowtable_alloc(struct flowtable *ft) 743{ 744 745 ft->ft_table = malloc(ft->ft_size * sizeof(struct flist), 746 M_FTABLE, M_WAITOK); 747 for (int i = 0; i < ft->ft_size; i++) 748 ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO); 749 750 ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK); 751 for (int i = 0; i < mp_ncpus; i++) { 752 bitstr_t **b; 753 754 b = zpcpu_get_cpu(ft->ft_masks, i); 755 *b = bit_alloc(ft->ft_size); 756 } 757 ft->ft_tmpmask = bit_alloc(ft->ft_size); 758} 759#undef calloc 760 761static void 762flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle) 763{ 764 struct flist *flist, freelist; 765 struct flentry *fle, *fle1, *fleprev; 766 bitstr_t *mask, *tmpmask; 767 int curbit, tmpsize; 768 769 SLIST_INIT(&freelist); 770 mask = flowtable_mask(ft); 771 tmpmask = ft->ft_tmpmask; 772 tmpsize = ft->ft_size; 773 memcpy(tmpmask, mask, ft->ft_size/8); 774 curbit = 0; 775 fleprev = NULL; /* pacify gcc */ 776 /* 777 * XXX Note to self, bit_ffs operates at the byte level 778 * and thus adds gratuitous overhead 779 */ 780 bit_ffs(tmpmask, ft->ft_size, &curbit); 781 while (curbit != -1) { 782 if (curbit >= ft->ft_size || curbit < -1) { 783 log(LOG_ALERT, 784 "warning: bad curbit value %d \n", 785 curbit); 786 break; 787 } 788 789 FLOWSTAT_INC(ft, ft_free_checks); 790 791 critical_enter(); 792 flist = flowtable_list(ft, curbit); 793#ifdef DIAGNOSTIC 794 if (SLIST_EMPTY(flist) && curbit > 0) { 795 log(LOG_ALERT, 796 "warning bit=%d set, but no fle found\n", 797 curbit); 798 } 799#endif 800 SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) { 801 if (rt != NULL && fle->f_rt != rt) { 802 fleprev = fle; 803 continue; 804 } 805 if (!flow_stale(ft, fle, maxidle)) { 806 fleprev = fle; 807 continue; 808 } 809 810 if (fle == SLIST_FIRST(flist)) 811 SLIST_REMOVE_HEAD(flist, f_next); 812 else 813 SLIST_REMOVE_AFTER(fleprev, f_next); 814 SLIST_INSERT_HEAD(&freelist, fle, f_next); 815 } 816 if (SLIST_EMPTY(flist)) 817 bit_clear(mask, curbit); 818 critical_exit(); 819 820 bit_clear(tmpmask, curbit); 821 tmpmask += (curbit / 8); 822 tmpsize -= (curbit / 8) * 8; 823 bit_ffs(tmpmask, tmpsize, &curbit); 824 } 825 826 SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) { 827 FLOWSTAT_INC(ft, ft_frees); 828 if (fle->f_rt != NULL) 829 RTFREE(fle->f_rt); 830 if (fle->f_lle != NULL) 831 LLE_FREE(fle->f_lle); 832 uma_zfree(flow_zone, fle); 833 } 834} 835 836static void 837flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle) 838{ 839 int i; 840 841 CPU_FOREACH(i) { 842 if (smp_started == 1) { 843 thread_lock(curthread); 844 sched_bind(curthread, i); 845 thread_unlock(curthread); 846 } 847 848 flowtable_free_stale(ft, rt, maxidle); 849 850 if (smp_started == 1) { 851 thread_lock(curthread); 852 sched_unbind(curthread); 853 thread_unlock(curthread); 854 } 855 } 856} 857 858void 859flowtable_route_flush(sa_family_t sa, struct rtentry *rt) 860{ 861 struct flowtable *ft; 862 863 switch (sa) { 864#ifdef INET 865 case AF_INET: 866 ft = &V_ip4_ft; 867 break; 868#endif 869#ifdef INET6 870 case AF_INET6: 871 ft = &V_ip6_ft; 872 break; 873#endif 874 default: 875 panic("%s: sa %d", __func__, sa); 876 } 877 878 flowtable_clean_vnet(ft, rt, 0); 879} 880 881static void 882flowtable_cleaner(void) 883{ 884 VNET_ITERATOR_DECL(vnet_iter); 885 struct thread *td; 886 887 if (bootverbose) 888 log(LOG_INFO, "flowtable cleaner started\n"); 889 td = curthread; 890 while (1) { 891 uint32_t flowclean_freq, maxidle; 892 893 /* 894 * The maximum idle time, as well as frequency are arbitrary. 895 */ 896 if (flow_full()) 897 maxidle = 5; 898 else 899 maxidle = 30; 900 901 VNET_LIST_RLOCK(); 902 VNET_FOREACH(vnet_iter) { 903 CURVNET_SET(vnet_iter); 904#ifdef INET 905 flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle); 906#endif 907#ifdef INET6 908 flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle); 909#endif 910 CURVNET_RESTORE(); 911 } 912 VNET_LIST_RUNLOCK(); 913 914 if (flow_full()) 915 flowclean_freq = 4*hz; 916 else 917 flowclean_freq = 20*hz; 918 mtx_lock(&flowclean_lock); 919 thread_lock(td); 920 sched_prio(td, PPAUSE); 921 thread_unlock(td); 922 flowclean_cycles++; 923 cv_broadcast(&flowclean_f_cv); 924 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq); 925 mtx_unlock(&flowclean_lock); 926 } 927} 928 929static void 930flowtable_flush(void *unused __unused) 931{ 932 uint64_t start; 933 934 mtx_lock(&flowclean_lock); 935 start = flowclean_cycles; 936 while (start == flowclean_cycles) { 937 cv_broadcast(&flowclean_c_cv); 938 cv_wait(&flowclean_f_cv, &flowclean_lock); 939 } 940 mtx_unlock(&flowclean_lock); 941} 942 943static struct kproc_desc flow_kp = { 944 "flowcleaner", 945 flowtable_cleaner, 946 &flowcleanerproc 947}; 948SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp); 949 950static int 951flowtable_get_size(char *name) 952{ 953 int size; 954 955 if (TUNABLE_INT_FETCH(name, &size)) { 956 if (size < 256) 957 size = 256; 958 if (!powerof2(size)) { 959 printf("%s must be power of 2\n", name); 960 size = 2048; 961 } 962 } else { 963 /* 964 * round up to the next power of 2 965 */ 966 size = 1 << fls((1024 + maxusers * 64) - 1); 967 } 968 969 return (size); 970} 971 972static void 973flowtable_init(const void *unused __unused) 974{ 975 976 flow_hashjitter = arc4random(); 977 978 flow_zone = uma_zcreate("flows", sizeof(struct flentry), 979 NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET); 980 uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus); 981 982 cv_init(&flowclean_c_cv, "c_flowcleanwait"); 983 cv_init(&flowclean_f_cv, "f_flowcleanwait"); 984 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF); 985 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL, 986 EVENTHANDLER_PRI_ANY); 987} 988SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, 989 flowtable_init, NULL); 990 991#ifdef INET 992static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL, 993 "Flowtable for IPv4"); 994 995static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat); 996VNET_PCPUSTAT_SYSINIT(ip4_ftstat); 997VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat); 998SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat, 999 ip4_ftstat, "Flowtable statistics for IPv4 " 1000 "(struct flowtable_stat, net/flowtable.h)"); 1001 1002static void 1003flowtable_init_vnet_v4(const void *unused __unused) 1004{ 1005 1006 V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size"); 1007 V_ip4_ft.ft_stat = VNET(ip4_ftstat); 1008 flowtable_alloc(&V_ip4_ft); 1009} 1010VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, 1011 flowtable_init_vnet_v4, NULL); 1012#endif /* INET */ 1013 1014#ifdef INET6 1015static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL, 1016 "Flowtable for IPv6"); 1017 1018static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat); 1019VNET_PCPUSTAT_SYSINIT(ip6_ftstat); 1020VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat); 1021SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat, 1022 ip6_ftstat, "Flowtable statistics for IPv6 " 1023 "(struct flowtable_stat, net/flowtable.h)"); 1024 1025static void 1026flowtable_init_vnet_v6(const void *unused __unused) 1027{ 1028 1029 V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size"); 1030 V_ip6_ft.ft_stat = VNET(ip6_ftstat); 1031 flowtable_alloc(&V_ip6_ft); 1032} 1033VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, 1034 flowtable_init_vnet_v6, NULL); 1035#endif /* INET6 */ 1036 1037#ifdef DDB 1038static bitstr_t * 1039flowtable_mask_pcpu(struct flowtable *ft, int cpuid) 1040{ 1041 1042 return (zpcpu_get_cpu(*ft->ft_masks, cpuid)); 1043} 1044 1045static struct flist * 1046flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) 1047{ 1048 1049 return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid)); 1050} 1051 1052static void 1053flow_show(struct flowtable *ft, struct flentry *fle) 1054{ 1055 int idle_time; 1056 int rt_valid, ifp_valid; 1057 volatile struct rtentry *rt; 1058 struct ifnet *ifp = NULL; 1059 uint32_t *hashkey = fle->f_key; 1060 1061 idle_time = (int)(time_uptime - fle->f_uptime); 1062 rt = fle->f_rt; 1063 rt_valid = rt != NULL; 1064 if (rt_valid) 1065 ifp = rt->rt_ifp; 1066 ifp_valid = ifp != NULL; 1067 1068#ifdef INET 1069 if (ft == &V_ip4_ft) { 1070 char daddr[4*sizeof "123"]; 1071#ifdef FLOWTABLE_HASH_ALL 1072 char saddr[4*sizeof "123"]; 1073 uint16_t sport, dport; 1074#endif 1075 1076 inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr); 1077#ifdef FLOWTABLE_HASH_ALL 1078 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); 1079 dport = ntohs((uint16_t)(hashkey[2] >> 16)); 1080 sport = ntohs((uint16_t)(hashkey[2] & 0xffff)); 1081 db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport); 1082#else 1083 db_printf("%s ", daddr); 1084#endif 1085 } 1086#endif /* INET */ 1087#ifdef INET6 1088 if (ft == &V_ip6_ft) { 1089#ifdef FLOWTABLE_HASH_ALL 1090 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", 1091 hashkey[0], hashkey[1], hashkey[2], 1092 hashkey[3], hashkey[4], hashkey[5], 1093 hashkey[6], hashkey[7], hashkey[8]); 1094#else 1095 db_printf("\n\tkey=%08x:%08x:%08x ", 1096 hashkey[0], hashkey[1], hashkey[2]); 1097#endif 1098 } 1099#endif /* INET6 */ 1100 1101 db_printf("hash=%08x idle_time=%03d" 1102 "\n\tfibnum=%02d rt=%p", 1103 fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt); 1104 1105#ifdef FLOWTABLE_HASH_ALL 1106 if (fle->f_flags & FL_STALE) 1107 db_printf(" FL_STALE "); 1108#endif 1109 if (rt_valid) { 1110 if (rt->rt_flags & RTF_UP) 1111 db_printf(" RTF_UP "); 1112 } 1113 if (ifp_valid) { 1114 if (ifp->if_flags & IFF_LOOPBACK) 1115 db_printf(" IFF_LOOPBACK "); 1116 if (ifp->if_flags & IFF_UP) 1117 db_printf(" IFF_UP "); 1118 if (ifp->if_flags & IFF_POINTOPOINT) 1119 db_printf(" IFF_POINTOPOINT "); 1120 } 1121 db_printf("\n"); 1122} 1123 1124static void 1125flowtable_show(struct flowtable *ft, int cpuid) 1126{ 1127 int curbit = 0; 1128 bitstr_t *mask, *tmpmask; 1129 1130 if (cpuid != -1) 1131 db_printf("cpu: %d\n", cpuid); 1132 mask = flowtable_mask_pcpu(ft, cpuid); 1133 tmpmask = ft->ft_tmpmask; 1134 memcpy(tmpmask, mask, ft->ft_size/8); 1135 /* 1136 * XXX Note to self, bit_ffs operates at the byte level 1137 * and thus adds gratuitous overhead 1138 */ 1139 bit_ffs(tmpmask, ft->ft_size, &curbit); 1140 while (curbit != -1) { 1141 struct flist *flist; 1142 struct flentry *fle; 1143 1144 if (curbit >= ft->ft_size || curbit < -1) { 1145 db_printf("warning: bad curbit value %d \n", 1146 curbit); 1147 break; 1148 } 1149 1150 flist = flowtable_list_pcpu(ft, curbit, cpuid); 1151 1152 SLIST_FOREACH(fle, flist, f_next) 1153 flow_show(ft, fle); 1154 bit_clear(tmpmask, curbit); 1155 bit_ffs(tmpmask, ft->ft_size, &curbit); 1156 } 1157} 1158 1159static void 1160flowtable_show_vnet(struct flowtable *ft) 1161{ 1162 1163 int i; 1164 1165 CPU_FOREACH(i) 1166 flowtable_show(ft, i); 1167} 1168 1169DB_SHOW_COMMAND(flowtables, db_show_flowtables) 1170{ 1171 VNET_ITERATOR_DECL(vnet_iter); 1172 1173 VNET_FOREACH(vnet_iter) { 1174 CURVNET_SET(vnet_iter); 1175#ifdef VIMAGE 1176 db_printf("vnet %p\n", vnet_iter); 1177#endif 1178#ifdef INET 1179 printf("IPv4:\n"); 1180 flowtable_show_vnet(&V_ip4_ft); 1181#endif 1182#ifdef INET6 1183 printf("IPv6:\n"); 1184 flowtable_show_vnet(&V_ip6_ft); 1185#endif 1186 CURVNET_RESTORE(); 1187 } 1188} 1189#endif 1190