1124524Sume/* Copyright (c) 2017 Facebook 266776Skris * 355163Sshin * This program is free software; you can redistribute it and/or 455163Sshin * modify it under the terms of version 2 of the GNU General Public 5222732Shrs * License as published by the Free Software Foundation. 655163Sshin */ 762632Skris#include <stddef.h> 855163Sshin#include <stdbool.h> 955163Sshin#include <string.h> 1055163Sshin#include <linux/pkt_cls.h> 1155163Sshin#include <linux/bpf.h> 1255163Sshin#include <linux/in.h> 1355163Sshin#include <linux/if_ether.h> 1455163Sshin#include <linux/ip.h> 1555163Sshin#include <linux/ipv6.h> 1655163Sshin#include <linux/icmp.h> 1755163Sshin#include <linux/icmpv6.h> 1855163Sshin#include <linux/tcp.h> 1962632Skris#include <linux/udp.h> 2055163Sshin#include <bpf/bpf_helpers.h> 2155163Sshin#include "test_iptunnel_common.h" 2255163Sshin#include <bpf/bpf_endian.h> 2355163Sshin 2455163Sshinstatic inline __u32 rol32(__u32 word, unsigned int shift) 2555163Sshin{ 2655163Sshin return (word << shift) | (word >> ((-shift) & 31)); 2755163Sshin} 2855163Sshin 2955163Sshin/* copy paste of jhash from kernel sources to make sure llvm 3055163Sshin * can compile it into valid sequence of bpf instructions 3155163Sshin */ 3255163Sshin#define __jhash_mix(a, b, c) \ 3355163Sshin{ \ 3455163Sshin a -= c; a ^= rol32(c, 4); c += b; \ 3555163Sshin b -= a; b ^= rol32(a, 6); a += c; \ 3655163Sshin c -= b; c ^= rol32(b, 8); b += a; \ 3755163Sshin a -= c; a ^= rol32(c, 16); c += b; \ 3866776Skris b -= a; b ^= rol32(a, 19); a += c; \ 39118661Sume c -= b; c ^= rol32(b, 4); b += a; \ 40118661Sume} 4155163Sshin 4255163Sshin#define __jhash_final(a, b, c) \ 4355163Sshin{ \ 4455163Sshin c ^= b; c -= rol32(b, 14); \ 4555163Sshin a ^= c; a -= rol32(c, 11); \ 46222732Shrs b ^= a; b -= rol32(a, 25); \ 4755163Sshin c ^= b; c -= rol32(b, 16); \ 48222732Shrs a ^= c; a -= rol32(c, 4); \ 4955163Sshin b ^= a; b -= rol32(a, 14); \ 5055163Sshin c ^= b; c -= rol32(b, 24); \ 5155163Sshin} 5255163Sshin 5355163Sshin#define JHASH_INITVAL 0xdeadbeef 5455163Sshin 55222732Shrstypedef unsigned int u32; 5655163Sshin 57119026Sumestatic inline u32 jhash(const void *key, u32 length, u32 initval) 5855163Sshin{ 5955163Sshin u32 a, b, c; 60253970Shrs const unsigned char *k = key; 6155163Sshin 6255163Sshin a = b = c = JHASH_INITVAL + length + initval; 6355163Sshin 6455163Sshin while (length > 12) { 6555163Sshin a += *(u32 *)(k); 6655163Sshin b += *(u32 *)(k + 4); 6755163Sshin c += *(u32 *)(k + 8); 6855163Sshin __jhash_mix(a, b, c); 6955163Sshin length -= 12; 7055163Sshin k += 12; 7155163Sshin } 7255163Sshin switch (length) { 73119026Sume case 12: c += (u32)k[11]<<24; 7455163Sshin case 11: c += (u32)k[10]<<16; 7562632Skris case 10: c += (u32)k[9]<<8; 76225520Shrs case 9: c += k[8]; 77222732Shrs case 8: b += (u32)k[7]<<24; 78222732Shrs case 7: b += (u32)k[6]<<16; 7955163Sshin case 6: b += (u32)k[5]<<8; 80222732Shrs case 5: b += k[4]; 81204407Suqs case 4: a += (u32)k[3]<<24; 82204407Suqs case 3: a += (u32)k[2]<<16; 83222732Shrs case 2: a += (u32)k[1]<<8; 84204407Suqs case 1: a += k[0]; 8555163Sshin __jhash_final(a, b, c); 86225520Shrs case 0: /* Nothing left to add */ 87225520Shrs break; 88222732Shrs } 89173412Skevlo 90222861Shrs return c; 91225520Shrs} 92225520Shrs 93225520Shrsstatic inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) 94118661Sume{ 95222732Shrs a += initval; 96225520Shrs b += initval; 97225520Shrs c += initval; 98222732Shrs __jhash_final(a, b, c); 99222732Shrs return c; 100222732Shrs} 101222732Shrs 102222732Shrsstatic inline u32 jhash_2words(u32 a, u32 b, u32 initval) 103222732Shrs{ 104222732Shrs return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); 105222732Shrs} 106222732Shrs 107222732Shrs#define PCKT_FRAGMENTED 65343 108222732Shrs#define IPV4_HDR_LEN_NO_OPT 20 109222732Shrs#define IPV4_PLUS_ICMP_HDR 28 110222732Shrs#define IPV6_PLUS_ICMP_HDR 48 111222732Shrs#define RING_SIZE 2 112222732Shrs#define MAX_VIPS 12 113222732Shrs#define MAX_REALS 5 114222732Shrs#define CTL_MAP_SIZE 16 115222732Shrs#define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE) 11655163Sshin#define F_IPV6 (1 << 0) 117124524Sume#define F_HASH_NO_SRC_PORT (1 << 0) 11855163Sshin#define F_ICMP (1 << 0) 119119026Sume#define F_SYN_SET (1 << 1) 120119026Sume 121119026Sumestruct packet_description { 12255163Sshin union { 12355163Sshin __be32 src; 12462632Skris __be32 srcv6[4]; 125118664Sume }; 12662632Skris union { 127118660Sume __be32 dst; 128118664Sume __be32 dstv6[4]; 129222732Shrs }; 13062632Skris union { 131118664Sume __u32 ports; 132118660Sume __u16 port16[2]; 133118664Sume }; 134222732Shrs __u8 proto; 13562632Skris __u8 flags; 13655163Sshin}; 137118660Sume 138222732Shrsstruct ctl_value { 13955163Sshin union { 14055163Sshin __u64 value; 14155163Sshin __u32 ifindex; 14255163Sshin __u8 mac[6]; 14362632Skris }; 144118664Sume}; 145118660Sume 146118664Sumestruct vip_meta { 14762632Skris __u32 flags; 14862632Skris __u32 vip_num; 14955163Sshin}; 150222732Shrs 15155163Sshinstruct real_definition { 15262632Skris union { 153118664Sume __be32 dst; 154118660Sume __be32 dstv6[4]; 155118664Sume }; 15662632Skris __u8 flags; 15762632Skris}; 15855163Sshin 15955163Sshinstruct vip_stats { 16055163Sshin __u64 bytes; 16155163Sshin __u64 pkts; 16255163Sshin}; 163118664Sume 164118660Sumestruct eth_hdr { 165118664Sume unsigned char eth_dest[ETH_ALEN]; 16655163Sshin unsigned char eth_source[ETH_ALEN]; 16755163Sshin unsigned short eth_proto; 16855163Sshin}; 16955163Sshin 17055163Sshinstruct { 17155163Sshin __uint(type, BPF_MAP_TYPE_HASH); 17255163Sshin __uint(max_entries, MAX_VIPS); 17355163Sshin __type(key, struct vip); 17455163Sshin __type(value, struct vip_meta); 17555163Sshin} vip_map SEC(".maps"); 17655163Sshin 17755163Sshinstruct { 17855163Sshin __uint(type, BPF_MAP_TYPE_ARRAY); 17955163Sshin __uint(max_entries, CH_RINGS_SIZE); 18055163Sshin __type(key, __u32); 18155163Sshin __type(value, __u32); 18262632Skris} ch_rings SEC(".maps"); 18355163Sshin 184222732Shrsstruct { 18555163Sshin __uint(type, BPF_MAP_TYPE_ARRAY); 18655163Sshin __uint(max_entries, MAX_REALS); 18755163Sshin __type(key, __u32); 188222732Shrs __type(value, struct real_definition); 18955163Sshin} reals SEC(".maps"); 190118664Sume 191118664Sumestruct { 192118664Sume __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 193204407Suqs __uint(max_entries, MAX_VIPS); 194119026Sume __type(key, __u32); 19555163Sshin __type(value, struct vip_stats); 196119026Sume} stats SEC(".maps"); 197222732Shrs 198119026Sumestruct { 199119026Sume __uint(type, BPF_MAP_TYPE_ARRAY); 200222732Shrs __uint(max_entries, CTL_MAP_SIZE); 201222732Shrs __type(key, __u32); 20255163Sshin __type(value, struct ctl_value); 20355163Sshin} ctl_array SEC(".maps"); 20455163Sshin 20555163Sshinstatic __always_inline __u32 get_packet_hash(struct packet_description *pckt, 20655163Sshin bool ipv6) 20755163Sshin{ 20855163Sshin if (ipv6) 20955163Sshin return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS), 210222732Shrs pckt->ports, CH_RINGS_SIZE); 21155163Sshin else 21255163Sshin return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE); 213118664Sume} 214118664Sume 215118664Sumestatic __always_inline bool get_packet_dst(struct real_definition **real, 216118664Sume struct packet_description *pckt, 217118664Sume struct vip_meta *vip_info, 21855163Sshin bool is_ipv6) 219118664Sume{ 220118664Sume __u32 hash = get_packet_hash(pckt, is_ipv6) % RING_SIZE; 221222732Shrs __u32 key = RING_SIZE * vip_info->vip_num + hash; 22255163Sshin __u32 *real_pos; 223222732Shrs 22455163Sshin real_pos = bpf_map_lookup_elem(&ch_rings, &key); 22555163Sshin if (!real_pos) 22655163Sshin return false; 22755163Sshin key = *real_pos; 22855163Sshin *real = bpf_map_lookup_elem(&reals, &key); 229118660Sume if (!(*real)) 230222732Shrs return false; 23155163Sshin return true; 23255163Sshin} 23355163Sshin 234222732Shrsstatic __always_inline int parse_icmpv6(void *data, void *data_end, __u64 off, 23555163Sshin struct packet_description *pckt) 23655163Sshin{ 23755163Sshin struct icmp6hdr *icmp_hdr; 23855163Sshin struct ipv6hdr *ip6h; 23955163Sshin 240253376Skevlo icmp_hdr = data + off; 241222732Shrs if (icmp_hdr + 1 > data_end) 242222732Shrs return TC_ACT_SHOT; 24355163Sshin if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG) 24455163Sshin return TC_ACT_OK; 245222732Shrs off += sizeof(struct icmp6hdr); 246119026Sume ip6h = data + off; 247118661Sume if (ip6h + 1 > data_end) 248119026Sume return TC_ACT_SHOT; 249222861Shrs pckt->proto = ip6h->nexthdr; 250222732Shrs pckt->flags |= F_ICMP; 251222732Shrs memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16); 252222732Shrs memcpy(pckt->dstv6, ip6h->saddr.s6_addr32, 16); 253222732Shrs return TC_ACT_UNSPEC; 254222732Shrs} 255222732Shrs 256222732Shrsstatic __always_inline int parse_icmp(void *data, void *data_end, __u64 off, 257222732Shrs struct packet_description *pckt) 258222732Shrs{ 259253970Shrs struct icmphdr *icmp_hdr; 260253970Shrs struct iphdr *iph; 261222861Shrs 262222861Shrs icmp_hdr = data + off; 26355163Sshin if (icmp_hdr + 1 > data_end) 264119026Sume return TC_ACT_SHOT; 265119026Sume if (icmp_hdr->type != ICMP_DEST_UNREACH || 266119026Sume icmp_hdr->code != ICMP_FRAG_NEEDED) 267222732Shrs return TC_ACT_OK; 268118660Sume off += sizeof(struct icmphdr); 26955163Sshin iph = data + off; 27055163Sshin if (iph + 1 > data_end) 27155163Sshin return TC_ACT_SHOT; 27255163Sshin if (iph->ihl != 5) 273118664Sume return TC_ACT_SHOT; 274118664Sume pckt->proto = iph->protocol; 27555163Sshin pckt->flags |= F_ICMP; 27655163Sshin pckt->src = iph->daddr; 27755163Sshin pckt->dst = iph->saddr; 27855163Sshin return TC_ACT_UNSPEC; 27955163Sshin} 28055163Sshin 28155163Sshinstatic __always_inline bool parse_udp(void *data, __u64 off, void *data_end, 28255163Sshin struct packet_description *pckt) 28355163Sshin{ 28455163Sshin struct udphdr *udp; 28555163Sshin udp = data + off; 28655163Sshin 28755163Sshin if (udp + 1 > data_end) 288118664Sume return false; 289118664Sume 29055163Sshin if (!(pckt->flags & F_ICMP)) { 29155163Sshin pckt->port16[0] = udp->source; 29255163Sshin pckt->port16[1] = udp->dest; 293118664Sume } else { 294118664Sume pckt->port16[0] = udp->dest; 29555163Sshin pckt->port16[1] = udp->source; 29655163Sshin } 29755163Sshin return true; 298222732Shrs} 299118906Sume 300222732Shrsstatic __always_inline bool parse_tcp(void *data, __u64 off, void *data_end, 30155163Sshin struct packet_description *pckt) 30255163Sshin{ 30355163Sshin struct tcphdr *tcp; 30455163Sshin 30555163Sshin tcp = data + off; 30655163Sshin if (tcp + 1 > data_end) 307118906Sume return false; 308118906Sume 309118906Sume if (tcp->syn) 310118906Sume pckt->flags |= F_SYN_SET; 311118660Sume 312118664Sume if (!(pckt->flags & F_ICMP)) { 313118664Sume pckt->port16[0] = tcp->source; 314225520Shrs pckt->port16[1] = tcp->dest; 315118664Sume } else { 31655163Sshin pckt->port16[0] = tcp->dest; 31755163Sshin pckt->port16[1] = tcp->source; 31855163Sshin } 31955163Sshin return true; 320118906Sume} 321118664Sume 322118664Sumestatic __always_inline int process_packet(void *data, __u64 off, void *data_end, 323225520Shrs bool is_ipv6, struct __sk_buff *skb) 324118664Sume{ 32555163Sshin void *pkt_start = (void *)(long)skb->data; 32655163Sshin struct packet_description pckt = {}; 32755163Sshin struct eth_hdr *eth = pkt_start; 32855163Sshin struct bpf_tunnel_key tkey = {}; 329118906Sume struct vip_stats *data_stats; 330118664Sume struct real_definition *dst; 331118664Sume struct vip_meta *vip_info; 332118664Sume struct ctl_value *cval; 333225520Shrs __u32 v4_intf_pos = 1; 334118664Sume __u32 v6_intf_pos = 2; 33555163Sshin struct ipv6hdr *ip6h; 33655163Sshin struct vip vip = {}; 33755163Sshin struct iphdr *iph; 33855163Sshin int tun_flag = 0; 339118906Sume __u16 pkt_bytes; 340118664Sume __u64 iph_len; 341118664Sume __u32 ifindex; 342225520Shrs __u8 protocol; 343118664Sume __u32 vip_num; 34455163Sshin int action; 34555163Sshin 34655163Sshin tkey.tunnel_ttl = 64; 34755163Sshin if (is_ipv6) { 34855163Sshin ip6h = data + off; 34955163Sshin if (ip6h + 1 > data_end) 350118906Sume return TC_ACT_SHOT; 351118664Sume 352118664Sume iph_len = sizeof(struct ipv6hdr); 353225520Shrs protocol = ip6h->nexthdr; 354118664Sume pckt.proto = protocol; 35555163Sshin pkt_bytes = bpf_ntohs(ip6h->payload_len); 35655163Sshin off += iph_len; 35755163Sshin if (protocol == IPPROTO_FRAGMENT) { 358118660Sume return TC_ACT_SHOT; 359118664Sume } else if (protocol == IPPROTO_ICMPV6) { 360225520Shrs action = parse_icmpv6(data, data_end, off, &pckt); 361118664Sume if (action >= 0) 36255163Sshin return action; 363118661Sume off += IPV6_PLUS_ICMP_HDR; 364118661Sume } else { 365118661Sume memcpy(pckt.srcv6, ip6h->saddr.s6_addr32, 16); 366118661Sume memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16); 367118661Sume } 368118661Sume } else { 369118661Sume iph = data + off; 370118661Sume if (iph + 1 > data_end) 371118661Sume return TC_ACT_SHOT; 372118661Sume if (iph->ihl != 5) 373118661Sume return TC_ACT_SHOT; 374118661Sume 375118661Sume protocol = iph->protocol; 376118661Sume pckt.proto = protocol; 377222732Shrs pkt_bytes = bpf_ntohs(iph->tot_len); 378118661Sume off += IPV4_HDR_LEN_NO_OPT; 379253970Shrs 380222861Shrs if (iph->frag_off & PCKT_FRAGMENTED) 381222861Shrs return TC_ACT_SHOT; 382222861Shrs if (protocol == IPPROTO_ICMP) { 383222861Shrs action = parse_icmp(data, data_end, off, &pckt); 384222861Shrs if (action >= 0) 385222861Shrs return action; 386225520Shrs off += IPV4_PLUS_ICMP_HDR; 387225520Shrs } else { 388222861Shrs pckt.src = iph->saddr; 389222861Shrs pckt.dst = iph->daddr; 390222861Shrs } 391222861Shrs } 392222732Shrs protocol = pckt.proto; 393222732Shrs 394222732Shrs if (protocol == IPPROTO_TCP) { 395222732Shrs if (!parse_tcp(data, off, data_end, &pckt)) 396222732Shrs return TC_ACT_SHOT; 397222732Shrs } else if (protocol == IPPROTO_UDP) { 398222732Shrs if (!parse_udp(data, off, data_end, &pckt)) 399222732Shrs return TC_ACT_SHOT; 400222732Shrs } else { 401222732Shrs return TC_ACT_SHOT; 402222732Shrs } 403222732Shrs 404222732Shrs if (is_ipv6) 405222732Shrs memcpy(vip.daddr.v6, pckt.dstv6, 16); 406222732Shrs else 407222732Shrs vip.daddr.v4 = pckt.dst; 408222732Shrs 409222732Shrs vip.dport = pckt.port16[1]; 410222732Shrs vip.protocol = pckt.proto; 411222732Shrs vip_info = bpf_map_lookup_elem(&vip_map, &vip); 412222732Shrs if (!vip_info) { 413222732Shrs vip.dport = 0; 414222732Shrs vip_info = bpf_map_lookup_elem(&vip_map, &vip); 415222732Shrs if (!vip_info) 416225520Shrs return TC_ACT_SHOT; 417222732Shrs pckt.port16[1] = 0; 418222732Shrs } 419222732Shrs 420222732Shrs if (vip_info->flags & F_HASH_NO_SRC_PORT) 421222732Shrs pckt.port16[0] = 0; 422222732Shrs 423225520Shrs if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6)) 424222732Shrs return TC_ACT_SHOT; 425222732Shrs 426222732Shrs if (dst->flags & F_IPV6) { 427222732Shrs cval = bpf_map_lookup_elem(&ctl_array, &v6_intf_pos); 428225520Shrs if (!cval) 429222732Shrs return TC_ACT_SHOT; 430222732Shrs ifindex = cval->ifindex; 431222732Shrs memcpy(tkey.remote_ipv6, dst->dstv6, 16); 432222732Shrs tun_flag = BPF_F_TUNINFO_IPV6; 433222732Shrs } else { 434222732Shrs cval = bpf_map_lookup_elem(&ctl_array, &v4_intf_pos); 435222732Shrs if (!cval) 436222732Shrs return TC_ACT_SHOT; 437222732Shrs ifindex = cval->ifindex; 438222732Shrs tkey.remote_ipv4 = dst->dst; 439222732Shrs } 440222732Shrs vip_num = vip_info->vip_num; 441222732Shrs data_stats = bpf_map_lookup_elem(&stats, &vip_num); 442222732Shrs if (!data_stats) 443222732Shrs return TC_ACT_SHOT; 444222732Shrs data_stats->pkts++; 445222732Shrs data_stats->bytes += pkt_bytes; 446222732Shrs bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), tun_flag); 447222732Shrs *(u32 *)eth->eth_dest = tkey.remote_ipv4; 448222732Shrs return bpf_redirect(ifindex, 0); 449222732Shrs} 450222732Shrs 451222861ShrsSEC("tc") 452222861Shrsint balancer_ingress(struct __sk_buff *ctx) 453222861Shrs{ 454222861Shrs void *data_end = (void *)(long)ctx->data_end; 455222861Shrs void *data = (void *)(long)ctx->data; 456222861Shrs struct eth_hdr *eth = data; 457222861Shrs __u32 eth_proto; 458222861Shrs __u32 nh_off; 459222861Shrs 460222861Shrs nh_off = sizeof(struct eth_hdr); 461222861Shrs if (data + nh_off > data_end) 462222861Shrs return TC_ACT_SHOT; 463222861Shrs eth_proto = eth->eth_proto; 464222861Shrs if (eth_proto == bpf_htons(ETH_P_IP)) 465222861Shrs return process_packet(data, nh_off, data_end, false, ctx); 466222861Shrs else if (eth_proto == bpf_htons(ETH_P_IPV6)) 467222861Shrs return process_packet(data, nh_off, data_end, true, ctx); 468222732Shrs else 469222732Shrs return TC_ACT_SHOT; 470222861Shrs} 471222861Shrschar _license[] SEC("license") = "GPL"; 472222732Shrs