146283Sdfr// SPDX-License-Identifier: GPL-2.0 246283Sdfr/* 346283Sdfr * IPVS An implementation of the IP virtual server support for the 446283Sdfr * LINUX operating system. IPVS is now implemented as a module 546283Sdfr * over the NetFilter framework. IPVS can be used to build a 646283Sdfr * high-performance and highly available server based on a 746283Sdfr * cluster of servers. 846283Sdfr * 946283Sdfr * Version 1, is capable of handling both version 0 and 1 messages. 1046283Sdfr * Version 0 is the plain old format. 1146283Sdfr * Note Version 0 receivers will just drop Ver 1 messages. 1246283Sdfr * Version 1 is capable of handle IPv6, Persistence data, 1346283Sdfr * time-outs, and firewall marks. 1446283Sdfr * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. 1546283Sdfr * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 1646283Sdfr * 1746283Sdfr * Definitions Message: is a complete datagram 1846283Sdfr * Sync_conn: is a part of a Message 1946283Sdfr * Param Data is an option to a Sync_conn. 2046283Sdfr * 2146283Sdfr * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 2246283Sdfr * 2346283Sdfr * ip_vs_sync: sync connection info from master load balancer to backups 2446283Sdfr * through multicast 2546283Sdfr * 2646283Sdfr * Changes: 2746283Sdfr * Alexandre Cassen : Added master & backup support at a time. 2846283Sdfr * Alexandre Cassen : Added SyncID support for incoming sync 2946283Sdfr * messages filtering. 3046283Sdfr * Justin Ossevoort : Fix endian problem on sync message size. 3146283Sdfr * Hans Schillstrom : Added Version 1: i.e. IPv6, 3246283Sdfr * Persistence support, fwmark and time-out. 3346283Sdfr */ 3446283Sdfr 3546283Sdfr#define KMSG_COMPONENT "IPVS" 3646283Sdfr#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 3746283Sdfr 3846283Sdfr#include <linux/module.h> 3946283Sdfr#include <linux/slab.h> 4046283Sdfr#include <linux/inetdevice.h> 4146283Sdfr#include <linux/net.h> 4246283Sdfr#include <linux/completion.h> 4346283Sdfr#include <linux/delay.h> 4446283Sdfr#include <linux/skbuff.h> 4546283Sdfr#include <linux/in.h> 4646283Sdfr#include <linux/igmp.h> /* for ip_mc_join_group */ 4746283Sdfr#include <linux/udp.h> 4846283Sdfr#include <linux/err.h> 4946283Sdfr#include <linux/kthread.h> 5046283Sdfr#include <linux/wait.h> 5146283Sdfr#include <linux/kernel.h> 5246283Sdfr#include <linux/sched/signal.h> 5346283Sdfr 5446283Sdfr#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ 5546283Sdfr 5646283Sdfr#include <net/ip.h> 5746283Sdfr#include <net/sock.h> 5846283Sdfr 5946283Sdfr#include <net/ip_vs.h> 6046283Sdfr 6146283Sdfr#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 6246283Sdfr#define IP_VS_SYNC_PORT 8848 /* multicast port */ 6346283Sdfr 6446283Sdfr#define SYNC_PROTO_VER 1 /* Protocol version in header */ 6546283Sdfr 6646283Sdfrstatic struct lock_class_key __ipvs_sync_key; 6746283Sdfr/* 6846283Sdfr * IPVS sync connection entry 6946283Sdfr * Version 0, i.e. original version. 7046283Sdfr */ 7146283Sdfrstruct ip_vs_sync_conn_v0 { 7246283Sdfr __u8 reserved; 7346283Sdfr 7446283Sdfr /* Protocol, addresses and port numbers */ 7546283Sdfr __u8 protocol; /* Which protocol (TCP/UDP) */ 7646283Sdfr __be16 cport; 7746283Sdfr __be16 vport; 7846283Sdfr __be16 dport; 7946283Sdfr __be32 caddr; /* client address */ 8046283Sdfr __be32 vaddr; /* virtual address */ 8146283Sdfr __be32 daddr; /* destination address */ 8246283Sdfr 8346283Sdfr /* Flags and state transition */ 8446283Sdfr __be16 flags; /* status flags */ 8546283Sdfr __be16 state; /* state info */ 8646283Sdfr 8746283Sdfr /* The sequence options start here */ 8846283Sdfr}; 8946283Sdfr 9046283Sdfrstruct ip_vs_sync_conn_options { 9146283Sdfr struct ip_vs_seq in_seq; /* incoming seq. struct */ 9246283Sdfr struct ip_vs_seq out_seq; /* outgoing seq. struct */ 9346283Sdfr}; 9446283Sdfr 9546283Sdfr/* 9646283Sdfr Sync Connection format (sync_conn) 9746283Sdfr 9846283Sdfr 0 1 2 3 9946283Sdfr 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 10046283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 10146283Sdfr | Type | Protocol | Ver. | Size | 10246283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 10346283Sdfr | Flags | 10446283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 10546283Sdfr | State | cport | 10646283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 10746283Sdfr | vport | dport | 10846283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 10946283Sdfr | fwmark | 11046283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 11146283Sdfr | timeout (in sec.) | 11246283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 11346283Sdfr | ... | 11446283Sdfr | IP-Addresses (v4 or v6) | 11546283Sdfr | ... | 11646283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 11746283Sdfr Optional Parameters. 11846283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 11946283Sdfr | Param. Type | Param. Length | Param. data | 12046283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 12146283Sdfr | ... | 12246283Sdfr | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 12346283Sdfr | | Param Type | Param. Length | 12446283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 12546283Sdfr | Param data | 12646283Sdfr | Last Param data should be padded for 32 bit alignment | 12746283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 12846283Sdfr*/ 12946283Sdfr 13046283Sdfr/* 13146283Sdfr * Type 0, IPv4 sync connection format 13246283Sdfr */ 13346283Sdfrstruct ip_vs_sync_v4 { 13446283Sdfr __u8 type; 13546283Sdfr __u8 protocol; /* Which protocol (TCP/UDP) */ 13646283Sdfr __be16 ver_size; /* Version msb 4 bits */ 13746283Sdfr /* Flags and state transition */ 13846283Sdfr __be32 flags; /* status flags */ 13946283Sdfr __be16 state; /* state info */ 14046283Sdfr /* Protocol, addresses and port numbers */ 14146283Sdfr __be16 cport; 14246283Sdfr __be16 vport; 14346283Sdfr __be16 dport; 14446283Sdfr __be32 fwmark; /* Firewall mark from skb */ 14546283Sdfr __be32 timeout; /* cp timeout */ 14646283Sdfr __be32 caddr; /* client address */ 14746283Sdfr __be32 vaddr; /* virtual address */ 14846283Sdfr __be32 daddr; /* destination address */ 14946283Sdfr /* The sequence options start here */ 15046283Sdfr /* PE data padded to 32bit alignment after seq. options */ 15146283Sdfr}; 15246283Sdfr/* 15346283Sdfr * Type 2 messages IPv6 15446283Sdfr */ 15546283Sdfrstruct ip_vs_sync_v6 { 15646283Sdfr __u8 type; 15746283Sdfr __u8 protocol; /* Which protocol (TCP/UDP) */ 15846283Sdfr __be16 ver_size; /* Version msb 4 bits */ 15946283Sdfr /* Flags and state transition */ 16046283Sdfr __be32 flags; /* status flags */ 16146283Sdfr __be16 state; /* state info */ 16246283Sdfr /* Protocol, addresses and port numbers */ 16346283Sdfr __be16 cport; 16446283Sdfr __be16 vport; 16546283Sdfr __be16 dport; 16646283Sdfr __be32 fwmark; /* Firewall mark from skb */ 16746283Sdfr __be32 timeout; /* cp timeout */ 16846283Sdfr struct in6_addr caddr; /* client address */ 16946283Sdfr struct in6_addr vaddr; /* virtual address */ 17046283Sdfr struct in6_addr daddr; /* destination address */ 17146283Sdfr /* The sequence options start here */ 17246283Sdfr /* PE data padded to 32bit alignment after seq. options */ 17346283Sdfr}; 17446283Sdfr 17546283Sdfrunion ip_vs_sync_conn { 17646283Sdfr struct ip_vs_sync_v4 v4; 17746283Sdfr struct ip_vs_sync_v6 v6; 17846283Sdfr}; 17946283Sdfr 18046283Sdfr/* Bits in Type field in above */ 18146283Sdfr#define STYPE_INET6 0 18246283Sdfr#define STYPE_F_INET6 (1 << STYPE_INET6) 18346283Sdfr 18446283Sdfr#define SVER_SHIFT 12 /* Shift to get version */ 18546283Sdfr#define SVER_MASK 0x0fff /* Mask to strip version */ 18646283Sdfr 18746283Sdfr#define IPVS_OPT_SEQ_DATA 1 18846283Sdfr#define IPVS_OPT_PE_DATA 2 18946283Sdfr#define IPVS_OPT_PE_NAME 3 19046283Sdfr#define IPVS_OPT_PARAM 7 19146283Sdfr 19246283Sdfr#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) 19346283Sdfr#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) 19446283Sdfr#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) 19546283Sdfr#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) 19646283Sdfr 19746283Sdfrstruct ip_vs_sync_thread_data { 19846283Sdfr struct task_struct *task; 19946283Sdfr struct netns_ipvs *ipvs; 20046283Sdfr struct socket *sock; 20146283Sdfr char *buf; 20246283Sdfr int id; 20346283Sdfr}; 20446283Sdfr 20546283Sdfr/* Version 0 definition of packet sizes */ 20646283Sdfr#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) 20746283Sdfr#define FULL_CONN_SIZE \ 20846283Sdfr(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) 20946283Sdfr 21046283Sdfr 21146283Sdfr/* 21246283Sdfr The master mulitcasts messages (Datagrams) to the backup load balancers 21346283Sdfr in the following format. 21446283Sdfr 21546283Sdfr Version 1: 21646283Sdfr Note, first byte should be Zero, so ver 0 receivers will drop the packet. 21746283Sdfr 21846283Sdfr 0 1 2 3 21946283Sdfr 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 22046283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 22146283Sdfr | 0 | SyncID | Size | 22246283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 22346283Sdfr | Count Conns | Version | Reserved, set to Zero | 22446283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 22546283Sdfr | | 22646283Sdfr | IPVS Sync Connection (1) | 22746283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 22846283Sdfr | . | 22946283Sdfr ~ . ~ 23046283Sdfr | . | 23146283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 23246283Sdfr | | 23346283Sdfr | IPVS Sync Connection (n) | 23446283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 23546283Sdfr 23646283Sdfr Version 0 Header 23746283Sdfr 0 1 2 3 23846283Sdfr 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 23946283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 24046283Sdfr | Count Conns | SyncID | Size | 24146283Sdfr +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 24246283Sdfr | IPVS Sync Connection (1) | 24346283Sdfr*/ 24446283Sdfr 24546283Sdfr/* Version 0 header */ 24646283Sdfrstruct ip_vs_sync_mesg_v0 { 24746283Sdfr __u8 nr_conns; 24846283Sdfr __u8 syncid; 24946283Sdfr __be16 size; 25046283Sdfr 25146283Sdfr /* ip_vs_sync_conn entries start here */ 25246283Sdfr}; 25346283Sdfr 25446283Sdfr/* Version 1 header */ 25546283Sdfrstruct ip_vs_sync_mesg { 25646283Sdfr __u8 reserved; /* must be zero */ 25746283Sdfr __u8 syncid; 25846283Sdfr __be16 size; 25946283Sdfr __u8 nr_conns; 26046283Sdfr __s8 version; /* SYNC_PROTO_VER */ 26146283Sdfr __u16 spare; 26246283Sdfr /* ip_vs_sync_conn entries start here */ 26346283Sdfr}; 26446283Sdfr 26546283Sdfrunion ipvs_sockaddr { 26646283Sdfr struct sockaddr_in in; 26746283Sdfr struct sockaddr_in6 in6; 26846283Sdfr}; 26946283Sdfr 27046283Sdfrstruct ip_vs_sync_buff { 27146283Sdfr struct list_head list; 27246283Sdfr unsigned long firstuse; 27346283Sdfr 27446283Sdfr /* pointers for the message data */ 27546283Sdfr struct ip_vs_sync_mesg *mesg; 27646283Sdfr unsigned char *head; 27746283Sdfr unsigned char *end; 27846283Sdfr}; 27946283Sdfr 28046283Sdfr/* 28146283Sdfr * Copy of struct ip_vs_seq 28246283Sdfr * From unaligned network order to aligned host order 28346283Sdfr */ 28446283Sdfrstatic void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) 28546283Sdfr{ 28646283Sdfr memset(ho, 0, sizeof(*ho)); 28746283Sdfr ho->init_seq = get_unaligned_be32(&no->init_seq); 28846283Sdfr ho->delta = get_unaligned_be32(&no->delta); 28946283Sdfr ho->previous_delta = get_unaligned_be32(&no->previous_delta); 29046283Sdfr} 29146283Sdfr 29246283Sdfr/* 29346283Sdfr * Copy of struct ip_vs_seq 29446283Sdfr * From Aligned host order to unaligned network order 29546283Sdfr */ 29646283Sdfrstatic void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) 29746283Sdfr{ 29846283Sdfr put_unaligned_be32(ho->init_seq, &no->init_seq); 29946283Sdfr put_unaligned_be32(ho->delta, &no->delta); 30046283Sdfr put_unaligned_be32(ho->previous_delta, &no->previous_delta); 30146283Sdfr} 30246283Sdfr 30346283Sdfrstatic inline struct ip_vs_sync_buff * 30446283Sdfrsb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 30546283Sdfr{ 30646283Sdfr struct ip_vs_sync_buff *sb; 30746283Sdfr 30846283Sdfr spin_lock_bh(&ipvs->sync_lock); 30946283Sdfr if (list_empty(&ms->sync_queue)) { 31046283Sdfr sb = NULL; 31146283Sdfr __set_current_state(TASK_INTERRUPTIBLE); 31246283Sdfr } else { 31346283Sdfr sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, 31446283Sdfr list); 31546283Sdfr list_del(&sb->list); 31646283Sdfr ms->sync_queue_len--; 31746283Sdfr if (!ms->sync_queue_len) 31846283Sdfr ms->sync_queue_delay = 0; 31946283Sdfr } 32046283Sdfr spin_unlock_bh(&ipvs->sync_lock); 32146283Sdfr 32246283Sdfr return sb; 32346283Sdfr} 32446283Sdfr 32546283Sdfr/* 32646283Sdfr * Create a new sync buffer for Version 1 proto. 32746283Sdfr */ 32846283Sdfrstatic inline struct ip_vs_sync_buff * 32946283Sdfrip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) 33046283Sdfr{ 33146283Sdfr struct ip_vs_sync_buff *sb; 33246283Sdfr 33346283Sdfr if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 33446283Sdfr return NULL; 33546283Sdfr 33646283Sdfr len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), 33746283Sdfr ipvs->mcfg.sync_maxlen); 33846283Sdfr sb->mesg = kmalloc(len, GFP_ATOMIC); 33946283Sdfr if (!sb->mesg) { 34046283Sdfr kfree(sb); 34146283Sdfr return NULL; 34246283Sdfr } 34346283Sdfr sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 34446283Sdfr sb->mesg->version = SYNC_PROTO_VER; 34546283Sdfr sb->mesg->syncid = ipvs->mcfg.syncid; 34646283Sdfr sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); 34746283Sdfr sb->mesg->nr_conns = 0; 34846283Sdfr sb->mesg->spare = 0; 34946283Sdfr sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); 35046283Sdfr sb->end = (unsigned char *)sb->mesg + len; 35146283Sdfr 35246283Sdfr sb->firstuse = jiffies; 35346283Sdfr return sb; 35446283Sdfr} 35546283Sdfr 35646283Sdfrstatic inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 35746283Sdfr{ 35846283Sdfr kfree(sb->mesg); 35946283Sdfr kfree(sb); 36046283Sdfr} 36146283Sdfr 36246283Sdfrstatic inline void sb_queue_tail(struct netns_ipvs *ipvs, 36346283Sdfr struct ipvs_master_sync_state *ms) 36446283Sdfr{ 36546283Sdfr struct ip_vs_sync_buff *sb = ms->sync_buff; 36646283Sdfr 36746283Sdfr spin_lock(&ipvs->sync_lock); 36846283Sdfr if (ipvs->sync_state & IP_VS_STATE_MASTER && 36946283Sdfr ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { 37046283Sdfr if (!ms->sync_queue_len) 37146283Sdfr schedule_delayed_work(&ms->master_wakeup_work, 37246283Sdfr max(IPVS_SYNC_SEND_DELAY, 1)); 37346283Sdfr ms->sync_queue_len++; 37446283Sdfr list_add_tail(&sb->list, &ms->sync_queue); 37546283Sdfr if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) { 37646283Sdfr int id = (int)(ms - ipvs->ms); 37746283Sdfr 37846283Sdfr wake_up_process(ipvs->master_tinfo[id].task); 37946283Sdfr } 38046283Sdfr } else 38146283Sdfr ip_vs_sync_buff_release(sb); 38246283Sdfr spin_unlock(&ipvs->sync_lock); 38346283Sdfr} 38446283Sdfr 38546283Sdfr/* 38646283Sdfr * Get the current sync buffer if it has been created for more 38746283Sdfr * than the specified time or the specified time is zero. 38846283Sdfr */ 38946283Sdfrstatic inline struct ip_vs_sync_buff * 39046283Sdfrget_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, 39146283Sdfr unsigned long time) 39246283Sdfr{ 39346283Sdfr struct ip_vs_sync_buff *sb; 39446283Sdfr 39546283Sdfr spin_lock_bh(&ipvs->sync_buff_lock); 39646283Sdfr sb = ms->sync_buff; 39746283Sdfr if (sb && time_after_eq(jiffies - sb->firstuse, time)) { 39846283Sdfr ms->sync_buff = NULL; 39946283Sdfr __set_current_state(TASK_RUNNING); 40046283Sdfr } else 40146283Sdfr sb = NULL; 40246283Sdfr spin_unlock_bh(&ipvs->sync_buff_lock); 40346283Sdfr return sb; 40446283Sdfr} 40546283Sdfr 40646283Sdfrstatic inline int 40746283Sdfrselect_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) 40846283Sdfr{ 40946283Sdfr return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; 41046283Sdfr} 41146283Sdfr 41246283Sdfr/* 41346283Sdfr * Create a new sync buffer for Version 0 proto. 41446283Sdfr */ 41546283Sdfrstatic inline struct ip_vs_sync_buff * 41646283Sdfrip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) 41746283Sdfr{ 41846283Sdfr struct ip_vs_sync_buff *sb; 41946283Sdfr struct ip_vs_sync_mesg_v0 *mesg; 42046283Sdfr 42146283Sdfr if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 42246283Sdfr return NULL; 42346283Sdfr 42446283Sdfr len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), 42546283Sdfr ipvs->mcfg.sync_maxlen); 42646283Sdfr sb->mesg = kmalloc(len, GFP_ATOMIC); 42746283Sdfr if (!sb->mesg) { 42846283Sdfr kfree(sb); 42946283Sdfr return NULL; 43046283Sdfr } 43146283Sdfr mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; 43246283Sdfr mesg->nr_conns = 0; 43346283Sdfr mesg->syncid = ipvs->mcfg.syncid; 43446283Sdfr mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); 43546283Sdfr sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); 43646283Sdfr sb->end = (unsigned char *)mesg + len; 43746283Sdfr sb->firstuse = jiffies; 43846283Sdfr return sb; 43946283Sdfr} 44046283Sdfr 44146283Sdfr/* Check if connection is controlled by persistence */ 44246283Sdfrstatic inline bool in_persistence(struct ip_vs_conn *cp) 44346283Sdfr{ 44446283Sdfr for (cp = cp->control; cp; cp = cp->control) { 44546283Sdfr if (cp->flags & IP_VS_CONN_F_TEMPLATE) 44646283Sdfr return true; 44746283Sdfr } 44846283Sdfr return false; 44946283Sdfr} 45046283Sdfr 45146283Sdfr/* Check if conn should be synced. 45246283Sdfr * pkts: conn packets, use sysctl_sync_threshold to avoid packet check 45346283Sdfr * - (1) sync_refresh_period: reduce sync rate. Additionally, retry 45446283Sdfr * sync_retries times with period of sync_refresh_period/8 45546283Sdfr * - (2) if both sync_refresh_period and sync_period are 0 send sync only 45646283Sdfr * for state changes or only once when pkts matches sync_threshold 45746283Sdfr * - (3) templates: rate can be reduced only with sync_refresh_period or 45846283Sdfr * with (2) 45946283Sdfr */ 46046283Sdfrstatic int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, 46146283Sdfr struct ip_vs_conn *cp, int pkts) 46246283Sdfr{ 46346283Sdfr unsigned long orig = READ_ONCE(cp->sync_endtime); 46446283Sdfr unsigned long now = jiffies; 46546283Sdfr unsigned long n = (now + cp->timeout) & ~3UL; 46646283Sdfr unsigned int sync_refresh_period; 46746283Sdfr int sync_period; 46846283Sdfr int force; 46946283Sdfr 47046283Sdfr /* Check if we sync in current state */ 47146283Sdfr if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) 47246283Sdfr force = 0; 47346283Sdfr else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) 47446283Sdfr return 0; 47546283Sdfr else if (likely(cp->protocol == IPPROTO_TCP)) { 47646283Sdfr if (!((1 << cp->state) & 47746283Sdfr ((1 << IP_VS_TCP_S_ESTABLISHED) | 47846283Sdfr (1 << IP_VS_TCP_S_FIN_WAIT) | 47946283Sdfr (1 << IP_VS_TCP_S_CLOSE) | 48046283Sdfr (1 << IP_VS_TCP_S_CLOSE_WAIT) | 48146283Sdfr (1 << IP_VS_TCP_S_TIME_WAIT)))) 48246283Sdfr return 0; 48346283Sdfr force = cp->state != cp->old_state; 48446283Sdfr if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) 48546283Sdfr goto set; 48646283Sdfr } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { 48746283Sdfr if (!((1 << cp->state) & 48846283Sdfr ((1 << IP_VS_SCTP_S_ESTABLISHED) | 48946283Sdfr (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | 49046283Sdfr (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | 49146283Sdfr (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | 49246283Sdfr (1 << IP_VS_SCTP_S_CLOSED)))) 49346283Sdfr return 0; 49446283Sdfr force = cp->state != cp->old_state; 49546283Sdfr if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) 49646283Sdfr goto set; 49746283Sdfr } else { 49846283Sdfr /* UDP or another protocol with single state */ 49946283Sdfr force = 0; 50046283Sdfr } 50146283Sdfr 50246283Sdfr sync_refresh_period = sysctl_sync_refresh_period(ipvs); 50346283Sdfr if (sync_refresh_period > 0) { 50446283Sdfr long diff = n - orig; 50546283Sdfr long min_diff = max(cp->timeout >> 1, 10UL * HZ); 50646283Sdfr 50746283Sdfr /* Avoid sync if difference is below sync_refresh_period 50846283Sdfr * and below the half timeout. 50946283Sdfr */ 51046283Sdfr if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { 51146283Sdfr int retries = orig & 3; 51246283Sdfr 51346283Sdfr if (retries >= sysctl_sync_retries(ipvs)) 51446283Sdfr return 0; 51546283Sdfr if (time_before(now, orig - cp->timeout + 51646283Sdfr (sync_refresh_period >> 3))) 51746283Sdfr return 0; 51846283Sdfr n |= retries + 1; 51946283Sdfr } 52046283Sdfr } 52146283Sdfr sync_period = sysctl_sync_period(ipvs); 52246283Sdfr if (sync_period > 0) { 52346283Sdfr if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && 52446283Sdfr pkts % sync_period != sysctl_sync_threshold(ipvs)) 52546283Sdfr return 0; 52646283Sdfr } else if (!sync_refresh_period && 52746283Sdfr pkts != sysctl_sync_threshold(ipvs)) 52846283Sdfr return 0; 52946283Sdfr 53046283Sdfrset: 53146283Sdfr cp->old_state = cp->state; 53246283Sdfr n = cmpxchg(&cp->sync_endtime, orig, n); 53346283Sdfr return n == orig || force; 53446283Sdfr} 53546283Sdfr 53646283Sdfr/* 53746283Sdfr * Version 0 , could be switched in by sys_ctl. 53846283Sdfr * Add an ip_vs_conn information into the current sync_buff. 53946283Sdfr */ 54046283Sdfrstatic void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, 54146283Sdfr int pkts) 54246283Sdfr{ 54346283Sdfr struct ip_vs_sync_mesg_v0 *m; 54446283Sdfr struct ip_vs_sync_conn_v0 *s; 54546283Sdfr struct ip_vs_sync_buff *buff; 54646283Sdfr struct ipvs_master_sync_state *ms; 54746283Sdfr int id; 54846283Sdfr unsigned int len; 54946283Sdfr 55046283Sdfr if (unlikely(cp->af != AF_INET)) 55146283Sdfr return; 55246283Sdfr /* Do not sync ONE PACKET */ 55346283Sdfr if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 55446283Sdfr return; 55546283Sdfr 55646283Sdfr if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 55746283Sdfr return; 55846283Sdfr 55946283Sdfr spin_lock_bh(&ipvs->sync_buff_lock); 56046283Sdfr if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 56146283Sdfr spin_unlock_bh(&ipvs->sync_buff_lock); 56246283Sdfr return; 56346283Sdfr } 56446283Sdfr 56546283Sdfr id = select_master_thread_id(ipvs, cp); 56646283Sdfr ms = &ipvs->ms[id]; 56746283Sdfr buff = ms->sync_buff; 56846283Sdfr len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 56946283Sdfr SIMPLE_CONN_SIZE; 57046283Sdfr if (buff) { 57146283Sdfr m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 57246283Sdfr /* Send buffer if it is for v1 */ 57346283Sdfr if (buff->head + len > buff->end || !m->nr_conns) { 57446283Sdfr sb_queue_tail(ipvs, ms); 57546283Sdfr ms->sync_buff = NULL; 57646283Sdfr buff = NULL; 57746283Sdfr } 57846283Sdfr } 57946283Sdfr if (!buff) { 58046283Sdfr buff = ip_vs_sync_buff_create_v0(ipvs, len); 58146283Sdfr if (!buff) { 58246283Sdfr spin_unlock_bh(&ipvs->sync_buff_lock); 58346283Sdfr pr_err("ip_vs_sync_buff_create failed.\n"); 58446283Sdfr return; 58546283Sdfr } 58646283Sdfr ms->sync_buff = buff; 58746283Sdfr } 58846283Sdfr 58946283Sdfr m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 59046283Sdfr s = (struct ip_vs_sync_conn_v0 *) buff->head; 59146283Sdfr 59246283Sdfr /* copy members */ 59346283Sdfr s->reserved = 0; 59446283Sdfr s->protocol = cp->protocol; 59546283Sdfr s->cport = cp->cport; 59646283Sdfr s->vport = cp->vport; 59746283Sdfr s->dport = cp->dport; 59846283Sdfr s->caddr = cp->caddr.ip; 59946283Sdfr s->vaddr = cp->vaddr.ip; 60046283Sdfr s->daddr = cp->daddr.ip; 60146283Sdfr s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 60246283Sdfr s->state = htons(cp->state); 60346283Sdfr if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 60446283Sdfr struct ip_vs_sync_conn_options *opt = 60546283Sdfr (struct ip_vs_sync_conn_options *)&s[1]; 60646283Sdfr memcpy(opt, &cp->sync_conn_opt, sizeof(*opt)); 60746283Sdfr } 60846283Sdfr 60946283Sdfr m->nr_conns++; 61046283Sdfr m->size = htons(ntohs(m->size) + len); 61146283Sdfr buff->head += len; 61246283Sdfr spin_unlock_bh(&ipvs->sync_buff_lock); 61346283Sdfr 61446283Sdfr /* synchronize its controller if it has */ 61546283Sdfr cp = cp->control; 61646283Sdfr if (cp) { 61746283Sdfr if (cp->flags & IP_VS_CONN_F_TEMPLATE) 61846283Sdfr pkts = atomic_inc_return(&cp->in_pkts); 61946283Sdfr else 62046283Sdfr pkts = sysctl_sync_threshold(ipvs); 62146283Sdfr ip_vs_sync_conn(ipvs, cp, pkts); 62246283Sdfr } 62346283Sdfr} 62446283Sdfr 625/* 626 * Add an ip_vs_conn information into the current sync_buff. 627 * Called by ip_vs_in. 628 * Sending Version 1 messages 629 */ 630void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) 631{ 632 struct ip_vs_sync_mesg *m; 633 union ip_vs_sync_conn *s; 634 struct ip_vs_sync_buff *buff; 635 struct ipvs_master_sync_state *ms; 636 int id; 637 __u8 *p; 638 unsigned int len, pe_name_len, pad; 639 640 /* Handle old version of the protocol */ 641 if (sysctl_sync_ver(ipvs) == 0) { 642 ip_vs_sync_conn_v0(ipvs, cp, pkts); 643 return; 644 } 645 /* Do not sync ONE PACKET */ 646 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 647 goto control; 648sloop: 649 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 650 goto control; 651 652 /* Sanity checks */ 653 pe_name_len = 0; 654 if (cp->pe_data_len) { 655 if (!cp->pe_data || !cp->dest) { 656 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); 657 return; 658 } 659 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); 660 } 661 662 spin_lock_bh(&ipvs->sync_buff_lock); 663 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 664 spin_unlock_bh(&ipvs->sync_buff_lock); 665 return; 666 } 667 668 id = select_master_thread_id(ipvs, cp); 669 ms = &ipvs->ms[id]; 670 671#ifdef CONFIG_IP_VS_IPV6 672 if (cp->af == AF_INET6) 673 len = sizeof(struct ip_vs_sync_v6); 674 else 675#endif 676 len = sizeof(struct ip_vs_sync_v4); 677 678 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) 679 len += sizeof(struct ip_vs_sync_conn_options) + 2; 680 681 if (cp->pe_data_len) 682 len += cp->pe_data_len + 2; /* + Param hdr field */ 683 if (pe_name_len) 684 len += pe_name_len + 2; 685 686 /* check if there is a space for this one */ 687 pad = 0; 688 buff = ms->sync_buff; 689 if (buff) { 690 m = buff->mesg; 691 pad = (4 - (size_t) buff->head) & 3; 692 /* Send buffer if it is for v0 */ 693 if (buff->head + len + pad > buff->end || m->reserved) { 694 sb_queue_tail(ipvs, ms); 695 ms->sync_buff = NULL; 696 buff = NULL; 697 pad = 0; 698 } 699 } 700 701 if (!buff) { 702 buff = ip_vs_sync_buff_create(ipvs, len); 703 if (!buff) { 704 spin_unlock_bh(&ipvs->sync_buff_lock); 705 pr_err("ip_vs_sync_buff_create failed.\n"); 706 return; 707 } 708 ms->sync_buff = buff; 709 m = buff->mesg; 710 } 711 712 p = buff->head; 713 buff->head += pad + len; 714 m->size = htons(ntohs(m->size) + pad + len); 715 /* Add ev. padding from prev. sync_conn */ 716 while (pad--) 717 *(p++) = 0; 718 719 s = (union ip_vs_sync_conn *)p; 720 721 /* Set message type & copy members */ 722 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); 723 s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ 724 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); 725 s->v4.state = htons(cp->state); 726 s->v4.protocol = cp->protocol; 727 s->v4.cport = cp->cport; 728 s->v4.vport = cp->vport; 729 s->v4.dport = cp->dport; 730 s->v4.fwmark = htonl(cp->fwmark); 731 s->v4.timeout = htonl(cp->timeout / HZ); 732 m->nr_conns++; 733 734#ifdef CONFIG_IP_VS_IPV6 735 if (cp->af == AF_INET6) { 736 p += sizeof(struct ip_vs_sync_v6); 737 s->v6.caddr = cp->caddr.in6; 738 s->v6.vaddr = cp->vaddr.in6; 739 s->v6.daddr = cp->daddr.in6; 740 } else 741#endif 742 { 743 p += sizeof(struct ip_vs_sync_v4); /* options ptr */ 744 s->v4.caddr = cp->caddr.ip; 745 s->v4.vaddr = cp->vaddr.ip; 746 s->v4.daddr = cp->daddr.ip; 747 } 748 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 749 *(p++) = IPVS_OPT_SEQ_DATA; 750 *(p++) = sizeof(struct ip_vs_sync_conn_options); 751 hton_seq((struct ip_vs_seq *)p, &cp->in_seq); 752 p += sizeof(struct ip_vs_seq); 753 hton_seq((struct ip_vs_seq *)p, &cp->out_seq); 754 p += sizeof(struct ip_vs_seq); 755 } 756 /* Handle pe data */ 757 if (cp->pe_data_len && cp->pe_data) { 758 *(p++) = IPVS_OPT_PE_DATA; 759 *(p++) = cp->pe_data_len; 760 memcpy(p, cp->pe_data, cp->pe_data_len); 761 p += cp->pe_data_len; 762 if (pe_name_len) { 763 /* Add PE_NAME */ 764 *(p++) = IPVS_OPT_PE_NAME; 765 *(p++) = pe_name_len; 766 memcpy(p, cp->pe->name, pe_name_len); 767 p += pe_name_len; 768 } 769 } 770 771 spin_unlock_bh(&ipvs->sync_buff_lock); 772 773control: 774 /* synchronize its controller if it has */ 775 cp = cp->control; 776 if (!cp) 777 return; 778 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 779 pkts = atomic_inc_return(&cp->in_pkts); 780 else 781 pkts = sysctl_sync_threshold(ipvs); 782 goto sloop; 783} 784 785/* 786 * fill_param used by version 1 787 */ 788static inline int 789ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, 790 struct ip_vs_conn_param *p, 791 __u8 *pe_data, unsigned int pe_data_len, 792 __u8 *pe_name, unsigned int pe_name_len) 793{ 794#ifdef CONFIG_IP_VS_IPV6 795 if (af == AF_INET6) 796 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, 797 (const union nf_inet_addr *)&sc->v6.caddr, 798 sc->v6.cport, 799 (const union nf_inet_addr *)&sc->v6.vaddr, 800 sc->v6.vport, p); 801 else 802#endif 803 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, 804 (const union nf_inet_addr *)&sc->v4.caddr, 805 sc->v4.cport, 806 (const union nf_inet_addr *)&sc->v4.vaddr, 807 sc->v4.vport, p); 808 /* Handle pe data */ 809 if (pe_data_len) { 810 if (pe_name_len) { 811 char buff[IP_VS_PENAME_MAXLEN+1]; 812 813 memcpy(buff, pe_name, pe_name_len); 814 buff[pe_name_len]=0; 815 p->pe = __ip_vs_pe_getbyname(buff); 816 if (!p->pe) { 817 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", 818 buff); 819 return 1; 820 } 821 } else { 822 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); 823 return 1; 824 } 825 826 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); 827 if (!p->pe_data) { 828 module_put(p->pe->module); 829 return -ENOMEM; 830 } 831 p->pe_data_len = pe_data_len; 832 } 833 return 0; 834} 835 836/* 837 * Connection Add / Update. 838 * Common for version 0 and 1 reception of backup sync_conns. 839 * Param: ... 840 * timeout is in sec. 841 */ 842static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, 843 unsigned int flags, unsigned int state, 844 unsigned int protocol, unsigned int type, 845 const union nf_inet_addr *daddr, __be16 dport, 846 unsigned long timeout, __u32 fwmark, 847 struct ip_vs_sync_conn_options *opt) 848{ 849 struct ip_vs_dest *dest; 850 struct ip_vs_conn *cp; 851 852 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 853 cp = ip_vs_conn_in_get(param); 854 if (cp && ((cp->dport != dport) || 855 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { 856 if (!(flags & IP_VS_CONN_F_INACTIVE)) { 857 ip_vs_conn_expire_now(cp); 858 __ip_vs_conn_put(cp); 859 cp = NULL; 860 } else { 861 /* This is the expiration message for the 862 * connection that was already replaced, so we 863 * just ignore it. 864 */ 865 __ip_vs_conn_put(cp); 866 kfree(param->pe_data); 867 return; 868 } 869 } 870 } else { 871 cp = ip_vs_ct_in_get(param); 872 } 873 874 if (cp) { 875 /* Free pe_data */ 876 kfree(param->pe_data); 877 878 dest = cp->dest; 879 spin_lock_bh(&cp->lock); 880 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 881 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 882 if (flags & IP_VS_CONN_F_INACTIVE) { 883 atomic_dec(&dest->activeconns); 884 atomic_inc(&dest->inactconns); 885 } else { 886 atomic_inc(&dest->activeconns); 887 atomic_dec(&dest->inactconns); 888 } 889 } 890 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 891 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 892 cp->flags = flags; 893 spin_unlock_bh(&cp->lock); 894 if (!dest) 895 ip_vs_try_bind_dest(cp); 896 } else { 897 /* 898 * Find the appropriate destination for the connection. 899 * If it is not found the connection will remain unbound 900 * but still handled. 901 */ 902 rcu_read_lock(); 903 /* This function is only invoked by the synchronization 904 * code. We do not currently support heterogeneous pools 905 * with synchronization, so we can make the assumption that 906 * the svc_af is the same as the dest_af 907 */ 908 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, 909 param->vaddr, param->vport, protocol, 910 fwmark, flags); 911 912 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, 913 fwmark); 914 rcu_read_unlock(); 915 if (!cp) { 916 kfree(param->pe_data); 917 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 918 return; 919 } 920 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 921 kfree(param->pe_data); 922 } 923 924 if (opt) { 925 cp->in_seq = opt->in_seq; 926 cp->out_seq = opt->out_seq; 927 } 928 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); 929 cp->state = state; 930 cp->old_state = cp->state; 931 /* 932 * For Ver 0 messages style 933 * - Not possible to recover the right timeout for templates 934 * - can not find the right fwmark 935 * virtual service. If needed, we can do it for 936 * non-fwmark persistent services. 937 * Ver 1 messages style. 938 * - No problem. 939 */ 940 if (timeout) { 941 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) 942 timeout = MAX_SCHEDULE_TIMEOUT / HZ; 943 cp->timeout = timeout*HZ; 944 } else { 945 struct ip_vs_proto_data *pd; 946 947 pd = ip_vs_proto_data_get(ipvs, protocol); 948 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) 949 cp->timeout = pd->timeout_table[state]; 950 else 951 cp->timeout = (3*60*HZ); 952 } 953 ip_vs_conn_put(cp); 954} 955 956/* 957 * Process received multicast message for Version 0 958 */ 959static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, 960 const size_t buflen) 961{ 962 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; 963 struct ip_vs_sync_conn_v0 *s; 964 struct ip_vs_sync_conn_options *opt; 965 struct ip_vs_protocol *pp; 966 struct ip_vs_conn_param param; 967 char *p; 968 int i; 969 970 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); 971 for (i=0; i<m->nr_conns; i++) { 972 unsigned int flags, state; 973 974 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 975 IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); 976 return; 977 } 978 s = (struct ip_vs_sync_conn_v0 *) p; 979 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 980 flags &= ~IP_VS_CONN_F_HASHED; 981 if (flags & IP_VS_CONN_F_SEQ_MASK) { 982 opt = (struct ip_vs_sync_conn_options *)&s[1]; 983 p += FULL_CONN_SIZE; 984 if (p > buffer+buflen) { 985 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); 986 return; 987 } 988 } else { 989 opt = NULL; 990 p += SIMPLE_CONN_SIZE; 991 } 992 993 state = ntohs(s->state); 994 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 995 pp = ip_vs_proto_get(s->protocol); 996 if (!pp) { 997 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", 998 s->protocol); 999 continue; 1000 } 1001 if (state >= pp->num_states) { 1002 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", 1003 pp->name, state); 1004 continue; 1005 } 1006 } else { 1007 if (state >= IP_VS_CTPL_S_LAST) 1008 IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n", 1009 state); 1010 } 1011 1012 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, 1013 (const union nf_inet_addr *)&s->caddr, 1014 s->cport, 1015 (const union nf_inet_addr *)&s->vaddr, 1016 s->vport, ¶m); 1017 1018 /* Send timeout as Zero */ 1019 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, 1020 (union nf_inet_addr *)&s->daddr, s->dport, 1021 0, 0, opt); 1022 } 1023} 1024 1025/* 1026 * Handle options 1027 */ 1028static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, 1029 __u32 *opt_flags, 1030 struct ip_vs_sync_conn_options *opt) 1031{ 1032 struct ip_vs_sync_conn_options *topt; 1033 1034 topt = (struct ip_vs_sync_conn_options *)p; 1035 1036 if (plen != sizeof(struct ip_vs_sync_conn_options)) { 1037 IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); 1038 return -EINVAL; 1039 } 1040 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { 1041 IP_VS_DBG(2, "BACKUP, conn options found twice\n"); 1042 return -EINVAL; 1043 } 1044 ntoh_seq(&topt->in_seq, &opt->in_seq); 1045 ntoh_seq(&topt->out_seq, &opt->out_seq); 1046 *opt_flags |= IPVS_OPT_F_SEQ_DATA; 1047 return 0; 1048} 1049 1050static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, 1051 __u8 **data, unsigned int maxlen, 1052 __u32 *opt_flags, __u32 flag) 1053{ 1054 if (plen > maxlen) { 1055 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); 1056 return -EINVAL; 1057 } 1058 if (*opt_flags & flag) { 1059 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); 1060 return -EINVAL; 1061 } 1062 *data_len = plen; 1063 *data = p; 1064 *opt_flags |= flag; 1065 return 0; 1066} 1067/* 1068 * Process a Version 1 sync. connection 1069 */ 1070static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) 1071{ 1072 struct ip_vs_sync_conn_options opt; 1073 union ip_vs_sync_conn *s; 1074 struct ip_vs_protocol *pp; 1075 struct ip_vs_conn_param param; 1076 __u32 flags; 1077 unsigned int af, state, pe_data_len=0, pe_name_len=0; 1078 __u8 *pe_data=NULL, *pe_name=NULL; 1079 __u32 opt_flags=0; 1080 int retc=0; 1081 1082 s = (union ip_vs_sync_conn *) p; 1083 1084 if (s->v6.type & STYPE_F_INET6) { 1085#ifdef CONFIG_IP_VS_IPV6 1086 af = AF_INET6; 1087 p += sizeof(struct ip_vs_sync_v6); 1088#else 1089 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); 1090 retc = 10; 1091 goto out; 1092#endif 1093 } else if (!s->v4.type) { 1094 af = AF_INET; 1095 p += sizeof(struct ip_vs_sync_v4); 1096 } else { 1097 return -10; 1098 } 1099 if (p > msg_end) 1100 return -20; 1101 1102 /* Process optional params check Type & Len. */ 1103 while (p < msg_end) { 1104 int ptype; 1105 int plen; 1106 1107 if (p+2 > msg_end) 1108 return -30; 1109 ptype = *(p++); 1110 plen = *(p++); 1111 1112 if (!plen || ((p + plen) > msg_end)) 1113 return -40; 1114 /* Handle seq option p = param data */ 1115 switch (ptype & ~IPVS_OPT_F_PARAM) { 1116 case IPVS_OPT_SEQ_DATA: 1117 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) 1118 return -50; 1119 break; 1120 1121 case IPVS_OPT_PE_DATA: 1122 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, 1123 IP_VS_PEDATA_MAXLEN, &opt_flags, 1124 IPVS_OPT_F_PE_DATA)) 1125 return -60; 1126 break; 1127 1128 case IPVS_OPT_PE_NAME: 1129 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, 1130 IP_VS_PENAME_MAXLEN, &opt_flags, 1131 IPVS_OPT_F_PE_NAME)) 1132 return -70; 1133 break; 1134 1135 default: 1136 /* Param data mandatory ? */ 1137 if (!(ptype & IPVS_OPT_F_PARAM)) { 1138 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", 1139 ptype & ~IPVS_OPT_F_PARAM); 1140 retc = 20; 1141 goto out; 1142 } 1143 } 1144 p += plen; /* Next option */ 1145 } 1146 1147 /* Get flags and Mask off unsupported */ 1148 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; 1149 flags |= IP_VS_CONN_F_SYNC; 1150 state = ntohs(s->v4.state); 1151 1152 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 1153 pp = ip_vs_proto_get(s->v4.protocol); 1154 if (!pp) { 1155 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", 1156 s->v4.protocol); 1157 retc = 30; 1158 goto out; 1159 } 1160 if (state >= pp->num_states) { 1161 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", 1162 pp->name, state); 1163 retc = 40; 1164 goto out; 1165 } 1166 } else { 1167 if (state >= IP_VS_CTPL_S_LAST) 1168 IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n", 1169 state); 1170 } 1171 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, 1172 pe_data_len, pe_name, pe_name_len)) { 1173 retc = 50; 1174 goto out; 1175 } 1176 /* If only IPv4, just silent skip IPv6 */ 1177 if (af == AF_INET) 1178 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, 1179 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, 1180 ntohl(s->v4.timeout), ntohl(s->v4.fwmark), 1181 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1182 ); 1183#ifdef CONFIG_IP_VS_IPV6 1184 else 1185 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, 1186 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, 1187 ntohl(s->v6.timeout), ntohl(s->v6.fwmark), 1188 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1189 ); 1190#endif 1191 ip_vs_pe_put(param.pe); 1192 return 0; 1193 /* Error exit */ 1194out: 1195 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); 1196 return retc; 1197 1198} 1199/* 1200 * Process received multicast message and create the corresponding 1201 * ip_vs_conn entries. 1202 * Handles Version 0 & 1 1203 */ 1204static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, 1205 const size_t buflen) 1206{ 1207 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; 1208 __u8 *p, *msg_end; 1209 int i, nr_conns; 1210 1211 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { 1212 IP_VS_DBG(2, "BACKUP, message header too short\n"); 1213 return; 1214 } 1215 1216 if (buflen != ntohs(m2->size)) { 1217 IP_VS_DBG(2, "BACKUP, bogus message size\n"); 1218 return; 1219 } 1220 /* SyncID sanity check */ 1221 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { 1222 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); 1223 return; 1224 } 1225 /* Handle version 1 message */ 1226 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) 1227 && (m2->spare == 0)) { 1228 1229 msg_end = buffer + sizeof(struct ip_vs_sync_mesg); 1230 nr_conns = m2->nr_conns; 1231 1232 for (i=0; i<nr_conns; i++) { 1233 union ip_vs_sync_conn *s; 1234 unsigned int size; 1235 int retc; 1236 1237 p = msg_end; 1238 if (p + sizeof(s->v4) > buffer+buflen) { 1239 IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n"); 1240 return; 1241 } 1242 s = (union ip_vs_sync_conn *)p; 1243 size = ntohs(s->v4.ver_size) & SVER_MASK; 1244 msg_end = p + size; 1245 /* Basic sanity checks */ 1246 if (msg_end > buffer+buflen) { 1247 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); 1248 return; 1249 } 1250 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { 1251 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", 1252 ntohs(s->v4.ver_size) >> SVER_SHIFT); 1253 return; 1254 } 1255 /* Process a single sync_conn */ 1256 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); 1257 if (retc < 0) { 1258 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", 1259 retc); 1260 return; 1261 } 1262 /* Make sure we have 32 bit alignment */ 1263 msg_end = p + ((size + 3) & ~3); 1264 } 1265 } else { 1266 /* Old type of message */ 1267 ip_vs_process_message_v0(ipvs, buffer, buflen); 1268 return; 1269 } 1270} 1271 1272 1273/* 1274 * Setup sndbuf (mode=1) or rcvbuf (mode=0) 1275 */ 1276static void set_sock_size(struct sock *sk, int mode, int val) 1277{ 1278 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ 1279 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ 1280 lock_sock(sk); 1281 if (mode) { 1282 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, 1283 READ_ONCE(sysctl_wmem_max)); 1284 sk->sk_sndbuf = val * 2; 1285 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1286 } else { 1287 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, 1288 READ_ONCE(sysctl_rmem_max)); 1289 sk->sk_rcvbuf = val * 2; 1290 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 1291 } 1292 release_sock(sk); 1293} 1294 1295/* 1296 * Setup loopback of outgoing multicasts on a sending socket 1297 */ 1298static void set_mcast_loop(struct sock *sk, u_char loop) 1299{ 1300 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 1301 inet_assign_bit(MC_LOOP, sk, loop); 1302#ifdef CONFIG_IP_VS_IPV6 1303 if (READ_ONCE(sk->sk_family) == AF_INET6) { 1304 /* IPV6_MULTICAST_LOOP */ 1305 inet6_assign_bit(MC6_LOOP, sk, loop); 1306 } 1307#endif 1308} 1309 1310/* 1311 * Specify TTL for outgoing multicasts on a sending socket 1312 */ 1313static void set_mcast_ttl(struct sock *sk, u_char ttl) 1314{ 1315 struct inet_sock *inet = inet_sk(sk); 1316 1317 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 1318 lock_sock(sk); 1319 WRITE_ONCE(inet->mc_ttl, ttl); 1320#ifdef CONFIG_IP_VS_IPV6 1321 if (sk->sk_family == AF_INET6) { 1322 struct ipv6_pinfo *np = inet6_sk(sk); 1323 1324 /* IPV6_MULTICAST_HOPS */ 1325 WRITE_ONCE(np->mcast_hops, ttl); 1326 } 1327#endif 1328 release_sock(sk); 1329} 1330 1331/* Control fragmentation of messages */ 1332static void set_mcast_pmtudisc(struct sock *sk, int val) 1333{ 1334 struct inet_sock *inet = inet_sk(sk); 1335 1336 /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ 1337 lock_sock(sk); 1338 WRITE_ONCE(inet->pmtudisc, val); 1339#ifdef CONFIG_IP_VS_IPV6 1340 if (sk->sk_family == AF_INET6) { 1341 struct ipv6_pinfo *np = inet6_sk(sk); 1342 1343 /* IPV6_MTU_DISCOVER */ 1344 WRITE_ONCE(np->pmtudisc, val); 1345 } 1346#endif 1347 release_sock(sk); 1348} 1349 1350/* 1351 * Specifiy default interface for outgoing multicasts 1352 */ 1353static int set_mcast_if(struct sock *sk, struct net_device *dev) 1354{ 1355 struct inet_sock *inet = inet_sk(sk); 1356 1357 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1358 return -EINVAL; 1359 1360 lock_sock(sk); 1361 inet->mc_index = dev->ifindex; 1362 /* inet->mc_addr = 0; */ 1363#ifdef CONFIG_IP_VS_IPV6 1364 if (sk->sk_family == AF_INET6) { 1365 struct ipv6_pinfo *np = inet6_sk(sk); 1366 1367 /* IPV6_MULTICAST_IF */ 1368 WRITE_ONCE(np->mcast_oif, dev->ifindex); 1369 } 1370#endif 1371 release_sock(sk); 1372 1373 return 0; 1374} 1375 1376 1377/* 1378 * Join a multicast group. 1379 * the group is specified by a class D multicast address 224.0.0.0/8 1380 * in the in_addr structure passed in as a parameter. 1381 */ 1382static int 1383join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev) 1384{ 1385 struct ip_mreqn mreq; 1386 int ret; 1387 1388 memset(&mreq, 0, sizeof(mreq)); 1389 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 1390 1391 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1392 return -EINVAL; 1393 1394 mreq.imr_ifindex = dev->ifindex; 1395 1396 lock_sock(sk); 1397 ret = ip_mc_join_group(sk, &mreq); 1398 release_sock(sk); 1399 1400 return ret; 1401} 1402 1403#ifdef CONFIG_IP_VS_IPV6 1404static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, 1405 struct net_device *dev) 1406{ 1407 int ret; 1408 1409 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1410 return -EINVAL; 1411 1412 lock_sock(sk); 1413 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); 1414 release_sock(sk); 1415 1416 return ret; 1417} 1418#endif 1419 1420static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) 1421{ 1422 __be32 addr; 1423 struct sockaddr_in sin; 1424 1425 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 1426 if (!addr) 1427 pr_err("You probably need to specify IP address on " 1428 "multicast interface.\n"); 1429 1430 IP_VS_DBG(7, "binding socket with (%s) %pI4\n", 1431 dev->name, &addr); 1432 1433 /* Now bind the socket with the address of multicast interface */ 1434 sin.sin_family = AF_INET; 1435 sin.sin_addr.s_addr = addr; 1436 sin.sin_port = 0; 1437 1438 return kernel_bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 1439} 1440 1441static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, 1442 struct ipvs_sync_daemon_cfg *c, int id) 1443{ 1444 if (AF_INET6 == c->mcast_af) { 1445 sa->in6 = (struct sockaddr_in6) { 1446 .sin6_family = AF_INET6, 1447 .sin6_port = htons(c->mcast_port + id), 1448 }; 1449 sa->in6.sin6_addr = c->mcast_group.in6; 1450 *salen = sizeof(sa->in6); 1451 } else { 1452 sa->in = (struct sockaddr_in) { 1453 .sin_family = AF_INET, 1454 .sin_port = htons(c->mcast_port + id), 1455 }; 1456 sa->in.sin_addr = c->mcast_group.in; 1457 *salen = sizeof(sa->in); 1458 } 1459} 1460 1461/* 1462 * Set up sending multicast socket over UDP 1463 */ 1464static int make_send_sock(struct netns_ipvs *ipvs, int id, 1465 struct net_device *dev, struct socket **sock_ret) 1466{ 1467 /* multicast addr */ 1468 union ipvs_sockaddr mcast_addr; 1469 struct socket *sock; 1470 int result, salen; 1471 1472 /* First create a socket */ 1473 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, 1474 IPPROTO_UDP, &sock); 1475 if (result < 0) { 1476 pr_err("Error during creation of socket; terminating\n"); 1477 goto error; 1478 } 1479 *sock_ret = sock; 1480 result = set_mcast_if(sock->sk, dev); 1481 if (result < 0) { 1482 pr_err("Error setting outbound mcast interface\n"); 1483 goto error; 1484 } 1485 1486 set_mcast_loop(sock->sk, 0); 1487 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); 1488 /* Allow fragmentation if MTU changes */ 1489 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); 1490 result = sysctl_sync_sock_size(ipvs); 1491 if (result > 0) 1492 set_sock_size(sock->sk, 1, result); 1493 1494 if (AF_INET == ipvs->mcfg.mcast_af) 1495 result = bind_mcastif_addr(sock, dev); 1496 else 1497 result = 0; 1498 if (result < 0) { 1499 pr_err("Error binding address of the mcast interface\n"); 1500 goto error; 1501 } 1502 1503 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); 1504 result = kernel_connect(sock, (struct sockaddr *)&mcast_addr, 1505 salen, 0); 1506 if (result < 0) { 1507 pr_err("Error connecting to the multicast addr\n"); 1508 goto error; 1509 } 1510 1511 return 0; 1512 1513error: 1514 return result; 1515} 1516 1517 1518/* 1519 * Set up receiving multicast socket over UDP 1520 */ 1521static int make_receive_sock(struct netns_ipvs *ipvs, int id, 1522 struct net_device *dev, struct socket **sock_ret) 1523{ 1524 /* multicast addr */ 1525 union ipvs_sockaddr mcast_addr; 1526 struct socket *sock; 1527 int result, salen; 1528 1529 /* First create a socket */ 1530 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, 1531 IPPROTO_UDP, &sock); 1532 if (result < 0) { 1533 pr_err("Error during creation of socket; terminating\n"); 1534 goto error; 1535 } 1536 *sock_ret = sock; 1537 /* it is equivalent to the REUSEADDR option in user-space */ 1538 sock->sk->sk_reuse = SK_CAN_REUSE; 1539 result = sysctl_sync_sock_size(ipvs); 1540 if (result > 0) 1541 set_sock_size(sock->sk, 0, result); 1542 1543 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); 1544 sock->sk->sk_bound_dev_if = dev->ifindex; 1545 result = kernel_bind(sock, (struct sockaddr *)&mcast_addr, salen); 1546 if (result < 0) { 1547 pr_err("Error binding to the multicast addr\n"); 1548 goto error; 1549 } 1550 1551 /* join the multicast group */ 1552#ifdef CONFIG_IP_VS_IPV6 1553 if (ipvs->bcfg.mcast_af == AF_INET6) 1554 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, 1555 dev); 1556 else 1557#endif 1558 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, 1559 dev); 1560 if (result < 0) { 1561 pr_err("Error joining to the multicast group\n"); 1562 goto error; 1563 } 1564 1565 return 0; 1566 1567error: 1568 return result; 1569} 1570 1571 1572static int 1573ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 1574{ 1575 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 1576 struct kvec iov; 1577 int len; 1578 1579 iov.iov_base = (void *)buffer; 1580 iov.iov_len = length; 1581 1582 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 1583 1584 return len; 1585} 1586 1587static int 1588ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 1589{ 1590 int msize; 1591 int ret; 1592 1593 msize = ntohs(msg->size); 1594 1595 ret = ip_vs_send_async(sock, (char *)msg, msize); 1596 if (ret >= 0 || ret == -EAGAIN) 1597 return ret; 1598 pr_err("ip_vs_send_async error %d\n", ret); 1599 return 0; 1600} 1601 1602static int 1603ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 1604{ 1605 struct msghdr msg = {NULL,}; 1606 struct kvec iov = {buffer, buflen}; 1607 int len; 1608 1609 /* Receive a packet */ 1610 iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen); 1611 len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); 1612 if (len < 0) 1613 return len; 1614 1615 return len; 1616} 1617 1618/* Wakeup the master thread for sending */ 1619static void master_wakeup_work_handler(struct work_struct *work) 1620{ 1621 struct ipvs_master_sync_state *ms = 1622 container_of(work, struct ipvs_master_sync_state, 1623 master_wakeup_work.work); 1624 struct netns_ipvs *ipvs = ms->ipvs; 1625 1626 spin_lock_bh(&ipvs->sync_lock); 1627 if (ms->sync_queue_len && 1628 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { 1629 int id = (int)(ms - ipvs->ms); 1630 1631 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; 1632 wake_up_process(ipvs->master_tinfo[id].task); 1633 } 1634 spin_unlock_bh(&ipvs->sync_lock); 1635} 1636 1637/* Get next buffer to send */ 1638static inline struct ip_vs_sync_buff * 1639next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 1640{ 1641 struct ip_vs_sync_buff *sb; 1642 1643 sb = sb_dequeue(ipvs, ms); 1644 if (sb) 1645 return sb; 1646 /* Do not delay entries in buffer for more than 2 seconds */ 1647 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); 1648} 1649 1650static int sync_thread_master(void *data) 1651{ 1652 struct ip_vs_sync_thread_data *tinfo = data; 1653 struct netns_ipvs *ipvs = tinfo->ipvs; 1654 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; 1655 struct sock *sk = tinfo->sock->sk; 1656 struct ip_vs_sync_buff *sb; 1657 1658 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1659 "syncid = %d, id = %d\n", 1660 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); 1661 1662 for (;;) { 1663 sb = next_sync_buff(ipvs, ms); 1664 if (unlikely(kthread_should_stop())) 1665 break; 1666 if (!sb) { 1667 schedule_timeout(IPVS_SYNC_CHECK_PERIOD); 1668 continue; 1669 } 1670 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1671 /* (Ab)use interruptible sleep to avoid increasing 1672 * the load avg. 1673 */ 1674 __wait_event_interruptible(*sk_sleep(sk), 1675 sock_writeable(sk) || 1676 kthread_should_stop()); 1677 if (unlikely(kthread_should_stop())) 1678 goto done; 1679 } 1680 ip_vs_sync_buff_release(sb); 1681 } 1682 1683done: 1684 __set_current_state(TASK_RUNNING); 1685 if (sb) 1686 ip_vs_sync_buff_release(sb); 1687 1688 /* clean up the sync_buff queue */ 1689 while ((sb = sb_dequeue(ipvs, ms))) 1690 ip_vs_sync_buff_release(sb); 1691 __set_current_state(TASK_RUNNING); 1692 1693 /* clean up the current sync_buff */ 1694 sb = get_curr_sync_buff(ipvs, ms, 0); 1695 if (sb) 1696 ip_vs_sync_buff_release(sb); 1697 1698 return 0; 1699} 1700 1701 1702static int sync_thread_backup(void *data) 1703{ 1704 struct ip_vs_sync_thread_data *tinfo = data; 1705 struct netns_ipvs *ipvs = tinfo->ipvs; 1706 struct sock *sk = tinfo->sock->sk; 1707 struct udp_sock *up = udp_sk(sk); 1708 int len; 1709 1710 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1711 "syncid = %d, id = %d\n", 1712 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); 1713 1714 while (!kthread_should_stop()) { 1715 wait_event_interruptible(*sk_sleep(sk), 1716 !skb_queue_empty_lockless(&sk->sk_receive_queue) || 1717 !skb_queue_empty_lockless(&up->reader_queue) || 1718 kthread_should_stop()); 1719 1720 /* do we have data now? */ 1721 while (!skb_queue_empty_lockless(&sk->sk_receive_queue) || 1722 !skb_queue_empty_lockless(&up->reader_queue)) { 1723 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1724 ipvs->bcfg.sync_maxlen); 1725 if (len <= 0) { 1726 if (len != -EAGAIN) 1727 pr_err("receiving message error\n"); 1728 break; 1729 } 1730 1731 ip_vs_process_message(ipvs, tinfo->buf, len); 1732 } 1733 } 1734 1735 return 0; 1736} 1737 1738 1739int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, 1740 int state) 1741{ 1742 struct ip_vs_sync_thread_data *ti = NULL, *tinfo; 1743 struct task_struct *task; 1744 struct net_device *dev; 1745 char *name; 1746 int (*threadfn)(void *data); 1747 int id = 0, count, hlen; 1748 int result = -ENOMEM; 1749 u16 mtu, min_mtu; 1750 1751 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1752 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", 1753 sizeof(struct ip_vs_sync_conn_v0)); 1754 1755 /* increase the module use count */ 1756 if (!ip_vs_use_count_inc()) 1757 return -ENOPROTOOPT; 1758 1759 /* Do not hold one mutex and then to block on another */ 1760 for (;;) { 1761 rtnl_lock(); 1762 if (mutex_trylock(&ipvs->sync_mutex)) 1763 break; 1764 rtnl_unlock(); 1765 mutex_lock(&ipvs->sync_mutex); 1766 if (rtnl_trylock()) 1767 break; 1768 mutex_unlock(&ipvs->sync_mutex); 1769 } 1770 1771 if (!ipvs->sync_state) { 1772 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); 1773 ipvs->threads_mask = count - 1; 1774 } else 1775 count = ipvs->threads_mask + 1; 1776 1777 if (c->mcast_af == AF_UNSPEC) { 1778 c->mcast_af = AF_INET; 1779 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); 1780 } 1781 if (!c->mcast_port) 1782 c->mcast_port = IP_VS_SYNC_PORT; 1783 if (!c->mcast_ttl) 1784 c->mcast_ttl = 1; 1785 1786 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); 1787 if (!dev) { 1788 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); 1789 result = -ENODEV; 1790 goto out_early; 1791 } 1792 hlen = (AF_INET6 == c->mcast_af) ? 1793 sizeof(struct ipv6hdr) + sizeof(struct udphdr) : 1794 sizeof(struct iphdr) + sizeof(struct udphdr); 1795 mtu = (state == IP_VS_STATE_BACKUP) ? 1796 clamp(dev->mtu, 1500U, 65535U) : 1500U; 1797 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; 1798 1799 if (c->sync_maxlen) 1800 c->sync_maxlen = clamp_t(unsigned int, 1801 c->sync_maxlen, min_mtu, 1802 65535 - hlen); 1803 else 1804 c->sync_maxlen = mtu - hlen; 1805 1806 if (state == IP_VS_STATE_MASTER) { 1807 result = -EEXIST; 1808 if (ipvs->ms) 1809 goto out_early; 1810 1811 ipvs->mcfg = *c; 1812 name = "ipvs-m:%d:%d"; 1813 threadfn = sync_thread_master; 1814 } else if (state == IP_VS_STATE_BACKUP) { 1815 result = -EEXIST; 1816 if (ipvs->backup_tinfo) 1817 goto out_early; 1818 1819 ipvs->bcfg = *c; 1820 name = "ipvs-b:%d:%d"; 1821 threadfn = sync_thread_backup; 1822 } else { 1823 result = -EINVAL; 1824 goto out_early; 1825 } 1826 1827 if (state == IP_VS_STATE_MASTER) { 1828 struct ipvs_master_sync_state *ms; 1829 1830 result = -ENOMEM; 1831 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); 1832 if (!ipvs->ms) 1833 goto out; 1834 ms = ipvs->ms; 1835 for (id = 0; id < count; id++, ms++) { 1836 INIT_LIST_HEAD(&ms->sync_queue); 1837 ms->sync_queue_len = 0; 1838 ms->sync_queue_delay = 0; 1839 INIT_DELAYED_WORK(&ms->master_wakeup_work, 1840 master_wakeup_work_handler); 1841 ms->ipvs = ipvs; 1842 } 1843 } 1844 result = -ENOMEM; 1845 ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data), 1846 GFP_KERNEL); 1847 if (!ti) 1848 goto out; 1849 1850 for (id = 0; id < count; id++) { 1851 tinfo = &ti[id]; 1852 tinfo->ipvs = ipvs; 1853 if (state == IP_VS_STATE_BACKUP) { 1854 result = -ENOMEM; 1855 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, 1856 GFP_KERNEL); 1857 if (!tinfo->buf) 1858 goto out; 1859 } 1860 tinfo->id = id; 1861 if (state == IP_VS_STATE_MASTER) 1862 result = make_send_sock(ipvs, id, dev, &tinfo->sock); 1863 else 1864 result = make_receive_sock(ipvs, id, dev, &tinfo->sock); 1865 if (result < 0) 1866 goto out; 1867 1868 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); 1869 if (IS_ERR(task)) { 1870 result = PTR_ERR(task); 1871 goto out; 1872 } 1873 tinfo->task = task; 1874 } 1875 1876 /* mark as active */ 1877 1878 if (state == IP_VS_STATE_MASTER) 1879 ipvs->master_tinfo = ti; 1880 else 1881 ipvs->backup_tinfo = ti; 1882 spin_lock_bh(&ipvs->sync_buff_lock); 1883 ipvs->sync_state |= state; 1884 spin_unlock_bh(&ipvs->sync_buff_lock); 1885 1886 mutex_unlock(&ipvs->sync_mutex); 1887 rtnl_unlock(); 1888 1889 return 0; 1890 1891out: 1892 /* We do not need RTNL lock anymore, release it here so that 1893 * sock_release below can use rtnl_lock to leave the mcast group. 1894 */ 1895 rtnl_unlock(); 1896 id = min(id, count - 1); 1897 if (ti) { 1898 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 1899 if (tinfo->task) 1900 kthread_stop(tinfo->task); 1901 } 1902 } 1903 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 1904 kfree(ipvs->ms); 1905 ipvs->ms = NULL; 1906 } 1907 mutex_unlock(&ipvs->sync_mutex); 1908 1909 /* No more mutexes, release socks */ 1910 if (ti) { 1911 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 1912 if (tinfo->sock) 1913 sock_release(tinfo->sock); 1914 kfree(tinfo->buf); 1915 } 1916 kfree(ti); 1917 } 1918 1919 /* decrease the module use count */ 1920 ip_vs_use_count_dec(); 1921 return result; 1922 1923out_early: 1924 mutex_unlock(&ipvs->sync_mutex); 1925 rtnl_unlock(); 1926 1927 /* decrease the module use count */ 1928 ip_vs_use_count_dec(); 1929 return result; 1930} 1931 1932 1933int stop_sync_thread(struct netns_ipvs *ipvs, int state) 1934{ 1935 struct ip_vs_sync_thread_data *ti, *tinfo; 1936 int id; 1937 int retc = -EINVAL; 1938 1939 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1940 1941 mutex_lock(&ipvs->sync_mutex); 1942 if (state == IP_VS_STATE_MASTER) { 1943 retc = -ESRCH; 1944 if (!ipvs->ms) 1945 goto err; 1946 ti = ipvs->master_tinfo; 1947 1948 /* 1949 * The lock synchronizes with sb_queue_tail(), so that we don't 1950 * add sync buffers to the queue, when we are already in 1951 * progress of stopping the master sync daemon. 1952 */ 1953 1954 spin_lock_bh(&ipvs->sync_buff_lock); 1955 spin_lock(&ipvs->sync_lock); 1956 ipvs->sync_state &= ~IP_VS_STATE_MASTER; 1957 spin_unlock(&ipvs->sync_lock); 1958 spin_unlock_bh(&ipvs->sync_buff_lock); 1959 1960 retc = 0; 1961 for (id = ipvs->threads_mask; id >= 0; id--) { 1962 struct ipvs_master_sync_state *ms = &ipvs->ms[id]; 1963 int ret; 1964 1965 tinfo = &ti[id]; 1966 pr_info("stopping master sync thread %d ...\n", 1967 task_pid_nr(tinfo->task)); 1968 cancel_delayed_work_sync(&ms->master_wakeup_work); 1969 ret = kthread_stop(tinfo->task); 1970 if (retc >= 0) 1971 retc = ret; 1972 } 1973 kfree(ipvs->ms); 1974 ipvs->ms = NULL; 1975 ipvs->master_tinfo = NULL; 1976 } else if (state == IP_VS_STATE_BACKUP) { 1977 retc = -ESRCH; 1978 if (!ipvs->backup_tinfo) 1979 goto err; 1980 ti = ipvs->backup_tinfo; 1981 1982 ipvs->sync_state &= ~IP_VS_STATE_BACKUP; 1983 retc = 0; 1984 for (id = ipvs->threads_mask; id >= 0; id--) { 1985 int ret; 1986 1987 tinfo = &ti[id]; 1988 pr_info("stopping backup sync thread %d ...\n", 1989 task_pid_nr(tinfo->task)); 1990 ret = kthread_stop(tinfo->task); 1991 if (retc >= 0) 1992 retc = ret; 1993 } 1994 ipvs->backup_tinfo = NULL; 1995 } else { 1996 goto err; 1997 } 1998 id = ipvs->threads_mask; 1999 mutex_unlock(&ipvs->sync_mutex); 2000 2001 /* No more mutexes, release socks */ 2002 for (tinfo = ti + id; tinfo >= ti; tinfo--) { 2003 if (tinfo->sock) 2004 sock_release(tinfo->sock); 2005 kfree(tinfo->buf); 2006 } 2007 kfree(ti); 2008 2009 /* decrease the module use count */ 2010 ip_vs_use_count_dec(); 2011 return retc; 2012 2013err: 2014 mutex_unlock(&ipvs->sync_mutex); 2015 return retc; 2016} 2017 2018/* 2019 * Initialize data struct for each netns 2020 */ 2021int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) 2022{ 2023 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); 2024 spin_lock_init(&ipvs->sync_lock); 2025 spin_lock_init(&ipvs->sync_buff_lock); 2026 return 0; 2027} 2028 2029void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) 2030{ 2031 int retc; 2032 2033 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); 2034 if (retc && retc != -ESRCH) 2035 pr_err("Failed to stop Master Daemon\n"); 2036 2037 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); 2038 if (retc && retc != -ESRCH) 2039 pr_err("Failed to stop Backup Daemon\n"); 2040} 2041