From 3575792e005dc9994f15ae72c1c6f401d134177d Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 17 Sep 2010 14:18:16 +0200 Subject: ipvs: extend connection flags to 32 bits - the sync protocol supports 16 bits only, so bits 0..15 should be used only for flags that should go to backup server, bits 16 and above should be allocated for flags not sent to backup. - use IP_VS_CONN_F_DEST_MASK as mask of connection flags in destination that can be changed by user space - allow IP_VS_CONN_F_ONE_PACKET to be set in destination Signed-off-by: Julian Anastasov Signed-off-by: Patrick McHardy --- include/net/ip_vs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index f976885f686f..62698a9c1631 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -366,6 +366,7 @@ struct ip_vs_conn { union nf_inet_addr caddr; /* client address */ union nf_inet_addr vaddr; /* virtual address */ union nf_inet_addr daddr; /* destination address */ + volatile __u32 flags; /* status flags */ __be16 cport; __be16 vport; __be16 dport; @@ -378,7 +379,6 @@ struct ip_vs_conn { /* Flags and state transition */ spinlock_t lock; /* lock for state transition */ - volatile __u16 flags; /* status flags */ volatile __u16 state; /* state info */ volatile __u16 old_state; /* old state, to be used for * state transition triggerd -- cgit v1.2.3 From f4bc17cdd205ebaa3807c2aa973719bb5ce6a5b2 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 21 Sep 2010 17:35:41 +0200 Subject: ipvs: netfilter connection tracking changes Add more code to IPVS to work with Netfilter connection tracking and fix some problems. - Allow IPVS to be compiled without connection tracking as in 2.6.35 and before. This can avoid keeping conntracks for all IPVS connections because this costs memory. ip_vs_ftp still depends on connection tracking and NAT as implemented for 2.6.36. - Add sysctl var "conntrack" to enable connection tracking for all IPVS connections. For loaded IPVS directors it needs tuning of nf_conntrack_max limit. - Add IP_VS_CONN_F_NFCT connection flag to request the connection to use connection tracking. This allows user space to provide this flag, for example, in dest->conn_flags. This can be useful to request connection tracking per real server instead of forcing it for all connections with the "conntrack" sysctl. This flag is set currently only by ip_vs_ftp and of course by "conntrack" sysctl. - Add ip_vs_nfct.c file to hold all connection tracking code, by this way main code should not depend of netfilter conntrack support. - Return back the ip_vs_post_routing handler as in 2.6.35 and use skb->ipvs_property=1 to allow IPVS to work without connection tracking Connection tracking: - most of the code is already in 2.6.36-rc - alter conntrack reply tuple for LVS-NAT connections when first packet from client is forwarded and conntrack state is NEW or RELATED. Additionally, alter reply for RELATED connections from real server, again for packet in original direction. - add IP_VS_XMIT_TUNNEL to confirm conntrack (without altering reply) for LVS-TUN early because we want to call nf_reset. It is needed because we add IPIP header and the original conntrack should be preserved, not destroyed. The transmitted IPIP packets can reuse same conntrack, so we do not set skb->ipvs_property. - try to destroy conntrack when the IPVS connection is destroyed. It is not fatal if conntrack disappears before that, it depends on the used timers. Fix problems from long time: - add skb->ip_summed = CHECKSUM_NONE for the LVS-TUN transmitters Signed-off-by: Julian Anastasov Signed-off-by: Patrick McHardy --- include/linux/ip_vs.h | 2 + include/net/ip_vs.h | 44 +++++- net/netfilter/ipvs/Kconfig | 13 +- net/netfilter/ipvs/Makefile | 5 +- net/netfilter/ipvs/ip_vs_conn.c | 13 ++ net/netfilter/ipvs/ip_vs_core.c | 46 ++++++- net/netfilter/ipvs/ip_vs_ctl.c | 12 ++ net/netfilter/ipvs/ip_vs_ftp.c | 146 +------------------- net/netfilter/ipvs/ip_vs_nfct.c | 292 ++++++++++++++++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_xmit.c | 98 +++++++------- 10 files changed, 475 insertions(+), 196 deletions(-) create mode 100644 net/netfilter/ipvs/ip_vs_nfct.c (limited to 'include/net') diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h index 003d75f6ffe1..df7728613720 100644 --- a/include/linux/ip_vs.h +++ b/include/linux/ip_vs.h @@ -90,10 +90,12 @@ #define IP_VS_CONN_F_ONE_PACKET 0x2000 /* forward only one packet */ /* Flags that are not sent to backup server start from bit 16 */ +#define IP_VS_CONN_F_NFCT (1 << 16) /* use netfilter conntrack */ /* Connection flags from destination that can be changed by user space */ #define IP_VS_CONN_F_DEST_MASK (IP_VS_CONN_F_FWD_MASK | \ IP_VS_CONN_F_ONE_PACKET | \ + IP_VS_CONN_F_NFCT | \ 0) #define IP_VS_SCHEDNAME_MAXLEN 16 diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 62698a9c1631..e8ec5231eae9 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -25,7 +25,9 @@ #include #include /* for struct ipv6hdr */ #include /* for ipv6_addr_copy */ - +#ifdef CONFIG_IP_VS_NFCT +#include +#endif /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; @@ -798,6 +800,7 @@ extern int sysctl_ip_vs_expire_nodest_conn; extern int sysctl_ip_vs_expire_quiescent_template; extern int sysctl_ip_vs_sync_threshold[2]; extern int sysctl_ip_vs_nat_icmp_send; +extern int sysctl_ip_vs_conntrack; extern struct ip_vs_stats ip_vs_stats; extern const struct ctl_path net_vs_ctl_path[]; @@ -955,8 +958,47 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum) return csum_partial(diff, sizeof(diff), oldsum); } +#ifdef CONFIG_IP_VS_NFCT +/* + * Netfilter connection tracking + * (from ip_vs_nfct.c) + */ +static inline int ip_vs_conntrack_enabled(void) +{ + return sysctl_ip_vs_conntrack; +} + extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin); +extern int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp); +extern void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 port, int from_rs); +extern void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp); + +#else + +static inline int ip_vs_conntrack_enabled(void) +{ + return 0; +} + +static inline void ip_vs_update_conntrack(struct sk_buff *skb, + struct ip_vs_conn *cp, int outin) +{ +} + +static inline int ip_vs_confirm_conntrack(struct sk_buff *skb, + struct ip_vs_conn *cp) +{ + return NF_ACCEPT; +} + +static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) +{ +} +/* CONFIG_IP_VS_NFCT */ +#endif #endif /* __KERNEL__ */ diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 46a77d5c3887..af3c9f48f2d7 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -3,7 +3,7 @@ # menuconfig IP_VS tristate "IP virtual server support" - depends on NET && INET && NETFILTER && NF_CONNTRACK + depends on NET && INET && NETFILTER ---help--- IP Virtual Server support will let you build a high-performance virtual server based on cluster of two or more real servers. This @@ -235,7 +235,8 @@ comment 'IPVS application helper' config IP_VS_FTP tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP && NF_NAT + depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT + select IP_VS_NFCT ---help--- FTP is a protocol that transfers IP address and/or port number in the payload. In the virtual server via Network Address Translation, @@ -247,4 +248,12 @@ config IP_VS_FTP If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_NFCT + bool "Netfilter connection tracking" + depends on NF_CONNTRACK + ---help--- + The Netfilter connection tracking support allows the IPVS + connection state to be exported to the Netfilter framework + for filtering purposes. + endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index e3baefd7066e..349fe8819b89 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o +ip_vs-extra_objs-y := +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o + ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ ip_vs_est.o ip_vs_proto.o \ - $(ip_vs_proto-objs-y) + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) # IPVS core diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 9fe1da7bcf16..a970d9691496 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -721,6 +721,9 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->control) ip_vs_control_del(cp); + if (cp->flags & IP_VS_CONN_F_NFCT) + ip_vs_conn_drop_conntrack(cp); + if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); @@ -816,6 +819,16 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, if (unlikely(pp && atomic_read(&pp->appcnt))) ip_vs_bind_app(cp, pp); + /* + * Allow conntrack to be preserved. By default, conntrack + * is created and destroyed for every packet. + * Sometimes keeping conntrack can be useful for + * IP_VS_CONN_F_ONE_PACKET too. + */ + + if (ip_vs_conntrack_enabled()) + cp->flags |= IP_VS_CONN_F_NFCT; + /* Hash it in the ip_vs_conn_tab finally */ ip_vs_conn_hash(cp); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 319991d4d251..7fbc80d81fe8 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -537,6 +537,23 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } +/* + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING + * chain and is used to avoid double NAT and confirmation when we do + * not want to keep the conntrack structure + */ +static unsigned int ip_vs_post_routing(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (!skb->ipvs_property) + return NF_ACCEPT; + /* The packet was sent from IPVS, exit this chain */ + return NF_STOP; +} + __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); @@ -695,7 +712,10 @@ static int handle_response_icmp(int af, struct sk_buff *skb, /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); - skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + skb->ipvs_property = 1; + else + ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; out: @@ -928,17 +948,19 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_update_conntrack(skb, cp, 0); + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + skb->ipvs_property = 1; + else + ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); - skb->ipvs_property = 1; - LeaveFunction(11); return NF_ACCEPT; drop: ip_vs_conn_put(cp); kfree_skb(skb); + LeaveFunction(11); return NF_STOLEN; } @@ -1483,6 +1505,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, + }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1511,6 +1541,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC-1, + }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7bd41d28080c..d2d842f292c6 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -88,6 +88,9 @@ int sysctl_ip_vs_expire_nodest_conn = 0; int sysctl_ip_vs_expire_quiescent_template = 0; int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; int sysctl_ip_vs_nat_icmp_send = 0; +#ifdef CONFIG_IP_VS_NFCT +int sysctl_ip_vs_conntrack; +#endif #ifdef CONFIG_IP_VS_DEBUG @@ -1580,6 +1583,15 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_do_defense_mode, }, +#ifdef CONFIG_IP_VS_NFCT + { + .procname = "conntrack", + .data = &sysctl_ip_vs_conntrack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .procname = "secure_tcp", .data = &sysctl_ip_vs_secure_tcp, diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 7e9af5b76d9e..9cd375f94d61 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -20,17 +20,6 @@ * * Author: Wouter Gadeyne * - * - * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from - * http://www.ssi.bg/~ja/nfct/: - * - * ip_vs_nfct.c: Netfilter connection tracking support for IPVS - * - * Portions Copyright (C) 2001-2002 - * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. - * - * Portions Copyright (C) 2003-2008 - * Julian Anastasov */ #define KMSG_COMPONENT "IPVS" @@ -58,16 +47,6 @@ #define SERVER_STRING "227 Entering Passive Mode (" #define CLIENT_STRING "PORT " -#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" -#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ - &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ - (T)->dst.protonum - -#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" -#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ - &((C)->vaddr.ip), ntohs((C)->vport), \ - &((C)->daddr.ip), ntohs((C)->dport), \ - (C)->protocol, (C)->state /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper @@ -85,6 +64,8 @@ static int ip_vs_ftp_pasv; static int ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) { + /* We use connection tracking for the command connection */ + cp->flags |= IP_VS_CONN_F_NFCT; return 0; } @@ -148,120 +129,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, return 1; } -/* - * Called from init_conntrack() as expectfn handler. - */ -static void -ip_vs_expect_callback(struct nf_conn *ct, - struct nf_conntrack_expect *exp) -{ - struct nf_conntrack_tuple *orig, new_reply; - struct ip_vs_conn *cp; - - if (exp->tuple.src.l3num != PF_INET) - return; - - /* - * We assume that no NF locks are held before this callback. - * ip_vs_conn_out_get and ip_vs_conn_in_get should match their - * expectations even if they use wildcard values, now we provide the - * actual values from the newly created original conntrack direction. - * The conntrack is confirmed when packet reaches IPVS hooks. - */ - - /* RS->CLIENT */ - orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum, - &orig->src.u3, orig->src.u.tcp.port, - &orig->dst.u3, orig->dst.u.tcp.port); - if (cp) { - /* Change reply CLIENT->RS to CLIENT->VS */ - new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found inout cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - new_reply.dst.u3 = cp->vaddr; - new_reply.dst.u.tcp.port = cp->vport; - IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE - ", inout cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - goto alter; - } - - /* CLIENT->VS */ - cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum, - &orig->src.u3, orig->src.u.tcp.port, - &orig->dst.u3, orig->dst.u.tcp.port); - if (cp) { - /* Change reply VS->CLIENT to RS->CLIENT */ - new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found outin cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - new_reply.src.u3 = cp->daddr; - new_reply.src.u.tcp.port = cp->dport; - IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " - FMT_TUPLE ", outin cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - goto alter; - } - - IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE - " - unknown expect\n", - __func__, ct, ct->status, ARG_TUPLE(orig)); - return; - -alter: - /* Never alter conntrack for non-NAT conns */ - if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) - nf_conntrack_alter_reply(ct, &new_reply); - ip_vs_conn_put(cp); - return; -} - -/* - * Create NF conntrack expectation with wildcard (optional) source port. - * Then the default callback function will alter the reply and will confirm - * the conntrack entry when the first packet comes. - */ -static void -ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct, - struct ip_vs_conn *cp, u_int8_t proto, - const __be16 *port, int from_rs) -{ - struct nf_conntrack_expect *exp; - - BUG_ON(!ct || ct == &nf_conntrack_untracked); - - exp = nf_ct_expect_alloc(ct); - if (!exp) - return; - - if (from_rs) - nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, - nf_ct_l3num(ct), &cp->daddr, &cp->caddr, - proto, port, &cp->cport); - else - nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, - nf_ct_l3num(ct), &cp->caddr, &cp->vaddr, - proto, port, &cp->vport); - - exp->expectfn = ip_vs_expect_callback; - - IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&exp->tuple)); - nf_ct_expect_related(exp); - nf_ct_expect_put(exp); -} - /* * Look at outgoing ftp packets to catch the response to a PASV command * from the server (inside-to-outside). @@ -335,7 +202,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, &cp->caddr, 0, &cp->vaddr, port, &from, port, - IP_VS_CONN_F_NO_CPORT, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, cp->dest); if (!n_cp) return 0; @@ -371,8 +239,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, start-data, end-start, buf, buf_len); if (ret) - ip_vs_expect_related(skb, ct, n_cp, - IPPROTO_TCP, NULL, 0); + ip_vs_nfct_expect_related(skb, ct, n_cp, + IPPROTO_TCP, 0, 0); } /* @@ -487,7 +355,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &cp->daddr, htons(ntohs(cp->dport)-1), - 0, + IP_VS_CONN_F_NFCT, cp->dest); if (!n_cp) return 0; diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c new file mode 100644 index 000000000000..c038458d0290 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -0,0 +1,292 @@ +/* + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS + * + * Portions Copyright (C) 2001-2002 + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. + * + * Portions Copyright (C) 2003-2010 + * Julian Anastasov + * + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Authors: + * Ben North + * Julian Anastasov Reorganize and sync with latest kernels + * Hannes Eder Extend NFCT support for FTP, ipvs match + * + * + * Current status: + * + * - provide conntrack confirmation for new and related connections, by + * this way we can see their proper conntrack state in all hooks + * - support for all forwarding methods, not only NAT + * - FTP support (NAT), ability to support other NAT apps with expectations + * - to correctly create expectations for related NAT connections the proper + * NF conntrack support must be already installed, eg. ip_vs_ftp requires + * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables + * NAT rules are needed) + * - alter reply for NAT when forwarding packet in original direction: + * conntrack from client in NEW or RELATED (Passive FTP DATA) state or + * when RELATED conntrack is created from real server (Active FTP DATA) + * - if iptables_nat is not loaded the Passive FTP will not work (the + * PASV response can not be NAT-ed) but Active FTP should work + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" +#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ + &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ + (T)->dst.protonum + +#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" +#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ + &((C)->vaddr.ip), ntohs((C)->vport), \ + &((C)->daddr.ip), ntohs((C)->dport), \ + (C)->protocol, (C)->state + +void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conntrack_tuple new_tuple; + + if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || + nf_ct_is_dying(ct)) + return; + + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return; + + /* Alter reply only in original direction */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return; + + /* + * The connection is not yet in the hashtable, so we update it. + * CIP->VIP will remain the same, so leave the tuple in + * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the + * real-server we will see RIP->DIP. + */ + new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + /* + * This will also take care of UDP and other protocols. + */ + if (outin) { + new_tuple.src.u3 = cp->daddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.src.u.tcp.port = cp->dport; + } else { + new_tuple.dst.u3 = cp->vaddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.dst.u.tcp.port = cp->vport; + } + IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE + ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), + ARG_TUPLE(&new_tuple), ARG_CONN(cp)); + nf_conntrack_alter_reply(ct, &new_tuple); +} + +int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + return nf_conntrack_confirm(skb); +} + +/* + * Called from init_conntrack() as expectfn handler. + */ +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_tuple *orig, new_reply; + struct ip_vs_conn *cp; + + if (exp->tuple.src.l3num != PF_INET) + return; + + /* + * We assume that no NF locks are held before this callback. + * ip_vs_conn_out_get and ip_vs_conn_in_get should match their + * expectations even if they use wildcard values, now we provide the + * actual values from the newly created original conntrack direction. + * The conntrack is confirmed when packet reaches IPVS hooks. + */ + + /* RS->CLIENT */ + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port); + if (cp) { + /* Change reply CLIENT->RS to CLIENT->VS */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found inout cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.dst.u3 = cp->vaddr; + new_reply.dst.u.tcp.port = cp->vport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE + ", inout cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + /* CLIENT->VS */ + cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port); + if (cp) { + /* Change reply VS->CLIENT to RS->CLIENT */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found outin cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.src.u3 = cp->daddr; + new_reply.src.u.tcp.port = cp->dport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " + FMT_TUPLE ", outin cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); + return; + +alter: + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) + nf_conntrack_alter_reply(ct, &new_reply); + ip_vs_conn_put(cp); + return; +} + +/* + * Create NF conntrack expectation with wildcard (optional) source port. + * Then the default callback function will alter the reply and will confirm + * the conntrack entry when the first packet comes. + * Use port 0 to expect connection from any port. + */ +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 port, int from_rs) +{ + struct nf_conntrack_expect *exp; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return; + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return; + + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + from_rs ? &cp->daddr : &cp->caddr, + from_rs ? &cp->caddr : &cp->vaddr, + proto, port ? &port : NULL, + from_rs ? &cp->cport : &cp->vport); + + exp->expectfn = ip_vs_nfct_expect_callback; + + IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); +} +EXPORT_SYMBOL(ip_vs_nfct_expect_related); + +/* + * Our connection was terminated, try to drop the conntrack immediately + */ +void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct nf_conntrack_tuple tuple; + + if (!cp->cport) + return; + + tuple = (struct nf_conntrack_tuple) { + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; + tuple.src.u3 = cp->caddr; + tuple.src.u.all = cp->cport; + tuple.src.l3num = cp->af; + tuple.dst.u3 = cp->vaddr; + tuple.dst.u.all = cp->vport; + + IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE + " for conn " FMT_CONN "\n", + __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); + + h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* Show what happens instead of calling nf_ct_kill() */ + if (del_timer(&ct->timeout)) { + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + if (ct->timeout.function) + ct->timeout.function(ct->timeout.data); + } else { + IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + } + nf_ct_put(ct); + } else { + IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); + } +} + diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 49df6bea6a2d..8817afa34e6a 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -194,12 +193,37 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) dst_release(old_dst); } -#define IP_VS_XMIT(pf, skb, rt) \ +#define IP_VS_XMIT_TUNNEL(skb, cp) \ +({ \ + int __ret = NF_ACCEPT; \ + \ + if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ + __ret = ip_vs_confirm_conntrack(skb, cp); \ + if (__ret == NF_ACCEPT) { \ + nf_reset(skb); \ + (skb)->ip_summed = CHECKSUM_NONE; \ + } \ + __ret; \ +}) + +#define IP_VS_XMIT_NAT(pf, skb, cp) \ do { \ - (skb)->ipvs_property = 1; \ + if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ + (skb)->ipvs_property = 1; \ + else \ + ip_vs_update_conntrack(skb, cp, 1); \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->dst.dev, dst_output); \ + skb_dst(skb)->dev, dst_output); \ +} while (0) + +#define IP_VS_XMIT(pf, skb, cp) \ +do { \ + if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ + (skb)->ipvs_property = 1; \ + skb_forward_csum(skb); \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ + skb_dst(skb)->dev, dst_output); \ } while (0) @@ -271,7 +295,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -335,7 +359,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -349,36 +373,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } #endif -void -ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) -{ - struct nf_conn *ct = (struct nf_conn *)skb->nfct; - struct nf_conntrack_tuple new_tuple; - - if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct)) - return; - - /* - * The connection is not yet in the hashtable, so we update it. - * CIP->VIP will remain the same, so leave the tuple in - * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the - * real-server we will see RIP->DIP. - */ - new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - if (outin) - new_tuple.src.u3 = cp->daddr; - else - new_tuple.dst.u3 = cp->vaddr; - /* - * This will also take care of UDP and other protocols. - */ - if (outin) - new_tuple.src.u.tcp.port = cp->dport; - else - new_tuple.dst.u.tcp.port = cp->vport; - nf_conntrack_alter_reply(ct, &new_tuple); -} - /* * NAT transmitter (only for outside-to-inside nat forwarding) * Not used for related ICMP @@ -434,8 +428,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - ip_vs_update_conntrack(skb, cp, 1); - /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ @@ -443,7 +435,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -451,8 +443,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, tx_error_icmp: dst_link_failure(skb); tx_error: - LeaveFunction(10); kfree_skb(skb); + LeaveFunction(10); return NF_STOLEN; tx_error_put: ip_rt_put(rt); @@ -512,8 +504,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - ip_vs_update_conntrack(skb, cp, 1); - /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ @@ -521,7 +511,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -571,6 +561,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; + int ret; EnterFunction(10); @@ -655,7 +646,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - ip_local_out(skb); + ret = IP_VS_XMIT_TUNNEL(skb, cp); + if (ret == NF_ACCEPT) + ip_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); LeaveFunction(10); @@ -681,6 +676,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; + int ret; EnterFunction(10); @@ -761,7 +757,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - ip6_local_out(skb); + ret = IP_VS_XMIT_TUNNEL(skb, cp); + if (ret == NF_ACCEPT) + ip6_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); LeaveFunction(10); @@ -820,7 +820,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -873,7 +873,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -947,7 +947,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp); rc = NF_STOLEN; goto out; @@ -1022,7 +1022,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp); rc = NF_STOLEN; goto out; -- cgit v1.2.3 From 8a8030407f55a6aaedb51167c1a2383311fcd707 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 21 Sep 2010 17:38:57 +0200 Subject: ipvs: make rerouting optional with snat_reroute Add new sysctl flag "snat_reroute". Recent kernels use ip_route_me_harder() to route LVS-NAT responses properly by VIP when there are multiple paths to client. But setups that do not have alternative default routes can skip this routing lookup by using snat_reroute=0. Signed-off-by: Julian Anastasov Signed-off-by: Patrick McHardy --- include/net/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_core.c | 37 +++++++++++++++++++++++++++++-------- net/netfilter/ipvs/ip_vs_ctl.c | 8 ++++++++ 3 files changed, 38 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index e8ec5231eae9..3915a4f4cd30 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -801,6 +801,7 @@ extern int sysctl_ip_vs_expire_quiescent_template; extern int sysctl_ip_vs_sync_threshold[2]; extern int sysctl_ip_vs_nat_icmp_send; extern int sysctl_ip_vs_conntrack; +extern int sysctl_ip_vs_snat_reroute; extern struct ip_vs_stats ip_vs_stats; extern const struct ctl_path net_vs_ctl_path[]; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 7fbc80d81fe8..06c388bf4e33 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -929,20 +929,31 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_send_check(ip_hdr(skb)); } + /* + * nf_iterate does not expect change in the skb->dst->dev. + * It looks like it is not fatal to enable this code for hooks + * where our handlers are at the end of the chain list and + * when all next handlers use skb->dst->dev and not outdev. + * It will definitely route properly the inout NAT traffic + * when multiple paths are used. + */ + /* For policy routing, packets originating from this * machine itself may be routed differently to packets * passing through. We want this packet to be routed as * if it came from this machine itself. So re-compute * the routing information. */ + if (sysctl_ip_vs_snat_reroute) { #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else + if (af == AF_INET6) { + if (ip6_route_me_harder(skb) != 0) + goto drop; + } else #endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; + } IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); @@ -991,8 +1002,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related, verdict = ip_vs_out_icmp_v6(skb, &related); - if (related) + if (related) { + if (sysctl_ip_vs_snat_reroute && + NF_ACCEPT == verdict && + ip6_route_me_harder(skb)) + verdict = NF_DROP; return verdict; + } ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else @@ -1000,8 +1016,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_out_icmp(skb, &related); - if (related) + if (related) { + if (sysctl_ip_vs_snat_reroute && + NF_ACCEPT == verdict && + ip_route_me_harder(skb, RTN_LOCAL)) + verdict = NF_DROP; return verdict; + } ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index d2d842f292c6..e637cd0384b1 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -91,6 +91,7 @@ int sysctl_ip_vs_nat_icmp_send = 0; #ifdef CONFIG_IP_VS_NFCT int sysctl_ip_vs_conntrack; #endif +int sysctl_ip_vs_snat_reroute = 1; #ifdef CONFIG_IP_VS_DEBUG @@ -1599,6 +1600,13 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_do_defense_mode, }, + { + .procname = "snat_reroute", + .data = &sysctl_ip_vs_snat_reroute, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #if 0 { .procname = "timeout_established", -- cgit v1.2.3 From 8b008faf92ac8f7eeb65e8cd36077601af7c46db Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 22 Sep 2010 08:36:59 +0200 Subject: netfilter: ctnetlink: allow to specify the expectation flags With this patch, you can specify the expectation flags for user-space created expectations. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_common.h | 4 ++++ include/linux/netfilter/nfnetlink_conntrack.h | 1 + include/net/netfilter/nf_conntrack_expect.h | 3 --- net/netfilter/nf_conntrack_netlink.c | 8 +++++++- 4 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 1afd18c855ec..fdc50cae861f 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -100,6 +100,10 @@ enum ip_conntrack_expect_events { IPEXP_NEW, /* new expectation */ }; +/* expectation flags */ +#define NF_CT_EXPECT_PERMANENT 0x1 +#define NF_CT_EXPECT_INACTIVE 0x2 + #ifdef __KERNEL__ struct ip_conntrack_stat { unsigned int searched; diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 9ed534c991b9..455f0ce4f430 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -161,6 +161,7 @@ enum ctattr_expect { CTA_EXPECT_ID, CTA_EXPECT_HELP_NAME, CTA_EXPECT_ZONE, + CTA_EXPECT_FLAGS, __CTA_EXPECT_MAX }; #define CTA_EXPECT_MAX (__CTA_EXPECT_MAX - 1) diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 11e815084fcf..96bb42af5fae 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -67,9 +67,6 @@ struct nf_conntrack_expect_policy { #define NF_CT_EXPECT_CLASS_DEFAULT 0 -#define NF_CT_EXPECT_PERMANENT 0x1 -#define NF_CT_EXPECT_INACTIVE 0x2 - int nf_conntrack_expect_init(struct net *net); void nf_conntrack_expect_fini(struct net *net); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 37533a30413b..0804e0ef6500 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1577,6 +1577,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)); NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)); + NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)); helper = rcu_dereference(nfct_help(master)->helper); if (helper) NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); @@ -1734,6 +1735,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { [CTA_EXPECT_ID] = { .type = NLA_U32 }, [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING }, [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, + [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, }; static int @@ -1933,9 +1935,13 @@ ctnetlink_create_expect(struct net *net, u16 zone, goto out; } + if (cda[CTA_EXPECT_FLAGS]) + exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + else + exp->flags = 0; + exp->class = 0; exp->expectfn = NULL; - exp->flags = 0; exp->master = ct; exp->helper = NULL; memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); -- cgit v1.2.3 From bc01befdcf3e40979eb518085a075cbf0aacede0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Sep 2010 21:06:34 +0200 Subject: netfilter: ctnetlink: add support for user-space expectation helpers This patch adds the basic infrastructure to support user-space expectation helpers via ctnetlink and the netfilter queuing infrastructure NFQUEUE. Basically, this patch: * adds NF_CT_EXPECT_USERSPACE flag to identify user-space created expectations. I have also added a sanity check in __nf_ct_expect_check() to avoid that kernel-space helpers may create an expectation if the master conntrack has no helper assigned. * adds some branches to check if the master conntrack helper exists, otherwise we skip the code that refers to kernel-space helper such as the local expectation list and the expectation policy. * allows to set the timeout for user-space expectations with no helper assigned. * a list of expectations created from user-space that depends on ctnetlink (if this module is removed, they are deleted). * includes USERSPACE in the /proc output for expectations that have been created by a user-space helper. This patch also modifies ctnetlink to skip including the helper name in the Netlink messages if no kernel-space helper is set (since no user-space expectation has not kernel-space kernel assigned). You can access an example user-space FTP conntrack helper at: http://people.netfilter.org/pablo/userspace-conntrack-helpers/nf-ftp-helper-userspace-POC.tar.bz Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_common.h | 1 + include/net/netfilter/nf_conntrack_expect.h | 1 + net/netfilter/nf_conntrack_expect.c | 62 ++++++++++++++++++++------- net/netfilter/nf_conntrack_netlink.c | 46 +++++++++++++------- 4 files changed, 79 insertions(+), 31 deletions(-) (limited to 'include/net') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index fdc50cae861f..23a1a08578a8 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -103,6 +103,7 @@ enum ip_conntrack_expect_events { /* expectation flags */ #define NF_CT_EXPECT_PERMANENT 0x1 #define NF_CT_EXPECT_INACTIVE 0x2 +#define NF_CT_EXPECT_USERSPACE 0x4 #ifdef __KERNEL__ struct ip_conntrack_stat { diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 96bb42af5fae..416b83844485 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -85,6 +85,7 @@ nf_ct_find_expectation(struct net *net, u16 zone, void nf_ct_unlink_expect(struct nf_conntrack_expect *exp); void nf_ct_remove_expectations(struct nf_conn *ct); void nf_ct_unexpect_related(struct nf_conntrack_expect *exp); +void nf_ct_remove_userspace_expectations(void); /* Allocate space for an expectation: this is mandatory before calling nf_ct_expect_related. You will have to call put afterwards. */ diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index acb29ccaa41f..b30a1f2aac00 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -38,20 +38,23 @@ static int nf_ct_expect_hash_rnd_initted __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; +static HLIST_HEAD(nf_ct_userspace_expect_list); + /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); - NF_CT_ASSERT(master_help); NF_CT_ASSERT(!timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); net->ct.expect_count--; hlist_del(&exp->lnode); - master_help->expecting[exp->class]--; + if (!(exp->flags & NF_CT_EXPECT_USERSPACE)) + master_help->expecting[exp->class]--; + nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); @@ -320,16 +323,21 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) atomic_inc(&exp->use); - hlist_add_head(&exp->lnode, &master_help->expectations); - master_help->expecting[exp->class]++; + if (master_help) { + hlist_add_head(&exp->lnode, &master_help->expectations); + master_help->expecting[exp->class]++; + } else if (exp->flags & NF_CT_EXPECT_USERSPACE) + hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list); hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); net->ct.expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); - p = &master_help->helper->expect_policy[exp->class]; - exp->timeout.expires = jiffies + p->timeout * HZ; + if (master_help) { + p = &master_help->helper->expect_policy[exp->class]; + exp->timeout.expires = jiffies + p->timeout * HZ; + } add_timer(&exp->timeout); atomic_inc(&exp->use); @@ -380,7 +388,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) unsigned int h; int ret = 1; - if (!master_help->helper) { + /* Don't allow expectations created from kernel-space with no helper */ + if (!(expect->flags & NF_CT_EXPECT_USERSPACE) && + (!master_help || (master_help && !master_help->helper))) { ret = -ESHUTDOWN; goto out; } @@ -398,13 +408,16 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) } } /* Will be over limit? */ - p = &master_help->helper->expect_policy[expect->class]; - if (p->max_expected && - master_help->expecting[expect->class] >= p->max_expected) { - evict_oldest_expect(master, expect); - if (master_help->expecting[expect->class] >= p->max_expected) { - ret = -EMFILE; - goto out; + if (master_help) { + p = &master_help->helper->expect_policy[expect->class]; + if (p->max_expected && + master_help->expecting[expect->class] >= p->max_expected) { + evict_oldest_expect(master, expect); + if (master_help->expecting[expect->class] + >= p->max_expected) { + ret = -EMFILE; + goto out; + } } } @@ -439,6 +452,21 @@ out: } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); +void nf_ct_remove_userspace_expectations(void) +{ + struct nf_conntrack_expect *exp; + struct hlist_node *n, *next; + + hlist_for_each_entry_safe(exp, n, next, + &nf_ct_userspace_expect_list, lnode) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } + } +} +EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations); + #ifdef CONFIG_PROC_FS struct ct_expect_iter_state { struct seq_net_private p; @@ -529,8 +557,12 @@ static int exp_seq_show(struct seq_file *s, void *v) seq_printf(s, "PERMANENT"); delim = ","; } - if (expect->flags & NF_CT_EXPECT_INACTIVE) + if (expect->flags & NF_CT_EXPECT_INACTIVE) { seq_printf(s, "%sINACTIVE", delim); + delim = ","; + } + if (expect->flags & NF_CT_EXPECT_USERSPACE) + seq_printf(s, "%sUSERSPACE", delim); helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 0804e0ef6500..b4077be5f663 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1560,8 +1560,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) { struct nf_conn *master = exp->master; - struct nf_conntrack_helper *helper; long timeout = (exp->timeout.expires - jiffies) / HZ; + struct nf_conn_help *help; if (timeout < 0) timeout = 0; @@ -1578,9 +1578,14 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)); NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)); NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)); - helper = rcu_dereference(nfct_help(master)->helper); - if (helper) - NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); + help = nfct_help(master); + if (help) { + struct nf_conntrack_helper *helper; + + helper = rcu_dereference(help->helper); + if (helper) + NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); + } return 0; @@ -1921,24 +1926,32 @@ ctnetlink_create_expect(struct net *net, u16 zone, if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); - help = nfct_help(ct); - - if (!help || !help->helper) { - /* such conntrack hasn't got any helper, abort */ - err = -EOPNOTSUPP; - goto out; - } - exp = nf_ct_expect_alloc(ct); if (!exp) { err = -ENOMEM; goto out; } + help = nfct_help(ct); + if (!help) { + if (!cda[CTA_EXPECT_TIMEOUT]) { + err = -EINVAL; + goto out; + } + exp->timeout.expires = + jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; - if (cda[CTA_EXPECT_FLAGS]) - exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); - else - exp->flags = 0; + exp->flags = NF_CT_EXPECT_USERSPACE; + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags |= + ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + } + } else { + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + exp->flags &= ~NF_CT_EXPECT_USERSPACE; + } else + exp->flags = 0; + } exp->class = 0; exp->expectfn = NULL; @@ -2109,6 +2122,7 @@ static void __exit ctnetlink_exit(void) { pr_info("ctnetlink: unregistering from nfnetlink.\n"); + nf_ct_remove_userspace_expectations(); #ifdef CONFIG_NF_CONNTRACK_EVENTS nf_ct_expect_unregister_notifier(&ctnl_notifier_exp); nf_conntrack_unregister_notifier(&ctnl_notifier); -- cgit v1.2.3 From f11017ec2d1859c661f4e2b12c4a8d250e1f47cf Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:52 +0900 Subject: IPVS: Add struct ip_vs_conn_param Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- include/net/ip_vs.h | 44 +++++--- net/netfilter/ipvs/ip_vs_conn.c | 171 ++++++++++++++++---------------- net/netfilter/ipvs/ip_vs_core.c | 64 ++++++------ net/netfilter/ipvs/ip_vs_ftp.c | 43 ++++---- net/netfilter/ipvs/ip_vs_nfct.c | 12 +-- net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 47 ++++----- net/netfilter/ipvs/ip_vs_sync.c | 33 +++--- 7 files changed, 211 insertions(+), 203 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 3915a4f4cd30..d4da774eca75 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -357,6 +357,15 @@ struct ip_vs_protocol { extern struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto); +struct ip_vs_conn_param { + const union nf_inet_addr *caddr; + const union nf_inet_addr *vaddr; + __be16 cport; + __be16 vport; + __u16 protocol; + u16 af; +}; + /* * IP_VS structure allocated for each dynamically scheduled connection */ @@ -626,13 +635,23 @@ enum { IP_VS_DIR_LAST, }; -extern struct ip_vs_conn *ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port); +static inline void ip_vs_conn_fill_param(int af, int protocol, + const union nf_inet_addr *caddr, + __be16 cport, + const union nf_inet_addr *vaddr, + __be16 vport, + struct ip_vs_conn_param *p) +{ + p->af = af; + p->protocol = protocol; + p->caddr = caddr; + p->cport = cport; + p->vaddr = vaddr; + p->vport = vport; +} -extern struct ip_vs_conn *ip_vs_ct_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port); +struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p); +struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, @@ -640,9 +659,7 @@ struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, unsigned int proto_off, int inverse); -extern struct ip_vs_conn *ip_vs_conn_out_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port); +struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, @@ -658,11 +675,10 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) extern void ip_vs_conn_put(struct ip_vs_conn *cp); extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); -extern struct ip_vs_conn * -ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, - const union nf_inet_addr *vaddr, __be16 vport, - const union nf_inet_addr *daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest); +struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, + const union nf_inet_addr *daddr, + __be16 dport, unsigned flags, + struct ip_vs_dest *dest); extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp); extern const char * ip_vs_state_name(__u16 proto, int state); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a970d9691496..deeb906a797b 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -218,27 +218,26 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) /* * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. * Called for pkts coming from OUTside-to-INside. - * s_addr, s_port: pkt source address (foreign host) - * d_addr, d_port: pkt dest address (load balancer) + * p->caddr, p->cport: pkt source address (foreign host) + * p->vaddr, p->vport: pkt dest address (load balancer) */ -static inline struct ip_vs_conn *__ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +static inline struct ip_vs_conn * +__ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && - ip_vs_addr_equal(af, d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && - ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - protocol == cp->protocol) { + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && + p->cport == cp->cport && p->vport == cp->vport && + ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + p->protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); ct_read_unlock(hash); @@ -251,71 +250,82 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get return NULL; } -struct ip_vs_conn *ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { struct ip_vs_conn *cp; - cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); - if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) - cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, - d_port); + cp = __ip_vs_conn_in_get(p); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { + struct ip_vs_conn_param cport_zero_p = *p; + cport_zero_p.cport = 0; + cp = __ip_vs_conn_in_get(&cport_zero_p); + } IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), cp ? "hit" : "not hit"); return cp; } +static int +ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse, + struct ip_vs_conn_param *p) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return 1; + + if (likely(!inverse)) + ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0], + &iph->daddr, pptr[1], p); + else + ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1], + &iph->saddr, pptr[0], p); + return 0; +} + struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { - __be16 _ports[2], *pptr; + struct ip_vs_conn_param p; - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) + if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) return NULL; - if (likely(!inverse)) - return ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - else - return ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); + return ip_vs_conn_in_get(&p); } EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); /* Get reference to connection template */ -struct ip_vs_conn *ip_vs_ct_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && /* protocol should only be IPPROTO_IP if - * d_addr is a fwmark */ - ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af, - d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && + * p->vaddr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : + p->af, p->vaddr, &cp->vaddr) && + p->cport == cp->cport && p->vport == cp->vport && cp->flags & IP_VS_CONN_F_TEMPLATE && - protocol == cp->protocol) { + p->protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); goto out; @@ -327,23 +337,19 @@ struct ip_vs_conn *ip_vs_ct_in_get ct_read_unlock(hash); IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), cp ? "hit" : "not hit"); return cp; } -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from inside-to-OUTside. - * s_addr, s_port: pkt source address (inside host) - * d_addr, d_port: pkt dest address (foreign host) - */ -struct ip_vs_conn *ip_vs_conn_out_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * p->caddr, p->cport: pkt source address (inside host) + * p->vaddr, p->vport: pkt dest address (foreign host) */ +struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { unsigned hash; struct ip_vs_conn *cp, *ret=NULL; @@ -351,16 +357,16 @@ struct ip_vs_conn *ip_vs_conn_out_get /* * Check for "full" addressed entries */ - hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); + hash = ip_vs_conn_hashkey(p->af, p->protocol, p->vaddr, p->vport); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, d_addr, &cp->caddr) && - ip_vs_addr_equal(af, s_addr, &cp->daddr) && - d_port == cp->cport && s_port == cp->dport && - protocol == cp->protocol) { + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && + p->vport == cp->cport && p->cport == cp->dport && + p->protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); ret = cp; @@ -371,9 +377,9 @@ struct ip_vs_conn *ip_vs_conn_out_get ct_read_unlock(hash); IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), ret ? "hit" : "not hit"); return ret; @@ -385,20 +391,12 @@ ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { - __be16 _ports[2], *pptr; + struct ip_vs_conn_param p; - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) + if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) return NULL; - if (likely(!inverse)) - return ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - else - return ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); + return ip_vs_conn_out_get(&p); } EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); @@ -758,13 +756,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) * Create a new connection entry and hash it into the ip_vs_conn_tab */ struct ip_vs_conn * -ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, - const union nf_inet_addr *vaddr, __be16 vport, +ip_vs_conn_new(const struct ip_vs_conn_param *p, const union nf_inet_addr *daddr, __be16 dport, unsigned flags, struct ip_vs_dest *dest) { struct ip_vs_conn *cp; - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol); cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { @@ -774,14 +771,14 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, INIT_LIST_HEAD(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - cp->af = af; - cp->protocol = proto; - ip_vs_addr_copy(af, &cp->caddr, caddr); - cp->cport = cport; - ip_vs_addr_copy(af, &cp->vaddr, vaddr); - cp->vport = vport; + cp->af = p->af; + cp->protocol = p->protocol; + ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); + cp->cport = p->cport; + ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr); + cp->vport = p->vport; /* proto should only be IPPROTO_IP if d_addr is a fwmark */ - ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af, + ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; @@ -810,7 +807,7 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, /* Bind its packet transmitter */ #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) + if (p->af == AF_INET6) ip_vs_bind_xmit_v6(cp); else #endif diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 70a5cacf86d5..87602a62458e 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -193,14 +193,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct ip_vs_iphdr iph; struct ip_vs_dest *dest; struct ip_vs_conn *ct; - int protocol = iph.protocol; __be16 dport = 0; /* destination port to forward */ - __be16 vport = 0; /* virtual service port */ unsigned int flags; + struct ip_vs_conn_param param; union nf_inet_addr snet; /* source network of the client, after masking */ - const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; - const union nf_inet_addr *vaddr = &iph.daddr; ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); @@ -232,6 +229,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * is created for other persistent services. */ { + int protocol = iph.protocol; + const union nf_inet_addr *vaddr = &iph.daddr; + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; + __be16 vport = 0; + if (ports[1] == svc->port) { /* non-FTP template: * @@ -253,11 +255,12 @@ ip_vs_sched_persist(struct ip_vs_service *svc, vaddr = &fwmark; } } + ip_vs_conn_fill_param(svc->af, protocol, &snet, 0, + vaddr, vport, ¶m); } /* Check if a template already exists */ - ct = ip_vs_ct_in_get(svc->af, protocol, &snet, 0, vaddr, vport); - + ct = ip_vs_ct_in_get(¶m); if (!ct || !ip_vs_check_template(ct)) { /* No template found or the dest of the connection * template is not available. @@ -272,8 +275,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, dport = dest->port; /* Create a template */ - ct = ip_vs_conn_new(svc->af, protocol, &snet, 0,vaddr, vport, - &dest->addr, dport, + ct = ip_vs_conn_new(¶m, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) return NULL; @@ -294,12 +296,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, ports[0], - &iph.daddr, ports[1], - &dest->addr, dport, - flags, - dest); + ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0], + &iph.daddr, ports[1], ¶m); + cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest); if (cp == NULL) { ip_vs_conn_put(ct); return NULL; @@ -366,14 +365,16 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* * Create a connection entry. */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &dest->addr, dest->port ? dest->port : pptr[1], - flags, - dest); - if (cp == NULL) - return NULL; + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, + pptr[0], &iph.daddr, pptr[1], &p); + cp = ip_vs_conn_new(&p, &dest->addr, + dest->port ? dest->port : pptr[1], + flags, dest); + if (!cp) + return NULL; + } IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", @@ -429,14 +430,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &daddr, 0, - IP_VS_CONN_F_BYPASS | flags, - NULL); - if (cp == NULL) - return NF_DROP; + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(svc->af, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], &p); + cp = ip_vs_conn_new(&p, &daddr, 0, + IP_VS_CONN_F_BYPASS | flags, + NULL); + if (!cp) + return NF_DROP; + } /* statistics */ ip_vs_in_stats(cp, skb); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 9cd375f94d61..090889a3b3af 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -195,13 +195,17 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Now update or create an connection entry for it */ - n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, - &cp->caddr, 0); + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(AF_INET, iph->protocol, + &from, port, &cp->caddr, 0, &p); + n_cp = ip_vs_conn_out_get(&p); + } if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &cp->caddr, 0, - &cp->vaddr, port, - &from, port, + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr, + 0, &cp->vaddr, port, &p); + n_cp = ip_vs_conn_new(&p, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, cp->dest); @@ -347,21 +351,22 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, ip_vs_proto_name(iph->protocol), &to.ip, ntohs(port), &cp->vaddr.ip, 0); - n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, - &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1)); - if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &to, port, + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), - &cp->daddr, htons(ntohs(cp->dport)-1), - IP_VS_CONN_F_NFCT, - cp->dest); - if (!n_cp) - return 0; + &p); + n_cp = ip_vs_conn_in_get(&p); + if (!n_cp) { + n_cp = ip_vs_conn_new(&p, &cp->daddr, + htons(ntohs(cp->dport)-1), + IP_VS_CONN_F_NFCT, cp->dest); + if (!n_cp) + return 0; - /* add its controller */ - ip_vs_control_add(n_cp, cp); + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } } /* diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index c038458d0290..4680647cd450 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -140,6 +140,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, { struct nf_conntrack_tuple *orig, new_reply; struct ip_vs_conn *cp; + struct ip_vs_conn_param p; if (exp->tuple.src.l3num != PF_INET) return; @@ -154,9 +155,10 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, /* RS->CLIENT */ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum, - &orig->src.u3, orig->src.u.tcp.port, - &orig->dst.u3, orig->dst.u.tcp.port); + ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port, &p); + cp = ip_vs_conn_out_get(&p); if (cp) { /* Change reply CLIENT->RS to CLIENT->VS */ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; @@ -176,9 +178,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, } /* CLIENT->VS */ - cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum, - &orig->src.u3, orig->src.u.tcp.port, - &orig->dst.u3, orig->dst.u.tcp.port); + cp = ip_vs_conn_in_get(&p); if (cp) { /* Change reply VS->CLIENT to RS->CLIENT */ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 1892dfc12fdd..8956ef33ea6c 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -40,6 +40,19 @@ struct isakmp_hdr { #define PORT_ISAKMP 500 +static void +ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph, + int inverse, struct ip_vs_conn_param *p) +{ + if (likely(!inverse)) + ip_vs_conn_fill_param(af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), + &iph->daddr, htons(PORT_ISAKMP), p); + else + ip_vs_conn_fill_param(af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), + &iph->saddr, htons(PORT_ISAKMP), p); +} static struct ip_vs_conn * ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, @@ -47,21 +60,10 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, int inverse) { struct ip_vs_conn *cp; + struct ip_vs_conn_param p; - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - + ah_esp_conn_fill_param_proto(af, iph, inverse, &p); + cp = ip_vs_conn_in_get(&p); if (!cp) { /* * We are not sure if the packet is from our @@ -87,21 +89,10 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, int inverse) { struct ip_vs_conn *cp; + struct ip_vs_conn_param p; - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - + ah_esp_conn_fill_param_proto(af, iph, inverse, &p); + cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 7ba06939829f..f68631f75f09 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -301,6 +301,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_dest *dest; + struct ip_vs_conn_param param; char *p; int i; @@ -370,18 +371,17 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) } } - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); - else - cp = ip_vs_ct_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); + { + ip_vs_conn_fill_param(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport, ¶m); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(¶m); + else + cp = ip_vs_ct_in_get(¶m); + } if (!cp) { /* * Find the appropriate destination for the connection. @@ -406,14 +406,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) else flags &= ~IP_VS_CONN_F_INACTIVE; } - cp = ip_vs_conn_new(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport, + cp = ip_vs_conn_new(¶m, (union nf_inet_addr *)&s->daddr, - s->dport, - flags, dest); + s->dport, flags, dest); if (dest) atomic_dec(&dest->refcnt); if (!cp) { -- cgit v1.2.3 From 85999283a21ab2dd37427fdd8c8e8af57223977c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:53 +0900 Subject: IPVS: Add struct ip_vs_pe Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- include/linux/ip_vs.h | 2 ++ include/net/ip_vs.h | 28 ++++++++++++++++- net/netfilter/ipvs/ip_vs_conn.c | 67 ++++++++++++++++++++++++++++++++++------- net/netfilter/ipvs/ip_vs_core.c | 36 +++++++++++++++++----- net/netfilter/ipvs/ip_vs_sync.c | 17 +++++++++-- 5 files changed, 129 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h index df7728613720..0a9c44d64292 100644 --- a/include/linux/ip_vs.h +++ b/include/linux/ip_vs.h @@ -99,8 +99,10 @@ 0) #define IP_VS_SCHEDNAME_MAXLEN 16 +#define IP_VS_PENAME_MAXLEN 16 #define IP_VS_IFNAME_MAXLEN 16 +#define IP_VS_PEDATA_MAXLEN 255 /* * The struct ip_vs_service_user and struct ip_vs_dest_user are diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d4da774eca75..b6b309d05e4e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -364,6 +364,10 @@ struct ip_vs_conn_param { __be16 vport; __u16 protocol; u16 af; + + const struct ip_vs_pe *pe; + char *pe_data; + __u8 pe_data_len; }; /* @@ -416,6 +420,9 @@ struct ip_vs_conn { void *app_data; /* Application private data */ struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */ + + char *pe_data; + __u8 pe_data_len; }; @@ -486,6 +493,9 @@ struct ip_vs_service { struct ip_vs_scheduler *scheduler; /* bound scheduler object */ rwlock_t sched_lock; /* lock sched_data */ void *sched_data; /* scheduler application data */ + + /* alternate persistence engine */ + struct ip_vs_pe *pe; }; @@ -549,6 +559,20 @@ struct ip_vs_scheduler { const struct sk_buff *skb); }; +/* The persistence engine object */ +struct ip_vs_pe { + struct list_head n_list; /* d-linked list head */ + char *name; /* scheduler name */ + atomic_t refcnt; /* reference counter */ + struct module *module; /* THIS_MODULE/NULL */ + + /* get the connection template, if any */ + int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb); + bool (*ct_match)(const struct ip_vs_conn_param *p, + struct ip_vs_conn *ct); + u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, + bool inverse); +}; /* * The application module object (a.k.a. app incarnation) @@ -648,6 +672,8 @@ static inline void ip_vs_conn_fill_param(int af, int protocol, p->cport = cport; p->vaddr = vaddr; p->vport = vport; + p->pe = NULL; + p->pe_data = NULL; } struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p); @@ -803,7 +829,7 @@ extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc); extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); extern struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb); +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb); extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_protocol *pp); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index deeb906a797b..06da21e97405 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -148,6 +148,42 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, & ip_vs_conn_tab_mask; } +static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, + bool inverse) +{ + const union nf_inet_addr *addr; + __be16 port; + + if (p->pe && p->pe->hashkey_raw) + return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & + ip_vs_conn_tab_mask; + + if (likely(!inverse)) { + addr = p->caddr; + port = p->cport; + } else { + addr = p->vaddr; + port = p->vport; + } + + return ip_vs_conn_hashkey(p->af, p->protocol, addr, port); +} + +static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) +{ + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport, + NULL, 0, &p); + + if (cp->dest && cp->dest->svc->pe) { + p.pe = cp->dest->svc->pe; + p.pe_data = cp->pe_data; + p.pe_data_len = cp->pe_data_len; + } + + return ip_vs_conn_hashkey_param(&p, false); +} /* * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. @@ -162,7 +198,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) return 0; /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + hash = ip_vs_conn_hashkey_conn(cp); ct_write_lock(hash); spin_lock(&cp->lock); @@ -195,7 +231,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) int ret; /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + hash = ip_vs_conn_hashkey_conn(cp); ct_write_lock(hash); spin_lock(&cp->lock); @@ -227,7 +263,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport); + hash = ip_vs_conn_hashkey_param(p, false); ct_read_lock(hash); @@ -312,11 +348,17 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport); + hash = ip_vs_conn_hashkey_param(p, false); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->pe && p->pe->ct_match) { + if (p->pe->ct_match(p, cp)) + goto out; + continue; + } + if (cp->af == p->af && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && /* protocol should only be IPPROTO_IP if @@ -325,15 +367,14 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) p->af, p->vaddr, &cp->vaddr) && p->cport == cp->cport && p->vport == cp->vport && cp->flags & IP_VS_CONN_F_TEMPLATE && - p->protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); + p->protocol == cp->protocol) goto out; - } } cp = NULL; out: + if (cp) + atomic_inc(&cp->refcnt); ct_read_unlock(hash); IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", @@ -357,7 +398,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) /* * Check for "full" addressed entries */ - hash = ip_vs_conn_hashkey(p->af, p->protocol, p->vaddr, p->vport); + hash = ip_vs_conn_hashkey_param(p, true); ct_read_lock(hash); @@ -722,6 +763,7 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->flags & IP_VS_CONN_F_NFCT) ip_vs_conn_drop_conntrack(cp); + kfree(cp->pe_data); if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); @@ -782,6 +824,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; + if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) { + cp->pe_data = p->pe_data; + cp->pe_data_len = p->pe_data_len; + } spin_lock_init(&cp->lock); /* @@ -832,7 +878,6 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, return cp; } - /* * /proc/net/ip_vs_conn entries */ @@ -848,7 +893,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { if (pos-- == 0) { seq->private = &ip_vs_conn_tab[idx]; - return cp; + return cp; } } ct_read_unlock_bh(idx); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 87602a62458e..ab9889380496 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -176,6 +176,19 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction, return pp->state_transition(cp, direction, skb, pp); } +static inline int +ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, + struct sk_buff *skb, int protocol, + const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + struct ip_vs_conn_param *p) +{ + ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p); + p->pe = svc->pe; + if (p->pe && p->pe->fill_param) + return p->pe->fill_param(p, skb); + return 0; +} /* * IPVS persistent scheduling function @@ -186,7 +199,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction, */ static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, - const struct sk_buff *skb, + struct sk_buff *skb, __be16 ports[2]) { struct ip_vs_conn *cp = NULL; @@ -255,8 +268,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc, vaddr = &fwmark; } } - ip_vs_conn_fill_param(svc->af, protocol, &snet, 0, - vaddr, vport, ¶m); + if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, + vaddr, vport, ¶m)) + return NULL; } /* Check if a template already exists */ @@ -268,22 +282,30 @@ ip_vs_sched_persist(struct ip_vs_service *svc, dest = svc->scheduler->schedule(svc, skb); if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); + kfree(param.pe_data); return NULL; } if (ports[1] == svc->port && svc->port != FTPPORT) dport = dest->port; - /* Create a template */ + /* Create a template + * This adds param.pe_data to the template, + * and thus param.pe_data will be destroyed + * when the template expires */ ct = ip_vs_conn_new(¶m, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest); - if (ct == NULL) + if (ct == NULL) { + kfree(param.pe_data); return NULL; + } ct->timeout = svc->timeout; - } else + } else { /* set destination with the found template */ dest = ct->dest; + kfree(param.pe_data); + } dport = ports[1]; if (dport == svc->port && dest->port) @@ -322,7 +344,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * Protocols supported: TCP, UDP */ struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) { struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index f68631f75f09..ab85aedea17e 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) ip_vs_sync_conn(cp->control); } +static inline int +ip_vs_conn_fill_param_sync(int af, int protocol, + const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + struct ip_vs_conn_param *p) +{ + /* XXX: Need to take into account persistence engine */ + ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p); + return 0; +} /* * Process received multicast message and create the corresponding @@ -372,11 +382,14 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) } { - ip_vs_conn_fill_param(AF_INET, s->protocol, + if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol, (union nf_inet_addr *)&s->caddr, s->cport, (union nf_inet_addr *)&s->vaddr, - s->vport, ¶m); + s->vport, ¶m)) { + pr_err("ip_vs_conn_fill_param_sync failed"); + return; + } if (!(flags & IP_VS_CONN_F_TEMPLATE)) cp = ip_vs_conn_in_get(¶m); else -- cgit v1.2.3 From a3c918acd29a96aba3b46bf50136e7953a480d17 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:53 +0900 Subject: IPVS: Add persistence engine data to /proc/net/ip_vs_conn This shouldn't break compatibility with userspace as the new data is at the end of the line. I have confirmed that this doesn't break ipvsadm, the main (only?) user-space user of this data. Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- include/net/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_conn.c | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b6b309d05e4e..974daf52ba76 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -572,6 +572,7 @@ struct ip_vs_pe { struct ip_vs_conn *ct); u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, bool inverse); + int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); }; /* diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 06da21e97405..4adedefdf563 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -950,30 +950,45 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); else { const struct ip_vs_conn *cp = v; + char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; + size_t len = 0; + + if (cp->dest && cp->dest->svc->pe && + cp->dest->svc->pe->show_pe_data) { + pe_data[0] = ' '; + len = strlen(cp->dest->svc->pe->name); + memcpy(pe_data + 1, cp->dest->svc->pe->name, len); + pe_data[len + 1] = ' '; + len += 2; + len += cp->dest->svc->pe->show_pe_data(cp, + pe_data + len); + } + pe_data[len] = '\0'; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) - seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n", + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%pI6 %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), &cp->daddr.in6, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); + (cp->timer.expires-jiffies)/HZ, pe_data); else #endif seq_printf(seq, "%-3s %08X %04X %08X %04X" - " %08X %04X %-11s %7lu\n", + " %08X %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), ntohl(cp->daddr.ip), ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); + (cp->timer.expires-jiffies)/HZ, pe_data); } return 0; } -- cgit v1.2.3 From 8be67a6617b3403551fccb67b1c624c659419515 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:54 +0900 Subject: IPVS: management of persistence engine modules This is based heavily on the scheduler management code Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- include/net/ip_vs.h | 6 ++ net/netfilter/ipvs/Makefile | 2 +- net/netfilter/ipvs/ip_vs_pe.c | 147 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 net/netfilter/ipvs/ip_vs_pe.c (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 974daf52ba76..a6b93a26d4e2 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -796,6 +796,12 @@ extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb); extern int ip_vs_app_init(void); extern void ip_vs_app_cleanup(void); +void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe); +void ip_vs_unbind_pe(struct ip_vs_service *svc); +int register_ip_vs_pe(struct ip_vs_pe *pe); +int unregister_ip_vs_pe(struct ip_vs_pe *pe); +extern struct ip_vs_pe *ip_vs_pe_get(const char *name); +extern void ip_vs_pe_put(struct ip_vs_pe *pe); /* * IPVS protocol functions (from ip_vs_proto.c) diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index 349fe8819b89..4a87bf3c6293 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -14,7 +14,7 @@ ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ - ip_vs_est.o ip_vs_proto.o \ + ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \ $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c new file mode 100644 index 000000000000..3414af70ee12 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -0,0 +1,147 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + +/* IPVS pe list */ +static LIST_HEAD(ip_vs_pe); + +/* lock for service table */ +static DEFINE_SPINLOCK(ip_vs_pe_lock); + +/* Bind a service with a pe */ +void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe) +{ + svc->pe = pe; +} + +/* Unbind a service from its pe */ +void ip_vs_unbind_pe(struct ip_vs_service *svc) +{ + svc->pe = NULL; +} + +/* Get pe in the pe list by name */ +static struct ip_vs_pe * +ip_vs_pe_getbyname(const char *pe_name) +{ + struct ip_vs_pe *pe; + + IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__, + pe_name); + + spin_lock_bh(&ip_vs_pe_lock); + + list_for_each_entry(pe, &ip_vs_pe, n_list) { + /* Test and get the modules atomically */ + if (pe->module && + !try_module_get(pe->module)) { + /* This pe is just deleted */ + continue; + } + if (strcmp(pe_name, pe->name)==0) { + /* HIT */ + spin_unlock_bh(&ip_vs_pe_lock); + return pe; + } + if (pe->module) + module_put(pe->module); + } + + spin_unlock_bh(&ip_vs_pe_lock); + return NULL; +} + +/* Lookup pe and try to load it if it doesn't exist */ +struct ip_vs_pe *ip_vs_pe_get(const char *name) +{ + struct ip_vs_pe *pe; + + /* Search for the pe by name */ + pe = ip_vs_pe_getbyname(name); + + /* If pe not found, load the module and search again */ + if (!pe) { + request_module("ip_vs_pe_%s", name); + pe = ip_vs_pe_getbyname(name); + } + + return pe; +} + +void ip_vs_pe_put(struct ip_vs_pe *pe) +{ + if (pe && pe->module) + module_put(pe->module); +} + +/* Register a pe in the pe list */ +int register_ip_vs_pe(struct ip_vs_pe *pe) +{ + struct ip_vs_pe *tmp; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + spin_lock_bh(&ip_vs_pe_lock); + + if (!list_empty(&pe->n_list)) { + spin_unlock_bh(&ip_vs_pe_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already linked\n", + __func__, pe->name); + return -EINVAL; + } + + /* Make sure that the pe with this name doesn't exist + * in the pe list. + */ + list_for_each_entry(tmp, &ip_vs_pe, n_list) { + if (strcmp(tmp->name, pe->name) == 0) { + spin_unlock_bh(&ip_vs_pe_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already existed " + "in the system\n", __func__, pe->name); + return -EINVAL; + } + } + /* Add it into the d-linked pe list */ + list_add(&pe->n_list, &ip_vs_pe); + spin_unlock_bh(&ip_vs_pe_lock); + + pr_info("[%s] pe registered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ip_vs_pe); + +/* Unregister a pe from the pe list */ +int unregister_ip_vs_pe(struct ip_vs_pe *pe) +{ + spin_lock_bh(&ip_vs_pe_lock); + if (list_empty(&pe->n_list)) { + spin_unlock_bh(&ip_vs_pe_lock); + pr_err("%s(): [%s] pe is not in the list. failed\n", + __func__, pe->name); + return -EINVAL; + } + + /* Remove it from the d-linked pe list */ + list_del(&pe->n_list); + spin_unlock_bh(&ip_vs_pe_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] pe unregistered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ip_vs_pe); -- cgit v1.2.3 From 0d1e71b04a04b6912e50926b9987c1e72facb1f3 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:54 +0900 Subject: IPVS: Allow configuration of persistence engines Allow the persistence engine of a virtual service to be set, edited and unset. This feature only works with the netlink user-space interface. Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- include/linux/ip_vs.h | 3 +++ include/net/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_ctl.c | 57 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 58 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h index 0a9c44d64292..5f43a3b2e3ad 100644 --- a/include/linux/ip_vs.h +++ b/include/linux/ip_vs.h @@ -336,6 +336,9 @@ enum { IPVS_SVC_ATTR_NETMASK, /* persistent netmask */ IPVS_SVC_ATTR_STATS, /* nested attribute for service stats */ + + IPVS_SVC_ATTR_PE_NAME, /* name of ct retriever */ + __IPVS_SVC_ATTR_MAX, }; diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a6b93a26d4e2..52fbe2308c38 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -444,6 +444,7 @@ struct ip_vs_service_user_kern { /* virtual service options */ char *sched_name; + char *pe_name; unsigned flags; /* virtual service flags */ unsigned timeout; /* persistent timeout in sec */ u32 netmask; /* persistent netmask */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5f20caf47a1d..a697591d0e23 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1134,6 +1134,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, { int ret = 0; struct ip_vs_scheduler *sched = NULL; + struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; /* increase the module use count */ @@ -1147,6 +1148,16 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, goto out_err; } + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_get(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out_err; + } + } + #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { ret = -EINVAL; @@ -1184,6 +1195,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, goto out_err; sched = NULL; + /* Bind the ct retriever */ + ip_vs_bind_pe(svc, pe); + pe = NULL; + /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ip_vs_ftpsvc_counter); @@ -1215,6 +1230,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, kfree(svc); } ip_vs_scheduler_put(sched); + ip_vs_pe_put(pe); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1230,6 +1246,7 @@ static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; /* @@ -1242,6 +1259,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } old_sched = sched; + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_get(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out; + } + old_pe = pe; + } + #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { ret = -EINVAL; @@ -1293,12 +1321,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } } + old_pe = svc->pe; + if (pe != old_pe) { + ip_vs_unbind_pe(svc); + ip_vs_bind_pe(svc, pe); + } + out_unlock: write_unlock_bh(&__ip_vs_svc_lock); -#ifdef CONFIG_IP_VS_IPV6 out: -#endif ip_vs_scheduler_put(old_sched); + ip_vs_pe_put(old_pe); return ret; } @@ -1312,6 +1345,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; + struct ip_vs_pe *old_pe; + + pr_info("%s: enter\n", __func__); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) @@ -1324,6 +1360,11 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) ip_vs_unbind_scheduler(svc); ip_vs_scheduler_put(old_sched); + /* Unbind persistence engine */ + old_pe = svc->pe; + ip_vs_unbind_pe(svc); + ip_vs_pe_put(old_pe); + /* Unbind app inc */ if (svc->inc) { ip_vs_app_inc_put(svc->inc); @@ -2026,6 +2067,8 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, struct ip_vs_service_user *usvc_compat) { + memset(usvc, 0, sizeof(*usvc)); + usvc->af = AF_INET; usvc->protocol = usvc_compat->protocol; usvc->addr.ip = usvc_compat->addr; @@ -2043,6 +2086,8 @@ static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, struct ip_vs_dest_user *udest_compat) { + memset(udest, 0, sizeof(*udest)); + udest->addr.ip = udest_compat->addr; udest->port = udest_compat->port; udest->conn_flags = udest_compat->conn_flags; @@ -2539,6 +2584,8 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_PENAME_MAXLEN }, [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, .len = sizeof(struct ip_vs_flags) }, [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, @@ -2615,6 +2662,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, } NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); + if (svc->pe) + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name); NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); @@ -2741,11 +2790,12 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, /* If a full entry was requested, check for the additional fields */ if (full_entry) { - struct nlattr *nla_sched, *nla_flags, *nla_timeout, + struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, *nla_netmask; struct ip_vs_flags flags; nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; @@ -2763,6 +2813,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, usvc->flags = (usvc->flags & ~flags.mask) | (flags.flags & flags.mask); usvc->sched_name = nla_data(nla_sched); + usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); usvc->netmask = nla_get_u32(nla_netmask); } -- cgit v1.2.3 From 0c200d935346fe0ebde9b6dffbb683dddd166fb9 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 4 Oct 2010 20:53:18 +0200 Subject: netfilter: nf_nat: make find/put static The functions nf_nat_proto_find_get and nf_nat_proto_put are only used internally in nf_nat_core. This might break some out of tree NAT module. Signed-off-by: Stephen Hemminger Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_nat_protocol.h | 3 --- net/ipv4/netfilter/nf_nat_core.c | 6 ++---- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_nat_protocol.h b/include/net/netfilter/nf_nat_protocol.h index df17bac46bf5..93cc90d28e66 100644 --- a/include/net/netfilter/nf_nat_protocol.h +++ b/include/net/netfilter/nf_nat_protocol.h @@ -45,9 +45,6 @@ struct nf_nat_protocol { extern int nf_nat_protocol_register(const struct nf_nat_protocol *proto); extern void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto); -extern const struct nf_nat_protocol *nf_nat_proto_find_get(u_int8_t protocol); -extern void nf_nat_proto_put(const struct nf_nat_protocol *proto); - /* Built-in protocols. */ extern const struct nf_nat_protocol nf_nat_protocol_tcp; extern const struct nf_nat_protocol nf_nat_protocol_udp; diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 2c084b3a8f0c..e2e00c4da883 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -47,7 +47,7 @@ __nf_nat_proto_find(u_int8_t protonum) return rcu_dereference(nf_nat_protos[protonum]); } -const struct nf_nat_protocol * +static const struct nf_nat_protocol * nf_nat_proto_find_get(u_int8_t protonum) { const struct nf_nat_protocol *p; @@ -60,14 +60,12 @@ nf_nat_proto_find_get(u_int8_t protonum) return p; } -EXPORT_SYMBOL_GPL(nf_nat_proto_find_get); -void +static void nf_nat_proto_put(const struct nf_nat_protocol *p) { module_put(p->me); } -EXPORT_SYMBOL_GPL(nf_nat_proto_put); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -- cgit v1.2.3 From eecc545856c8a0f27783a440d25f4ceaa1f95ce8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 4 Oct 2010 23:24:21 +0200 Subject: netfilter: add missing xt_log.h file Forgot to add xt_log.h in commit a8defca0 (netfilter: ipt_LOG: add bufferisation to call printk() once) Signed-off-by: Patrick McHardy --- include/net/netfilter/xt_log.h | 54 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 include/net/netfilter/xt_log.h (limited to 'include/net') diff --git a/include/net/netfilter/xt_log.h b/include/net/netfilter/xt_log.h new file mode 100644 index 000000000000..0dfb34a5b53c --- /dev/null +++ b/include/net/netfilter/xt_log.h @@ -0,0 +1,54 @@ +#define S_SIZE (1024 - (sizeof(unsigned int) + 1)) + +struct sbuff { + unsigned int count; + char buf[S_SIZE + 1]; +}; +static struct sbuff emergency, *emergency_ptr = &emergency; + +static int sb_add(struct sbuff *m, const char *f, ...) +{ + va_list args; + int len; + + if (likely(m->count < S_SIZE)) { + va_start(args, f); + len = vsnprintf(m->buf + m->count, S_SIZE - m->count, f, args); + va_end(args); + if (likely(m->count + len < S_SIZE)) { + m->count += len; + return 0; + } + } + m->count = S_SIZE; + printk_once(KERN_ERR KBUILD_MODNAME " please increase S_SIZE\n"); + return -1; +} + +static struct sbuff *sb_open(void) +{ + struct sbuff *m = kmalloc(sizeof(*m), GFP_ATOMIC); + + if (unlikely(!m)) { + local_bh_disable(); + do { + m = xchg(&emergency_ptr, NULL); + } while (!m); + } + m->count = 0; + return m; +} + +static void sb_close(struct sbuff *m) +{ + m->buf[m->count] = 0; + printk("%s\n", m->buf); + + if (likely(m != &emergency)) + kfree(m); + else { + xchg(&emergency_ptr, m); + local_bh_enable(); + } +} + -- cgit v1.2.3 From ebbf41df4aabb6d506fa18ea8cb4c2b4388a18b9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Oct 2010 10:19:06 +0200 Subject: netfilter: ctnetlink: add expectation deletion events This patch allows to listen to events that inform about expectations destroyed. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_common.h | 1 + include/net/netfilter/nf_conntrack_expect.h | 8 ++++++- net/netfilter/nf_conntrack_expect.c | 6 ++++-- net/netfilter/nf_conntrack_netlink.c | 30 +++++++++++++++++++-------- 4 files changed, 33 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 23a1a08578a8..50cdc2559a5a 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -98,6 +98,7 @@ enum ip_conntrack_events { enum ip_conntrack_expect_events { IPEXP_NEW, /* new expectation */ + IPEXP_DESTROY, /* destroyed expectation */ }; /* expectation flags */ diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 416b83844485..0f8a8c587532 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -82,7 +82,13 @@ struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple); -void nf_ct_unlink_expect(struct nf_conntrack_expect *exp); +void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, + u32 pid, int report); +static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) +{ + nf_ct_unlink_expect_report(exp, 0, 0); +} + void nf_ct_remove_expectations(struct nf_conn *ct); void nf_ct_unexpect_related(struct nf_conntrack_expect *exp); void nf_ct_remove_userspace_expectations(void); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index b30a1f2aac00..46e8966912b1 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -41,7 +41,8 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static HLIST_HEAD(nf_ct_userspace_expect_list); /* nf_conntrack_expect helper functions */ -void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) +void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, + u32 pid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); @@ -55,11 +56,12 @@ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) if (!(exp->flags & NF_CT_EXPECT_USERSPACE)) master_help->expecting[exp->class]--; + nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); } -EXPORT_SYMBOL_GPL(nf_ct_unlink_expect); +EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(unsigned long ul_expect) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index b4077be5f663..62bad229106b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1632,17 +1632,20 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct sk_buff *skb; - unsigned int type; + unsigned int type, group; int flags = 0; - if (events & (1 << IPEXP_NEW)) { + if (events & (1 << IPEXP_DESTROY)) { + type = IPCTNL_MSG_EXP_DELETE; + group = NFNLGRP_CONNTRACK_EXP_DESTROY; + } else if (events & (1 << IPEXP_NEW)) { type = IPCTNL_MSG_EXP_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; + group = NFNLGRP_CONNTRACK_EXP_NEW; } else return 0; - if (!item->report && - !nfnetlink_has_listeners(net, NFNLGRP_CONNTRACK_EXP_NEW)) + if (!item->report && !nfnetlink_has_listeners(net, group)) return 0; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); @@ -1665,8 +1668,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) rcu_read_unlock(); nlmsg_end(skb, nlh); - nfnetlink_send(skb, net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW, - item->report, GFP_ATOMIC); + nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC); return 0; nla_put_failure: @@ -1849,7 +1851,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } /* after list removal, usage count == 1 */ - nf_ct_unexpect_related(exp); + spin_lock_bh(&nf_conntrack_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } + spin_unlock_bh(&nf_conntrack_lock); /* have to put what we 'get' above. * after this line usage count == 0 */ nf_ct_expect_put(exp); @@ -1866,7 +1874,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, m_help = nfct_help(exp->master); if (!strcmp(m_help->helper->name, name) && del_timer(&exp->timeout)) { - nf_ct_unlink_expect(exp); + nf_ct_unlink_expect_report(exp, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); nf_ct_expect_put(exp); } } @@ -1880,7 +1890,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, &net->ct.expect_hash[i], hnode) { if (del_timer(&exp->timeout)) { - nf_ct_unlink_expect(exp); + nf_ct_unlink_expect_report(exp, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); nf_ct_expect_put(exp); } } -- cgit v1.2.3 From 714f095f74582764d629785f03b459a3d0503624 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Tue, 19 Oct 2010 10:38:48 +0200 Subject: ipvs: IPv6 tunnel mode IPv6 encapsulation uses a bad source address for the tunnel. i.e. VIP will be used as local-addr and encap. dst addr. Decapsulation will not accept this. Example LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100) (eth0 2003::1:0:1/96) RS (ethX 2003::1:0:5/96) tcpdump 2003::2:0:100 > 2003::1:0:5: IP6 (hlim 63, next-header TCP (6) payload length: 40) 2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312 (correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val 1904932 ecr 0,nop,wscale 3], length 0 In Linux IPv6 impl. you can't have a tunnel with an any cast address receiving packets (I have not tried to interpret RFC 2473) To have receive capabilities the tunnel must have: - Local address set as multicast addr or an unicast addr - Remote address set as an unicast addr. - Loop back addres or Link local address are not allowed. This causes us to setup a tunnel in the Real Server with the LVS as the remote address, here you can't use the VIP address since it's used inside the tunnel. Solution Use outgoing interface IPv6 address (match against the destination). i.e. use ip6_route_output() to look up the route cache and then use ipv6_dev_get_saddr(...) to set the source address of the encapsulated packet. Additionally, cache the results in new destination fields: dst_cookie and dst_saddr and properly check the returned dst from ip6_route_output. We now add xfrm_lookup call only for the tunneling method where the source address is a local one. Signed-off-by:Hans Schillstrom Signed-off-by: Patrick McHardy --- include/net/ip_vs.h | 4 + net/netfilter/ipvs/ip_vs_xmit.c | 171 +++++++++++++++++++++------------------- 2 files changed, 96 insertions(+), 79 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 52fbe2308c38..6e8a6192e574 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -529,6 +529,10 @@ struct ip_vs_dest { spinlock_t dst_lock; /* lock of dst_cache */ struct dst_entry *dst_cache; /* destination cache entry */ u32 dst_rtos; /* RT_TOS(tos) for dst */ + u32 dst_cookie; +#ifdef CONFIG_IP_VS_IPV6 + struct in6_addr dst_saddr; +#endif /* for virtual service */ struct ip_vs_service *svc; /* service it belongs to */ diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 8817afa34e6a..b0bd8afbf368 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -26,6 +26,7 @@ #include /* for ip_route_output */ #include #include +#include #include #include #include @@ -37,26 +38,27 @@ * Destination cache to speed up outgoing route lookup */ static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst, + u32 dst_cookie) { struct dst_entry *old_dst; old_dst = dest->dst_cache; dest->dst_cache = dst; dest->dst_rtos = rtos; + dest->dst_cookie = dst_cookie; dst_release(old_dst); } static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) { struct dst_entry *dst = dest->dst_cache; if (!dst) return NULL; - if ((dst->obsolete - || (dest->af == AF_INET && rtos != dest->dst_rtos)) && - dst->ops->check(dst, cookie) == NULL) { + if ((dst->obsolete || rtos != dest->dst_rtos) && + dst->ops->check(dst, dest->dst_cookie) == NULL) { dest->dst_cache = NULL; dst_release(dst); return NULL; @@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) } static struct rtable * -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos) { + struct net *net = dev_net(skb->dev); struct rtable *rt; /* Route to the other host */ struct ip_vs_dest *dest = cp->dest; if (dest) { spin_lock(&dest->dst_lock); if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos, 0))) { + __ip_vs_dst_check(dest, rtos))) { struct flowi fl = { .oif = 0, .nl_u = { @@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .tos = rtos, } }, }; - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(net, &rt, &fl)) { spin_unlock(&dest->dst_lock); IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &dest->addr.ip); return NULL; } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst)); + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n", &dest->addr.ip, atomic_read(&rt->dst.__refcnt), rtos); @@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .tos = rtos, } }, }; - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(net, &rt, &fl)) { IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &cp->daddr.ip); return NULL; @@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) } #ifdef CONFIG_IP_VS_IPV6 + +static struct dst_entry * +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, + struct in6_addr *ret_saddr, int do_xfrm) +{ + struct dst_entry *dst; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = *daddr, + }, + }, + }; + + dst = ip6_route_output(net, NULL, &fl); + if (dst->error) + goto out_err; + if (!ret_saddr) + return dst; + if (ipv6_addr_any(&fl.fl6_src) && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, + &fl.fl6_dst, 0, &fl.fl6_src) < 0) + goto out_err; + if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0) + goto out_err; + ipv6_addr_copy(ret_saddr, &fl.fl6_src); + return dst; + +out_err: + dst_release(dst); + IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); + return NULL; +} + static struct rt6_info * -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct in6_addr *ret_saddr, int do_xfrm) { + struct net *net = dev_net(skb->dev); struct rt6_info *rt; /* Route to the other host */ struct ip_vs_dest *dest = cp->dest; + struct dst_entry *dst; if (dest) { spin_lock(&dest->dst_lock); - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); + rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0); if (!rt) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = dest->addr.in6, - .saddr = { - .s6_addr32 = - { 0, 0, 0, 0 }, - }, - }, - }, - }; + u32 cookie; - rt = (struct rt6_info *)ip6_route_output(&init_net, - NULL, &fl); - if (!rt) { + dst = __ip_vs_route_output_v6(net, &dest->addr.in6, + &dest->dst_saddr, + do_xfrm); + if (!dst) { spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", - &dest->addr.in6); return NULL; } - __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst)); - IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n", - &dest->addr.in6, + rt = (struct rt6_info *) dst; + cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie); + IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", + &dest->addr.in6, &dest->dst_saddr, atomic_read(&rt->dst.__refcnt)); } + if (ret_saddr) + ipv6_addr_copy(ret_saddr, &dest->dst_saddr); spin_unlock(&dest->dst_lock); } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = cp->daddr.in6, - .saddr = { - .s6_addr32 = { 0, 0, 0, 0 }, - }, - }, - }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", - &cp->daddr.in6); + dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr, + do_xfrm); + if (!dst) return NULL; - } + rt = (struct rt6_info *) dst; } return rt; @@ -248,6 +268,7 @@ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { + struct net *net = dev_net(skb->dev); struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); u8 tos = iph->tos; @@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(net, &rt, &fl)) { IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n", __func__, &iph->daddr); goto tx_error_icmp; @@ -313,25 +334,18 @@ int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { + struct net *net = dev_net(skb->dev); + struct dst_entry *dst; struct rt6_info *rt; /* Route to the other host */ struct ipv6hdr *iph = ipv6_hdr(skb); int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = iph->daddr, - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, - }; EnterFunction(10); - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n", - __func__, &iph->daddr); + dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0); + if (!dst) goto tx_error_icmp; - } + rt = (struct rt6_info *) dst; /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) goto tx_error_icmp; /* MTU checking */ @@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - rt = __ip_vs_get_out_rt_v6(cp); + rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); if (!rt) goto tx_error_icmp; @@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct iphdr *old_iph = ip_hdr(skb); u8 tos = old_iph->tos; __be16 df = old_iph->frag_off; - sk_buff_data_t old_transport_header = skb->transport_header; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; @@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos)))) goto tx_error_icmp; tdev = rt->dst.dev; @@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, old_iph = ip_hdr(skb); } - skb->transport_header = old_transport_header; + skb->transport_header = skb->network_header; /* fix old IP header checksum */ ip_send_check(old_iph); @@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rt6_info *rt; /* Route to the other host */ + struct in6_addr saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ struct ipv6hdr *old_iph = ipv6_hdr(skb); - sk_buff_data_t old_transport_header = skb->transport_header; struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; @@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - rt = __ip_vs_get_out_rt_v6(cp); + rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1); if (!rt) goto tx_error_icmp; tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); - /* TODO IPv6: do we need this check in IPv6? */ - if (mtu < 1280) { + if (mtu < IPV6_MIN_MTU) { dst_release(&rt->dst); - IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__); + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); goto tx_error; } if (skb_dst(skb)) @@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, old_iph = ipv6_hdr(skb); } - skb->transport_header = old_transport_header; + skb->transport_header = skb->network_header; skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); @@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); iph->priority = old_iph->priority; memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); - iph->daddr = rt->rt6i_dst.addr; - iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ + ipv6_addr_copy(&iph->daddr, &cp->daddr.in6); + ipv6_addr_copy(&iph->saddr, &saddr); iph->hop_limit = old_iph->hop_limit; /* Another hack: avoid icmp_send in ip_fragment */ @@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) goto tx_error_icmp; /* MTU checking */ @@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rt = __ip_vs_get_out_rt_v6(cp); + rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); if (!rt) goto tx_error_icmp; @@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos)))) goto tx_error_icmp; /* MTU checking */ @@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - rt = __ip_vs_get_out_rt_v6(cp); + rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); if (!rt) goto tx_error_icmp; -- cgit v1.2.3 From 8b27b10f5863a5b63e46304a71aa01463d1efac4 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:17:20 +0300 Subject: ipvs: optimize checksums for apps Avoid full checksum calculation for apps that can provide info whether csum was broken after payload mangling. For now only ip_vs_ftp mangles payload and it updates the csum, so the full recalculation is avoided for all packets. Add CHECKSUM_UNNECESSARY for snat_handler (TCP and UDP). It is needed to support SNAT from local address for the case when csum is fully recalculated. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 12 ++++++++++-- net/netfilter/ipvs/ip_vs_ftp.c | 7 ++++++- net/netfilter/ipvs/ip_vs_proto_tcp.c | 31 +++++++++++++++++++++++++------ net/netfilter/ipvs/ip_vs_proto_udp.c | 31 +++++++++++++++++++++++++------ 4 files changed, 66 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 6e8a6192e574..adcdba9dd183 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -597,11 +597,19 @@ struct ip_vs_app { __be16 port; /* port number in net order */ atomic_t usecnt; /* usage counter */ - /* output hook: return false if can't linearize. diff set for TCP. */ + /* + * output hook: Process packet in inout direction, diff set for TCP. + * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, + * 2=Mangled but checksum was not updated + */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff); - /* input hook: return false if can't linearize. diff set for TCP. */ + /* + * input hook: Process packet in outin direction, diff set for TCP. + * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, + * 2=Mangled but checksum was not updated + */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 090889a3b3af..75455000ad1c 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -242,9 +242,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, start-data, end-start, buf, buf_len); - if (ret) + if (ret) { ip_vs_nfct_expect_related(skb, ct, n_cp, IPPROTO_TCP, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } } /* diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 318d011036db..64dc2954cf78 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -120,6 +120,7 @@ tcp_snat_handler(struct sk_buff *skb, struct tcphdr *tcph; unsigned int tcphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -134,13 +135,20 @@ tcp_snat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + if (!(ret = ip_vs_app_pkt_out(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; } tcph = (void *)skb_network_header(skb) + tcphoff; @@ -151,12 +159,13 @@ tcp_snat_handler(struct sk_buff *skb, tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - tcphoff)); - } else if (!cp->app) { + } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; @@ -174,6 +183,7 @@ tcp_snat_handler(struct sk_buff *skb, skb->len - tcphoff, cp->protocol, skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, @@ -190,6 +200,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct tcphdr *tcph; unsigned int tcphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -204,6 +215,8 @@ tcp_dnat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -212,8 +225,13 @@ tcp_dnat_handler(struct sk_buff *skb, * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ - if (!ip_vs_app_pkt_in(cp, skb)) + if (!(ret = ip_vs_app_pkt_in(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; } tcph = (void *)skb_network_header(skb) + tcphoff; @@ -226,12 +244,13 @@ tcp_dnat_handler(struct sk_buff *skb, tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); - } else if (!cp->app) { + } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index f9290893bd93..9c558c40bfbb 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -121,6 +121,7 @@ udp_snat_handler(struct sk_buff *skb, struct udphdr *udph; unsigned int udphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -135,6 +136,8 @@ udp_snat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -142,8 +145,13 @@ udp_snat_handler(struct sk_buff *skb, /* * Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + if (!(ret = ip_vs_app_pkt_out(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; } udph = (void *)skb_network_header(skb) + udphoff; @@ -156,12 +164,13 @@ udp_snat_handler(struct sk_buff *skb, udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { + } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; @@ -181,6 +190,7 @@ udp_snat_handler(struct sk_buff *skb, skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, udph->check, (char*)&(udph->check) - (char*)udph); @@ -196,6 +206,7 @@ udp_dnat_handler(struct sk_buff *skb, struct udphdr *udph; unsigned int udphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -210,6 +221,8 @@ udp_dnat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -218,8 +231,13 @@ udp_dnat_handler(struct sk_buff *skb, * Attempt ip_vs_app call. * It will fix ip_vs_conn */ - if (!ip_vs_app_pkt_in(cp, skb)) + if (!(ret = ip_vs_app_pkt_in(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; } udph = (void *)skb_network_header(skb) + udphoff; @@ -232,12 +250,13 @@ udp_dnat_handler(struct sk_buff *skb, udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { + } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; -- cgit v1.2.3 From cf356d69db0afef692cd640917bc70f708c27f14 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:21:07 +0300 Subject: ipvs: switch to notrack mode Change skb->ipvs_property semantic. This is preparation to support ip_vs_out processing in LOCAL_OUT. ipvs_property=1 will be used to avoid expensive lookups for traffic sent by transmitters. Now when conntrack support is not used we call ip_vs_notrack method to avoid problems in OUTPUT and POST_ROUTING hooks instead of exiting POST_ROUTING as before. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 20 +++++++++++++++++++- net/netfilter/ipvs/ip_vs_core.c | 39 ++++----------------------------------- net/netfilter/ipvs/ip_vs_xmit.c | 7 +++++-- 3 files changed, 28 insertions(+), 38 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index adcdba9dd183..0e4618470cee 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -25,7 +25,7 @@ #include #include /* for struct ipv6hdr */ #include /* for ipv6_addr_copy */ -#ifdef CONFIG_IP_VS_NFCT +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include #endif @@ -1021,6 +1021,24 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum) return csum_partial(diff, sizeof(diff), oldsum); } +/* + * Forget current conntrack (unconfirmed) and attach notrack entry + */ +static inline void ip_vs_notrack(struct sk_buff *skb) +{ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (!ct || !nf_ct_is_untracked(ct)) { + nf_reset(skb); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + } +#endif +} + #ifdef CONFIG_IP_VS_NFCT /* * Netfilter connection tracking diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index e5fef7aef0d4..222453029b9e 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -507,23 +507,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain and is used to avoid double NAT and confirmation when we do - * not want to keep the conntrack structure - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; -} - __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); @@ -682,8 +665,9 @@ static int handle_response_icmp(int af, struct sk_buff *skb, /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); + skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) - skb->ipvs_property = 1; + ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; @@ -929,8 +913,9 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) - skb->ipvs_property = 1; + ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); @@ -1496,14 +1481,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, - }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1532,14 +1509,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_NAT_SRC-1, - }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b0bd8afbf368..94b53b441028 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -217,6 +217,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) ({ \ int __ret = NF_ACCEPT; \ \ + (skb)->ipvs_property = 1; \ if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ __ret = ip_vs_confirm_conntrack(skb, cp); \ if (__ret == NF_ACCEPT) { \ @@ -228,8 +229,9 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) #define IP_VS_XMIT_NAT(pf, skb, cp) \ do { \ + (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - (skb)->ipvs_property = 1; \ + ip_vs_notrack(skb); \ else \ ip_vs_update_conntrack(skb, cp, 1); \ skb_forward_csum(skb); \ @@ -239,8 +241,9 @@ do { \ #define IP_VS_XMIT(pf, skb, cp) \ do { \ + (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - (skb)->ipvs_property = 1; \ + ip_vs_notrack(skb); \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ -- cgit v1.2.3 From 190ecd27cd7294105e3b26ca71663c7d940acbbb Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:24:37 +0300 Subject: ipvs: do not schedule conns from real servers This patch is needed to avoid scheduling of packets from local real server when we add ip_vs_in in LOCAL_OUT hook to support local client. Currently, when ip_vs_in can not find existing connection it tries to create new one by calling ip_vs_schedule. The default indication from ip_vs_schedule was if connection was scheduled to real server. If real server is not available we try to use the bypass forwarding method or to send ICMP error. But in some cases we do not want to use the bypass feature. So, add flag 'ignored' to indicate if the scheduler ignores this packet. Make sure we do not create new connections from replies. We can hit this problem for persistent services and local real server when ip_vs_in is added to LOCAL_OUT hook to handle local clients. Also, make sure ip_vs_schedule ignores SYN packets for Active FTP DATA from local real server. The FTP DATA connection should be created on SYN+ACK from client to assign correct connection daddr. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 ++- net/netfilter/ipvs/ip_vs_core.c | 34 ++++++++++++++++++++++++++++++++-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 6 ++++-- net/netfilter/ipvs/ip_vs_proto_tcp.c | 7 +++++-- net/netfilter/ipvs/ip_vs_proto_udp.c | 6 ++++-- 5 files changed, 47 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 0e4618470cee..9d5c1b965304 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -849,7 +849,8 @@ extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc); extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); extern struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb); +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp, int *ignored); extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_protocol *pp); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 222453029b9e..0090d6d25e95 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -342,7 +342,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * Protocols supported: TCP, UDP */ struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp, int *ignored) { struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; @@ -350,16 +351,43 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) __be16 _ports[2], *pptr; unsigned int flags; + *ignored = 1; ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NULL; + /* + * FTPDATA needs this check when using local real server. + * Never schedule Active FTPDATA connections from real server. + * For LVS-NAT they must be already created. For other methods + * with persistence the connection is created on SYN+ACK. + */ + if (pptr[0] == FTPDATA) { + IP_VS_DBG_PKT(12, pp, skb, 0, "Not scheduling FTPDATA"); + return NULL; + } + + /* + * Do not schedule replies from local real server. It is risky + * for fwmark services but mostly for persistent services. + */ + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && + (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { + IP_VS_DBG_PKT(12, pp, skb, 0, + "Not scheduling reply for existing connection"); + __ip_vs_conn_put(cp); + return NULL; + } + /* * Persistent service */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) + if (svc->flags & IP_VS_SVC_F_PERSISTENT) { + *ignored = 0; return ip_vs_sched_persist(svc, skb, pptr); + } /* * Non-persistent service @@ -372,6 +400,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) return NULL; } + *ignored = 0; + dest = svc->scheduler->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 4c0855cb006e..9ab5232ce019 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -31,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, if ((sch->type == SCTP_CID_INIT) && (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, sh->dest))) { + int ignored; + if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -44,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { + *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + if (!*cpp && !ignored) { *verdict = ip_vs_leave(svc, skb, pp); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 64dc2954cf78..85d80a66b492 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -43,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ if (th->syn && (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, th->dest))) { + int ignored; + if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -60,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { + *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + if (!*cpp && !ignored) { *verdict = ip_vs_leave(svc, skb, pp); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 9c558c40bfbb..5d21f08155ed 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -46,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, uh->dest); if (svc) { + int ignored; + if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -60,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { + *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + if (!*cpp && !ignored) { *verdict = ip_vs_leave(svc, skb, pp); return 0; } -- cgit v1.2.3 From fc604767613b6d2036cdc35b660bc39451040a47 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:38:15 +0300 Subject: ipvs: changes for local real server This patch deals with local real servers: - Add support for DNAT to local address (different real server port). It needs ip_vs_out hook in LOCAL_OUT for both families because skb->protocol is not set for locally generated packets and can not be used to set 'af'. - Skip packets in ip_vs_in marked with skb->ipvs_property because ip_vs_out processing can be executed in LOCAL_OUT but we still have the conn_out_get check in ip_vs_in. - Ignore packets with inet->nodefrag from local stack - Require skb_dst(skb) != NULL because we use it to get struct net - Add support for changing the route to local IPv4 stack after DNAT depending on the source address type. Local client sets output route and the remote client sets input route. It looks like IPv6 does not need such rerouting because the replies use addresses from initial incoming header, not from skb route. - All transmitters now have strict checks for the destination address type: redirect from non-local address to local real server requires NAT method, local address can not be used as source address when talking to remote real server. - Now LOCALNODE is not set explicitly as forwarding method in real server to allow the connections to provide correct forwarding method to the backup server. Not sure if this breaks tools that expect to see 'Local' real server type. If needed, this can be supported with new flag IP_VS_DEST_F_LOCAL. Now it should be possible connections in backup that lost their fwmark information during sync to be forwarded properly to their daddr, even if it is local address in the backup server. By this way backup could be used as real server for DR or TUN, for NAT there are some restrictions because tuple collisions in conntracks can create problems for the traffic. - Call ip_vs_dst_reset when destination is updated in case some real server IP type is changed between local and remote. [ horms@verge.net.au: removed trailing whitespace ] Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_core.c | 123 ++++++++++-- net/netfilter/ipvs/ip_vs_ctl.c | 18 +- net/netfilter/ipvs/ip_vs_xmit.c | 433 ++++++++++++++++++++++++++++++++-------- 4 files changed, 458 insertions(+), 117 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 9d5c1b965304..2f88d5942332 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -409,6 +409,7 @@ struct ip_vs_conn { /* packet transmitter for different forwarding methods. If it mangles the packet, it must return NF_DROP or better NF_STOLEN, otherwise this must be changed to a sk_buff **. + NF_ACCEPT can be returned when destination is local. */ int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c4f091d5a628..a6c8aff1b47e 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -984,26 +984,34 @@ drop: } /* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) { struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int af; EnterFunction(11); - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - + /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + + if (unlikely(!skb_dst(skb))) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1106,6 +1114,69 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, return handle_response(af, skb, pp, cp, iph.len); } +/* + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET6); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif /* * Handle ICMP messages in the outside-to-inside direction (incoming). @@ -1342,6 +1413,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, struct ip_vs_conn *cp; int ret, restart, af, pkts; + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); @@ -1525,13 +1600,13 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_IN, .priority = 100, }, - /* After packet filtering, change source only for VS/NAT */ + /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_out, + .hook = ip_vs_local_reply4, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 100, + .hooknum = NF_INET_LOCAL_OUT, + .priority = -99, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ @@ -1542,6 +1617,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1553,13 +1636,13 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_IN, .priority = 100, }, - /* After packet filtering, change source only for VS/NAT */ + /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_out, + .hook = ip_vs_local_reply6, .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 100, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = -99, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ @@ -1570,6 +1653,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 0b884d3e192f..5f5daa30b0af 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -777,20 +777,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; - /* check if local node and update the flags */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { - if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - } else -#endif - if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; @@ -824,6 +810,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; + spin_lock(&dest->dst_lock); + ip_vs_dst_reset(dest); + spin_unlock(&dest->dst_lock); + if (add) ip_vs_new_estimator(&dest->stats); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 63cc0feaaef6..8608882f89e3 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -67,12 +67,19 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) return dst; } +/* + * Get route to destination or remote server + * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest, + * &4=Allow redirect from remote daddr to local + */ static struct rtable * -__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos) +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, + __be32 daddr, u32 rtos, int rt_mode) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net(skb_dst(skb)->dev); struct rtable *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; + struct rtable *ort; /* Original route */ + int local; if (dest) { spin_lock(&dest->dst_lock); @@ -104,23 +111,95 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos) .oif = 0, .nl_u = { .ip4_u = { - .daddr = cp->daddr.ip, + .daddr = daddr, .saddr = 0, .tos = rtos, } }, }; if (ip_route_output_key(net, &rt, &fl)) { IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", - &cp->daddr.ip); + &daddr); return NULL; } } + local = rt->rt_flags & RTCF_LOCAL; + if (!((local ? 1 : 2) & rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", + (rt->rt_flags & RTCF_LOCAL) ? + "local":"non-local", &rt->rt_dst); + ip_rt_put(rt); + return NULL; + } + if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) && + ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " + "requires NAT method, dest: %pI4\n", + &ip_hdr(skb)->daddr, &rt->rt_dst); + ip_rt_put(rt); + return NULL; + } + if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 " + "to non-local address, dest: %pI4\n", + &ip_hdr(skb)->saddr, &rt->rt_dst); + ip_rt_put(rt); + return NULL; + } + return rt; } +/* Reroute packet to local IPv4 stack after DNAT */ +static int +__ip_vs_reroute_locally(struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + struct net *net = dev_net(dev); + struct iphdr *iph = ip_hdr(skb); + + if (rt->fl.iif) { + unsigned long orefdst = skb->_skb_refdst; + + if (ip_route_input(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) + return 0; + refdst_drop(orefdst); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos), + } + }, + .mark = skb->mark, + }; + struct rtable *rt; + + if (ip_route_output_key(net, &rt, &fl)) + return 0; + if (!(rt->rt_flags & RTCF_LOCAL)) { + ip_rt_put(rt); + return 0; + } + /* Drop old route. */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } + return 1; +} + #ifdef CONFIG_IP_VS_IPV6 +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK; +} + static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, struct in6_addr *ret_saddr, int do_xfrm) @@ -155,14 +234,21 @@ out_err: return NULL; } +/* + * Get route to destination or remote server + * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest, + * &4=Allow redirect from remote daddr to local + */ static struct rt6_info * -__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct in6_addr *ret_saddr, int do_xfrm) +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, + struct in6_addr *daddr, struct in6_addr *ret_saddr, + int do_xfrm, int rt_mode) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net(skb_dst(skb)->dev); struct rt6_info *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; + struct rt6_info *ort; /* Original route */ struct dst_entry *dst; + int local; if (dest) { spin_lock(&dest->dst_lock); @@ -188,13 +274,38 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ipv6_addr_copy(ret_saddr, &dest->dst_saddr); spin_unlock(&dest->dst_lock); } else { - dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr, - do_xfrm); + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); if (!dst) return NULL; rt = (struct rt6_info *) dst; } + local = __ip_vs_is_local_route6(rt); + if (!((local ? 1 : 2) & rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n", + local ? "local":"non-local", daddr); + dst_release(&rt->dst); + return NULL; + } + if (local && !(rt_mode & 4) && + !((ort = (struct rt6_info *) skb_dst(skb)) && + __ip_vs_is_local_route6(ort))) { + IP_VS_DBG_RL("Redirect from non-local address %pI6 to local " + "requires NAT method, dest: %pI6\n", + &ipv6_hdr(skb)->daddr, daddr); + dst_release(&rt->dst); + return NULL; + } + if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 " + "to non-local address, dest: %pI6\n", + &ipv6_hdr(skb)->saddr, daddr); + dst_release(&rt->dst); + return NULL; + } + return rt; } #endif @@ -227,23 +338,27 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) __ret; \ }) -#define IP_VS_XMIT_NAT(pf, skb, cp) \ +#define IP_VS_XMIT_NAT(pf, skb, cp, local) \ do { \ (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ ip_vs_notrack(skb); \ else \ ip_vs_update_conntrack(skb, cp, 1); \ + if (local) \ + return NF_ACCEPT; \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ } while (0) -#define IP_VS_XMIT(pf, skb, cp) \ +#define IP_VS_XMIT(pf, skb, cp, local) \ do { \ (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ ip_vs_notrack(skb); \ + if (local) \ + return NF_ACCEPT; \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ @@ -258,7 +373,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { /* we do not touch skb and do not need pskb ptr */ - return NF_ACCEPT; + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); } @@ -271,27 +386,15 @@ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { - struct net *net = dev_net(skb->dev); struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); - u8 tos = iph->tos; int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = iph->daddr, - .saddr = 0, - .tos = RT_TOS(tos), } }, - }; EnterFunction(10); - if (ip_route_output_key(net, &rt, &fl)) { - IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n", - __func__, &iph->daddr); + if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, + RT_TOS(iph->tos), 2))) goto tx_error_icmp; - } /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -319,7 +422,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -337,18 +440,14 @@ int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { - struct net *net = dev_net(skb->dev); - struct dst_entry *dst; struct rt6_info *rt; /* Route to the other host */ struct ipv6hdr *iph = ipv6_hdr(skb); int mtu; EnterFunction(10); - dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0); - if (!dst) + if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2))) goto tx_error_icmp; - rt = (struct rt6_info *) dst; /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -376,7 +475,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -401,6 +500,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ int mtu; struct iphdr *iph = ip_hdr(skb); + int local; EnterFunction(10); @@ -414,16 +514,40 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(iph->tos), 1|2|4))) goto tx_error_icmp; + local = rt->rt_flags & RTCF_LOCAL; + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, pp, skb, 0, "ip_vs_nat_xmit(): " + "stopping DNAT to local address"); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) { + IP_VS_DBG_RL_PKT(1, pp, skb, 0, "ip_vs_nat_xmit(): " + "stopping DNAT to loopback address"); + goto tx_error_put; + } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -433,16 +557,27 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; + goto tx_error_put; ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); + if (!local) { + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + ip_rt_put(rt); + /* + * Some IPv4 replies get local address from routes, + * not from iph, so while we DNAT after routing + * we need this second input/output route. + */ + if (!__ip_vs_reroute_locally(skb)) + goto tx_error; + } + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length @@ -452,7 +587,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); LeaveFunction(10); return NF_STOLEN; @@ -475,6 +610,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, { struct rt6_info *rt; /* Route to the other host */ int mtu; + int local; EnterFunction(10); @@ -489,18 +625,44 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, 1|2|4))) goto tx_error_icmp; + local = __ip_vs_is_local_route6(rt); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to local address"); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG_RL_PKT(1, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to loopback address"); + goto tx_error_put; + } /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -510,14 +672,19 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) goto tx_error; - ipv6_hdr(skb)->daddr = cp->daddr.in6; + ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6); + + if (!local || !skb->dev) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + /* destined to loopback, do we need to change route? */ + dst_release(&rt->dst); + } IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); @@ -528,7 +695,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); LeaveFunction(10); return NF_STOLEN; @@ -588,16 +755,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(tos), 1|2))) goto tx_error_icmp; + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + } tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { - ip_rt_put(rt); IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); - goto tx_error; + goto tx_error_put; } if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); @@ -607,9 +778,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if ((old_iph->frag_off & htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* @@ -678,6 +848,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; +tx_error_put: + ip_rt_put(rt); + goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 @@ -703,27 +876,29 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, 1, 1|2))) goto tx_error_icmp; + if (__ip_vs_is_local_route6(rt)) { + dst_release(&rt->dst); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + } tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (mtu < IPV6_MIN_MTU) { - dst_release(&rt->dst); IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); - goto tx_error; + goto tx_error_put; } if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* @@ -789,6 +964,9 @@ tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; +tx_error_put: + dst_release(&rt->dst); + goto tx_error; } #endif @@ -807,8 +985,13 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(iph->tos), 1|2))) goto tx_error_icmp; + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + } /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -836,7 +1019,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -859,9 +1042,13 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, 1|2))) goto tx_error_icmp; + if (__ip_vs_is_local_route6(rt)) { + dst_release(&rt->dst); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + } /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -889,7 +1076,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -915,6 +1102,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ int mtu; int rc; + int local; EnterFunction(10); @@ -935,16 +1123,43 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(ip_hdr(skb)->tos), 1|2|4))) goto tx_error_icmp; + local = rt->rt_flags & RTCF_LOCAL; + + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error_put; + } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { - ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -954,16 +1169,27 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - ip_vs_nat_icmp(skb, pp, cp, 0); + if (!local) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + ip_rt_put(rt); + /* + * Some IPv4 replies get local address from routes, + * not from iph, so while we DNAT after routing + * we need this second input/output route. + */ + if (!__ip_vs_reroute_locally(skb)) + goto tx_error; + } + /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); rc = NF_STOLEN; goto out; @@ -989,6 +1215,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct rt6_info *rt; /* Route to the other host */ int mtu; int rc; + int local; EnterFunction(10); @@ -1009,17 +1236,44 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, 1|2|4))) goto tx_error_icmp; + local = __ip_vs_is_local_route6(rt); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error_put; + } + /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -1029,16 +1283,21 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - ip_vs_nat_icmp_v6(skb, pp, cp, 0); + if (!local || !skb->dev) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + /* destined to loopback, do we need to change route? */ + dst_release(&rt->dst); + } + /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); rc = NF_STOLEN; goto out; -- cgit v1.2.3 From 0d79641a96d612aaa6d57a4d4f521d7ed9c9ccdd Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:46:17 +0300 Subject: ipvs: provide address family for debugging As skb->protocol is not valid in LOCAL_OUT add parameter for address family in packet debugging functions. Even if ports are not present in AH and ESP change them to use ip_vs_tcpudp_debug_packet to show at least valid addresses as before. This patch removes the last user of skb->protocol in IPVS. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 17 ++++++----- net/netfilter/ipvs/ip_vs_core.c | 41 +++++++++++++++----------- net/netfilter/ipvs/ip_vs_proto.c | 8 ++--- net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 52 ++------------------------------- net/netfilter/ipvs/ip_vs_proto_sctp.c | 2 +- net/netfilter/ipvs/ip_vs_proto_tcp.c | 4 +-- net/netfilter/ipvs/ip_vs_proto_udp.c | 4 +-- net/netfilter/ipvs/ip_vs_xmit.c | 18 +++++++----- 8 files changed, 54 insertions(+), 92 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 2f88d5942332..b7bbd6c28cfa 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -136,24 +136,24 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, if (net_ratelimit()) \ printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ } while (0) -#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg) \ +#define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level()) \ - pp->debug_packet(pp, skb, ofs, msg); \ + pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) -#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg) \ +#define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level() && \ net_ratelimit()) \ - pp->debug_packet(pp, skb, ofs, msg); \ + pp->debug_packet(af, pp, skb, ofs, msg); \ } while (0) #else /* NO DEBUGGING at ALL */ #define IP_VS_DBG_BUF(level, msg...) do {} while (0) #define IP_VS_ERR_BUF(msg...) do {} while (0) #define IP_VS_DBG(level, msg...) do {} while (0) #define IP_VS_DBG_RL(msg...) do {} while (0) -#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg) do {} while (0) -#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg) do {} while (0) +#define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) do {} while (0) +#define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #endif #define IP_VS_BUG() BUG() @@ -345,7 +345,7 @@ struct ip_vs_protocol { int (*app_conn_bind)(struct ip_vs_conn *cp); - void (*debug_packet)(struct ip_vs_protocol *pp, + void (*debug_packet)(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg); @@ -828,7 +828,8 @@ extern int ip_vs_set_state_timeout(int *table, int num, const char *const *names, const char *name, int to); extern void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, + const struct sk_buff *skb, int offset, const char *msg); extern struct ip_vs_protocol ip_vs_protocol_tcp; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5fbcf67af8ec..b4e51e9c5a04 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -365,7 +365,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * with persistence the connection is created on SYN+ACK. */ if (pptr[0] == FTPDATA) { - IP_VS_DBG_PKT(12, pp, skb, 0, "Not scheduling FTPDATA"); + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling FTPDATA"); return NULL; } @@ -376,7 +377,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { - IP_VS_DBG_PKT(12, pp, skb, 0, + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, "Not scheduling reply for existing connection"); __ip_vs_conn_put(cp); return NULL; @@ -617,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, skb->ip_summed = CHECKSUM_UNNECESSARY; if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMP"); else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMP"); } @@ -662,11 +663,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, skb->ip_summed = CHECKSUM_PARTIAL; if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMPv6"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMPv6"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); } #endif @@ -798,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking outgoing ICMP for"); offset += cih->ihl * 4; @@ -874,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking outgoing ICMPv6 for"); offset += sizeof(struct ipv6hdr); @@ -922,7 +927,7 @@ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int ihl) { - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); if (!skb_make_writable(skb, ihl)) goto drop; @@ -967,7 +972,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_route_me_harder(skb, RTN_LOCAL) != 0) goto drop; - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); @@ -1117,7 +1122,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) } } } - IP_VS_DBG_PKT(12, pp, skb, 0, + IP_VS_DBG_PKT(12, af, pp, skb, 0, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; } @@ -1253,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking incoming ICMP for"); offset += cih->ihl * 4; @@ -1364,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking incoming ICMPv6 for"); offset += sizeof(struct ipv6hdr); @@ -1492,12 +1499,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely(!cp)) { /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, pp, skb, 0, + IP_VS_DBG_PKT(12, af, pp, skb, 0, "ip_vs_in: packet continues traversal as normal"); return NF_ACCEPT; } - IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 027f654799fe..c53998390877 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -172,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, else if (ih->frag_off & htons(IP_OFFSET)) sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); else { - __be16 _ports[2], *pptr -; + __be16 _ports[2], *pptr; + pptr = skb_header_pointer(skb, offset + ih->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) @@ -223,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) + if (af == AF_INET6) ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); else #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 8956ef33ea6c..3a0461117d3f 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -117,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } - -static void -ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "TRUNCATED"); - else - sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr); - - pr_debug("%s: %s %s\n", msg, pp->name, buf); -} - -#ifdef CONFIG_IP_VS_IPV6 -static void -ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct ipv6hdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "TRUNCATED"); - else - sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr); - - pr_debug("%s: %s %s\n", msg, pp->name, buf); -} -#endif - -static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) - ah_esp_debug_packet_v6(pp, skb, offset, msg); - else -#endif - ah_esp_debug_packet_v4(pp, skb, offset, msg); -} - - static void ah_esp_init(struct ip_vs_protocol *pp) { /* nothing to do now */ @@ -195,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = { .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, + .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ .set_state_timeout = NULL, }; @@ -219,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = { .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, + .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ }; #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 9ab5232ce019..d254345bfda7 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -176,7 +176,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) if (val != cmp) { /* CRC failure, dump it. */ - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 85d80a66b492..f6c5200e2146 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -300,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - tcphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } @@ -311,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - tcphoff, ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 5d21f08155ed..9d106a06bb0a 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -314,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - udphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } @@ -325,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - udphoff, ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 97b5361c036e..de04ea39cde8 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -543,7 +543,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, pp, skb, 0, "ip_vs_nat_xmit(): " + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): " "stopping DNAT to local address"); goto tx_error_put; } @@ -552,7 +553,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) { - IP_VS_DBG_RL_PKT(1, pp, skb, 0, "ip_vs_nat_xmit(): " + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to loopback address"); goto tx_error_put; } @@ -561,7 +562,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); + IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): frag needed for"); goto tx_error_put; } @@ -593,7 +595,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still @@ -654,7 +656,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, pp, skb, 0, + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); goto tx_error_put; @@ -665,7 +667,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { - IP_VS_DBG_RL_PKT(1, pp, skb, 0, + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); goto tx_error_put; @@ -680,7 +682,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->dev = net->loopback_dev; } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); goto tx_error_put; } @@ -706,7 +708,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, dst_release(&rt->dst); } - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still -- cgit v1.2.3 From 106e4c26b1529e559d1aae777f11b4f8f7bafc26 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 12:45:14 +0200 Subject: tproxy: kick out TIME_WAIT sockets in case a new connection comes in with the same tuple Without tproxy redirections an incoming SYN kicks out conflicting TIME_WAIT sockets, in order to handle clients that reuse ports within the TIME_WAIT period. The same mechanism didn't work in case TProxy is involved in finding the proper socket, as the time_wait processing code looked up the listening socket assuming that the listener addr/port matches those of the established connection. This is not the case with TProxy as the listener addr/port is possibly changed with the tproxy rule. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_tproxy_core.h | 6 ++- net/netfilter/nf_tproxy_core.c | 29 +++++++++++---- net/netfilter/xt_TPROXY.c | 68 +++++++++++++++++++++++++++++++--- net/netfilter/xt_socket.c | 2 +- 4 files changed, 90 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h index 208b46f4d6d2..b3a8942a4e6a 100644 --- a/include/net/netfilter/nf_tproxy_core.h +++ b/include/net/netfilter/nf_tproxy_core.h @@ -8,12 +8,16 @@ #include #include +#define NFT_LOOKUP_ANY 0 +#define NFT_LOOKUP_LISTENER 1 +#define NFT_LOOKUP_ESTABLISHED 2 + /* look up and get a reference to a matching socket */ extern struct sock * nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, - const struct net_device *in, bool listening); + const struct net_device *in, int lookup_type); static inline void nf_tproxy_put_sock(struct sock *sk) diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c index 5490fc37c92d..8589e5e08115 100644 --- a/net/netfilter/nf_tproxy_core.c +++ b/net/netfilter/nf_tproxy_core.c @@ -22,21 +22,34 @@ struct sock * nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, - const struct net_device *in, bool listening_only) + const struct net_device *in, int lookup_type) { struct sock *sk; /* look up socket */ switch (protocol) { case IPPROTO_TCP: - if (listening_only) - sk = __inet_lookup_listener(net, &tcp_hashinfo, - daddr, ntohs(dport), - in->ifindex); - else + switch (lookup_type) { + case NFT_LOOKUP_ANY: sk = __inet_lookup(net, &tcp_hashinfo, saddr, sport, daddr, dport, in->ifindex); + break; + case NFT_LOOKUP_LISTENER: + sk = inet_lookup_listener(net, &tcp_hashinfo, + daddr, dport, + in->ifindex); + break; + case NFT_LOOKUP_ESTABLISHED: + sk = inet_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + default: + WARN_ON(1); + sk = NULL; + break; + } break; case IPPROTO_UDP: sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, @@ -47,8 +60,8 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, sk = NULL; } - pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n", - protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk); + pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", + protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); return sk; } diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 21bb2aff6b8f..e0b6900a92c1 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -24,6 +24,57 @@ #include #include +/** + * tproxy_handle_time_wait() - handle TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @par: Iptables target parameters. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * tproxy_handle_time_wait() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +static struct sock * +tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par, struct sock *sk) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct xt_tproxy_target_info *tgi = par->targinfo; + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, + hp->source, tgi->lport ? tgi->lport : hp->dest, + par->in, NFT_LOOKUP_LISTENER); + if (sk2) { + /* yeah, there's one, let's kill the TIME_WAIT + * socket and redirect to the listener + */ + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} + static unsigned int tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par) { @@ -37,11 +88,18 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par) return NF_DROP; sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, - iph->saddr, - tgi->laddr ? tgi->laddr : iph->daddr, - hp->source, - tgi->lport ? tgi->lport : hp->dest, - par->in, true); + iph->saddr, iph->daddr, + hp->source, hp->dest, + par->in, NFT_LOOKUP_ESTABLISHED); + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) + sk = tproxy_handle_time_wait(skb, par, sk); + else if (!sk) + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, + hp->source, tgi->lport ? tgi->lport : hp->dest, + par->in, NFT_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ if (sk && nf_tproxy_assign_sock(skb, sk)) { diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 1ca89908cbad..266faa0a2fbf 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -142,7 +142,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, #endif sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, - saddr, daddr, sport, dport, par->in, false); + saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); if (sk != NULL) { bool wildcard; bool transparent = true; -- cgit v1.2.3 From 6006db84a91838813cdad8a6622a4e39efe9ea47 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 12:47:34 +0200 Subject: tproxy: add lookup type checks for UDP in nf_tproxy_get_sock_v4() Also, inline this function as the lookup_type is always a literal and inlining removes branches performed at runtime. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_tproxy_core.h | 116 ++++++++++++++++++++++++++++++++- net/netfilter/nf_tproxy_core.c | 48 -------------- 2 files changed, 114 insertions(+), 50 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h index b3a8942a4e6a..1027d7f901a0 100644 --- a/include/net/netfilter/nf_tproxy_core.h +++ b/include/net/netfilter/nf_tproxy_core.h @@ -13,11 +13,123 @@ #define NFT_LOOKUP_ESTABLISHED 2 /* look up and get a reference to a matching socket */ -extern struct sock * + + +/* This function is used by the 'TPROXY' target and the 'socket' + * match. The following lookups are supported: + * + * Explicit TProxy target rule + * =========================== + * + * This is used when the user wants to intercept a connection matching + * an explicit iptables rule. In this case the sockets are assumed + * matching in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple, it is returned, assuming the redirection + * already took place and we process a packet belonging to an + * established connection + * + * - match: if there's a listening socket matching the redirection + * (e.g. on-port & on-ip of the connection), it is returned, + * regardless if it was bound to 0.0.0.0 or an explicit + * address. The reasoning is that if there's an explicit rule, it + * does not really matter if the listener is bound to an interface + * or to 0. The user already stated that he wants redirection + * (since he added the rule). + * + * "socket" match based redirection (no specific rule) + * =================================================== + * + * There are connections with dynamic endpoints (e.g. FTP data + * connection) that the user is unable to add explicit rules + * for. These are taken care of by a generic "socket" rule. It is + * assumed that the proxy application is trusted to open such + * connections without explicit iptables rule (except of course the + * generic 'socket' rule). In this case the following sockets are + * matched in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple + * + * - match: if there's a non-zero bound listener (possibly with a + * non-local address) We don't accept zero-bound listeners, since + * then local services could intercept traffic going through the + * box. + * + * Please note that there's an overlap between what a TPROXY target + * and a socket match will match. Normally if you have both rules the + * "socket" match will be the first one, effectively all packets + * belonging to established connections going through that one. + */ +static inline struct sock * nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, - const struct net_device *in, int lookup_type); + const struct net_device *in, int lookup_type) +{ + struct sock *sk; + + /* look up socket */ + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NFT_LOOKUP_ANY: + sk = __inet_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + case NFT_LOOKUP_LISTENER: + sk = inet_lookup_listener(net, &tcp_hashinfo, + daddr, dport, + in->ifindex); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too */ + + break; + case NFT_LOOKUP_ESTABLISHED: + sk = inet_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + default: + WARN_ON(1); + sk = NULL; + break; + } + break; + case IPPROTO_UDP: + sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk && lookup_type != NFT_LOOKUP_ANY) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too */ + if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NFT_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", + protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); + + return sk; +} + static inline void nf_tproxy_put_sock(struct sock *sk) diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c index 8589e5e08115..db655638d76d 100644 --- a/net/netfilter/nf_tproxy_core.c +++ b/net/netfilter/nf_tproxy_core.c @@ -18,54 +18,6 @@ #include #include -struct sock * -nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, - const __be32 saddr, const __be32 daddr, - const __be16 sport, const __be16 dport, - const struct net_device *in, int lookup_type) -{ - struct sock *sk; - - /* look up socket */ - switch (protocol) { - case IPPROTO_TCP: - switch (lookup_type) { - case NFT_LOOKUP_ANY: - sk = __inet_lookup(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - case NFT_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, &tcp_hashinfo, - daddr, dport, - in->ifindex); - break; - case NFT_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - default: - WARN_ON(1); - sk = NULL; - break; - } - break; - case IPPROTO_UDP: - sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, - in->ifindex); - break; - default: - WARN_ON(1); - sk = NULL; - } - - pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", - protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); - - return sk; -} -EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4); static void nf_tproxy_destructor(struct sk_buff *skb) -- cgit v1.2.3 From 093d282321daeb19c107e5f1f16d7f68484f3ade Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 13:06:43 +0200 Subject: tproxy: fix hash locking issue when using port redirection in __inet_inherit_port() When __inet_inherit_port() is called on a tproxy connection the wrong locks are held for the inet_bind_bucket it is added to. __inet_inherit_port() made an implicit assumption that the listener's port number (and thus its bind bucket). Unfortunately, if you're using the TPROXY target to redirect skbs to a transparent proxy that assumption is not true anymore and things break. This patch adds code to __inet_inherit_port() so that it can handle this case by looking up or creating a new bind bucket for the child socket and updates callers of __inet_inherit_port() to gracefully handle __inet_inherit_port() failing. Reported by and original patch from Stephen Buck . See http://marc.info/?t=128169268200001&r=1&w=2 for the original discussion. Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- include/net/inet_hashtables.h | 2 +- net/dccp/ipv4.c | 10 +++++++--- net/dccp/ipv6.c | 10 +++++++--- net/ipv4/inet_hashtables.c | 28 ++++++++++++++++++++++++++-- net/ipv4/tcp_ipv4.c | 10 +++++++--- net/ipv6/tcp_ipv6.c | 12 ++++++++---- 6 files changed, 56 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 74358d1b3f43..e9c2ed8af864 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -245,7 +245,7 @@ static inline int inet_sk_listen_hashfn(const struct sock *sk) } /* Caller must disable local BH processing. */ -extern void __inet_inherit_port(struct sock *sk, struct sock *child); +extern int __inet_inherit_port(struct sock *sk, struct sock *child); extern void inet_put_port(struct sock *sk); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d4a166f0f391..3f69ea114829 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -392,7 +392,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, newsk = dccp_create_openreq_child(sk, req, skb); if (newsk == NULL) - goto exit; + goto exit_nonewsk; sk_setup_caps(newsk, dst); @@ -409,16 +409,20 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, dccp_sync_mss(newsk, dst_mtu(dst)); + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto exit; + } __inet_hash_nolisten(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; exit_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +exit_nonewsk: + dst_release(dst); exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - dst_release(dst); return NULL; } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6e3f32575df7..dca711df9b60 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -564,7 +564,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, newsk = dccp_create_openreq_child(sk, req, skb); if (newsk == NULL) - goto out; + goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks @@ -632,18 +632,22 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto out; + } __inet6_hash(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; out_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: + dst_release(dst); out: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); if (opt != NULL && opt != np->opt) sock_kfree_s(sk, opt, opt->tot_len); - dst_release(dst); return NULL; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fb7ad5a21ff3..1b344f30b463 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -101,19 +101,43 @@ void inet_put_port(struct sock *sk) } EXPORT_SYMBOL(inet_put_port); -void __inet_inherit_port(struct sock *sk, struct sock *child) +int __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, + unsigned short port = inet_sk(child)->inet_num; + const int bhash = inet_bhashfn(sock_net(sk), port, table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; + if (tb->port != port) { + /* NOTE: using tproxy and redirecting skbs to a proxy + * on a different listener port breaks the assumption + * that the listener socket's icsk_bind_hash is the same + * as that of the child socket. We have to look up or + * create a new bind bucket for the child here. */ + struct hlist_node *node; + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (net_eq(ib_net(tb), sock_net(sk)) && + tb->port == port) + break; + } + if (!node) { + tb = inet_bind_bucket_create(table->bind_bucket_cachep, + sock_net(sk), head, port); + if (!tb) { + spin_unlock(&head->lock); + return -ENOMEM; + } + } + } sk_add_bind_node(child, &tb->owners); inet_csk(child)->icsk_bind_hash = tb; spin_unlock(&head->lock); + + return 0; } EXPORT_SYMBOL_GPL(__inet_inherit_port); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a0232f3a358b..8f8527d41682 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1422,7 +1422,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) - goto exit; + goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(newsk, dst); @@ -1469,16 +1469,20 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto exit; + } __inet_hash_nolisten(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; exit_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +exit_nonewsk: + dst_release(dst); exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - dst_release(dst); return NULL; } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fe6d40418c0b..ba5258ef1c57 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1409,7 +1409,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk = tcp_create_openreq_child(sk, req, skb); if (newsk == NULL) - goto out; + goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks @@ -1497,18 +1497,22 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto out; + } __inet6_hash(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; out_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -out: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); +out_nonewsk: if (opt && opt != np->opt) sock_kfree_s(sk, opt, opt->tot_len); dst_release(dst); +out: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; } -- cgit v1.2.3 From e97c3e278e951501c2f385de70c3ceacdea78c4a Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:03:43 +0200 Subject: tproxy: split off ipv6 defragmentation to a separate module Like with IPv4, TProxy needs IPv6 defragmentation but does not require connection tracking. Since defragmentation was coupled with conntrack, I split off the two, creating an nf_defrag_ipv6 module, similar to the already existing nf_defrag_ipv4. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- include/net/netfilter/ipv6/nf_defrag_ipv6.h | 6 ++ net/ipv6/netfilter/Makefile | 5 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 78 +-------------- net/ipv6/netfilter/nf_conntrack_reasm.c | 14 ++- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 131 +++++++++++++++++++++++++ 5 files changed, 156 insertions(+), 78 deletions(-) create mode 100644 include/net/netfilter/ipv6/nf_defrag_ipv6.h create mode 100644 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c (limited to 'include/net') diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h new file mode 100644 index 000000000000..94dd54d76b48 --- /dev/null +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -0,0 +1,6 @@ +#ifndef _NF_DEFRAG_IPV6_H +#define _NF_DEFRAG_IPV6_H + +extern void nf_defrag_ipv6_enable(void); + +#endif /* _NF_DEFRAG_IPV6_H */ diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index aafbba30c899..3f8e4a3d83ce 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -11,10 +11,11 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o # objects for l3 independent conntrack -nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o +nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o +nf_defrag_ipv6-objs := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o # l3 independent conntrack -obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o +obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index ff43461704be..c8af58b22562 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -29,6 +28,7 @@ #include #include #include +#include #include static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, @@ -189,53 +189,6 @@ out: return nf_conntrack_confirm(skb); } -static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, - struct sk_buff *skb) -{ - u16 zone = NF_CT_DEFAULT_ZONE; - - if (skb->nfct) - zone = nf_ct_zone((struct nf_conn *)skb->nfct); - -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) - return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; -#endif - if (hooknum == NF_INET_PRE_ROUTING) - return IP6_DEFRAG_CONNTRACK_IN + zone; - else - return IP6_DEFRAG_CONNTRACK_OUT + zone; - -} - -static unsigned int ipv6_defrag(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct sk_buff *reasm; - - /* Previously seen (loopback)? */ - if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) - return NF_ACCEPT; - - reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); - /* queued */ - if (reasm == NULL) - return NF_STOLEN; - - /* error occured or not fragmented */ - if (reasm == skb) - return NF_ACCEPT; - - nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, - (struct net_device *)out, okfn); - - return NF_STOLEN; -} - static unsigned int __ipv6_conntrack_in(struct net *net, unsigned int hooknum, struct sk_buff *skb, @@ -287,13 +240,6 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum, } static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { - { - .hook = ipv6_defrag, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, - }, { .hook = ipv6_conntrack_in, .owner = THIS_MODULE, @@ -308,13 +254,6 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK, }, - { - .hook = ipv6_defrag, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, - }, { .hook = ipv6_confirm, .owner = THIS_MODULE, @@ -386,10 +325,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { .nlattr_tuple_size = ipv6_nlattr_tuple_size, .nlattr_to_tuple = ipv6_nlattr_to_tuple, .nla_policy = ipv6_nla_policy, -#endif -#ifdef CONFIG_SYSCTL - .ctl_table_path = nf_net_netfilter_sysctl_path, - .ctl_table = nf_ct_ipv6_sysctl_table, #endif .me = THIS_MODULE, }; @@ -403,16 +338,12 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) int ret = 0; need_conntrack(); + nf_defrag_ipv6_enable(); - ret = nf_ct_frag6_init(); - if (ret < 0) { - pr_err("nf_conntrack_ipv6: can't initialize frag6.\n"); - return ret; - } ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6); if (ret < 0) { pr_err("nf_conntrack_ipv6: can't register tcp.\n"); - goto cleanup_frag6; + return ret; } ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6); @@ -450,8 +381,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); cleanup_tcp: nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); - cleanup_frag6: - nf_ct_frag6_cleanup(); return ret; } @@ -463,7 +392,6 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void) nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); - nf_ct_frag6_cleanup(); } module_init(nf_conntrack_l3proto_ipv6_init); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 138a8b362706..489d71b844ac 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -73,7 +73,7 @@ static struct inet_frags nf_frags; static struct netns_frags nf_init_frags; #ifdef CONFIG_SYSCTL -struct ctl_table nf_ct_ipv6_sysctl_table[] = { +struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", .data = &nf_init_frags.timeout, @@ -97,6 +97,8 @@ struct ctl_table nf_ct_ipv6_sysctl_table[] = { }, { } }; + +static struct ctl_table_header *nf_ct_frag6_sysctl_header; #endif static unsigned int nf_hashfn(struct inet_frag_queue *q) @@ -623,11 +625,21 @@ int nf_ct_frag6_init(void) inet_frags_init_net(&nf_init_frags); inet_frags_init(&nf_frags); + nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path, + nf_ct_frag6_sysctl_table); + if (!nf_ct_frag6_sysctl_header) { + inet_frags_fini(&nf_frags); + return -ENOMEM; + } + return 0; } void nf_ct_frag6_cleanup(void) { + unregister_sysctl_table(nf_ct_frag6_sysctl_header); + nf_ct_frag6_sysctl_header = NULL; + inet_frags_fini(&nf_frags); nf_init_frags.low_thresh = 0; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c new file mode 100644 index 000000000000..99abfb53bab9 --- /dev/null +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -0,0 +1,131 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, + struct sk_buff *skb) +{ + u16 zone = NF_CT_DEFAULT_ZONE; + + if (skb->nfct) + zone = nf_ct_zone((struct nf_conn *)skb->nfct); + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge && + skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; +#endif + if (hooknum == NF_INET_PRE_ROUTING) + return IP6_DEFRAG_CONNTRACK_IN + zone; + else + return IP6_DEFRAG_CONNTRACK_OUT + zone; + +} + +static unsigned int ipv6_defrag(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm; + + /* Previously seen (loopback)? */ + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) + return NF_ACCEPT; + + reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); + /* queued */ + if (reasm == NULL) + return NF_STOLEN; + + /* error occured or not fragmented */ + if (reasm == skb) + return NF_ACCEPT; + + nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, + (struct net_device *)out, okfn); + + return NF_STOLEN; +} + +static struct nf_hook_ops ipv6_defrag_ops[] = { + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, +}; + +static int __init nf_defrag_init(void) +{ + int ret = 0; + + ret = nf_ct_frag6_init(); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); + return ret; + } + ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't register hooks\n"); + goto cleanup_frag6; + } + return ret; + +cleanup_frag6: + nf_ct_frag6_cleanup(); + return ret; + +} + +static void __exit nf_defrag_fini(void) +{ + nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + nf_ct_frag6_cleanup(); +} + +void nf_defrag_ipv6_enable(void) +{ +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); + +module_init(nf_defrag_init); +module_exit(nf_defrag_fini); + +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From aa976fc011efa1f0e3290c6c9addf7c20757f885 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:05:41 +0200 Subject: tproxy: added udp6_lib_lookup function Just like with IPv4, we need access to the UDP hash table to look up local sockets, but instead of exporting the global udp_table, export a lookup function. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- include/net/udp.h | 3 +++ net/ipv6/udp.c | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index a184d3496b13..200b82848c9a 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -183,6 +183,9 @@ extern int udp_lib_setsockopt(struct sock *sk, int level, int optname, extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); +extern struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif); /* * SNMP statistics for UDP and UDP-Lite diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 33e368318d47..c84dad432114 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -320,6 +320,14 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, udptable); } +struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, int dif) +{ + return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); +} +EXPORT_SYMBOL_GPL(udp6_lib_lookup); + + /* * This should be easy, if there is something there we * return it, otherwise we block. -- cgit v1.2.3 From 3b9afb29917f4ab08decf358ecfd354a72a91ac0 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:12:14 +0200 Subject: tproxy: added IPv6 socket lookup function to nf_tproxy_core Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_tproxy_core.h | 72 +++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h index 1027d7f901a0..cd85b3bc8327 100644 --- a/include/net/netfilter/nf_tproxy_core.h +++ b/include/net/netfilter/nf_tproxy_core.h @@ -5,7 +5,8 @@ #include #include #include -#include +#include +#include #include #define NFT_LOOKUP_ANY 0 @@ -130,6 +131,75 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, return sk; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static inline struct sock * +nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, int lookup_type) +{ + struct sock *sk; + + /* look up socket */ + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NFT_LOOKUP_ANY: + sk = inet6_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + case NFT_LOOKUP_LISTENER: + sk = inet6_lookup_listener(net, &tcp_hashinfo, + daddr, ntohs(dport), + in->ifindex); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too */ + + break; + case NFT_LOOKUP_ESTABLISHED: + sk = __inet6_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, ntohs(dport), + in->ifindex); + break; + default: + WARN_ON(1); + sk = NULL; + break; + } + break; + case IPPROTO_UDP: + sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk && lookup_type != NFT_LOOKUP_ANY) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too */ + if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NFT_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n", + protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk); + + return sk; +} +#endif static inline void nf_tproxy_put_sock(struct sock *sk) -- cgit v1.2.3