From 5191f4d82daf22b7ee9446f83527d2795e225974 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Thu, 29 Jan 2015 19:34:42 +0100 Subject: netfilter: nft_compat: add ebtables support This patch extends nft_compat to support ebtables extensions. ebtables verdict codes are translated to the ones used by the nf_tables engine, so we can properly use ebtables target extensions from nft_compat. This patch extends previous work by Giuseppe Longo . Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 63 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 265e190f2218..c598f74063a1 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -19,6 +19,7 @@ #include #include #include +#include #include static int nft_compat_chain_validate_dependency(const char *tablename, @@ -40,6 +41,7 @@ static int nft_compat_chain_validate_dependency(const char *tablename, union nft_entry { struct ipt_entry e4; struct ip6t_entry e6; + struct ebt_entry ebt; }; static inline void @@ -50,9 +52,9 @@ nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info) par->hotdrop = false; } -static void nft_target_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +static void nft_target_eval_xt(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; @@ -66,7 +68,7 @@ static void nft_target_eval(const struct nft_expr *expr, if (pkt->xt.hotdrop) ret = NF_DROP; - switch(ret) { + switch (ret) { case XT_CONTINUE: data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; break; @@ -74,7 +76,41 @@ static void nft_target_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = ret; break; } - return; +} + +static void nft_target_eval_bridge(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + void *info = nft_expr_priv(expr); + struct xt_target *target = expr->ops->data; + struct sk_buff *skb = pkt->skb; + int ret; + + nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + + ret = target->target(skb, &pkt->xt); + + if (pkt->xt.hotdrop) + ret = NF_DROP; + + switch (ret) { + case EBT_ACCEPT: + data[NFT_REG_VERDICT].verdict = NF_ACCEPT; + break; + case EBT_DROP: + data[NFT_REG_VERDICT].verdict = NF_DROP; + break; + case EBT_CONTINUE: + data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + break; + case EBT_RETURN: + data[NFT_REG_VERDICT].verdict = NFT_RETURN; + break; + default: + data[NFT_REG_VERDICT].verdict = ret; + break; + } } static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { @@ -100,6 +136,10 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, entry->e6.ipv6.proto = proto; entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; + case NFPROTO_BRIDGE: + entry->ebt.ethproto = proto; + entry->ebt.invflags = inv ? EBT_IPROTO : 0; + break; } par->entryinfo = entry; par->target = target; @@ -307,6 +347,10 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, entry->e6.ipv6.proto = proto; entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; + case NFPROTO_BRIDGE: + entry->ebt.ethproto = proto; + entry->ebt.invflags = inv ? EBT_IPROTO : 0; + break; } par->entryinfo = entry; par->match = match; @@ -490,6 +534,9 @@ nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb, case AF_INET6: fmt = "ip6t_%s"; break; + case NFPROTO_BRIDGE: + fmt = "ebt_%s"; + break; default: pr_err("nft_compat: unsupported protocol %d\n", nfmsg->nfgen_family); @@ -663,13 +710,17 @@ nft_target_select_ops(const struct nft_ctx *ctx, nft_target->ops.type = &nft_target_type; nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); - nft_target->ops.eval = nft_target_eval; nft_target->ops.init = nft_target_init; nft_target->ops.destroy = nft_target_destroy; nft_target->ops.dump = nft_target_dump; nft_target->ops.validate = nft_target_validate; nft_target->ops.data = target; + if (family == NFPROTO_BRIDGE) + nft_target->ops.eval = nft_target_eval_bridge; + else + nft_target->ops.eval = nft_target_eval_xt; + list_add(&nft_target->head, &nft_target_list); return &nft_target->ops; -- cgit v1.2.3 From 4c1017aa80c95a74703139bb95c4ce0d130efe4d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 30 Jan 2015 07:46:33 +0000 Subject: netfilter: nft_lookup: add missing attribute validation for NFTA_LOOKUP_SET_ID Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_lookup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 6404a726d17b..9615b8b9fb37 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -39,6 +39,7 @@ static void nft_lookup_eval(const struct nft_expr *expr, static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET] = { .type = NLA_STRING }, + [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, }; -- cgit v1.2.3 From bf250a1fa769f2eb8fc7a4e28b3b523e9cb67eef Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 10 Feb 2015 11:37:29 -0500 Subject: ipv6: Partial checksum only UDP packets ip6_append_data is used by other protocols and some of them can't be partially checksummed. Only partially checksum UDP protocol. Fixes: 32dce968dd987a (ipv6: Allow for partial checksums on non-ufo packets) Reported-by: Sabrina Dubroca Tested-by: Sabrina Dubroca Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d33df4cbd872..7deebf102cba 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1273,7 +1273,7 @@ emsgsize: /* If this is the first and only packet and device * supports checksum offloading, let's use it. */ - if (!skb && + if (!skb && sk->sk_protocol == IPPROTO_UDP && length + fragheaderlen < mtu && rt->dst.dev->features & NETIF_F_V6_CSUM && !exthdrlen) -- cgit v1.2.3 From 80ad0d4a7a75158f2824d541e4802c88aba4f063 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 10 Feb 2015 13:33:37 -0500 Subject: rds: rds_cong_queue_updates needs to defer the congestion update transmission When the RDS transport is TCP, we cannot inline the call to rds_send_xmit from rds_cong_queue_update because (a) we are already holding the sock_lock in the recv path, and will deadlock when tcp_setsockopt/tcp_sendmsg try to get the sock lock (b) cong_queue_update does an irqsave on the rds_cong_lock, and this will trigger warnings (for a good reason) from functions called out of sock_lock. This patch reverts the change introduced by 2fa57129d ("RDS: Bypass workqueue when queueing cong updates"). The patch has been verified for both RDS/TCP as well as RDS/RDMA to ensure that there are not regressions for either transport: - for verification of RDS/TCP a client-server unit-test was used, with the server blocked in gdb and thus unable to drain its rcvbuf, eventually triggering a RDS congestion update. - for RDS/RDMA, the standard IB regression tests were used Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/cong.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/cong.c b/net/rds/cong.c index e5b65acd650b..e6144b8246fd 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -221,7 +221,21 @@ void rds_cong_queue_updates(struct rds_cong_map *map) list_for_each_entry(conn, &map->m_conn_list, c_map_item) { if (!test_and_set_bit(0, &conn->c_map_queued)) { rds_stats_inc(s_cong_update_queued); - rds_send_xmit(conn); + /* We cannot inline the call to rds_send_xmit() here + * for two reasons (both pertaining to a TCP transport): + * 1. When we get here from the receive path, we + * are already holding the sock_lock (held by + * tcp_v4_rcv()). So inlining calls to + * tcp_setsockopt and/or tcp_sendmsg will deadlock + * when it tries to get the sock_lock()) + * 2. Interrupts are masked so that we can mark the + * the port congested from both send and recv paths. + * (See comment around declaration of rdc_cong_lock). + * An attempt to get the sock_lock() here will + * therefore trigger warnings. + * Defer the xmit to rds_send_worker() instead. + */ + queue_delayed_work(rds_wq, &conn->c_send_w, 0); } } -- cgit v1.2.3 From b35725a285768d85b5ba1be7fe5002654a65ce88 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 10 Feb 2015 13:35:16 -0800 Subject: openvswitch: Reset key metadata for packet execution. Userspace packet execute command pass down flow key for given packet. But userspace can skip some parameter with zero value. Therefore kernel needs to initialize key metadata to zero. Fixes: 0714812134 ("openvswitch: Eliminate memset() from flow_extract.") Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index e2c348b8baca..50ec42f170a0 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -717,6 +717,8 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr, { int err; + memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE); + /* Extract metadata from netlink attributes. */ err = ovs_nla_get_flow_metadata(attr, key, log); if (err) -- cgit v1.2.3 From 13101602c4a9f653d59af9469040797bc5b361ca Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Feb 2015 11:23:38 +0100 Subject: openvswitch: Add missing initialization in validate_and_copy_set_tun() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/openvswitch/flow_netlink.c: In function ‘validate_and_copy_set_tun’: net/openvswitch/flow_netlink.c:1749: warning: ‘err’ may be used uninitialized in this function If ipv4_tun_from_nlattr() returns a different positive value than OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, err will be uninitialized, and validate_and_copy_set_tun() may return an undefined value instead of a zero success indicator. Initialize err to zero to fix this. Fixes: 1dd144cf5b4b47e1 ("openvswitch: Support VXLAN Group Policy extension") Signed-off-by: Geert Uytterhoeven Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 993281e6278d..3829328c5a76 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1746,7 +1746,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct sw_flow_key key; struct ovs_tunnel_info *tun_info; struct nlattr *a; - int err, start, opts_type; + int err = 0, start, opts_type; ovs_match_init(&match, &key, NULL); opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); -- cgit v1.2.3 From 26c4f7da3e413da697a7beb22ad496390eda7da0 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 10 Feb 2015 16:30:27 -0800 Subject: net: Fix remcsum in GRO path to not change packet Remote checksum offload processing is currently the same for both the GRO and non-GRO path. When the remote checksum offload option is encountered, the checksum field referred to is modified in the packet. So in the GRO case, the packet is modified in the GRO path and then the operation is skipped when the packet goes through the normal path based on skb->remcsum_offload. There is a problem in that the packet may be modified in the GRO path, but then forwarded off host still containing the remote checksum option. A remote host will again perform RCO but now the checksum verification will fail since GRO RCO already modified the checksum. To fix this, we ensure that GRO restores a packet to it's original state before returning. In this model, when GRO processes a remote checksum option it still changes the checksum per the algorithm but on return from lower layer processing the checksum is restored to its original value. In this patch we add define gro_remcsum structure which is passed to skb_gro_remcsum_process to save offset and delta for the checksum being changed. After lower layer processing, skb_gro_remcsum_cleanup is called to restore the checksum before returning from GRO. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/fou.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 92ddea1e6457..7fa8d36e56d4 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -71,12 +71,6 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, size_t offset = ntohs(pd[1]); size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); - if (skb->remcsum_offload) { - /* Already processed in GRO path */ - skb->remcsum_offload = 0; - return guehdr; - } - if (!pskb_may_pull(skb, plen)) return NULL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; @@ -214,7 +208,8 @@ out_unlock: static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, struct guehdr *guehdr, void *data, - size_t hdrlen, u8 ipproto) + size_t hdrlen, u8 ipproto, + struct gro_remcsum *grc) { __be16 *pd = data; size_t start = ntohs(pd[0]); @@ -222,7 +217,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); if (skb->remcsum_offload) - return guehdr; + return NULL; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; @@ -234,7 +229,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return NULL; } - skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset); + skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, + start, offset, grc); skb->remcsum_offload = 1; @@ -254,6 +250,9 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, void *data; u16 doffset = 0; int flush = 1; + struct gro_remcsum grc; + + skb_gro_remcsum_init(&grc); off = skb_gro_offset(skb); len = off + sizeof(*guehdr); @@ -295,7 +294,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_gro_remcsum(skb, off, guehdr, data + doffset, hdrlen, - guehdr->proto_ctype); + guehdr->proto_ctype, &grc); if (!guehdr) goto out; @@ -345,6 +344,7 @@ out_unlock: rcu_read_unlock(); out: NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_remcsum_cleanup(skb, &grc); return pp; } -- cgit v1.2.3 From 6db93ea13b7937c0523ea1f977b322f1347a5633 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 10 Feb 2015 16:30:29 -0800 Subject: udp: Set SKB_GSO_UDP_TUNNEL* in UDP GRO path Properly set GSO types and skb->encapsulation in the UDP tunnel GRO complete so that packets are properly represented for GSO. This sets SKB_GSO_UDP_TUNNEL or SKB_GSO_UDP_TUNNEL_CSUM depending on whether non-zero checksums were received, and sets SKB_GSO_TUNNEL_REMCSUM if the remote checksum option was processed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 13 ++++++++++++- net/ipv6/udp_offload.c | 6 +++++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index d10f6f4ead27..4915d8284a86 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -402,6 +402,13 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) } rcu_read_unlock(); + + if (skb->remcsum_offload) + skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM; + + skb->encapsulation = 1; + skb_set_inner_mac_header(skb, nhoff + sizeof(struct udphdr)); + return err; } @@ -410,9 +417,13 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - if (uh->check) + if (uh->check) { + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, iph->daddr, 0); + } else { + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; + } return udp_gro_complete(skb, nhoff); } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index a56276996b72..ab889bb16b3c 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -161,9 +161,13 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - if (uh->check) + if (uh->check) { + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr, &ipv6h->daddr, 0); + } else { + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; + } return udp_gro_complete(skb, nhoff); } -- cgit v1.2.3 From 15e2396d4e3ce23188852b74d924107982c63b42 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 10 Feb 2015 16:30:31 -0800 Subject: net: Infrastructure for CHECKSUM_PARTIAL with remote checsum offload This patch adds infrastructure so that remote checksum offload can set CHECKSUM_PARTIAL instead of calling csum_partial and writing the modfied checksum field. Add skb_remcsum_adjust_partial function to set an skb for using CHECKSUM_PARTIAL with remote checksum offload. Changed skb_remcsum_process and skb_gro_remcsum_process to take a boolean argument to indicate if checksum partial can be set or the checksum needs to be modified using the normal algorithm. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 1 + net/ipv4/fou.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index d030575532a2..48c6ecb18f3c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4024,6 +4024,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; + NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ switch (skb->ip_summed) { diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 7fa8d36e56d4..d320f575cc62 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -75,7 +75,7 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, return NULL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset); + skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset, true); return guehdr; } @@ -230,7 +230,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, } skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, - start, offset, grc); + start, offset, grc, true); skb->remcsum_offload = 1; -- cgit v1.2.3 From fe881ef11cf0220f118816181930494d484c4883 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 10 Feb 2015 16:30:33 -0800 Subject: gue: Use checksum partial with remote checksum offload Change remote checksum handling to set checksum partial as default behavior. Added an iflink parameter to configure not using checksum partial (calling csum_partial to update checksum). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/fou.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index d320f575cc62..ff069f6597ac 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -22,14 +22,18 @@ static LIST_HEAD(fou_list); struct fou { struct socket *sock; u8 protocol; + u8 flags; u16 port; struct udp_offload udp_offloads; struct list_head list; }; +#define FOU_F_REMCSUM_NOPARTIAL BIT(0) + struct fou_cfg { u16 type; u8 protocol; + u8 flags; struct udp_port_cfg udp_config; }; @@ -64,7 +68,8 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) } static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, - void *data, size_t hdrlen, u8 ipproto) + void *data, size_t hdrlen, u8 ipproto, + bool nopartial) { __be16 *pd = data; size_t start = ntohs(pd[0]); @@ -75,7 +80,8 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, return NULL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset, true); + skb_remcsum_process(skb, (void *)guehdr + hdrlen, + start, offset, nopartial); return guehdr; } @@ -136,7 +142,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_remcsum(skb, guehdr, data + doffset, - hdrlen, guehdr->proto_ctype); + hdrlen, guehdr->proto_ctype, + !!(fou->flags & + FOU_F_REMCSUM_NOPARTIAL)); if (!guehdr) goto drop; @@ -209,7 +217,7 @@ out_unlock: static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, struct guehdr *guehdr, void *data, size_t hdrlen, u8 ipproto, - struct gro_remcsum *grc) + struct gro_remcsum *grc, bool nopartial) { __be16 *pd = data; size_t start = ntohs(pd[0]); @@ -230,7 +238,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, } skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, - start, offset, grc, true); + start, offset, grc, nopartial); skb->remcsum_offload = 1; @@ -250,6 +258,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, void *data; u16 doffset = 0; int flush = 1; + struct fou *fou = container_of(uoff, struct fou, udp_offloads); struct gro_remcsum grc; skb_gro_remcsum_init(&grc); @@ -294,7 +303,9 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_gro_remcsum(skb, off, guehdr, data + doffset, hdrlen, - guehdr->proto_ctype, &grc); + guehdr->proto_ctype, &grc, + !!(fou->flags & + FOU_F_REMCSUM_NOPARTIAL)); if (!guehdr) goto out; @@ -455,6 +466,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk = sock->sk; + fou->flags = cfg->flags; fou->port = cfg->udp_config.local_udp_port; /* Initial for fou type */ @@ -541,6 +553,7 @@ static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_AF] = { .type = NLA_U8, }, [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, [FOU_ATTR_TYPE] = { .type = NLA_U8, }, + [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, }; static int parse_nl_config(struct genl_info *info, @@ -571,6 +584,9 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_TYPE]) cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]); + if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL]) + cfg->flags |= FOU_F_REMCSUM_NOPARTIAL; + return 0; } -- cgit v1.2.3 From 4762fb980465463734f02c67c67f40beb8903f73 Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Wed, 11 Feb 2015 14:06:23 +0100 Subject: ipv6: fix possible deadlock in ip6_fl_purge / ip6_fl_gc Use spin_lock_bh in ip6_fl_purge() to prevent following potentially deadlock scenario between ip6_fl_purge() and ip6_fl_gc() timer. ================================= [ INFO: inconsistent lock state ] 3.19.0 #1 Not tainted --------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/5/0 [HC0[0]:SC1[1]:HE1:SE0] takes: (ip6_fl_lock){+.?...}, at: [] ip6_fl_gc+0x2d/0x180 {SOFTIRQ-ON-W} state was registered at: [] __lock_acquire+0x4a0/0x10b0 [] lock_acquire+0xc4/0x2b0 [] _raw_spin_lock+0x3d/0x80 [] ip6_flowlabel_net_exit+0x28/0x110 [] ops_exit_list.isra.1+0x39/0x60 [] cleanup_net+0x100/0x1e0 [] process_one_work+0x20a/0x830 [] worker_thread+0x11b/0x460 [] kthread+0x104/0x120 [] ret_from_fork+0x7c/0xb0 irq event stamp: 84640 hardirqs last enabled at (84640): [] _raw_spin_unlock_irq+0x30/0x50 hardirqs last disabled at (84639): [] _raw_spin_lock_irq+0x1f/0x80 softirqs last enabled at (84628): [] _local_bh_enable+0x21/0x50 softirqs last disabled at (84629): [] irq_exit+0x12d/0x150 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(ip6_fl_lock); lock(ip6_fl_lock); *** DEADLOCK *** Signed-off-by: Jan Stancek Signed-off-by: David S. Miller --- net/ipv6/ip6_flowlabel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 2f780cba6e12..f45d6db50a45 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -172,7 +172,7 @@ static void __net_exit ip6_fl_purge(struct net *net) { int i; - spin_lock(&ip6_fl_lock); + spin_lock_bh(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; @@ -190,7 +190,7 @@ static void __net_exit ip6_fl_purge(struct net *net) flp = &fl->next; } } - spin_unlock(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, -- cgit v1.2.3 From 9672723973f1e70189b8409eb2da189a980481c5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 12 Feb 2015 15:17:27 +0100 Subject: bridge: netfilter: Move sysctl-specific error code inside #ifdef MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_SYSCTL=n: net/bridge/br_netfilter.c: In function ‘br_netfilter_init’: net/bridge/br_netfilter.c:996: warning: label ‘err1’ defined but not used Move the label and the code after it inside the existing #ifdef to get rid of the warning. Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 65728e0dc4ff..0ee453fad3de 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -987,15 +987,12 @@ static int __init br_netfilter_init(void) if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); - ret = -ENOMEM; - goto err1; + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + return -ENOMEM; } #endif printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; -err1: - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - return ret; } static void __exit br_netfilter_fini(void) -- cgit v1.2.3 From ba34e6d9d346fe4e05d7e417b9edf5140772d34c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Feb 2015 04:47:12 -0800 Subject: tcp: make sure skb is not shared before using skb_get() IPv6 can keep a copy of SYN message using skb_get() in tcp_v6_conn_request() so that caller wont free the skb when calling kfree_skb() later. Therefore TCP fast open has to clone the skb it is queuing in child->sk_receive_queue, as all skbs consumed from receive_queue are freed using __kfree_skb() (ie assuming skb->users == 1) Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Fixes: 5b7ed0892f2af ("tcp: move fastopen functions to tcp_fastopen.c") Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 53db2c309572..ea82fd492c1b 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -134,6 +134,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; + u32 end_seq; req->num_retrans = 0; req->num_timeout = 0; @@ -185,20 +186,35 @@ static bool tcp_fastopen_create_child(struct sock *sk, /* Queue the data carried in the SYN packet. We need to first * bump skb's refcnt because the caller will attempt to free it. + * Note that IPv6 might also have used skb_get() trick + * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts) + * So we need to eventually get a clone of the packet, + * before inserting it in sk_receive_queue. * * XXX (TFO) - we honor a zero-payload TFO request for now, * (any reason not to?) but no need to queue the skb since * there is no data. How about SYN+FIN? */ - if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) { - skb = skb_get(skb); - skb_dst_drop(skb); - __skb_pull(skb, tcp_hdr(skb)->doff * 4); - skb_set_owner_r(skb, child); - __skb_queue_tail(&child->sk_receive_queue, skb); - tp->syn_data_acked = 1; + end_seq = TCP_SKB_CB(skb)->end_seq; + if (end_seq != TCP_SKB_CB(skb)->seq + 1) { + struct sk_buff *skb2; + + if (unlikely(skb_shared(skb))) + skb2 = skb_clone(skb, GFP_ATOMIC); + else + skb2 = skb_get(skb); + + if (likely(skb2)) { + skb_dst_drop(skb2); + __skb_pull(skb2, tcp_hdrlen(skb)); + skb_set_owner_r(skb2, child); + __skb_queue_tail(&child->sk_receive_queue, skb2); + tp->syn_data_acked = 1; + } else { + end_seq = TCP_SKB_CB(skb)->seq + 1; + } } - tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; sk->sk_data_ready(sk); bh_unlock_sock(child); sock_put(child); -- cgit v1.2.3 From 26ad0b83587fb6e9a20eef388b0587ada3da5d06 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 12 Feb 2015 09:58:48 -0800 Subject: openvswitch: Fix key serialization. Fix typo where mask is used rather than key. Fixes: 74ed7ab9264("openvswitch: Add support for unique flow IDs.") Reported-by: Joe Stringer Signed-off-by: Pravin B Shelar Acked-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 3829328c5a76..216f20b90aa5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1516,7 +1516,7 @@ int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb) /* Called with ovs_mutex or RCU read lock. */ int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb) { - return ovs_nla_put_key(&flow->mask->key, &flow->key, + return ovs_nla_put_key(&flow->key, &flow->key, OVS_FLOW_ATTR_KEY, false, skb); } -- cgit v1.2.3 From 3b4711757d7903ab6fa88a9e7ab8901b8227da60 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 12 Feb 2015 16:14:08 -0800 Subject: ipv6: fix ipv6_cow_metrics for non DST_HOST case ipv6_cow_metrics() currently assumes only DST_HOST routes require dynamic metrics allocation from inetpeer. The assumption breaks when ndisc discovered router with RTAX_MTU and RTAX_HOPLIMIT metric. Refer to ndisc_router_discovery() in ndisc.c and note that dst_metric_set() is called after the route is created. This patch creates the metrics array (by calling dst_cow_metrics_generic) in ipv6_cow_metrics(). Test: radvd.conf: interface qemubr0 { AdvLinkMTU 1300; AdvCurHopLimit 30; prefix fd00:face:face:face::/64 { AdvOnLink on; AdvAutonomous on; AdvRouterAddr off; }; }; Before: [root@qemu1 ~]# ip -6 r show | egrep -v unreachable fd00:face:face:face::/64 dev eth0 proto kernel metric 256 expires 27sec fe80::/64 dev eth0 proto kernel metric 256 default via fe80::74df:d0ff:fe23:8ef2 dev eth0 proto ra metric 1024 expires 27sec After: [root@qemu1 ~]# ip -6 r show | egrep -v unreachable fd00:face:face:face::/64 dev eth0 proto kernel metric 256 expires 27sec mtu 1300 fe80::/64 dev eth0 proto kernel metric 256 mtu 1300 default via fe80::74df:d0ff:fe23:8ef2 dev eth0 proto ra metric 1024 expires 27sec mtu 1300 hoplimit 30 Fixes: 8e2ec639173f325 (ipv6: don't use inetpeer to store metrics for routes.) Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 98565ce0ebcd..4688bd4d7f59 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -141,7 +141,7 @@ static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) u32 *p = NULL; if (!(rt->dst.flags & DST_HOST)) - return NULL; + return dst_cow_metrics_generic(dst, old); peer = rt6_get_peer_create(rt); if (peer) { -- cgit v1.2.3 From 4a26e453d99a06e3f1548569d7d405ce38878b78 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sat, 14 Feb 2015 22:26:34 +0900 Subject: net/core: Fix warning while make xmldocs caused by dev.c This patch fix following warning wile make xmldocs. Warning(.//net/core/dev.c:5345): No description found for parameter 'bonding_info' Warning(.//net/core/dev.c:5345): Excess function parameter 'netdev_bonding_info' description in 'netdev_bonding_info_change' This warning starts to appear after following patch was added into Linus's tree during merger period. commit 61bd3857ff2c7daf756d49b41e6277bbdaa8f789 net/core: Add event for a change in slave state Signed-off-by: Masanari Iida Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 48c6ecb18f3c..8f9710c62e20 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5336,7 +5336,7 @@ EXPORT_SYMBOL(netdev_upper_dev_unlink); /** * netdev_bonding_info_change - Dispatch event about slave change * @dev: device - * @netdev_bonding_info: info to dispatch + * @bonding_info: info to dispatch * * Send NETDEV_BONDING_INFO to netdev notifiers with info. * The caller must hold the RTNL lock. -- cgit v1.2.3 From ca9f1fd263e14765a4c213e46940876ad78fce28 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 14 Feb 2015 13:47:54 -0500 Subject: net: spelling fixes Spelling errors caught by codespell. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- net/core/pktgen.c | 2 +- net/ipv4/devinet.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index ec9baea10c16..f6bdc2b1ba01 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -531,7 +531,7 @@ do_pass: *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); break; - /* Unkown instruction. */ + /* Unknown instruction. */ default: goto err; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9fa25b0ea145..b4899f5b7388 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -97,7 +97,7 @@ * New xmit() return, do_div and misc clean up by Stephen Hemminger * 040923 * - * Randy Dunlap fixed u64 printk compiler waring + * Randy Dunlap fixed u64 printk compiler warning * * Remove FCS from BW calculation. Lennert Buytenhek * New time handling. Lennert Buytenhek 041213 diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index f0b4a31d7bd6..3a8985c94581 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1186,7 +1186,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) no_in_dev: /* Not loopback addresses on loopback should be preferred - in this case. It is importnat that lo is the first interface + in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { -- cgit v1.2.3 From 7afb8886a05be68e376655539a064ec672de8a8e Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 13 Feb 2015 13:56:53 -0800 Subject: rtnetlink: call ->dellink on failure when ->newlink exists Ignacy reported that when eth0 is down and add a vlan device on top of it like: ip link add link eth0 name eth0.1 up type vlan id 1 We will get a refcount leak: unregister_netdevice: waiting for eth0.1 to become free. Usage count = 2 The problem is when rtnl_configure_link() fails in rtnl_newlink(), we simply call unregister_device(), but for stacked device like vlan, we almost do nothing when we unregister the upper device, more work is done when we unregister the lower device, so call its ->dellink(). Reported-by: Ignacy Gawedzki Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5be499b6a2d2..ab293a3066b3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2162,7 +2162,14 @@ replay: } err = rtnl_configure_link(dev, ifm); if (err < 0) { - unregister_netdevice(dev); + if (ops->newlink) { + LIST_HEAD(list_kill); + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); + } goto out; } -- cgit v1.2.3 From 19334920eaf7df3f69950b040ede6c7598425a5b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 16 Feb 2015 21:23:51 -0800 Subject: net: dsa: Set valid phy interface type If the phy interface mode is not found in devicetree, or if devicetree is not configured, of_get_phy_mode returns -ENODEV. The current code sets the phy interface mode to the return value from of_get_phy_mode without checking if it is valid. This invalid phy interface mode is passed as parameter to of_phy_connect or to phy_connect_direct. This sets the phy interface mode to the invalid value, which in turn causes problems for any code using phydev->interface. Fixes: b31f65fb4383 ("net: dsa: slave: Fix autoneg for phys on switch MDIO bus") Fixes: 0d8bcdd383b8 ("net: dsa: allow for more complex PHY setups") Cc: Florian Fainelli Cc: Andrew Lunn Signed-off-by: Guenter Roeck Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d104ae15836f..f23deadf42a0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -521,10 +521,13 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; u32 phy_flags = 0; - int ret; + int mode, ret; port_dn = cd->port_dn[p->port]; - p->phy_interface = of_get_phy_mode(port_dn); + mode = of_get_phy_mode(port_dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + p->phy_interface = mode; phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); if (of_phy_is_fixed_link(port_dn)) { @@ -559,6 +562,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, if (!p->phy) return -ENODEV; + /* Use already configured phy mode */ + p->phy_interface = p->phy->interface; phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, p->phy_interface); } else { -- cgit v1.2.3