From ba1960257c5980f9b58057995ce3394bd8e48ca3 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Tue, 10 Jan 2012 15:19:54 +0200 Subject: mac80211: update oper_channel on ibss join Commit 13c40c5 ("mac80211: Add HT operation modes for IBSS") broke ibss operation by mistakenly removing the local->oper_channel update (causing ibss to start on the wrong channel). fix it. Signed-off-by: Eliad Peller Acked-by: Simon Wunderlich Signed-off-by: John W. Linville --- net/mac80211/ibss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index b3d76b756cd5..a4643969a13b 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -106,6 +106,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0; + local->oper_channel = chan; channel_type = ifibss->channel_type; if (channel_type > NL80211_CHAN_HT20 && !cfg80211_can_beacon_sec_chan(local->hw.wiphy, chan, channel_type)) -- cgit v1.2.3 From 405385f8ce7a2ed8f82e216d88b5282142e1288b Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 11 Jan 2012 13:11:50 +0200 Subject: mac80211: set bss_conf.idle when vif is connected __ieee80211_recalc_idle() iterates through the vifs, sets bss_conf.idle = true if they are disconnected, and increases "count" if they are not (which later gets evaluated in order to determine whether the device is idle). However, the loop doesn't set bss_conf.idle = false (along with increasing "count"), causing the device idle state and the vif idle state to get out of sync in some cases. Signed-off-by: Eliad Peller Signed-off-by: John W. Linville --- net/mac80211/iface.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index e47768cb8cb3..01a21c2f6ab3 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1314,6 +1314,7 @@ u32 __ieee80211_recalc_idle(struct ieee80211_local *local) continue; } /* count everything else */ + sdata->vif.bss_conf.idle = false; count++; } -- cgit v1.2.3 From b49ba04a3a0382e7314d990707c21094c410425a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 19 Jan 2012 08:20:57 -0800 Subject: iwlwifi: fix PCI-E transport "inta" race When an interrupt comes in, we read the reason bits and collect them into "trans_pcie->inta". This happens with the spinlock held. However, there's a bug resetting this variable -- that happens after the spinlock has been released. This means that it is possible for interrupts to be missed if the reset happens after some other interrupt reasons were already added to the variable. I found this by code inspection, looking for a reason that we sometimes see random commands time out. It seems possible that this causes such behaviour, but I can't say for sure right now since it happens extremely infrequently on my test systems. Cc: stable@vger.kernel.org [3.2] Signed-off-by: Johannes Berg Signed-off-by: Wey-Yi Guy Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c b/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c index 752493f00406..65d1f05007be 100644 --- a/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c +++ b/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c @@ -972,11 +972,11 @@ void iwl_irq_tasklet(struct iwl_trans *trans) } #endif - spin_unlock_irqrestore(&trans->shrd->lock, flags); - /* saved interrupt in inta variable now we can reset trans_pcie->inta */ trans_pcie->inta = 0; + spin_unlock_irqrestore(&trans->shrd->lock, flags); + /* Now service all interrupt bits discovered above. */ if (inta & CSR_INT_BIT_HW_ERR) { IWL_ERR(trans, "Hardware error detected. Restarting.\n"); -- cgit v1.2.3 From 68315801dbf3ab2001679fd2074c9dc5dcf87dfa Mon Sep 17 00:00:00 2001 From: James Chapman Date: Wed, 25 Jan 2012 02:39:05 +0000 Subject: l2tp: l2tp_ip - fix possible oops on packet receive When a packet is received on an L2TP IP socket (L2TPv3 IP link encapsulation), the l2tpip socket's backlog_rcv function calls xfrm4_policy_check(). This is not necessary, since it was called before the skb was added to the backlog. With CONFIG_NET_NS enabled, xfrm4_policy_check() will oops if skb->dev is null, so this trivial patch removes the call. This bug has always been present, but only when CONFIG_NET_NS is enabled does it cause problems. Most users are probably using UDP encapsulation for L2TP, hence the problem has only recently surfaced. EIP: 0060:[] EFLAGS: 00210246 CPU: 0 EIP is at l2tp_ip_recvmsg+0xd4/0x2a7 EAX: 00000001 EBX: d77b5180 ECX: 00000000 EDX: 00200246 ESI: 00000000 EDI: d63cbd30 EBP: d63cbd18 ESP: d63cbcf4 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Call Trace: [] sock_common_recvmsg+0x31/0x46 [] __sock_recvmsg_nosec+0x45/0x4d [] __sock_recvmsg+0x31/0x3b [] sock_recvmsg+0x96/0xab [] ? might_fault+0x47/0x81 [] ? might_fault+0x47/0x81 [] ? _copy_from_user+0x31/0x115 [] ? copy_from_user+0x8/0xa [] ? verify_iovec+0x3e/0x78 [] __sys_recvmsg+0x10a/0x1aa [] ? sock_recvmsg+0x0/0xab [] ? __lock_acquire+0xbdf/0xbee [] ? do_page_fault+0x193/0x375 [] ? fcheck_files+0x9b/0xca [] ? fget_light+0x2a/0x9c [] sys_recvmsg+0x2b/0x43 [] sys_socketcall+0x16d/0x1a5 [] ? trace_hardirqs_on_thunk+0xc/0x10 [] sysenter_do_call+0x12/0x38 Code: c6 05 8c ea a8 c1 01 e8 0c d4 d9 ff 85 f6 74 07 3e ff 86 80 00 00 00 b9 17 b6 2b c1 ba 01 00 00 00 b8 78 ed 48 c1 e8 23 f6 d9 ff 76 0c 68 28 e3 30 c1 68 2d 44 41 c1 e8 89 57 01 00 83 c4 0c Signed-off-by: James Chapman Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index d21e7ebd91ca..55670ec3cd0f 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -393,11 +393,6 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) - goto drop; - - nf_reset(skb); - /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) -- cgit v1.2.3 From 2b05ad33e1e624e7f08b8676d270dc7725403b7e Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 25 Jan 2012 08:34:51 +0000 Subject: tcp: bind() fix autoselection to share ports The current code checks for conflicts when the application requests a specific port. If there is no conflict, then the request is granted. On the other hand, the port autoselection done by the kernel fails when all ports are bound even when there is a port with no conflict available. The fix changes port autoselection to check if there is a conflict and use it if not. Signed-off-by: Flavio Leitner Signed-off-by: Marcelo Ricardo Leitner Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 2e4e24476c4c..ecd19b5a7ee2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -128,6 +128,11 @@ again: goto have_snum; } } + if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + spin_unlock(&head->lock); + snum = rover; + goto have_snum; + } goto next; } break; -- cgit v1.2.3 From fddb7b5761f104f034a0e708ece756d9b2eb2cac Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 25 Jan 2012 08:34:52 +0000 Subject: tcp: bind() optimize port allocation Port autoselection finds a port and then drop the lock, then right after that, gets the hash bucket again and lock it. Fix it to go direct. Signed-off-by: Flavio Leitner Signed-off-by: Marcelo Ricardo Leitner Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ecd19b5a7ee2..19d66cefd7d3 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -123,15 +123,13 @@ again: smallest_size = tb->num_owners; smallest_rover = rover; if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { - spin_unlock(&head->lock); snum = smallest_rover; - goto have_snum; + goto tb_found; } } if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { - spin_unlock(&head->lock); snum = rover; - goto have_snum; + goto tb_found; } goto next; } -- cgit v1.2.3 From 073862ba5d249c20bd5c49fc6d904ff0e1f6a672 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Jan 2012 00:41:38 +0000 Subject: netns: fix net_alloc_generic() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a new net namespace is created, we should attach to it a "struct net_generic" with enough slots (even empty), or we can hit the following BUG_ON() : [ 200.752016] kernel BUG at include/net/netns/generic.h:40! ... [ 200.752016] [] ? get_cfcnfg+0x3a/0x180 [ 200.752016] [] ? lockdep_rtnl_is_held+0x10/0x20 [ 200.752016] [] caif_device_notify+0x2e/0x530 [ 200.752016] [] notifier_call_chain+0x67/0x110 [ 200.752016] [] raw_notifier_call_chain+0x11/0x20 [ 200.752016] [] call_netdevice_notifiers+0x32/0x60 [ 200.752016] [] register_netdevice+0x196/0x300 [ 200.752016] [] register_netdev+0x19/0x30 [ 200.752016] [] loopback_net_init+0x4a/0xa0 [ 200.752016] [] ops_init+0x42/0x180 [ 200.752016] [] setup_net+0x6b/0x100 [ 200.752016] [] copy_net_ns+0x86/0x110 [ 200.752016] [] create_new_namespaces+0xd9/0x190 net_alloc_generic() should take into account the maximum index into the ptr array, as a subsystem might use net_generic() anytime. This also reduces number of reallocations in net_assign_generic() Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Eric Dumazet Cc: Sjur Brændeland Cc: Eric W. Biederman Cc: Pavel Emelyanov Signed-off-by: David S. Miller --- net/core/net_namespace.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index aefcd7acbffa..0e950fda9a0a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -30,6 +30,20 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ +static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; + +static struct net_generic *net_alloc_generic(void) +{ + struct net_generic *ng; + size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + ng = kzalloc(generic_size, GFP_KERNEL); + if (ng) + ng->len = max_gen_ptrs; + + return ng; +} + static int net_assign_generic(struct net *net, int id, void *data) { struct net_generic *ng, *old_ng; @@ -43,8 +57,7 @@ static int net_assign_generic(struct net *net, int id, void *data) if (old_ng->len >= id) goto assign; - ng = kzalloc(sizeof(struct net_generic) + - id * sizeof(void *), GFP_KERNEL); + ng = net_alloc_generic(); if (ng == NULL) return -ENOMEM; @@ -59,7 +72,6 @@ static int net_assign_generic(struct net *net, int id, void *data) * the old copy for kfree after a grace period. */ - ng->len = id; memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); rcu_assign_pointer(net->gen, ng); @@ -161,18 +173,6 @@ out_undo: goto out; } -static struct net_generic *net_alloc_generic(void) -{ - struct net_generic *ng; - size_t generic_size = sizeof(struct net_generic) + - INITIAL_NET_GEN_PTRS * sizeof(void *); - - ng = kzalloc(generic_size, GFP_KERNEL); - if (ng) - ng->len = INITIAL_NET_GEN_PTRS; - - return ng; -} #ifdef CONFIG_NET_NS static struct kmem_cache *net_cachep; @@ -483,6 +483,7 @@ again: } return error; } + max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id); } error = __register_pernet_operations(list, ops); if (error) { -- cgit v1.2.3 From 40206dd98f066d596d4280558fc5f798165861c7 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 26 Jan 2012 07:23:23 +0000 Subject: xen-netfront: correct MAX_TX_TARGET calculation. Signed-off-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netfront.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index fa679057630f..698b905058dd 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -68,7 +68,7 @@ struct netfront_cb { #define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) #define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE) -#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256) +#define TX_MAX_TARGET min_t(int, NET_TX_RING_SIZE, 256) struct netfront_stats { u64 rx_packets; -- cgit v1.2.3 From f2b3ee9e4200b32d113b1bd3c93f9a836c97357c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 26 Jan 2012 10:34:35 +0000 Subject: ipv6: Fix ip_gre lockless xmits. Tunnel devices set NETIF_F_LLTX to bypass HARD_TX_LOCK. Sit and ipip set this unconditionally in ops->setup, but gre enables it conditionally after parameter passing in ops->newlink. This is not called during tunnel setup as below, however, so GRE tunnels are still taking the lock. modprobe ip_gre ip tunnel add test0 mode gre remote 10.5.1.1 dev lo ip link set test0 up ip addr add 10.6.0.1 dev test0 # cat /sys/class/net/test0/features # $DIR/test_tunnel_xmit 10 10.5.2.1 ip route add 10.5.2.0/24 dev test0 ip tunnel del test0 The newlink callback is only called in rtnl_netlink, and only if the device is new, as it calls register_netdevice internally. Gre tunnels are created at 'ip tunnel add' with ioctl SIOCADDTUNNEL, which calls ipgre_tunnel_locate, which calls register_netdev. rtnl_newlink is called at 'ip link set', but skips ops->newlink and the device is up with locking still enabled. The equivalent ipip tunnel works fine, btw (just substitute 'method gre' for 'method ipip'). On kernels before /sys/class/net/*/features was removed [1], the first commented out line returns 0x6000 with method gre, which indicates that NETIF_F_LLTX (0x1000) is not set. With ipip, it reports 0x7000. This test cannot be used on recent kernels where the sysfs file is removed (and ETHTOOL_GFEATURES does not currently work for tunnel devices, because they lack dev->ethtool_ops). The second commented out line calls a simple transmission test [2] that sends on 24 cores at maximum rate. Results of a single run: ipip: 19,372,306 gre before patch: 4,839,753 gre after patch: 19,133,873 This patch replicates the condition check in ipgre_newlink to ipgre_tunnel_locate. It works for me, both with oseq on and off. This is the first time I looked at rtnetlink and iproute2 code, though, so someone more knowledgeable should probably check the patch. Thanks. The tail of both functions is now identical, by the way. To avoid code duplication, I'll be happy to rework this and merge the two. [1] http://patchwork.ozlabs.org/patch/104610/ [2] http://kernel.googlecode.com/files/xmit_udp_parallel.c Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2b53a1f7abf6..6b3ca5ba4450 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -422,6 +422,10 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, if (register_netdevice(dev) < 0) goto failed_free; + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + dev_hold(dev); ipgre_tunnel_link(ign, nt); return nt; -- cgit v1.2.3 From f18da14565819ba43b8321237e2426a2914cc2ef Mon Sep 17 00:00:00 2001 From: Stefan Gula Date: Thu, 26 Jan 2012 11:01:06 +0000 Subject: net: RTNETLINK adjusting values of min_ifinfo_dump_size Setting link parameters on a netdevice changes the value of if_nlmsg_size(), therefore it is necessary to recalculate min_ifinfo_dump_size. Signed-off-by: Stefan Gula Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f16444bc6cbb..65aebd450027 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1509,6 +1509,9 @@ errout: if (send_addr_notify) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + min_ifinfo_dump_size = max_t(u16, if_nlmsg_size(dev), + min_ifinfo_dump_size); + return err; } -- cgit v1.2.3 From 9018e93948c6f8f95fbcc9fa05f6c403d6adb406 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Thu, 26 Jan 2012 12:09:28 +0000 Subject: net: explicitly add jump_label.h header to sock.h Commit 36a1211970193ce215de50ed1e4e1272bc814df1 removed linux/module.h include statement from one of the headers that end up in net/sock.h. It was providing us with static_branch() definition implicitly, so after its removal the build got broken. To fix this, and avoid having this happening in the future, let me do the right thing and include linux/jump_label.h explicitly in sock.h. Signed-off-by: Glauber Costa Reported-by: Randy Dunlap CC: David S. Miller Signed-off-by: David S. Miller --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/sock.h b/include/net/sock.h index 4c69ac165e6b..91c1c8baf020 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -55,6 +55,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 5ee4433efe99b9f39f6eff5052a177bbcfe72cea Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jan 2012 14:02:55 +0000 Subject: netns: Fail conspicously if someone uses net_generic at an inappropriate time. By definition net_generic should never be called when it can return NULL. Fail conspicously with a BUG_ON to make it clear when people mess up that a NULL return should never happen. Recently there was a bug in the CAIF subsystem where it was registered with register_pernet_device instead of register_pernet_subsys. It was erroneously concluded that net_generic could validly return NULL and that net_assign_generic was buggy (when it was just inefficient). Hopefully this BUG_ON will prevent people to coming to similar erroneous conclusions in the futrue. Signed-off-by: Eric W. Biederman Tested-by: Sasha Levin Signed-off-by: David S. Miller --- include/net/netns/generic.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h index 3419bf5cd154..d55f43443335 100644 --- a/include/net/netns/generic.h +++ b/include/net/netns/generic.h @@ -41,6 +41,7 @@ static inline void *net_generic(const struct net *net, int id) ptr = ng->ptr[id - 1]; rcu_read_unlock(); + BUG_ON(!ptr); return ptr; } #endif -- cgit v1.2.3 From 8a8ee9aff6c3077dd9c2c7a77478e8ed362b96c6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jan 2012 14:04:53 +0000 Subject: net caif: Register properly as a pernet subsystem. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit caif is a subsystem and as such it needs to register with register_pernet_subsys instead of register_pernet_device. Among other problems using register_pernet_device was resulting in net_generic being called before the caif_net structure was allocated. Which has been causing net_generic to fail with either BUG_ON's or by return NULL pointers. A more ugly problem that could be caused is packets in flight why the subsystem is shutting down. To remove confusion also remove the cruft cause by inappropriately trying to fix this bug. With the aid of the previous patch I have tested this patch and confirmed that using register_pernet_subsys makes the failure go away as it should. Signed-off-by: Eric W. Biederman Acked-by: Sjur Brændeland Tested-by: Sasha Levin Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 22 ++-------------------- net/caif/cfcnfg.c | 1 - 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 673728add60b..82c57069415f 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -59,8 +59,6 @@ struct cfcnfg *get_cfcnfg(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); - if (!caifn) - return NULL; return caifn->cfg; } EXPORT_SYMBOL(get_cfcnfg); @@ -69,8 +67,6 @@ static struct caif_device_entry_list *caif_device_list(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); - if (!caifn) - return NULL; return &caifn->caifdevs; } @@ -99,8 +95,6 @@ static struct caif_device_entry *caif_device_alloc(struct net_device *dev) struct caif_device_entry *caifd; caifdevs = caif_device_list(dev_net(dev)); - if (!caifdevs) - return NULL; caifd = kzalloc(sizeof(*caifd), GFP_KERNEL); if (!caifd) @@ -120,8 +114,6 @@ static struct caif_device_entry *caif_get(struct net_device *dev) struct caif_device_entry_list *caifdevs = caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; - if (!caifdevs) - return NULL; list_for_each_entry_rcu(caifd, &caifdevs->list, list) { if (caifd->netdev == dev) @@ -321,8 +313,6 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, struct caif_device_entry_list *caifdevs; caifdevs = caif_device_list(dev_net(dev)); - if (!cfg || !caifdevs) - return; caifd = caif_device_alloc(dev); if (!caifd) return; @@ -374,8 +364,6 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, cfg = get_cfcnfg(dev_net(dev)); caifdevs = caif_device_list(dev_net(dev)); - if (!cfg || !caifdevs) - return 0; caifd = caif_get(dev); if (caifd == NULL && dev->type != ARPHRD_CAIF) @@ -507,9 +495,6 @@ static struct notifier_block caif_device_notifier = { static int caif_init_net(struct net *net) { struct caif_net *caifn = net_generic(net, caif_net_id); - if (WARN_ON(!caifn)) - return -EINVAL; - INIT_LIST_HEAD(&caifn->caifdevs.list); mutex_init(&caifn->caifdevs.lock); @@ -527,9 +512,6 @@ static void caif_exit_net(struct net *net) caif_device_list(net); struct cfcnfg *cfg = get_cfcnfg(net); - if (!cfg || !caifdevs) - return; - rtnl_lock(); mutex_lock(&caifdevs->lock); @@ -569,7 +551,7 @@ static int __init caif_device_init(void) { int result; - result = register_pernet_device(&caif_net_ops); + result = register_pernet_subsys(&caif_net_ops); if (result) return result; @@ -582,7 +564,7 @@ static int __init caif_device_init(void) static void __exit caif_device_exit(void) { - unregister_pernet_device(&caif_net_ops); + unregister_pernet_subsys(&caif_net_ops); unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_packet_type); } diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 598aafb4cb51..ba9cfd47778a 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -309,7 +309,6 @@ int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, int err; struct cfctrl_link_param param; struct cfcnfg *cfg = get_cfcnfg(net); - caif_assert(cfg != NULL); rcu_read_lock(); err = caif_connect_req_to_link_param(cfg, conn_req, ¶m); -- cgit v1.2.3 From 4acb41903b2f99f3dffd4c3df9acc84ca5942cb2 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 30 Jan 2012 01:20:17 +0000 Subject: net/tcp: Fix tcp memory limits initialization when !CONFIG_SYSCTL sysctl_tcp_mem() initialization was moved to sysctl_tcp_ipv4.c in commit 3dc43e3e4d0b52197d3205214fe8f162f9e0c334, since it became a per-ns value. That code, however, will never run when CONFIG_SYSCTL is disabled, leading to bogus values on those fields - causing hung TCP sockets. This patch fixes it by keeping an initialization code in tcp_init(). It will be overwritten by the first net namespace init if CONFIG_SYSCTL is compiled in, and do the right thing if it is compiled out. It is also named properly as tcp_init_mem(), to properly signal its non-sysctl side effect on TCP limits. Reported-by: Ingo Molnar Signed-off-by: Glauber Costa Cc: David S. Miller Link: http://lkml.kernel.org/r/4F22D05A.8030604@parallels.com [ renamed the function, tidied up the changelog a bit ] Signed-off-by: Ingo Molnar Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/sysctl_net_ipv4.c | 1 + net/ipv4/tcp.c | 16 +++++++++++++--- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 0118ea999f67..d49db0113a06 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -311,6 +311,8 @@ extern struct proto tcp_prot; #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) +extern void tcp_init_mem(struct net *net); + extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4aa7e9dc0cbb..4cb9cd2f2c39 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -814,6 +814,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count = 4; + tcp_init_mem(net); limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9bcdec3ad772..06373b4a449a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3216,6 +3216,16 @@ static int __init set_thash_entries(char *str) } __setup("thash_entries=", set_thash_entries); +void tcp_init_mem(struct net *net) +{ + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + unsigned long limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); + net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; + net->ipv4.sysctl_tcp_mem[1] = limit; + net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; +} + void __init tcp_init(void) { struct sk_buff *skb = NULL; @@ -3276,9 +3286,9 @@ void __init tcp_init(void) sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); - /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = ((unsigned long)init_net.ipv4.sysctl_tcp_mem[1]) - << (PAGE_SHIFT - 7); + tcp_init_mem(&init_net); + limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); max_share = min(4UL*1024*1024, limit); sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; -- cgit v1.2.3 From 5b35e1e6e9ca651e6b291c96d1106043c9af314a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 28 Jan 2012 17:29:46 +0000 Subject: tcp: fix tcp_trim_head() to adjust segment count with skb MSS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes tcp_trim_head() to recalculate the number of segments in the skb with the skb's existing MSS, so trimming the head causes the skb segment count to be monotonically non-increasing - it should stay the same or go down, but not increase. Previously tcp_trim_head() used the current MSS of the connection. But if there was a decrease in MSS between original transmission and ACK (e.g. due to PMTUD), this could cause tcp_trim_head() to counter-intuitively increase the segment count when trimming bytes off the head of an skb. This violated assumptions in tcp_tso_acked() that tcp_trim_head() only decreases the packet count, so that packets_acked in tcp_tso_acked() could underflow, leading tcp_clean_rtx_queue() to pass u32 pkts_acked values as large as 0xffffffff to ca_ops->pkts_acked(). As an aside, if tcp_trim_head() had really wanted the skb to reflect the current MSS, it should have called tcp_set_skb_tso_segs() unconditionally, since a decrease in MSS would mean that a single-packet skb should now be sliced into multiple segments. Signed-off-by: Neal Cardwell Acked-by: Nandita Dukkipati Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8c8de2780c7a..4ff3b6dc74fc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1141,11 +1141,9 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) sk_mem_uncharge(sk, len); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - /* Any change of skb->len requires recalculation of tso - * factor and mss. - */ + /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); return 0; } -- cgit v1.2.3 From 6f01fd6e6f6809061b56e78f1e8d143099716d70 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 28 Jan 2012 16:11:03 +0000 Subject: af_unix: fix EPOLLET regression for stream sockets Commit 0884d7aa24 (AF_UNIX: Fix poll blocking problem when reading from a stream socket) added a regression for epoll() in Edge Triggered mode (EPOLLET) Appropriate fix is to use skb_peek()/skb_unlink() instead of skb_dequeue(), and only call skb_unlink() when skb is fully consumed. This remove the need to requeue a partial skb into sk_receive_queue head and the extra sk->sk_data_ready() calls that added the regression. This is safe because once skb is given to sk_receive_queue, it is not modified by a writer, and readers are serialized by u->readlock mutex. This also reduce number of spinlock acquisition for small reads or MSG_PEEK users so should improve overall performance. Reported-by: Nick Mathewson Signed-off-by: Eric Dumazet Cc: Alexey Moiseytsev Signed-off-by: David S. Miller --- net/unix/af_unix.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index aad8fb699989..85d3bb7490aa 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1918,7 +1918,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb; unix_state_lock(sk); - skb = skb_dequeue(&sk->sk_receive_queue); + skb = skb_peek(&sk->sk_receive_queue); if (skb == NULL) { unix_sk(sk)->recursion_level = 0; if (copied >= target) @@ -1958,11 +1958,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (check_creds) { /* Never glue messages from different writers */ if ((UNIXCB(skb).pid != siocb->scm->pid) || - (UNIXCB(skb).cred != siocb->scm->cred)) { - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + (UNIXCB(skb).cred != siocb->scm->cred)) break; - } } else { /* Copy credentials */ scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); @@ -1977,8 +1974,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, chunk = min_t(unsigned int, skb->len, size); if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); if (copied == 0) copied = -EFAULT; break; @@ -1993,13 +1988,10 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (UNIXCB(skb).fp) unix_detach_fds(siocb->scm, skb); - /* put the skb back if we didn't use it up.. */ - if (skb->len) { - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + if (skb->len) break; - } + skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); if (siocb->scm->fp) @@ -2010,9 +2002,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (UNIXCB(skb).fp) siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); - /* put message back and return */ - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); break; } } while (size); -- cgit v1.2.3