From ba1960257c5980f9b58057995ce3394bd8e48ca3 Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Tue, 10 Jan 2012 15:19:54 +0200
Subject: mac80211: update oper_channel on ibss join

Commit 13c40c5 ("mac80211: Add HT operation modes for IBSS") broke
ibss operation by mistakenly removing the local->oper_channel
update (causing ibss to start on the wrong channel). fix it.

Signed-off-by: Eliad Peller <eliad@wizery.com>
Acked-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/ibss.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index b3d76b756cd5..a4643969a13b 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -106,6 +106,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 
 	sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0;
 
+	local->oper_channel = chan;
 	channel_type = ifibss->channel_type;
 	if (channel_type > NL80211_CHAN_HT20 &&
 	    !cfg80211_can_beacon_sec_chan(local->hw.wiphy, chan, channel_type))
-- 
cgit v1.2.3


From 405385f8ce7a2ed8f82e216d88b5282142e1288b Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Wed, 11 Jan 2012 13:11:50 +0200
Subject: mac80211: set bss_conf.idle when vif is connected

__ieee80211_recalc_idle() iterates through the vifs,
sets bss_conf.idle = true if they are disconnected,
and increases "count" if they are not (which later
gets evaluated in order to determine whether the
device is idle).

However, the loop doesn't set bss_conf.idle = false
(along with increasing "count"), causing the device
idle state and the vif idle state to get out of sync
in some cases.

Signed-off-by: Eliad Peller <eliad@wizery.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/iface.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index e47768cb8cb3..01a21c2f6ab3 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1314,6 +1314,7 @@ u32 __ieee80211_recalc_idle(struct ieee80211_local *local)
 			continue;
 		}
 		/* count everything else */
+		sdata->vif.bss_conf.idle = false;
 		count++;
 	}
 
-- 
cgit v1.2.3


From b49ba04a3a0382e7314d990707c21094c410425a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 19 Jan 2012 08:20:57 -0800
Subject: iwlwifi: fix PCI-E transport "inta" race

When an interrupt comes in, we read the reason
bits and collect them into "trans_pcie->inta".
This happens with the spinlock held. However,
there's a bug resetting this variable -- that
happens after the spinlock has been released.
This means that it is possible for interrupts
to be missed if the reset happens after some
other interrupt reasons were already added to
the variable.

I found this by code inspection, looking for a
reason that we sometimes see random commands
time out. It seems possible that this causes
such behaviour, but I can't say for sure right
now since it happens extremely infrequently on
my test systems.

Cc: stable@vger.kernel.org [3.2]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Wey-Yi Guy <wey-yi.w.guy@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c b/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c
index 752493f00406..65d1f05007be 100644
--- a/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c
@@ -972,11 +972,11 @@ void iwl_irq_tasklet(struct iwl_trans *trans)
 	}
 #endif
 
-	spin_unlock_irqrestore(&trans->shrd->lock, flags);
-
 	/* saved interrupt in inta variable now we can reset trans_pcie->inta */
 	trans_pcie->inta = 0;
 
+	spin_unlock_irqrestore(&trans->shrd->lock, flags);
+
 	/* Now service all interrupt bits discovered above. */
 	if (inta & CSR_INT_BIT_HW_ERR) {
 		IWL_ERR(trans, "Hardware error detected.  Restarting.\n");
-- 
cgit v1.2.3


From 68315801dbf3ab2001679fd2074c9dc5dcf87dfa Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Wed, 25 Jan 2012 02:39:05 +0000
Subject: l2tp: l2tp_ip - fix possible oops on packet receive

When a packet is received on an L2TP IP socket (L2TPv3 IP link
encapsulation), the l2tpip socket's backlog_rcv function calls
xfrm4_policy_check(). This is not necessary, since it was called
before the skb was added to the backlog. With CONFIG_NET_NS enabled,
xfrm4_policy_check() will oops if skb->dev is null, so this trivial
patch removes the call.

This bug has always been present, but only when CONFIG_NET_NS is
enabled does it cause problems. Most users are probably using UDP
encapsulation for L2TP, hence the problem has only recently
surfaced.

EIP: 0060:[<c12bb62b>] EFLAGS: 00210246 CPU: 0
EIP is at l2tp_ip_recvmsg+0xd4/0x2a7
EAX: 00000001 EBX: d77b5180 ECX: 00000000 EDX: 00200246
ESI: 00000000 EDI: d63cbd30 EBP: d63cbd18 ESP: d63cbcf4
 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
Call Trace:
 [<c1218568>] sock_common_recvmsg+0x31/0x46
 [<c1215c92>] __sock_recvmsg_nosec+0x45/0x4d
 [<c12163a1>] __sock_recvmsg+0x31/0x3b
 [<c1216828>] sock_recvmsg+0x96/0xab
 [<c10b2693>] ? might_fault+0x47/0x81
 [<c10b2693>] ? might_fault+0x47/0x81
 [<c1167fd0>] ? _copy_from_user+0x31/0x115
 [<c121e8c8>] ? copy_from_user+0x8/0xa
 [<c121ebd6>] ? verify_iovec+0x3e/0x78
 [<c1216604>] __sys_recvmsg+0x10a/0x1aa
 [<c1216792>] ? sock_recvmsg+0x0/0xab
 [<c105a99b>] ? __lock_acquire+0xbdf/0xbee
 [<c12d5a99>] ? do_page_fault+0x193/0x375
 [<c10d1200>] ? fcheck_files+0x9b/0xca
 [<c10d1259>] ? fget_light+0x2a/0x9c
 [<c1216bbb>] sys_recvmsg+0x2b/0x43
 [<c1218145>] sys_socketcall+0x16d/0x1a5
 [<c11679f0>] ? trace_hardirqs_on_thunk+0xc/0x10
 [<c100305f>] sysenter_do_call+0x12/0x38
Code: c6 05 8c ea a8 c1 01 e8 0c d4 d9 ff 85 f6 74 07 3e ff 86 80 00 00 00 b9 17 b6 2b c1 ba 01 00 00 00 b8 78 ed 48 c1 e8 23 f6 d9 ff <ff> 76 0c 68 28 e3 30 c1 68 2d 44 41 c1 e8 89 57 01 00 83 c4 0c

Signed-off-by: James Chapman <jchapman@katalix.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ip.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index d21e7ebd91ca..55670ec3cd0f 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -393,11 +393,6 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
 {
 	int rc;
 
-	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
-		goto drop;
-
-	nf_reset(skb);
-
 	/* Charge it to the socket, dropping if the queue is full. */
 	rc = sock_queue_rcv_skb(sk, skb);
 	if (rc < 0)
-- 
cgit v1.2.3


From 2b05ad33e1e624e7f08b8676d270dc7725403b7e Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fbl@redhat.com>
Date: Wed, 25 Jan 2012 08:34:51 +0000
Subject: tcp: bind() fix autoselection to share ports

The current code checks for conflicts when the application
requests a specific port.  If there is no conflict, then
the request is granted.

On the other hand, the port autoselection done by the kernel
fails when all ports are bound even when there is a port
with no conflict available.

The fix changes port autoselection to check if there is a
conflict and use it if not.

Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 2e4e24476c4c..ecd19b5a7ee2 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -128,6 +128,11 @@ again:
 							goto have_snum;
 						}
 					}
+					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
+						spin_unlock(&head->lock);
+						snum = rover;
+						goto have_snum;
+					}
 					goto next;
 				}
 			break;
-- 
cgit v1.2.3


From fddb7b5761f104f034a0e708ece756d9b2eb2cac Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fbl@redhat.com>
Date: Wed, 25 Jan 2012 08:34:52 +0000
Subject: tcp: bind() optimize port allocation

Port autoselection finds a port and then drop the lock,
then right after that, gets the hash bucket again and lock it.

Fix it to go direct.

Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ecd19b5a7ee2..19d66cefd7d3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -123,15 +123,13 @@ again:
 						smallest_size = tb->num_owners;
 						smallest_rover = rover;
 						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
-							spin_unlock(&head->lock);
 							snum = smallest_rover;
-							goto have_snum;
+							goto tb_found;
 						}
 					}
 					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
-						spin_unlock(&head->lock);
 						snum = rover;
-						goto have_snum;
+						goto tb_found;
 					}
 					goto next;
 				}
-- 
cgit v1.2.3


From 073862ba5d249c20bd5c49fc6d904ff0e1f6a672 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 26 Jan 2012 00:41:38 +0000
Subject: netns: fix net_alloc_generic()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a new net namespace is created, we should attach to it a "struct
net_generic" with enough slots (even empty), or we can hit the following
BUG_ON() :

[  200.752016] kernel BUG at include/net/netns/generic.h:40!
...
[  200.752016]  [<ffffffff825c3cea>] ? get_cfcnfg+0x3a/0x180
[  200.752016]  [<ffffffff821cf0b0>] ? lockdep_rtnl_is_held+0x10/0x20
[  200.752016]  [<ffffffff825c41be>] caif_device_notify+0x2e/0x530
[  200.752016]  [<ffffffff810d61b7>] notifier_call_chain+0x67/0x110
[  200.752016]  [<ffffffff810d67c1>] raw_notifier_call_chain+0x11/0x20
[  200.752016]  [<ffffffff821bae82>] call_netdevice_notifiers+0x32/0x60
[  200.752016]  [<ffffffff821c2b26>] register_netdevice+0x196/0x300
[  200.752016]  [<ffffffff821c2ca9>] register_netdev+0x19/0x30
[  200.752016]  [<ffffffff81c1c67a>] loopback_net_init+0x4a/0xa0
[  200.752016]  [<ffffffff821b5e62>] ops_init+0x42/0x180
[  200.752016]  [<ffffffff821b600b>] setup_net+0x6b/0x100
[  200.752016]  [<ffffffff821b6466>] copy_net_ns+0x86/0x110
[  200.752016]  [<ffffffff810d5789>] create_new_namespaces+0xd9/0x190

net_alloc_generic() should take into account the maximum index into the
ptr array, as a subsystem might use net_generic() anytime.

This also reduces number of reallocations in net_assign_generic()

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Sjur Brændeland <sjur.brandeland@stericsson.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index aefcd7acbffa..0e950fda9a0a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -30,6 +30,20 @@ EXPORT_SYMBOL(init_net);
 
 #define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */
 
+static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+
+static struct net_generic *net_alloc_generic(void)
+{
+	struct net_generic *ng;
+	size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+
+	ng = kzalloc(generic_size, GFP_KERNEL);
+	if (ng)
+		ng->len = max_gen_ptrs;
+
+	return ng;
+}
+
 static int net_assign_generic(struct net *net, int id, void *data)
 {
 	struct net_generic *ng, *old_ng;
@@ -43,8 +57,7 @@ static int net_assign_generic(struct net *net, int id, void *data)
 	if (old_ng->len >= id)
 		goto assign;
 
-	ng = kzalloc(sizeof(struct net_generic) +
-			id * sizeof(void *), GFP_KERNEL);
+	ng = net_alloc_generic();
 	if (ng == NULL)
 		return -ENOMEM;
 
@@ -59,7 +72,6 @@ static int net_assign_generic(struct net *net, int id, void *data)
 	 * the old copy for kfree after a grace period.
 	 */
 
-	ng->len = id;
 	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
 
 	rcu_assign_pointer(net->gen, ng);
@@ -161,18 +173,6 @@ out_undo:
 	goto out;
 }
 
-static struct net_generic *net_alloc_generic(void)
-{
-	struct net_generic *ng;
-	size_t generic_size = sizeof(struct net_generic) +
-		INITIAL_NET_GEN_PTRS * sizeof(void *);
-
-	ng = kzalloc(generic_size, GFP_KERNEL);
-	if (ng)
-		ng->len = INITIAL_NET_GEN_PTRS;
-
-	return ng;
-}
 
 #ifdef CONFIG_NET_NS
 static struct kmem_cache *net_cachep;
@@ -483,6 +483,7 @@ again:
 			}
 			return error;
 		}
+		max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
 	}
 	error = __register_pernet_operations(list, ops);
 	if (error) {
-- 
cgit v1.2.3


From 40206dd98f066d596d4280558fc5f798165861c7 Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Thu, 26 Jan 2012 07:23:23 +0000
Subject: xen-netfront: correct MAX_TX_TARGET calculation.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index fa679057630f..698b905058dd 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -68,7 +68,7 @@ struct netfront_cb {
 
 #define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
 #define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
-#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+#define TX_MAX_TARGET min_t(int, NET_TX_RING_SIZE, 256)
 
 struct netfront_stats {
 	u64			rx_packets;
-- 
cgit v1.2.3


From f2b3ee9e4200b32d113b1bd3c93f9a836c97357c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Jan 2012 10:34:35 +0000
Subject: ipv6: Fix ip_gre lockless xmits.

Tunnel devices set NETIF_F_LLTX to bypass HARD_TX_LOCK.  Sit and
ipip set this unconditionally in ops->setup, but gre enables it
conditionally after parameter passing in ops->newlink. This is
not called during tunnel setup as below, however, so GRE tunnels are
still taking the lock.

modprobe ip_gre
ip tunnel add test0 mode gre remote 10.5.1.1 dev lo
ip link set test0 up
ip addr add 10.6.0.1 dev test0
 # cat /sys/class/net/test0/features
 # $DIR/test_tunnel_xmit 10 10.5.2.1
ip route add 10.5.2.0/24 dev test0
ip tunnel del test0

The newlink callback is only called in rtnl_netlink, and only if
the device is new, as it calls register_netdevice internally. Gre
tunnels are created at 'ip tunnel add' with ioctl SIOCADDTUNNEL,
which calls ipgre_tunnel_locate, which calls register_netdev.
rtnl_newlink is called at 'ip link set', but skips ops->newlink
and the device is up with locking still enabled. The equivalent
ipip tunnel works fine, btw (just substitute 'method gre' for
'method ipip').

On kernels before /sys/class/net/*/features was removed [1],
the first commented out line returns 0x6000 with method gre,
which indicates that NETIF_F_LLTX (0x1000) is not set. With ipip,
it reports 0x7000. This test cannot be used on recent kernels where
the sysfs file is removed (and ETHTOOL_GFEATURES does not currently
work for tunnel devices, because they lack dev->ethtool_ops).

The second commented out line calls a simple transmission test [2]
that sends on 24 cores at maximum rate. Results of a single run:

ipip:			19,372,306
gre before patch:	 4,839,753
gre after patch:	19,133,873

This patch replicates the condition check in ipgre_newlink to
ipgre_tunnel_locate. It works for me, both with oseq on and off.
This is the first time I looked at rtnetlink and iproute2 code,
though, so someone more knowledgeable should probably check the
patch. Thanks.

The tail of both functions is now identical, by the way. To avoid
code duplication, I'll be happy to rework this and merge the two.

[1] http://patchwork.ozlabs.org/patch/104610/
[2] http://kernel.googlecode.com/files/xmit_udp_parallel.c

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2b53a1f7abf6..6b3ca5ba4450 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -422,6 +422,10 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
 	if (register_netdevice(dev) < 0)
 		goto failed_free;
 
+	/* Can use a lockless transmit, unless we generate output sequences */
+	if (!(nt->parms.o_flags & GRE_SEQ))
+		dev->features |= NETIF_F_LLTX;
+
 	dev_hold(dev);
 	ipgre_tunnel_link(ign, nt);
 	return nt;
-- 
cgit v1.2.3


From f18da14565819ba43b8321237e2426a2914cc2ef Mon Sep 17 00:00:00 2001
From: Stefan Gula <steweg@gmail.com>
Date: Thu, 26 Jan 2012 11:01:06 +0000
Subject: net: RTNETLINK adjusting values of min_ifinfo_dump_size

Setting link parameters on a netdevice changes the value
of if_nlmsg_size(), therefore it is necessary to recalculate
min_ifinfo_dump_size.

Signed-off-by: Stefan Gula <steweg@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f16444bc6cbb..65aebd450027 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1509,6 +1509,9 @@ errout:
 
 	if (send_addr_notify)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	min_ifinfo_dump_size = max_t(u16, if_nlmsg_size(dev),
+				     min_ifinfo_dump_size);
+
 	return err;
 }
 
-- 
cgit v1.2.3


From 9018e93948c6f8f95fbcc9fa05f6c403d6adb406 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@parallels.com>
Date: Thu, 26 Jan 2012 12:09:28 +0000
Subject: net: explicitly add jump_label.h header to sock.h

Commit 36a1211970193ce215de50ed1e4e1272bc814df1 removed linux/module.h
include statement from one of the headers that end up in net/sock.h.
It was providing us with static_branch() definition implicitly, so
after its removal the build got broken.

To fix this, and avoid having this happening in the future,
let me do the right thing and include linux/jump_label.h
explicitly in sock.h.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Reported-by: Randy Dunlap <rdunlap@xenotime.net>
CC: David S. Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 4c69ac165e6b..91c1c8baf020 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -55,6 +55,7 @@
 #include <linux/uaccess.h>
 #include <linux/memcontrol.h>
 #include <linux/res_counter.h>
+#include <linux/jump_label.h>
 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
-- 
cgit v1.2.3


From 5ee4433efe99b9f39f6eff5052a177bbcfe72cea Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jan 2012 14:02:55 +0000
Subject: netns: Fail conspicously if someone uses net_generic at an
 inappropriate time.

By definition net_generic should never be called when it can return
NULL.  Fail conspicously with a BUG_ON to make it clear when people mess
up that a NULL return should never happen.

Recently there was a bug in the CAIF subsystem where it was registered
with register_pernet_device instead of register_pernet_subsys.  It was
erroneously concluded that net_generic could validly return NULL and
that net_assign_generic was buggy (when it was just inefficient).
Hopefully this BUG_ON will prevent people to coming to similar erroneous
conclusions in the futrue.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/generic.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index 3419bf5cd154..d55f43443335 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -41,6 +41,7 @@ static inline void *net_generic(const struct net *net, int id)
 	ptr = ng->ptr[id - 1];
 	rcu_read_unlock();
 
+	BUG_ON(!ptr);
 	return ptr;
 }
 #endif
-- 
cgit v1.2.3


From 8a8ee9aff6c3077dd9c2c7a77478e8ed362b96c6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jan 2012 14:04:53 +0000
Subject: net caif: Register properly as a pernet subsystem.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

caif is a subsystem and as such it needs to register with
register_pernet_subsys instead of register_pernet_device.

Among other problems using register_pernet_device was resulting in
net_generic being called before the caif_net structure was allocated.
Which has been causing net_generic to fail with either BUG_ON's or by
return NULL pointers.

A more ugly problem that could be caused is packets in flight why the
subsystem is shutting down.

To remove confusion also remove the cruft cause by inappropriately
trying to fix this bug.

With the aid of the previous patch I have tested this patch and
confirmed that using register_pernet_subsys makes the failure go away as
it should.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/caif_dev.c | 22 ++--------------------
 net/caif/cfcnfg.c   |  1 -
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 673728add60b..82c57069415f 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -59,8 +59,6 @@ struct cfcnfg *get_cfcnfg(struct net *net)
 {
 	struct caif_net *caifn;
 	caifn = net_generic(net, caif_net_id);
-	if (!caifn)
-		return NULL;
 	return caifn->cfg;
 }
 EXPORT_SYMBOL(get_cfcnfg);
@@ -69,8 +67,6 @@ static struct caif_device_entry_list *caif_device_list(struct net *net)
 {
 	struct caif_net *caifn;
 	caifn = net_generic(net, caif_net_id);
-	if (!caifn)
-		return NULL;
 	return &caifn->caifdevs;
 }
 
@@ -99,8 +95,6 @@ static struct caif_device_entry *caif_device_alloc(struct net_device *dev)
 	struct caif_device_entry *caifd;
 
 	caifdevs = caif_device_list(dev_net(dev));
-	if (!caifdevs)
-		return NULL;
 
 	caifd = kzalloc(sizeof(*caifd), GFP_KERNEL);
 	if (!caifd)
@@ -120,8 +114,6 @@ static struct caif_device_entry *caif_get(struct net_device *dev)
 	struct caif_device_entry_list *caifdevs =
 	    caif_device_list(dev_net(dev));
 	struct caif_device_entry *caifd;
-	if (!caifdevs)
-		return NULL;
 
 	list_for_each_entry_rcu(caifd, &caifdevs->list, list) {
 		if (caifd->netdev == dev)
@@ -321,8 +313,6 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
 	struct caif_device_entry_list *caifdevs;
 
 	caifdevs = caif_device_list(dev_net(dev));
-	if (!cfg || !caifdevs)
-		return;
 	caifd = caif_device_alloc(dev);
 	if (!caifd)
 		return;
@@ -374,8 +364,6 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
 
 	cfg = get_cfcnfg(dev_net(dev));
 	caifdevs = caif_device_list(dev_net(dev));
-	if (!cfg || !caifdevs)
-		return 0;
 
 	caifd = caif_get(dev);
 	if (caifd == NULL && dev->type != ARPHRD_CAIF)
@@ -507,9 +495,6 @@ static struct notifier_block caif_device_notifier = {
 static int caif_init_net(struct net *net)
 {
 	struct caif_net *caifn = net_generic(net, caif_net_id);
-	if (WARN_ON(!caifn))
-		return -EINVAL;
-
 	INIT_LIST_HEAD(&caifn->caifdevs.list);
 	mutex_init(&caifn->caifdevs.lock);
 
@@ -527,9 +512,6 @@ static void caif_exit_net(struct net *net)
 	    caif_device_list(net);
 	struct cfcnfg *cfg =  get_cfcnfg(net);
 
-	if (!cfg || !caifdevs)
-		return;
-
 	rtnl_lock();
 	mutex_lock(&caifdevs->lock);
 
@@ -569,7 +551,7 @@ static int __init caif_device_init(void)
 {
 	int result;
 
-	result = register_pernet_device(&caif_net_ops);
+	result = register_pernet_subsys(&caif_net_ops);
 
 	if (result)
 		return result;
@@ -582,7 +564,7 @@ static int __init caif_device_init(void)
 
 static void __exit caif_device_exit(void)
 {
-	unregister_pernet_device(&caif_net_ops);
+	unregister_pernet_subsys(&caif_net_ops);
 	unregister_netdevice_notifier(&caif_device_notifier);
 	dev_remove_pack(&caif_packet_type);
 }
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index 598aafb4cb51..ba9cfd47778a 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -309,7 +309,6 @@ int caif_connect_client(struct net *net, struct caif_connect_request *conn_req,
 	int err;
 	struct cfctrl_link_param param;
 	struct cfcnfg *cfg = get_cfcnfg(net);
-	caif_assert(cfg != NULL);
 
 	rcu_read_lock();
 	err = caif_connect_req_to_link_param(cfg, conn_req, &param);
-- 
cgit v1.2.3


From 4acb41903b2f99f3dffd4c3df9acc84ca5942cb2 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@parallels.com>
Date: Mon, 30 Jan 2012 01:20:17 +0000
Subject: net/tcp: Fix tcp memory limits initialization when !CONFIG_SYSCTL

sysctl_tcp_mem() initialization was moved to sysctl_tcp_ipv4.c
in commit 3dc43e3e4d0b52197d3205214fe8f162f9e0c334, since it
became a per-ns value.

That code, however, will never run when CONFIG_SYSCTL is
disabled, leading to bogus values on those fields - causing hung
TCP sockets.

This patch fixes it by keeping an initialization code in
tcp_init(). It will be overwritten by the first net namespace
init if CONFIG_SYSCTL is compiled in, and do the right thing if
it is compiled out.

It is also named properly as tcp_init_mem(), to properly signal
its non-sysctl side effect on TCP limits.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: David S. Miller <davem@davemloft.net>
Link: http://lkml.kernel.org/r/4F22D05A.8030604@parallels.com
[ renamed the function, tidied up the changelog a bit ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h          |  2 ++
 net/ipv4/sysctl_net_ipv4.c |  1 +
 net/ipv4/tcp.c             | 16 +++++++++++++---
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0118ea999f67..d49db0113a06 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -311,6 +311,8 @@ extern struct proto tcp_prot;
 #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)
 #define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
 
+extern void tcp_init_mem(struct net *net);
+
 extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4aa7e9dc0cbb..4cb9cd2f2c39 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -814,6 +814,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 
 	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
 
+	tcp_init_mem(net);
 	limit = nr_free_buffer_pages() / 8;
 	limit = max(limit, 128UL);
 	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9bcdec3ad772..06373b4a449a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3216,6 +3216,16 @@ static int __init set_thash_entries(char *str)
 }
 __setup("thash_entries=", set_thash_entries);
 
+void tcp_init_mem(struct net *net)
+{
+	/* Set per-socket limits to no more than 1/128 the pressure threshold */
+	unsigned long limit = nr_free_buffer_pages() / 8;
+	limit = max(limit, 128UL);
+	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
+	net->ipv4.sysctl_tcp_mem[1] = limit;
+	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
+}
+
 void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
@@ -3276,9 +3286,9 @@ void __init tcp_init(void)
 	sysctl_tcp_max_orphans = cnt / 2;
 	sysctl_max_syn_backlog = max(128, cnt / 256);
 
-	/* Set per-socket limits to no more than 1/128 the pressure threshold */
-	limit = ((unsigned long)init_net.ipv4.sysctl_tcp_mem[1])
-		<< (PAGE_SHIFT - 7);
+	tcp_init_mem(&init_net);
+	limit = nr_free_buffer_pages() / 8;
+	limit = max(limit, 128UL);
 	max_share = min(4UL*1024*1024, limit);
 
 	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
-- 
cgit v1.2.3


From 5b35e1e6e9ca651e6b291c96d1106043c9af314a Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 28 Jan 2012 17:29:46 +0000
Subject: tcp: fix tcp_trim_head() to adjust segment count with skb MSS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit fixes tcp_trim_head() to recalculate the number of
segments in the skb with the skb's existing MSS, so trimming the head
causes the skb segment count to be monotonically non-increasing - it
should stay the same or go down, but not increase.

Previously tcp_trim_head() used the current MSS of the connection. But
if there was a decrease in MSS between original transmission and ACK
(e.g. due to PMTUD), this could cause tcp_trim_head() to
counter-intuitively increase the segment count when trimming bytes off
the head of an skb. This violated assumptions in tcp_tso_acked() that
tcp_trim_head() only decreases the packet count, so that packets_acked
in tcp_tso_acked() could underflow, leading tcp_clean_rtx_queue() to
pass u32 pkts_acked values as large as 0xffffffff to
ca_ops->pkts_acked().

As an aside, if tcp_trim_head() had really wanted the skb to reflect
the current MSS, it should have called tcp_set_skb_tso_segs()
unconditionally, since a decrease in MSS would mean that a
single-packet skb should now be sliced into multiple segments.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8c8de2780c7a..4ff3b6dc74fc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1141,11 +1141,9 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 	sk_mem_uncharge(sk, len);
 	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
 
-	/* Any change of skb->len requires recalculation of tso
-	 * factor and mss.
-	 */
+	/* Any change of skb->len requires recalculation of tso factor. */
 	if (tcp_skb_pcount(skb) > 1)
-		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
+		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
 
 	return 0;
 }
-- 
cgit v1.2.3


From 6f01fd6e6f6809061b56e78f1e8d143099716d70 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 28 Jan 2012 16:11:03 +0000
Subject: af_unix: fix EPOLLET regression for stream sockets

Commit 0884d7aa24 (AF_UNIX: Fix poll blocking problem when reading from
a stream socket) added a regression for epoll() in Edge Triggered mode
(EPOLLET)

Appropriate fix is to use skb_peek()/skb_unlink() instead of
skb_dequeue(), and only call skb_unlink() when skb is fully consumed.

This remove the need to requeue a partial skb into sk_receive_queue head
and the extra sk->sk_data_ready() calls that added the regression.

This is safe because once skb is given to sk_receive_queue, it is not
modified by a writer, and readers are serialized by u->readlock mutex.

This also reduce number of spinlock acquisition for small reads or
MSG_PEEK users so should improve overall performance.

Reported-by: Nick Mathewson <nickm@freehaven.net>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Alexey Moiseytsev <himeraster@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index aad8fb699989..85d3bb7490aa 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1918,7 +1918,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		struct sk_buff *skb;
 
 		unix_state_lock(sk);
-		skb = skb_dequeue(&sk->sk_receive_queue);
+		skb = skb_peek(&sk->sk_receive_queue);
 		if (skb == NULL) {
 			unix_sk(sk)->recursion_level = 0;
 			if (copied >= target)
@@ -1958,11 +1958,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		if (check_creds) {
 			/* Never glue messages from different writers */
 			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
-			    (UNIXCB(skb).cred != siocb->scm->cred)) {
-				skb_queue_head(&sk->sk_receive_queue, skb);
-				sk->sk_data_ready(sk, skb->len);
+			    (UNIXCB(skb).cred != siocb->scm->cred))
 				break;
-			}
 		} else {
 			/* Copy credentials */
 			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
@@ -1977,8 +1974,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 		chunk = min_t(unsigned int, skb->len, size);
 		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
-			skb_queue_head(&sk->sk_receive_queue, skb);
-			sk->sk_data_ready(sk, skb->len);
 			if (copied == 0)
 				copied = -EFAULT;
 			break;
@@ -1993,13 +1988,10 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			if (UNIXCB(skb).fp)
 				unix_detach_fds(siocb->scm, skb);
 
-			/* put the skb back if we didn't use it up.. */
-			if (skb->len) {
-				skb_queue_head(&sk->sk_receive_queue, skb);
-				sk->sk_data_ready(sk, skb->len);
+			if (skb->len)
 				break;
-			}
 
+			skb_unlink(skb, &sk->sk_receive_queue);
 			consume_skb(skb);
 
 			if (siocb->scm->fp)
@@ -2010,9 +2002,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			if (UNIXCB(skb).fp)
 				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
 
-			/* put message back and return */
-			skb_queue_head(&sk->sk_receive_queue, skb);
-			sk->sk_data_ready(sk, skb->len);
 			break;
 		}
 	} while (size);
-- 
cgit v1.2.3